diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2eee8e6148f7..a2fcaec9c394 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -624,7 +624,8 @@ if (CLANG_ENABLE_BOOTSTRAP)
   set(COMPILER_OPTIONS
     -DCMAKE_CXX_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/${CXX_COMPILER}
     -DCMAKE_C_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/${C_COMPILER}
-    -DCMAKE_ASM_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/${C_COMPILER})
+    -DCMAKE_ASM_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/${C_COMPILER}
+    -DCMAKE_ASM_COMPILER_ID=Clang)
 
   if(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED)
     add_dependencies(clang-bootstrap-deps llvm-profdata)
diff --git a/cmake/caches/Fuchsia-stage2.cmake b/cmake/caches/Fuchsia-stage2.cmake
index 7ad2d9c60765..03b1e3255352 100644
--- a/cmake/caches/Fuchsia-stage2.cmake
+++ b/cmake/caches/Fuchsia-stage2.cmake
@@ -17,10 +17,7 @@ set(LLVM_ENABLE_LTO ON CACHE BOOL "")
 if(NOT APPLE)
   set(LLVM_ENABLE_LLD ON CACHE BOOL "")
   set(CLANG_DEFAULT_LINKER lld CACHE STRING "")
-endif()
-
-if(APPLE)
-  set(LLDB_CODESIGN_IDENTITY "" CACHE STRING "")
+  set(CLANG_DEFAULT_OBJCOPY llvm-objcopy CACHE STRING "")
 endif()
 
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
@@ -83,8 +80,6 @@ set(LLVM_TOOLCHAIN_TOOLS
 set(LLVM_DISTRIBUTION_COMPONENTS
   clang
   lld
-  lldb
-  liblldb
   LTO
   clang-format
   clang-headers
diff --git a/docs/ClangCommandLineReference.rst b/docs/ClangCommandLineReference.rst
index 4d095883a4b7..6a0eab438d4a 100644
--- a/docs/ClangCommandLineReference.rst
+++ b/docs/ClangCommandLineReference.rst
@@ -116,10 +116,18 @@ Output path for the plist report
 .. option:: -bundle\_loader <arg>
 .. program:: clang
 
+.. option:: -cfguard
+
+Emit tables required for Windows Control Flow Guard.
+
 .. option:: -client\_name<arg>
 
 .. option:: -compatibility\_version<arg>
 
+.. option:: --config <arg>
+
+Specifies configuration file
+
 .. option:: --constant-cfstrings
 
 .. option:: -coverage, --coverage
@@ -1175,6 +1183,10 @@ Load the clang builtins module map file.
 
 .. option:: -fcaret-diagnostics, -fno-caret-diagnostics
 
+.. option:: -fcf-protection=<arg>, -fcf-protection (equivalent to -fcf-protection=full)
+
+Instrument control-flow architecture protection. Options: return, branch, full, none.
+
 .. option:: -fclasspath=<arg>, --CLASSPATH <arg>, --CLASSPATH=<arg>, --classpath <arg>, --classpath=<arg>
 
 .. option:: -fcolor-diagnostics, -fno-color-diagnostics
@@ -1545,6 +1557,10 @@ Enable ARC-style weak references in Objective-C
 
 OpenMP target code is compiled as relocatable using the -c flag. For OpenMP targets the code is relocatable by default.
 
+.. option:: -fopenmp-simd, -fno-openmp-simd
+
+Emit OpenMP code only for SIMD-based constructs.
+
 .. option:: -fopenmp-use-tls
 
 .. option:: -fopenmp-version=<arg>
@@ -1748,6 +1764,10 @@ Force the usage of stack protectors for all functions
 
 Use a strong heuristic to apply stack protectors to functions
 
+.. option:: -fstack-size-section, -fno-stack-size-section
+
+Emit section containing metadata on function stack sizes
+
 .. option:: -fstandalone-debug, -fno-limit-debug-info, -fno-standalone-debug
 
 Emit full debug info for all types used by the program
@@ -1998,7 +2018,7 @@ Link stack frames through backchain on System Z
 
 .. option:: -mconsole<arg>
 
-.. option:: -mcpu=<arg>, -mv4 (equivalent to -mcpu=hexagonv4), -mv5 (equivalent to -mcpu=hexagonv5), -mv55 (equivalent to -mcpu=hexagonv55), -mv60 (equivalent to -mcpu=hexagonv60), -mv62 (equivalent to -mcpu=hexagonv62)
+.. option:: -mcpu=<arg>, -mv4 (equivalent to -mcpu=hexagonv4), -mv5 (equivalent to -mcpu=hexagonv5), -mv55 (equivalent to -mcpu=hexagonv55), -mv60 (equivalent to -mcpu=hexagonv60), -mv62 (equivalent to -mcpu=hexagonv62), -mv65 (equivalent to -mcpu=hexagonv65)
 
 .. option:: -mdefault-build-attributes<arg>, -mno-default-build-attributes<arg>
 
@@ -2312,6 +2332,8 @@ WebAssembly
 -----------
 .. option:: -mnontrapping-fptoint, -mno-nontrapping-fptoint
 
+.. option:: -msign-ext, -mno-sign-ext
+
 .. option:: -msimd128, -mno-simd128
 
 X86
@@ -2328,6 +2350,8 @@ X86
 
 .. option:: -mavx2, -mno-avx2
 
+.. option:: -mavx512bitalg, -mno-avx512bitalg
+
 .. option:: -mavx512bw, -mno-avx512bw
 
 .. option:: -mavx512cd, -mno-avx512cd
@@ -2344,8 +2368,12 @@ X86
 
 .. option:: -mavx512vbmi, -mno-avx512vbmi
 
+.. option:: -mavx512vbmi2, -mno-avx512vbmi2
+
 .. option:: -mavx512vl, -mno-avx512vl
 
+.. option:: -mavx512vnni, -mno-avx512vnni
+
 .. option:: -mavx512vpopcntdq, -mno-avx512vpopcntdq
 
 .. option:: -mbmi, -mno-bmi
@@ -2370,6 +2398,8 @@ X86
 
 .. option:: -mfxsr, -mno-fxsr
 
+.. option:: -mgfni, -mno-gfni
+
 .. option:: -mibt, -mno-ibt
 
 .. option:: -mlwp, -mno-lwp
@@ -2394,10 +2424,16 @@ X86
 
 .. option:: -mprfchw, -mno-prfchw
 
+.. option:: -mrdpid, -mno-rdpid
+
 .. option:: -mrdrnd, -mno-rdrnd
 
 .. option:: -mrdseed, -mno-rdseed
 
+.. option:: -mretpoline, -mno-retpoline
+
+.. option:: -mretpoline-external-thunk, -mno-retpoline-external-thunk
+
 .. option:: -mrtm, -mno-rtm
 
 .. option:: -msgx, -mno-sgx
@@ -2424,6 +2460,10 @@ X86
 
 .. option:: -mtbm, -mno-tbm
 
+.. option:: -mvaes, -mno-vaes
+
+.. option:: -mvpclmulqdq, -mno-vpclmulqdq
+
 .. option:: -mx87, -m80387, -mno-x87
 
 .. option:: -mxop, -mno-xop
diff --git a/docs/ClangFormatStyleOptions.rst b/docs/ClangFormatStyleOptions.rst
index 04267928a92a..ea43f19a3c7b 100644
--- a/docs/ClangFormatStyleOptions.rst
+++ b/docs/ClangFormatStyleOptions.rst
@@ -994,9 +994,9 @@ the configuration (without a prefix: ``Auto``).
 
     .. code-block:: c++
 
-      Constructor()
-          : initializer1(),
-            initializer2()
+    Constructor()
+        : initializer1(),
+          initializer2()
 
   * ``BCIS_BeforeComma`` (in configuration: ``BeforeComma``)
     Break constructor initializers before the colon and commas, and align
@@ -1004,18 +1004,18 @@ the configuration (without a prefix: ``Auto``).
 
     .. code-block:: c++
 
-      Constructor()
-          : initializer1()
-          , initializer2()
+    Constructor()
+        : initializer1()
+        , initializer2()
 
   * ``BCIS_AfterColon`` (in configuration: ``AfterColon``)
     Break constructor initializers after the colon and commas.
 
     .. code-block:: c++
 
-      Constructor() :
-          initializer1(),
-          initializer2()
+    Constructor() :
+        initializer1(),
+        initializer2()
 
 
 
@@ -1201,7 +1201,8 @@ the configuration (without a prefix: ``Auto``).
 
   * ``IBS_Regroup`` (in configuration: ``Regroup``)
     Merge multiple ``#include`` blocks together and sort as one.
-    Then split into groups based on category priority. See ``IncludeCategories``.
+    Then split into groups based on category priority. See
+    ``IncludeCategories``.
 
     .. code-block:: c++
 
@@ -1577,24 +1578,42 @@ the configuration (without a prefix: ``Auto``).
 
 
 **RawStringFormats** (``std::vector<RawStringFormat>``)
-  Raw string delimiters denoting that the raw string contents are
-  code in a particular language and can be reformatted.
+  Defines hints for detecting supported languages code blocks in raw
+  strings.
+
+  A raw string with a matching delimiter or a matching enclosing function
+  name will be reformatted assuming the specified language based on the
+  style for that language defined in the .clang-format file. If no style has
+  been defined in the .clang-format file for the specific language, a
+  predefined style given by 'BasedOnStyle' is used. If 'BasedOnStyle' is not
+  found, the formatting is based on llvm style. A matching delimiter takes
+  precedence over a matching enclosing function name for determining the
+  language of the raw string contents.
+
+  If a canonical delimiter is specified, occurrences of other delimiters for
+  the same language will be updated to the canonical if possible.
 
-  A raw string with a matching delimiter will be reformatted assuming the
-  specified language based on a predefined style given by 'BasedOnStyle'.
-  If 'BasedOnStyle' is not found, the formatting is based on llvm style.
+  There should be at most one specification per language and each delimiter
+  and enclosing function should not occur in multiple specifications.
 
   To configure this in the .clang-format file, use:
 
   .. code-block:: yaml
 
     RawStringFormats:
-      - Delimiter: 'pb'
-        Language:  TextProto
-        BasedOnStyle: llvm
-      - Delimiter: 'proto'
-        Language:  TextProto
-        BasedOnStyle: google
+      - Language: TextProto
+          Delimiters:
+            - 'pb'
+            - 'proto'
+          EnclosingFunctions:
+            - 'PARSE_TEXT_PROTO'
+          BasedOnStyle: google
+      - Language: Cpp
+          Delimiters:
+            - 'cc'
+            - 'cpp'
+          BasedOnStyle: llvm
+          CanonicalDelimiter: 'cc'
 
 **ReflowComments** (``bool``)
   If ``true``, clang-format will attempt to re-flow comments.
diff --git a/docs/InternalsManual.rst b/docs/InternalsManual.rst
index 058c63f0afd6..b23d31468d0e 100644
--- a/docs/InternalsManual.rst
+++ b/docs/InternalsManual.rst
@@ -1823,7 +1823,7 @@ Note that setting this member to 1 will opt out of common attribute semantic
 handling, requiring extra implementation efforts to ensure the attribute
 appertains to the appropriate subject, etc.
 
-If the attribute should not be propagated from from a template declaration to an
+If the attribute should not be propagated from a template declaration to an
 instantiation of the template, set the ``Clone`` member to 0. By default, all
 attributes will be cloned to template instantiations.
 
diff --git a/docs/LanguageExtensions.rst b/docs/LanguageExtensions.rst
index 9b407f31d973..41f161302f2b 100644
--- a/docs/LanguageExtensions.rst
+++ b/docs/LanguageExtensions.rst
@@ -1096,6 +1096,11 @@ The following type trait primitives are supported by Clang:
 * ``__is_constructible`` (MSVC 2013, clang)
 * ``__is_nothrow_constructible`` (MSVC 2013, clang)
 * ``__is_assignable`` (MSVC 2015, clang)
+* ``__reference_binds_to_temporary(T, U)`` (Clang):  Determines whether a
+  reference of type ``T`` bound to an expression of type ``U`` would bind to a
+  materialized temporary object. If ``T`` is not a reference type the result
+  is false. Note this trait will also return false when the initialization of
+  ``T`` from ``U`` is ill-formed.
 
 Blocks
 ======
diff --git a/docs/LibASTMatchersReference.html b/docs/LibASTMatchersReference.html
index ed1ec193d8df..a807672121b2 100644
--- a/docs/LibASTMatchersReference.html
+++ b/docs/LibASTMatchersReference.html
@@ -700,7 +700,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 Given
   switch(a) { case 42: break; default: break; }
 caseStmt()
-  matches 'case 42: break;'.
+  matches 'case 42:'.
 </pre></td></tr>
 
 
@@ -741,7 +741,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('compoundStmt0')"><a name="compoundStmt0Anchor">compoundStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CompoundStmt.html">CompoundStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="compoundStmt0"><pre>Matches compound statements.
 
-Example matches '{}' and '{{}}'in 'for (;;) {{}}'
+Example matches '{}' and '{{}}' in 'for (;;) {{}}'
   for (;;) {{}}
 </pre></td></tr>
 
@@ -1028,7 +1028,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 Given
   switch(a) { case 42: break; default: break; }
 defaultStmt()
-  matches 'default: break;'.
+  matches 'default:'.
 </pre></td></tr>
 
 
@@ -1197,9 +1197,9 @@ <h2 id="decl-matchers">Node Matchers</h2>
 materializeTemporaryExpr() matches 'f()' in these statements
   T u(f());
   g(f());
+  f().func();
 but does not match
   f();
-  f().func();
 </pre></td></tr>
 
 
@@ -1255,10 +1255,9 @@ <h2 id="decl-matchers">Node Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('objcThrowStmt0')"><a name="objcThrowStmt0Anchor">objcThrowStmt</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ObjCAtThrowStmt.html">ObjCAtThrowStmt</a>&gt;...</td></tr>
-<tr><td colspan="4" class="doc" id="objcThrowStmt0"><pre>Matches Objective-C @throw statements.
+<tr><td colspan="4" class="doc" id="objcThrowStmt0"><pre>Matches Objective-C statements.
 
-Example matches @throw
-  @throw obj;
+Example matches @throw obj;
 </pre></td></tr>
 
 
@@ -1370,7 +1369,7 @@ <h2 id="decl-matchers">Node Matchers</h2>
 Given
   switch(a) { case 42: break; default: break; }
 switchCase()
-  matches 'case 42: break;' and 'default: break;'.
+  matches 'case 42:' and 'default:'.
 </pre></td></tr>
 
 
@@ -1927,7 +1926,11 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;</td><td class="name" onclick="toggle('equals2')"><a name="equals2Anchor">equals</a></td><td>ValueT  Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;</td><td class="name" onclick="toggle('equals5')"><a name="equals5Anchor">equals</a></td><td>bool Value</td></tr>
+<tr><td colspan="4" class="doc" id="equals5"><pre></pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;</td><td class="name" onclick="toggle('equals2')"><a name="equals2Anchor">equals</a></td><td>const ValueT  Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals2"><pre>Matches literals that are equal to the given value of type ValueT.
 
 Given
@@ -1953,10 +1956,6 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;</td><td class="name" onclick="toggle('equals5')"><a name="equals5Anchor">equals</a></td><td>bool Value</td></tr>
-<tr><td colspan="4" class="doc" id="equals5"><pre></pre></td></tr>
-
-
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;</td><td class="name" onclick="toggle('equals11')"><a name="equals11Anchor">equals</a></td><td>double Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals11"><pre></pre></td></tr>
 
@@ -2409,7 +2408,11 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;</td><td class="name" onclick="toggle('equals3')"><a name="equals3Anchor">equals</a></td><td>ValueT  Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;</td><td class="name" onclick="toggle('equals4')"><a name="equals4Anchor">equals</a></td><td>bool Value</td></tr>
+<tr><td colspan="4" class="doc" id="equals4"><pre></pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;</td><td class="name" onclick="toggle('equals3')"><a name="equals3Anchor">equals</a></td><td>const ValueT  Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals3"><pre>Matches literals that are equal to the given value of type ValueT.
 
 Given
@@ -2435,10 +2438,6 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;</td><td class="name" onclick="toggle('equals4')"><a name="equals4Anchor">equals</a></td><td>bool Value</td></tr>
-<tr><td colspan="4" class="doc" id="equals4"><pre></pre></td></tr>
-
-
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;</td><td class="name" onclick="toggle('equals10')"><a name="equals10Anchor">equals</a></td><td>double Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals10"><pre></pre></td></tr>
 
@@ -2645,6 +2644,15 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumDecl.html">EnumDecl</a>&gt;</td><td class="name" onclick="toggle('isScoped0')"><a name="isScoped0Anchor">isScoped</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isScoped0"><pre>Matches C++11 scoped enum declaration.
+
+Example matches Y (matcher = enumDecl(isScoped()))
+enum X {};
+enum class Y {};
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FieldDecl.html">FieldDecl</a>&gt;</td><td class="name" onclick="toggle('hasBitWidth0')"><a name="hasBitWidth0Anchor">hasBitWidth</a></td><td>unsigned Width</td></tr>
 <tr><td colspan="4" class="doc" id="hasBitWidth0"><pre>Matches non-static data members that are bit-fields of the specified
 bit width.
@@ -2673,7 +2681,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;</td><td class="name" onclick="toggle('equals1')"><a name="equals1Anchor">equals</a></td><td>ValueT  Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;</td><td class="name" onclick="toggle('equals1')"><a name="equals1Anchor">equals</a></td><td>const ValueT  Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals1"><pre>Matches literals that are equal to the given value of type ValueT.
 
 Given
@@ -2741,6 +2749,15 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('hasTrailingReturn0')"><a name="hasTrailingReturn0Anchor">hasTrailingReturn</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="hasTrailingReturn0"><pre>Matches a function declared with a trailing return type.
+
+Example matches Y (matcher = functionDecl(hasTrailingReturn()))
+int X() {}
+auto Y() -&gt; int {}
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isConstexpr1')"><a name="isConstexpr1Anchor">isConstexpr</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isConstexpr1"><pre>Matches constexpr variable and function declarations.
 
@@ -2811,6 +2828,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
+
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isExternC0')"><a name="isExternC0Anchor">isExternC</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExternC0"><pre>Matches extern "C" function or variable declarations.
 
@@ -2827,6 +2845,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
   matches the declaration of x and y, but not the declaration of z.
 </pre></td></tr>
 
+
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isInline1')"><a name="isInline1Anchor">isInline</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isInline1"><pre>Matches function and namespace declarations that are marked with
 the inline keyword.
@@ -2842,6 +2861,20 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isNoReturn0')"><a name="isNoReturn0Anchor">isNoReturn</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isNoReturn0"><pre>Matches FunctionDecls that have a noreturn attribute.
+
+Given
+  void nope();
+  [[noreturn]] void a();
+  __attribute__((noreturn)) void b();
+  struct c { [[noreturn]] c(); };
+functionDecl(isNoReturn())
+  matches all of those except
+  void nope();
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isNoThrow0')"><a name="isNoThrow0Anchor">isNoThrow</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isNoThrow0"><pre>Matches functions that have a non-throwing exception specification.
 
@@ -2916,11 +2949,11 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
   void j(int i);
   void k(int x, int y, int z, ...);
 functionDecl(parameterCountIs(2))
-  matches void g(int i, int j) {}
+  matches g and h
 functionProtoType(parameterCountIs(2))
-  matches void h(int i, int j)
+  matches g and h
 functionProtoType(parameterCountIs(3))
-  matches void k(int x, int y, int z, ...);
+  matches k
 </pre></td></tr>
 
 
@@ -2966,15 +2999,19 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
   void j(int i);
   void k(int x, int y, int z, ...);
 functionDecl(parameterCountIs(2))
-  matches void g(int i, int j) {}
+  matches g and h
 functionProtoType(parameterCountIs(2))
-  matches void h(int i, int j)
+  matches g and h
 functionProtoType(parameterCountIs(3))
-  matches void k(int x, int y, int z, ...);
+  matches k
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;</td><td class="name" onclick="toggle('equals0')"><a name="equals0Anchor">equals</a></td><td>ValueT  Value</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;</td><td class="name" onclick="toggle('equals6')"><a name="equals6Anchor">equals</a></td><td>bool Value</td></tr>
+<tr><td colspan="4" class="doc" id="equals6"><pre></pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;</td><td class="name" onclick="toggle('equals0')"><a name="equals0Anchor">equals</a></td><td>const ValueT  Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals0"><pre>Matches literals that are equal to the given value of type ValueT.
 
 Given
@@ -3000,10 +3037,6 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;</td><td class="name" onclick="toggle('equals6')"><a name="equals6Anchor">equals</a></td><td>bool Value</td></tr>
-<tr><td colspan="4" class="doc" id="equals6"><pre></pre></td></tr>
-
-
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;</td><td class="name" onclick="toggle('equals13')"><a name="equals13Anchor">equals</a></td><td>double Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals13"><pre></pre></td></tr>
 
@@ -3049,7 +3082,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;</td><td class="name" onclick="toggle('hasName0')"><a name="hasName0Anchor">hasName</a></td><td>std::string  Name</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1NamedDecl.html">NamedDecl</a>&gt;</td><td class="name" onclick="toggle('hasName0')"><a name="hasName0Anchor">hasName</a></td><td>const std::string  Name</td></tr>
 <tr><td colspan="4" class="doc" id="hasName0"><pre>Matches NamedDecl nodes that have the specified name.
 
 Supports specifying enclosing namespaces or classes by prefixing the name
@@ -3510,7 +3543,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 representation of that integral value in base 10.
 
 Given
-  template&lt;int T&gt; struct A {};
+  template&lt;int T&gt; struct C {};
   C&lt;42&gt; c;
 classTemplateSpecializationDecl(
   hasAnyTemplateArgument(equalsIntegralValue("42")))
@@ -3522,7 +3555,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 <tr><td colspan="4" class="doc" id="isIntegral0"><pre>Matches a TemplateArgument that is an integral value.
 
 Given
-  template&lt;int T&gt; struct A {};
+  template&lt;int T&gt; struct C {};
   C&lt;42&gt; c;
 classTemplateSpecializationDecl(
   hasAnyTemplateArgument(isIntegral()))
@@ -3798,6 +3831,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
+
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isExternC1')"><a name="isExternC1Anchor">isExternC</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isExternC1"><pre>Matches extern "C" function or variable declarations.
 
@@ -3814,6 +3848,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
   matches the declaration of x and y, but not the declaration of z.
 </pre></td></tr>
 
+
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isStaticStorageClass1')"><a name="isStaticStorageClass1Anchor">isStaticStorageClass</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isStaticStorageClass1"><pre>Matches variablefunction declarations that have "static" storage
 class specifier ("static" keyword) written in the source.
@@ -3947,10 +3982,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 <tr><td colspan="4" class="doc" id="forEachDescendant0"><pre>Matches AST nodes that have descendant AST nodes that match the
 provided matcher.
 
-Example matches X, A, B, C
+Example matches X, A, A::X, B, B::C, B::C::X
   (matcher = cxxRecordDecl(forEachDescendant(cxxRecordDecl(hasName("X")))))
-  class X {};  Matches X, because X::X is a class of name X inside X.
-  class A { class X {}; };
+  class X {};
+  class A { class X {}; };  Matches A, because A::X is a class of name
+                            X inside A.
   class B { class C { class X {}; }; };
 
 DescendantT must be an AST base type.
@@ -3973,10 +4009,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 <tr><td colspan="4" class="doc" id="forEach0"><pre>Matches AST nodes that have child AST nodes that match the
 provided matcher.
 
-Example matches X, Y
+Example matches X, Y, Y::X, Z::Y, Z::Y::X
   (matcher = cxxRecordDecl(forEach(cxxRecordDecl(hasName("X")))
-  class X {};  Matches X, because X::X is a class of name X inside X.
-  class Y { class X {}; };
+  class X {};
+  class Y { class X {}; };  Matches Y, because Y::X is a class of name X
+                            inside Y.
   class Z { class Y { class X {}; }; };  Does not match Z.
 
 ChildT must be an AST base type.
@@ -4079,7 +4116,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration15')"><a name="hasDeclaration15Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration15')"><a name="hasDeclaration15Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration15"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4090,8 +4127,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -4214,7 +4260,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasEitherOperand0')"><a name="hasEitherOperand0Anchor">hasEitherOperand</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1BinaryOperator.html">BinaryOperator</a>&gt;</td><td class="name" onclick="toggle('hasEitherOperand0')"><a name="hasEitherOperand0Anchor">hasEitherOperand</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasEitherOperand0"><pre>Matches if either the left hand side or the right hand side of a
 binary operator matches.
 </pre></td></tr>
@@ -4311,7 +4357,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration13')"><a name="hasDeclaration13Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration13')"><a name="hasDeclaration13Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration13"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4322,8 +4368,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -4435,7 +4490,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 Example matches y.x()
   (matcher = cxxMemberCallExpr(on(hasType(cxxRecordDecl(hasName("Y"))))))
   class Y { public: void x(); };
-  void z() { Y y; y.x(); }",
+  void z() { Y y; y.x(); }
 
 FIXME: Overload to allow directly matching types?
 </pre></td></tr>
@@ -4505,7 +4560,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration12')"><a name="hasDeclaration12Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration12')"><a name="hasDeclaration12Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration12"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4516,8 +4571,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -4640,7 +4704,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration14')"><a name="hasDeclaration14Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration14')"><a name="hasDeclaration14Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration14"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4651,8 +4715,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -4676,7 +4749,18 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CastExpr.html">CastExpr</a>&gt;</td><td class="name" onclick="toggle('hasSourceExpression0')"><a name="hasSourceExpression0Anchor">hasSourceExpression</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasSourceExpression0"><pre></pre></td></tr>
+<tr><td colspan="4" class="doc" id="hasSourceExpression0"><pre>Matches if the cast's source expression
+or opaque value's source expression matches the given matcher.
+
+Example 1: matches "a string"
+(matcher = castExpr(hasSourceExpression(cxxConstructExpr())))
+class URL { URL(string); };
+URL url = "a string";
+
+Example 2: matches 'b' (matcher =
+opaqueValueExpr(hasSourceExpression(implicitCastExpr(declRefExpr())))
+int a = b ?: 1;
+</pre></td></tr>
 
 
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyTemplateArgument0')"><a name="hasAnyTemplateArgument0Anchor">hasAnyTemplateArgument</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
@@ -4722,7 +4806,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
   A&lt;bool, int&gt; b;
   A&lt;int, bool&gt; c;
 
-  template&lt;typename T&gt; f() {};
+  template&lt;typename T&gt; void f() {}
   void func() { f&lt;int&gt;(); };
 classTemplateSpecializationDecl(hasTemplateArgument(
     1, refersToType(asString("int"))))
@@ -4781,7 +4865,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration11')"><a name="hasDeclaration11Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration11')"><a name="hasDeclaration11Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration11"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4792,8 +4876,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -4945,7 +5038,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration10')"><a name="hasDeclaration10Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1EnumType.html">EnumType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration10')"><a name="hasDeclaration10Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration10"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -4956,8 +5049,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5221,7 +5323,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
   A&lt;bool, int&gt; b;
   A&lt;int, bool&gt; c;
 
-  template&lt;typename T&gt; f() {};
+  template&lt;typename T&gt; void f() {}
   void func() { f&lt;int&gt;(); };
 classTemplateSpecializationDecl(hasTemplateArgument(
     1, refersToType(asString("int"))))
@@ -5293,7 +5395,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration9')"><a name="hasDeclaration9Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1InjectedClassNameType.html">InjectedClassNameType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration9')"><a name="hasDeclaration9Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration9"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5304,8 +5406,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5317,7 +5428,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration8')"><a name="hasDeclaration8Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1LabelStmt.html">LabelStmt</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration8')"><a name="hasDeclaration8Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration8"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5328,8 +5439,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5341,7 +5461,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration7')"><a name="hasDeclaration7Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1MemberExpr.html">MemberExpr</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration7')"><a name="hasDeclaration7Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration7"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5352,8 +5472,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5522,7 +5651,18 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1OpaqueValueExpr.html">OpaqueValueExpr</a>&gt;</td><td class="name" onclick="toggle('hasSourceExpression1')"><a name="hasSourceExpression1Anchor">hasSourceExpression</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
-<tr><td colspan="4" class="doc" id="hasSourceExpression1"><pre></pre></td></tr>
+<tr><td colspan="4" class="doc" id="hasSourceExpression1"><pre>Matches if the cast's source expression
+or opaque value's source expression matches the given matcher.
+
+Example 1: matches "a string"
+(matcher = castExpr(hasSourceExpression(cxxConstructExpr())))
+class URL { URL(string); };
+URL url = "a string";
+
+Example 2: matches 'b' (matcher =
+opaqueValueExpr(hasSourceExpression(implicitCastExpr(declRefExpr())))
+int a = b ?: 1;
+</pre></td></tr>
 
 
 <tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1OverloadExpr.html">OverloadExpr</a>&gt;</td><td class="name" onclick="toggle('hasAnyDeclaration0')"><a name="hasAnyDeclaration0Anchor">hasAnyDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
@@ -5601,7 +5741,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration6')"><a name="hasDeclaration6Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1QualType.html">QualType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration6')"><a name="hasDeclaration6Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration6"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5612,8 +5752,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5673,7 +5822,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration5')"><a name="hasDeclaration5Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1RecordType.html">RecordType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration5')"><a name="hasDeclaration5Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration5"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5684,8 +5833,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5754,7 +5912,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('alignOfExpr0')"><a name="alignOfExpr0Anchor">alignOfExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('alignOfExpr0')"><a name="alignOfExpr0Anchor">alignOfExpr</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="alignOfExpr0"><pre>Same as unaryExprOrTypeTraitExpr, but only matching
 alignof.
 </pre></td></tr>
@@ -5774,7 +5932,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('sizeOfExpr0')"><a name="sizeOfExpr0Anchor">sizeOfExpr</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('sizeOfExpr0')"><a name="sizeOfExpr0Anchor">sizeOfExpr</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnaryExprOrTypeTraitExpr.html">UnaryExprOrTypeTraitExpr</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="sizeOfExpr0"><pre>Same as unaryExprOrTypeTraitExpr, but only matching
 sizeof.
 </pre></td></tr>
@@ -5816,7 +5974,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration4')"><a name="hasDeclaration4Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TagType.html">TagType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration4')"><a name="hasDeclaration4Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration4"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5827,8 +5985,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5844,8 +6011,8 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 <tr><td colspan="4" class="doc" id="isExpr0"><pre>Matches a sugar TemplateArgument that refers to a certain expression.
 
 Given
-  template&lt;typename T&gt; struct A {};
-  struct B { B* next; };
+  struct B { int next; };
+  template&lt;int(B::*next_ptr)&gt; struct A {};
   A&lt;&amp;B::next&gt; a;
 templateSpecializationType(hasAnyTemplateArgument(
   isExpr(hasDescendant(declRefExpr(to(fieldDecl(hasName("next"))))))))
@@ -5859,11 +6026,11 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 declaration.
 
 Given
-  template&lt;typename T&gt; struct A {};
-  struct B { B* next; };
+  struct B { int next; };
+  template&lt;int(B::*next_ptr)&gt; struct A {};
   A&lt;&amp;B::next&gt; a;
 classTemplateSpecializationDecl(hasAnyTemplateArgument(
-    refersToDeclaration(fieldDecl(hasName("next"))))
+    refersToDeclaration(fieldDecl(hasName("next")))))
   matches the specialization A&lt;&amp;B::next&gt; with fieldDecl(...) matching
     B::next
 </pre></td></tr>
@@ -5873,7 +6040,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 <tr><td colspan="4" class="doc" id="refersToIntegralType0"><pre>Matches a TemplateArgument that referes to an integral type.
 
 Given
-  template&lt;int T&gt; struct A {};
+  template&lt;int T&gt; struct C {};
   C&lt;42&gt; c;
 classTemplateSpecializationDecl(
   hasAnyTemplateArgument(refersToIntegralType(asString("int"))))
@@ -5929,7 +6096,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration3')"><a name="hasDeclaration3Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration3')"><a name="hasDeclaration3Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration3"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5940,8 +6107,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5962,7 +6138,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
   A&lt;bool, int&gt; b;
   A&lt;int, bool&gt; c;
 
-  template&lt;typename T&gt; f() {};
+  template&lt;typename T&gt; void f() {}
   void func() { f&lt;int&gt;(); };
 classTemplateSpecializationDecl(hasTemplateArgument(
     1, refersToType(asString("int"))))
@@ -5973,7 +6149,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration2')"><a name="hasDeclaration2Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TemplateTypeParmType.html">TemplateTypeParmType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration2')"><a name="hasDeclaration2Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration2"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -5984,8 +6160,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -5997,7 +6182,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;T&gt;</td><td class="name" onclick="toggle('findAll0')"><a name="findAll0Anchor">findAll</a></td><td>Matcher&lt;T&gt;  Matcher</td></tr>
+<tr><td>Matcher&lt;T&gt;</td><td class="name" onclick="toggle('findAll0')"><a name="findAll0Anchor">findAll</a></td><td>const Matcher&lt;T&gt;  Matcher</td></tr>
 <tr><td colspan="4" class="doc" id="findAll0"><pre>Matches if the node or any descendant matches.
 
 Generates results for each match.
@@ -6026,7 +6211,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration1')"><a name="hasDeclaration1Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1TypedefType.html">TypedefType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration1')"><a name="hasDeclaration1Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration1"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -6037,8 +6222,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
@@ -6057,7 +6251,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 For example, in:
   class A {};
   using B = A;
-The matcher type(hasUniqualifeidDesugaredType(recordType())) matches
+The matcher type(hasUnqualifeidDesugaredType(recordType())) matches
 both B and A.
 </pre></td></tr>
 
@@ -6081,7 +6275,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration0')"><a name="hasDeclaration0Anchor">hasDeclaration</a></td><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1UnresolvedUsingType.html">UnresolvedUsingType</a>&gt;</td><td class="name" onclick="toggle('hasDeclaration0')"><a name="hasDeclaration0Anchor">hasDeclaration</a></td><td>const Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;  InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasDeclaration0"><pre>Matches a node if the declaration associated with that node
 matches the given matcher.
 
@@ -6092,8 +6286,17 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 - for CXXConstructExpr, the declaration of the constructor
 - for CXXNewExpr, the declaration of the operator new
 
-Also usable as Matcher&lt;T&gt; for any T supporting the getDecl() member
-function. e.g. various subtypes of clang::Type and various expressions.
+For type nodes, hasDeclaration will generally match the declaration of the
+sugared type. Given
+  class X {};
+  typedef X Y;
+  Y y;
+in varDecl(hasType(hasDeclaration(decl()))) the decl will match the
+typedefDecl. A common use case is to match the underlying, desugared type.
+This can be achieved by using the hasUnqualifiedDesugaredType matcher:
+  varDecl(hasType(hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(decl())))))
+In this matcher, the decl will match the CXXRecordDecl of class X.
 
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1AddrLabelExpr.html">AddrLabelExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CallExpr.html">CallExpr</a>&gt;,
   Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXNewExpr.html">CXXNewExpr</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1DeclRefExpr.html">DeclRefExpr</a>&gt;,
diff --git a/docs/OpenMPSupport.rst b/docs/OpenMPSupport.rst
new file mode 100644
index 000000000000..f34049473f82
--- /dev/null
+++ b/docs/OpenMPSupport.rst
@@ -0,0 +1,68 @@
+.. raw:: html
+
+  <style type="text/css">
+    .none { background-color: #FFCCCC }
+    .partial { background-color: #FFFF99 }
+    .good { background-color: #CCFF99 }
+  </style>
+
+.. role:: none
+.. role:: partial
+.. role:: good
+
+==================
+OpenMP Support
+==================
+
+Clang fully supports OpenMP 3.1 + some elements of OpenMP 4.5. Clang supports offloading to X86_64, AArch64 and PPC64[LE] devices.
+Support for Cuda devices is not ready yet.
+The status of major OpenMP 4.5 features support in Clang.
+
+Standalone directives
+=====================
+
+* #pragma omp [for] simd: :good:`Complete`.
+
+* #pragma omp declare simd: :partial:`Partial`.  We support parsing/semantic
+  analysis + generation of special attributes for X86 target, but still
+  missing the LLVM pass for vectorization.
+
+* #pragma omp taskloop [simd]: :good:`Complete`.
+
+* #pragma omp target [enter|exit] data: :good:`Complete`.
+
+* #pragma omp target update: :good:`Complete`.
+
+* #pragma omp target: :good:`Complete`.
+
+* #pragma omp declare target: :partial:`Partial`.  No full codegen support.
+
+* #pragma omp teams: :good:`Complete`.
+
+* #pragma omp distribute [simd]: :good:`Complete`.
+
+* #pragma omp distribute parallel for [simd]: :good:`Complete`.
+
+Combined directives
+===================
+
+* #pragma omp parallel for simd: :good:`Complete`.
+
+* #pragma omp target parallel: :good:`Complete`.
+
+* #pragma omp target parallel for [simd]: :good:`Complete`.
+
+* #pragma omp target simd: :good:`Complete`.
+
+* #pragma omp target teams: :good:`Complete`.
+
+* #pragma omp teams distribute [simd]: :good:`Complete`.
+
+* #pragma omp target teams distribute [simd]: :good:`Complete`.
+
+* #pragma omp teams distribute parallel for [simd]: :good:`Complete`.
+
+* #pragma omp target teams distribute parallel for [simd]: :good:`Complete`.
+
+Clang does not support any constructs/updates from upcoming OpenMP 5.0 except for `reduction`-based clauses in the `task` and `target`-based directives.
+In addition, the LLVM OpenMP runtime `libomp` supports the OpenMP Tools Interface (OMPT) on x86, x86_64, AArch64, and PPC64 on Linux, Windows, and mac OS.
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 56edfeaa7285..b29cd0627d05 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -1,5 +1,5 @@
 =======================================
-Clang 6.0.0 (In-Progress) Release Notes
+Clang 7.0.0 (In-Progress) Release Notes
 =======================================
 
 .. contents::
@@ -10,7 +10,7 @@ Written by the `LLVM Team <http://llvm.org/>`_
 
 .. warning::
 
-   These are in-progress notes for the upcoming Clang 6 release.
+   These are in-progress notes for the upcoming Clang 7 release.
    Release notes for previous releases can be found on
    `the Download Page <http://releases.llvm.org/download.html>`_.
 
@@ -18,7 +18,7 @@ Introduction
 ============
 
 This document contains the release notes for the Clang C/C++/Objective-C
-frontend, part of the LLVM Compiler Infrastructure, release 6.0.0. Here we
+frontend, part of the LLVM Compiler Infrastructure, release 7.0.0. Here we
 describe the status of Clang in some detail, including major
 improvements from the previous release and new feature work. For the
 general LLVM release notes, see `the LLVM
@@ -35,7 +35,7 @@ main Clang web page, this document applies to the *next* release, not
 the current one. To see the release notes for a specific release, please
 see the `releases page <http://llvm.org/releases/>`_.
 
-What's New in Clang 6.0.0?
+What's New in Clang 7.0.0?
 ==========================
 
 Some of the major new features and improvements to Clang are listed
@@ -51,86 +51,17 @@ Major New Features
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- ``-Wpragma-pack`` is a new warning that warns in the following cases:
-
-  - When a translation unit is missing terminating ``#pragma pack (pop)``
-    directives.
-
-  - When leaving an included file that changes the current alignment value,
-    i.e. when the alignment before ``#include`` is different to the alignment
-    after ``#include``.
-
-  - ``-Wpragma-pack-suspicious-include`` (disabled by default) warns on an
-    ``#include`` when the included file contains structures or unions affected by
-    a non-default alignment that has been specified using a ``#pragma pack``
-    directive prior to the ``#include``.
-
-- ``-Wobjc-messaging-id`` is a new, non-default warning that warns about
-  message sends to unqualified ``id`` in Objective-C. This warning is useful
-  for projects that would like to avoid any potential future compiler
-  errors/warnings, as the system frameworks might add a method with the same
-  selector which could make the message send to ``id`` ambiguous.
-
-- ``-Wtautological-compare`` now warns when comparing an unsigned integer and 0
-  regardless of whether the constant is signed or unsigned."
-
-- ``-Wtautological-compare`` now warns about comparing a signed integer and 0
-  when the signed integer is coerced to an unsigned type for the comparison.
-  ``-Wsign-compare`` was adjusted not to warn in this case.
-
-- ``-Wtautological-constant-compare`` is a new warning that warns on
-  tautological comparisons between integer variable of the type ``T`` and the
-  largest/smallest possible integer constant of that same type.
-
-- For C code, ``-Wsign-compare``, ``-Wsign-conversion``,
-  ``-Wtautological-constant-compare`` and
-  ``-Wtautological-constant-out-of-range-compare`` were adjusted to use the
-  underlying datatype of ``enum``.
-
-- ``-Wnull-pointer-arithmetic`` now warns about performing pointer arithmetic
-  on a null pointer. Such pointer arithmetic has an undefined behavior if the
-  offset is nonzero. It also now warns about arithmetic on a null pointer
-  treated as a cast from integer to pointer (GNU extension).
-
-- ``-Wzero-as-null-pointer-constant`` was adjusted not to warn on null pointer
-  constants that originate from system macros, except ``NULL`` macro.
+- ...
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
-- Bitrig OS was merged back into OpenBSD, so Bitrig support has been
-  removed from Clang/LLVM.
-
-- The default value of _MSC_VER was raised from 1800 to 1911, making it
-  compatible with the Visual Studio 2015 and 2017 C++ standard library headers.
-  Users should generally expect this to be regularly raised to match the most
-  recently released version of the Visual C++ compiler.
-
-- clang now defaults to ``.init_array`` if no gcc installation can be found.
-  If a gcc installation is found, it still prefers ``.ctors`` if the found
-  gcc is older than 4.7.0.
-
-- The new builtin preprocessor macros ``__is_target_arch``,
-  ``__is_target_vendor``, ``__is_target_os``, and ``__is_target_environment``
-  can be used to to examine the individual components of the target triple.
+- ...
 
 New Compiler Flags
 ------------------
 
-- --autocomplete was implemented to obtain a list of flags and its arguments. This is used for shell autocompletion.
-
-- The ``-fdouble-square-bracket-attributes`` and corresponding
-  ``-fno-double-square-bracket-attributes`` flags were added to enable or
-  disable [[]] attributes in any language mode. Currently, only a limited
-  number of attributes are supported outside of C++ mode. See the Clang
-  attribute documentation for more information about which attributes are
-  supported for each syntax.
-  
-- Added the ``-std=c17``, ``-std=gnu17``, and ``-std=iso9899:2017`` language
-  mode flags for compatibility with GCC. This enables support for the next
-  version of the C standard, expected to be published by ISO in 2018. The only
-  difference between the ``-std=c17`` and ``-std=c11`` language modes is the
-  value of the ``__STDC_VERSION__`` macro, as C17 is a bug fix release.
+- ...
 
 Deprecated Compiler Flags
 -------------------------
@@ -149,20 +80,12 @@ Clang now supports the ...
 Attribute Changes in Clang
 --------------------------
 
-- Clang now supports the majority of its attributes under both the GNU-style
-  spelling (``__attribute((name))``) and the double square-bracket spelling
-  in the ``clang`` vendor namespace (``[[clang::name]]``). Attributes whose
-  syntax is specified by some other standard (such as CUDA and OpenCL
-  attributes) continue to follow their respective specification.
-  
-- Added the ``__has_c_attribute()`` builtin preprocessor macro which allows
-  users to dynamically detect whether a double square-bracket attribute is
-  supported in C mode. This attribute syntax can be enabled with the
-  ``-fdouble-square-bracket-attributes`` flag.
-  
-- The presence of __attribute__((availability(...))) on a declaration no longer
-  implies default visibility for that declaration on macOS.
-
+- Clang now supports function multiversioning with attribute 'target' on ELF
+  based x86/x86-64 environments by using indirect functions. This implementation
+  has a few minor limitations over the GCC implementation for the sake of AST
+  sanity, however it is otherwise compatible with existing code using this
+  feature for GCC. Consult the documentation for the target attribute for more
+  information.
 - ...
 
 Windows Support
@@ -186,10 +109,7 @@ C11 Feature Support
 C++ Language Changes in Clang
 -----------------------------
 
-- Clang's default C++ dialect is now ``gnu++14`` instead of ``gnu++98``. This
-  means Clang will by default accept code using features from C++14 and
-  conforming GNU extensions. Projects incompatible with C++14 can add
-  ``-std=gnu++98`` to their build settings to restore the previous behaviour.
+- ...
 
 C++1z Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -209,23 +129,12 @@ OpenCL C Language Changes in Clang
 OpenMP Support in Clang
 ----------------------------------
 
-- Added options `-f[no]-openmp-simd` that support code emission only foe OpenMP
-  SIMD-based directives, like `#pragma omp simd`, `#pragma omp parallel for simd`
-  etc. The code is emitted only for simd-based part of the combined directives
-  and clauses.
-
-- Added support for almost all target-based directives except for
-  `#pragma omp target teams distribute parallel for [simd]`. Although, please
-  note that `depend` clauses on target-based directives are not supported yet.
-  Clang supports offloading to X86_64, AArch64 and PPC64[LE] devices.
-
-- Added support for `reduction`-based clauses on `task`-based directives from
-  upcoming OpenMP 5.0.
+- ...
 
 Internal API Changes
 --------------------
 
-These are major API changes that have happened since the 4.0.0 release of
+These are major API changes that have happened since the 6.0.0 release of
 Clang. If upgrading an external codebase that uses Clang as a library,
 this section should help get you past the largest hurdles of upgrading.
 
@@ -234,67 +143,12 @@ this section should help get you past the largest hurdles of upgrading.
 AST Matchers
 ------------
 
-The hasDeclaration matcher now works the same for Type and QualType and only
-ever looks through one level of sugaring in a limited number of cases.
-
-There are two main patterns affected by this:
-
--  qualType(hasDeclaration(recordDecl(...))): previously, we would look through
-   sugar like TypedefType to get at the underlying recordDecl; now, we need
-   to explicitly remove the sugaring:
-   qualType(hasUnqualifiedDesugaredType(hasDeclaration(recordDecl(...))))
-
--  hasType(recordDecl(...)): hasType internally uses hasDeclaration; previously,
-   this matcher used to match for example TypedefTypes of the RecordType, but
-   after the change they don't; to fix, use:
-
-::
-   hasType(hasUnqualifiedDesugaredType(
-       recordType(hasDeclaration(recordDecl(...)))))
-
--  templateSpecializationType(hasDeclaration(classTemplateDecl(...))):
-   previously, we would directly match the underlying ClassTemplateDecl;
-   now, we can explicitly match the ClassTemplateSpecializationDecl, but that
-   requires to explicitly get the ClassTemplateDecl:
-
-::
-   templateSpecializationType(hasDeclaration(
-       classTemplateSpecializationDecl(
-           hasSpecializedTemplate(classTemplateDecl(...)))))
+- ...
 
 clang-format
 ------------
 
-* Option *IndentPPDirectives* added to indent preprocessor directives on
-  conditionals.
-
-  +----------------------+----------------------+
-  | Before               | After                |
-  +======================+======================+
-  |  .. code-block:: c++ | .. code-block:: c++  |
-  |                      |                      |
-  |    #if FOO           |   #if FOO            |
-  |    #if BAR           |   #  if BAR          |
-  |    #include <foo>    |   #    include <foo> |
-  |    #endif            |   #  endif           |
-  |    #endif            |   #endif             |
-  +----------------------+----------------------+
-
-* Option -verbose added to the command line.
-  Shows the list of processed files.
-
-* Option *IncludeBlocks* added to merge and regroup multiple ``#include`` blocks during sorting.
-
-  +-------------------------+-------------------------+-------------------------+
-  | Before (Preserve)       | Merge                   | Regroup                 |
-  +=========================+=========================+=========================+
-  |  .. code-block:: c++    | .. code-block:: c++     | .. code-block:: c++     |
-  |                         |                         |                         |
-  |   #include "b.h"        |   #include "a.h"        |   #include "a.h"        |
-  |                         |   #include "b.h"        |   #include "b.h"        |
-  |   #include "a.b"        |   #include <lib/main.h> |                         |
-  |   #include <lib/main.h> |                         |   #include <lib/main.h> |
-  +-------------------------+-------------------------+-------------------------+
+- ...
 
 libclang
 --------
@@ -305,18 +159,14 @@ libclang
 Static Analyzer
 ---------------
 
-- Static Analyzer can now properly detect and diagnose unary pre-/post-
-  increment/decrement on an uninitialized value.
+- ...
 
 ...
 
 Undefined Behavior Sanitizer (UBSan)
 ------------------------------------
 
-* A minimal runtime is now available. It is suitable for use in production
-  environments, and has a small attack surface. It only provides very basic
-  issue logging and deduplication, and does not support ``-fsanitize=vptr``
-  checking.
+* ...
 
 Core Analysis Improvements
 ==========================
diff --git a/docs/ThinLTO.rst b/docs/ThinLTO.rst
index 14c098b052df..38873f464c29 100644
--- a/docs/ThinLTO.rst
+++ b/docs/ThinLTO.rst
@@ -156,7 +156,7 @@ A policy string is a series of key-value pairs separated by ``:`` characters.
 Possible key-value pairs are:
 
 - ``cache_size=X%``: The maximum size for the cache directory is ``X`` percent
-  of the available space on the the disk. Set to 100 to indicate no limit,
+  of the available space on the disk. Set to 100 to indicate no limit,
   50 to indicate that the cache size will not be left over half the available
   disk space. A value over 100 is invalid. A value of 0 disables the percentage
   size-based pruning. The default is 75%.
diff --git a/docs/Toolchain.rst b/docs/Toolchain.rst
index e727ccdc7c1a..06bde35c3da6 100644
--- a/docs/Toolchain.rst
+++ b/docs/Toolchain.rst
@@ -106,7 +106,7 @@ Assember
 Clang can either use LLVM's integrated assembler or an external system-specific
 tool (for instance, the GNU Assembler on GNU OSes) to produce machine code from
 assembly.
-By default, Clang uses LLVM's integrataed assembler on all targets where it is
+By default, Clang uses LLVM's integrated assembler on all targets where it is
 supported. If you wish to use the system assember instead, use the
 ``-fno-integrated-as`` option.
 
diff --git a/docs/UsersManual.rst b/docs/UsersManual.rst
index 9596fb1cbded..84dfe9fc3d73 100644
--- a/docs/UsersManual.rst
+++ b/docs/UsersManual.rst
@@ -694,6 +694,79 @@ a special character, which is the convention used by GNU Make. The -MV
 option tells Clang to put double-quotes around the entire filename, which
 is the convention used by NMake and Jom.
 
+Configuration files
+-------------------
+
+Configuration files group command-line options and allow all of them to be
+specified just by referencing the configuration file. They may be used, for
+example, to collect options required to tune compilation for particular
+target, such as -L, -I, -l, --sysroot, codegen options, etc.
+
+The command line option `--config` can be used to specify configuration
+file in a Clang invocation. For example:
+
+::
+
+    clang --config /home/user/cfgs/testing.txt
+    clang --config debug.cfg
+
+If the provided argument contains a directory separator, it is considered as
+a file path, and options are read from that file. Otherwise the argument is
+treated as a file name and is searched for sequentially in the directories:
+
+    - user directory,
+    - system directory,
+    - the directory where Clang executable resides.
+
+Both user and system directories for configuration files are specified during
+clang build using CMake parameters, CLANG_CONFIG_FILE_USER_DIR and
+CLANG_CONFIG_FILE_SYSTEM_DIR respectively. The first file found is used. It is
+an error if the required file cannot be found.
+
+Another way to specify a configuration file is to encode it in executable name.
+For example, if the Clang executable is named `armv7l-clang` (it may be a
+symbolic link to `clang`), then Clang will search for file `armv7l.cfg` in the
+directory where Clang resides.
+
+If a driver mode is specified in invocation, Clang tries to find a file specific
+for the specified mode. For example, if the executable file is named
+`x86_64-clang-cl`, Clang first looks for `x86_64-cl.cfg` and if it is not found,
+looks for `x86_64.cfg`.
+
+If the command line contains options that effectively change target architecture
+(these are -m32, -EL, and some others) and the configuration file starts with an
+architecture name, Clang tries to load the configuration file for the effective
+architecture. For example, invocation:
+
+::
+
+    x86_64-clang -m32 abc.c
+
+causes Clang search for a file `i368.cfg` first, and if no such file is found,
+Clang looks for the file `x86_64.cfg`.
+
+The configuration file consists of command-line options specified on one or
+more lines. Lines composed of whitespace characters only are ignored as well as
+lines in which the first non-blank character is `#`. Long options may be split
+between several lines by a trailing backslash. Here is example of a
+configuration file:
+
+::
+
+    # Several options on line
+    -c --target=x86_64-unknown-linux-gnu
+
+    # Long option split between lines
+    -I/usr/lib/gcc/x86_64-linux-gnu/5.4.0/../../../../\
+    include/c++/5.4.0
+
+    # other config files may be included
+    @linux.options
+
+Files included by `@file` directives in configuration files are resolved
+relative to the including file. For example, if a configuration file
+`~/.llvm/target.cfg` contains the directive `@os/linux.opts`, the file
+`linux.opts` is searched for in the directory `~/.llvm/os`.
 
 Language and Target-Independent Features
 ========================================
@@ -1002,7 +1075,7 @@ location.
 Building a relocatable precompiled header requires two additional
 arguments. First, pass the ``--relocatable-pch`` flag to indicate that
 the resulting PCH file should be relocatable. Second, pass
-`-isysroot /path/to/build`, which makes all includes for your library
+``-isysroot /path/to/build``, which makes all includes for your library
 relative to the build directory. For example:
 
 .. code-block:: console
@@ -1012,9 +1085,9 @@ relative to the build directory. For example:
 When loading the relocatable PCH file, the various headers used in the
 PCH file are found from the system header root. For example, ``mylib.h``
 can be found in ``/usr/include/mylib.h``. If the headers are installed
-in some other system root, the `-isysroot` option can be used provide
+in some other system root, the ``-isysroot`` option can be used provide
 a different system root from which the headers will be based. For
-example, `-isysroot /Developer/SDKs/MacOSX10.4u.sdk` will look for
+example, ``-isysroot /Developer/SDKs/MacOSX10.4u.sdk`` will look for
 ``mylib.h`` in ``/Developer/SDKs/MacOSX10.4u.sdk/usr/include/mylib.h``.
 
 Relocatable precompiled headers are intended to be used in a limited
@@ -1302,7 +1375,8 @@ Profile Guided Optimization
 Profile information enables better optimization. For example, knowing that a
 branch is taken very frequently helps the compiler make better decisions when
 ordering basic blocks. Knowing that a function ``foo`` is called more
-frequently than another function ``bar`` helps the inliner.
+frequently than another function ``bar`` helps the inliner. Optimization
+levels ``-O2`` and above are recommended for use of profile guided optimization.
 
 Clang supports profile guided optimization with two different kinds of
 profiling. A sampling profiler can generate a profile with very low runtime
@@ -2659,6 +2733,7 @@ Execute ``clang-cl /?`` to see a list of supported options:
       /Gd                     Set __cdecl as a default calling convention
       /GF-                    Disable string pooling
       /GR-                    Disable emission of RTTI data
+      /Gregcall               Set __regcall as a default calling convention
       /GR                     Enable emission of RTTI data
       /Gr                     Set __fastcall as a default calling convention
       /GS-                    Disable buffer security check
@@ -2715,7 +2790,7 @@ Execute ``clang-cl /?`` to see a list of supported options:
       /W2                     Enable -Wall
       /W3                     Enable -Wall
       /W4                     Enable -Wall and -Wextra
-      /Wall                   Enable -Wall and -Wextra
+      /Wall                   Enable -Weverything
       /WX-                    Do not treat warnings as errors
       /WX                     Treat warnings as errors
       /w                      Disable all warnings
@@ -2772,6 +2847,8 @@ Execute ``clang-cl /?`` to see a list of supported options:
                               Disable specified features of coverage instrumentation for Sanitizers
       -fno-sanitize-memory-track-origins
                               Disable origins tracking in MemorySanitizer
+      -fno-sanitize-memory-use-after-dtor
+                              Disable use-after-destroy detection in MemorySanitizer
       -fno-sanitize-recover=<value>
                               Disable recovery for specified sanitizers
       -fno-sanitize-stats     Disable sanitizer statistics gathering.
@@ -2802,6 +2879,8 @@ Execute ``clang-cl /?`` to see a list of supported options:
                               Path to blacklist file for sanitizers
       -fsanitize-cfi-cross-dso
                               Enable control flow integrity (CFI) checks for cross-DSO calls.
+      -fsanitize-cfi-icall-generalize-pointers
+                              Generalize pointers in CFI indirect call type signature checks
       -fsanitize-coverage=<value>
                               Specify the type of coverage instrumentation for Sanitizers
       -fsanitize-memory-track-origins=<value>
@@ -2825,6 +2904,7 @@ Execute ``clang-cl /?`` to see a list of supported options:
       -fsanitize=<check>      Turn on runtime checks for various forms of undefined or suspicious
                               behavior. See user manual for available checks
       -fstandalone-debug      Emit full debug info for all types used by the program
+      -fwhole-program-vtables Enables whole-program vtable optimization. Requires -flto
       -gcodeview              Generate CodeView debug information
       -gline-tables-only      Emit debug line number tables only
       -miamcu                 Use Intel MCU ABI
@@ -2833,6 +2913,7 @@ Execute ``clang-cl /?`` to see a list of supported options:
       -Qunused-arguments      Don't emit warning for unused driver arguments
       -R<remark>              Enable the specified remark
       --target=<value>        Generate code for the given target
+      --version               Print version information
       -v                      Show commands to run and use verbose output
       -W<warning>             Enable the specified warning
       -Xclang <arg>           Pass <arg> to the clang compiler
diff --git a/docs/conf.py b/docs/conf.py
index a12f99ad6d9a..b38c93af23c2 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -49,9 +49,9 @@
 # built documents.
 #
 # The short version.
-version = '6'
+version = '7'
 # The full version, including alpha/beta/rc tags.
-release = '6'
+release = '7'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/doxygen.cfg.in b/docs/doxygen.cfg.in
index 13ed72222bea..08f4253d851b 100644
--- a/docs/doxygen.cfg.in
+++ b/docs/doxygen.cfg.in
@@ -284,7 +284,7 @@ MARKDOWN_SUPPORT       = YES
 
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by by putting a % sign in front of the word
+# be prevented in individual cases by putting a % sign in front of the word
 # or globally by setting AUTOLINK_SUPPORT to NO.
 # The default value is: YES.
 
diff --git a/docs/index.rst b/docs/index.rst
index 342ab74d2d80..ed479534ee97 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -39,6 +39,7 @@ Using Clang as a Compiler
    SourceBasedCodeCoverage
    Modules
    MSVCCompatibility
+   OpenMPSupport
    ThinLTO
    CommandGuide/index
    FAQ
diff --git a/docs/tools/dump_ast_matchers.py b/docs/tools/dump_ast_matchers.py
index 45540405de97..d38977548fd9 100755
--- a/docs/tools/dump_ast_matchers.py
+++ b/docs/tools/dump_ast_matchers.py
@@ -95,7 +95,7 @@ def strip_doxygen(comment):
 def unify_arguments(args):
   """Gets rid of anything the user doesn't care about in the argument list."""
   args = re.sub(r'internal::', r'', args)
-  args = re.sub(r'const\s+(.*)&', r'\1 ', args)
+  args = re.sub(r'extern const\s+(.*)&', r'\1 ', args)
   args = re.sub(r'&', r' ', args)
   args = re.sub(r'(^|\s)M\d?(\s)', r'\1Matcher<*>\2', args)
   return args
@@ -150,11 +150,11 @@ def act_on_decl(declaration, comment, allowed_types):
                   comment, is_dyncast=True)
       return
 
-    # Parse the various matcher definition macros.
-    m = re.match(""".*AST_TYPE_MATCHER\(
-                       \s*([^\s,]+\s*),
-                       \s*([^\s,]+\s*)
-                     \)\s*;\s*$""", declaration, flags=re.X)
+    # Special case of type matchers:
+    #   AstTypeMatcher<ArgumentType> name
+    m = re.match(r""".*AstTypeMatcher\s*<
+                       \s*([^\s>]+)\s*>
+                       \s*([^\s;]+)\s*;\s*$""", declaration, flags=re.X)
     if m:
       inner, name = m.groups()
       add_matcher('Type', name, 'Matcher<%s>...' % inner,
@@ -165,7 +165,8 @@ def act_on_decl(declaration, comment, allowed_types):
       #             comment, is_dyncast=True)
       return
 
-    m = re.match(""".*AST_TYPE(LOC)?_TRAVERSE_MATCHER\(
+    # Parse the various matcher definition macros.
+    m = re.match(""".*AST_TYPE(LOC)?_TRAVERSE_MATCHER(?:_DECL)?\(
                        \s*([^\s,]+\s*),
                        \s*(?:[^\s,]+\s*),
                        \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\)
@@ -236,7 +237,7 @@ def act_on_decl(declaration, comment, allowed_types):
                        (?:,\s*([^\s,]+)\s*
                           ,\s*([^\s,]+)\s*)?
                        (?:,\s*\d+\s*)?
-                      \)\s*{\s*$""", declaration, flags=re.X)
+                      \)\s*{""", declaration, flags=re.X)
     if m:
       p, n, result, name = m.groups()[0:4]
       args = m.groups()[4:]
@@ -256,8 +257,8 @@ def act_on_decl(declaration, comment, allowed_types):
 
     # Parse ArgumentAdapting matchers.
     m = re.match(
-        r"""^.*ArgumentAdaptingMatcherFunc<.*>\s*(?:LLVM_ATTRIBUTE_UNUSED\s*)
-              ([a-zA-Z]*)\s*=\s*{};$""",
+        r"""^.*ArgumentAdaptingMatcherFunc<.*>\s*
+              ([a-zA-Z]*);$""",
         declaration, flags=re.X)
     if m:
       name = m.groups()[0]
@@ -267,7 +268,7 @@ def act_on_decl(declaration, comment, allowed_types):
     # Parse Variadic functions.
     m = re.match(
         r"""^.*internal::VariadicFunction\s*<\s*([^,]+),\s*([^,]+),\s*[^>]+>\s*
-              ([a-zA-Z]*)\s*=\s*{.*};$""",
+              ([a-zA-Z]*);$""",
         declaration, flags=re.X)
     if m:
       result, arg, name = m.groups()[:3]
@@ -276,15 +277,15 @@ def act_on_decl(declaration, comment, allowed_types):
 
     # Parse Variadic operator matchers.
     m = re.match(
-        r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s>]+)\s*>\s*
-              ([a-zA-Z]*)\s*=\s*{.*};$""",
+        r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s]+)\s*>\s*
+              ([a-zA-Z]*);$""",
         declaration, flags=re.X)
     if m:
       min_args, max_args, name = m.groups()[:3]
       if max_args == '1':
         add_matcher('*', name, 'Matcher<*>', comment)
         return
-      elif max_args == 'UINT_MAX':
+      elif max_args == 'std::numeric_limits<unsigned>::max()':
         add_matcher('*', name, 'Matcher<*>, ..., Matcher<*>', comment)
         return
 
diff --git a/examples/analyzer-plugin/MainCallChecker.cpp b/examples/analyzer-plugin/MainCallChecker.cpp
index 7f08760e3d4c..74fe663e981f 100644
--- a/examples/analyzer-plugin/MainCallChecker.cpp
+++ b/examples/analyzer-plugin/MainCallChecker.cpp
@@ -16,10 +16,8 @@ class MainCallChecker : public Checker < check::PreStmt<CallExpr> > {
 } // end anonymous namespace
 
 void MainCallChecker::checkPreStmt(const CallExpr *CE, CheckerContext &C) const {
-  const ProgramStateRef state = C.getState();
-  const LocationContext *LC = C.getLocationContext();
   const Expr *Callee = CE->getCallee();
-  const FunctionDecl *FD = state->getSVal(Callee, LC).getAsFunctionDecl();
+  const FunctionDecl *FD = C.getSVal(Callee).getAsFunctionDecl();
 
   if (!FD)
     return;
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index 587008a7210b..ebbf38975a1e 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -32,7 +32,7 @@
  * compatible, thus CINDEX_VERSION_MAJOR is expected to remain stable.
  */
 #define CINDEX_VERSION_MAJOR 0
-#define CINDEX_VERSION_MINOR 45
+#define CINDEX_VERSION_MINOR 47
 
 #define CINDEX_VERSION_ENCODE(major, minor) ( \
       ((major) * 10000)                       \
@@ -2641,6 +2641,16 @@ CINDEX_LINKAGE enum CXCursorKind clang_getCursorKind(CXCursor);
  */
 CINDEX_LINKAGE unsigned clang_isDeclaration(enum CXCursorKind);
 
+/**
+ * \brief Determine whether the given declaration is invalid.
+ *
+ * A declaration is invalid if it could not be parsed successfully.
+ *
+ * \returns non-zero if the cursor represents a declaration and it is
+ * invalid, otherwise NULL.
+ */
+CINDEX_LINKAGE unsigned clang_isInvalidDeclaration(CXCursor);
+
 /**
  * \brief Determine whether the given cursor kind represents a simple
  * reference.
@@ -4081,6 +4091,89 @@ CINDEX_LINKAGE CXSourceRange clang_Cursor_getSpellingNameRange(CXCursor,
                                                           unsigned pieceIndex,
                                                           unsigned options);
 
+/**
+ * \brief Opaque pointer representing a policy that controls pretty printing
+ * for \c clang_getCursorPrettyPrinted.
+ */
+typedef void *CXPrintingPolicy;
+
+/**
+ * \brief Properties for the printing policy.
+ *
+ * See \c clang::PrintingPolicy for more information.
+ */
+enum CXPrintingPolicyProperty {
+  CXPrintingPolicy_Indentation,
+  CXPrintingPolicy_SuppressSpecifiers,
+  CXPrintingPolicy_SuppressTagKeyword,
+  CXPrintingPolicy_IncludeTagDefinition,
+  CXPrintingPolicy_SuppressScope,
+  CXPrintingPolicy_SuppressUnwrittenScope,
+  CXPrintingPolicy_SuppressInitializers,
+  CXPrintingPolicy_ConstantArraySizeAsWritten,
+  CXPrintingPolicy_AnonymousTagLocations,
+  CXPrintingPolicy_SuppressStrongLifetime,
+  CXPrintingPolicy_SuppressLifetimeQualifiers,
+  CXPrintingPolicy_SuppressTemplateArgsInCXXConstructors,
+  CXPrintingPolicy_Bool,
+  CXPrintingPolicy_Restrict,
+  CXPrintingPolicy_Alignof,
+  CXPrintingPolicy_UnderscoreAlignof,
+  CXPrintingPolicy_UseVoidForZeroParams,
+  CXPrintingPolicy_TerseOutput,
+  CXPrintingPolicy_PolishForDeclaration,
+  CXPrintingPolicy_Half,
+  CXPrintingPolicy_MSWChar,
+  CXPrintingPolicy_IncludeNewlines,
+  CXPrintingPolicy_MSVCFormatting,
+  CXPrintingPolicy_ConstantsAsWritten,
+  CXPrintingPolicy_SuppressImplicitBase,
+  CXPrintingPolicy_FullyQualifiedName,
+
+  CXPrintingPolicy_LastProperty = CXPrintingPolicy_FullyQualifiedName
+};
+
+/**
+ * \brief Get a property value for the given printing policy.
+ */
+CINDEX_LINKAGE unsigned
+clang_PrintingPolicy_getProperty(CXPrintingPolicy Policy,
+                                 enum CXPrintingPolicyProperty Property);
+
+/**
+ * \brief Set a property value for the given printing policy.
+ */
+CINDEX_LINKAGE void clang_PrintingPolicy_setProperty(CXPrintingPolicy Policy,
+                                                     enum CXPrintingPolicyProperty Property,
+                                                     unsigned Value);
+
+/**
+ * \brief Retrieve the default policy for the cursor.
+ *
+ * The policy should be released after use with \c
+ * clang_PrintingPolicy_dispose.
+ */
+CINDEX_LINKAGE CXPrintingPolicy clang_getCursorPrintingPolicy(CXCursor);
+
+/**
+ * \brief Release a printing policy.
+ */
+CINDEX_LINKAGE void clang_PrintingPolicy_dispose(CXPrintingPolicy Policy);
+
+/**
+ * \brief Pretty print declarations.
+ *
+ * \param Cursor The cursor representing a declaration.
+ *
+ * \param Policy The policy to control the entities being printed. If
+ * NULL, a default policy is used.
+ *
+ * \returns The pretty printed declaration or the empty string for
+ * other cursors.
+ */
+CINDEX_LINKAGE CXString clang_getCursorPrettyPrinted(CXCursor Cursor,
+                                                     CXPrintingPolicy Policy);
+
 /**
  * \brief Retrieve the display name for the entity referenced by this cursor.
  *
@@ -5151,7 +5244,14 @@ enum CXCodeComplete_Flags {
    * \brief Whether to include brief documentation within the set of code
    * completions returned.
    */
-  CXCodeComplete_IncludeBriefComments = 0x04
+  CXCodeComplete_IncludeBriefComments = 0x04,
+
+  /**
+   * Whether to speed up completion by omitting top- or namespace-level entities
+   * defined in the preamble. There's no guarantee any particular entity is
+   * omitted. This may be useful if the headers are indexed externally.
+   */
+  CXCodeComplete_SkipPreamble = 0x08
 };
 
 /**
diff --git a/include/clang/AST/ASTContext.h b/include/clang/AST/ASTContext.h
index a5d080035df0..e152223776e5 100644
--- a/include/clang/AST/ASTContext.h
+++ b/include/clang/AST/ASTContext.h
@@ -1162,6 +1162,13 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// \brief Change the result type of a function type once it is deduced.
   void adjustDeducedFunctionResultType(FunctionDecl *FD, QualType ResultType);
 
+  /// Get a function type and produce the equivalent function type with the
+  /// specified exception specification. Type sugar that can be present on a
+  /// declaration of a function with an exception specification is permitted
+  /// and preserved. Other type sugar (for instance, typedefs) is not.
+  QualType getFunctionTypeWithExceptionSpec(
+      QualType Orig, const FunctionProtoType::ExceptionSpecInfo &ESI);
+
   /// \brief Determine whether two function types are the same, ignoring
   /// exception specifications in cases where they're part of the type.
   bool hasSameFunctionTypeIgnoringExceptionSpec(QualType T, QualType U);
@@ -2633,6 +2640,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// it is not used.
   bool DeclMustBeEmitted(const Decl *D);
 
+  /// \brief Visits all versions of a multiversioned function with the passed
+  /// predicate.
+  void forEachMultiversionedFunctionVersion(
+      const FunctionDecl *FD,
+      llvm::function_ref<void(const FunctionDecl *)> Pred) const;
+
   const CXXConstructorDecl *
   getCopyConstructorForExceptionObject(CXXRecordDecl *RD);
 
diff --git a/include/clang/AST/Attr.h b/include/clang/AST/Attr.h
index bbe320c28a3b..a7c766a65ce1 100644
--- a/include/clang/AST/Attr.h
+++ b/include/clang/AST/Attr.h
@@ -52,8 +52,10 @@ class Attr {
   unsigned Inherited : 1;
   unsigned IsPackExpansion : 1;
   unsigned Implicit : 1;
+  // FIXME: These are properties of the attribute kind, not state for this
+  // instance of the attribute.
   unsigned IsLateParsed : 1;
-  unsigned DuplicatesAllowed : 1;
+  unsigned InheritEvenIfAlreadyPresent : 1;
 
   void *operator new(size_t bytes) noexcept {
     llvm_unreachable("Attrs cannot be allocated with regular 'new'.");
@@ -74,10 +76,10 @@ class Attr {
 
 protected:
   Attr(attr::Kind AK, SourceRange R, unsigned SpellingListIndex,
-       bool IsLateParsed, bool DuplicatesAllowed)
+       bool IsLateParsed)
     : Range(R), AttrKind(AK), SpellingListIndex(SpellingListIndex),
       Inherited(false), IsPackExpansion(false), Implicit(false),
-      IsLateParsed(IsLateParsed), DuplicatesAllowed(DuplicatesAllowed) {}
+      IsLateParsed(IsLateParsed), InheritEvenIfAlreadyPresent(false) {}
 
 public:
 
@@ -109,18 +111,13 @@ class Attr {
 
   // Pretty print this attribute.
   void printPretty(raw_ostream &OS, const PrintingPolicy &Policy) const;
-
-  /// \brief By default, attributes cannot be duplicated when being merged;
-  /// however, an attribute can override this. Returns true if the attribute
-  /// can be duplicated when merging.
-  bool duplicatesAllowed() const { return DuplicatesAllowed; }
 };
 
 class StmtAttr : public Attr {
 protected:
   StmtAttr(attr::Kind AK, SourceRange R, unsigned SpellingListIndex,
-                  bool IsLateParsed, bool DuplicatesAllowed)
-      : Attr(AK, R, SpellingListIndex, IsLateParsed, DuplicatesAllowed) {}
+                  bool IsLateParsed)
+      : Attr(AK, R, SpellingListIndex, IsLateParsed) {}
 
 public:
   static bool classof(const Attr *A) {
@@ -132,12 +129,20 @@ class StmtAttr : public Attr {
 class InheritableAttr : public Attr {
 protected:
   InheritableAttr(attr::Kind AK, SourceRange R, unsigned SpellingListIndex,
-                  bool IsLateParsed, bool DuplicatesAllowed)
-      : Attr(AK, R, SpellingListIndex, IsLateParsed, DuplicatesAllowed) {}
+                  bool IsLateParsed, bool InheritEvenIfAlreadyPresent)
+      : Attr(AK, R, SpellingListIndex, IsLateParsed) {
+    this->InheritEvenIfAlreadyPresent = InheritEvenIfAlreadyPresent;
+  }
 
 public:
   void setInherited(bool I) { Inherited = I; }
 
+  /// Should this attribute be inherited from a prior declaration even if it's
+  /// explicitly provided in the current declaration?
+  bool shouldInheritEvenIfAlreadyPresent() const {
+    return InheritEvenIfAlreadyPresent;
+  }
+
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Attr *A) {
     return A->getKind() >= attr::FirstInheritableAttr &&
@@ -148,9 +153,9 @@ class InheritableAttr : public Attr {
 class InheritableParamAttr : public InheritableAttr {
 protected:
   InheritableParamAttr(attr::Kind AK, SourceRange R, unsigned SpellingListIndex,
-                       bool IsLateParsed, bool DuplicatesAllowed)
+                       bool IsLateParsed, bool InheritEvenIfAlreadyPresent)
       : InheritableAttr(AK, R, SpellingListIndex, IsLateParsed,
-                        DuplicatesAllowed) {}
+                        InheritEvenIfAlreadyPresent) {}
 
 public:
   // Implement isa/cast/dyncast/etc.
@@ -166,9 +171,9 @@ class ParameterABIAttr : public InheritableParamAttr {
 protected:
   ParameterABIAttr(attr::Kind AK, SourceRange R,
                    unsigned SpellingListIndex, bool IsLateParsed,
-                   bool DuplicatesAllowed)
+                   bool InheritEvenIfAlreadyPresent)
     : InheritableParamAttr(AK, R, SpellingListIndex, IsLateParsed,
-                           DuplicatesAllowed) {}
+                           InheritEvenIfAlreadyPresent) {}
 
 public:
   ParameterABI getABI() const {
diff --git a/include/clang/AST/Decl.h b/include/clang/AST/Decl.h
index 04a832e552a4..b2d3ef5d5470 100644
--- a/include/clang/AST/Decl.h
+++ b/include/clang/AST/Decl.h
@@ -83,7 +83,7 @@ class VarTemplateDecl;
 /// TypeLoc TL = TypeSourceInfo->getTypeLoc();
 /// TL.getStartLoc().print(OS, SrcMgr);
 /// @endcode
-class TypeSourceInfo {
+class LLVM_ALIGNAS(8) TypeSourceInfo {
   // Contains a memory block after the class, used for type source information,
   // allocated by ASTContext.
   friend class ASTContext;
@@ -1750,6 +1750,10 @@ class FunctionDecl : public DeclaratorDecl, public DeclContext,
   /// parsing it.
   unsigned WillHaveBody : 1;
 
+  /// Indicates that this function is a multiversioned function using attribute
+  /// 'target'.
+  unsigned IsMultiVersion : 1;
+
 protected:
   /// [C++17] Only used by CXXDeductionGuideDecl. Declared here to avoid
   /// increasing the size of CXXDeductionGuideDecl by the size of an unsigned
@@ -1846,9 +1850,9 @@ class FunctionDecl : public DeclaratorDecl, public DeclContext,
         IsExplicitlyDefaulted(false), HasImplicitReturnZero(false),
         IsLateTemplateParsed(false), IsConstexpr(isConstexprSpecified),
         InstantiationIsPending(false), UsesSEHTry(false), HasSkippedBody(false),
-        WillHaveBody(false), IsCopyDeductionCandidate(false), HasODRHash(false),
-        ODRHash(0), EndRangeLoc(NameInfo.getEndLoc()),
-        DNLoc(NameInfo.getInfo()) {}
+        WillHaveBody(false), IsMultiVersion(false),
+        IsCopyDeductionCandidate(false), HasODRHash(false), ODRHash(0),
+        EndRangeLoc(NameInfo.getEndLoc()), DNLoc(NameInfo.getInfo()) {}
 
   using redeclarable_base = Redeclarable<FunctionDecl>;
 
@@ -2154,6 +2158,15 @@ class FunctionDecl : public DeclaratorDecl, public DeclContext,
   bool willHaveBody() const { return WillHaveBody; }
   void setWillHaveBody(bool V = true) { WillHaveBody = V; }
 
+  /// True if this function is considered a multiversioned function.
+  bool isMultiVersion() const { return getCanonicalDecl()->IsMultiVersion; }
+
+  /// Sets the multiversion state for this declaration and all of its
+  /// redeclarations.
+  void setIsMultiVersion(bool V = true) {
+    getCanonicalDecl()->IsMultiVersion = V;
+  }
+
   void setPreviousDeclaration(FunctionDecl * PrevDecl);
 
   FunctionDecl *getCanonicalDecl() override;
@@ -2800,13 +2813,16 @@ class TypeDecl : public NamedDecl {
 
 /// Base class for declarations which introduce a typedef-name.
 class TypedefNameDecl : public TypeDecl, public Redeclarable<TypedefNameDecl> {
-  using ModedTInfo = std::pair<TypeSourceInfo *, QualType>;
-  llvm::PointerUnion<TypeSourceInfo *, ModedTInfo *> MaybeModedTInfo;
+  struct LLVM_ALIGNAS(8) ModedTInfo {
+    TypeSourceInfo *first;
+    QualType second;
+  };
 
-  // FIXME: This can be packed into the bitfields in Decl.
-  /// If 0, we have not computed IsTransparentTag.
-  /// Otherwise, IsTransparentTag is (CacheIsTransparentTag >> 1).
-  mutable unsigned CacheIsTransparentTag : 2;
+  /// If int part is 0, we have not computed IsTransparentTag.
+  /// Otherwise, IsTransparentTag is (getInt() >> 1).
+  mutable llvm::PointerIntPair<
+      llvm::PointerUnion<TypeSourceInfo *, ModedTInfo *>, 2>
+      MaybeModedTInfo;
 
   void anchor() override;
 
@@ -2815,7 +2831,7 @@ class TypedefNameDecl : public TypeDecl, public Redeclarable<TypedefNameDecl> {
                   SourceLocation StartLoc, SourceLocation IdLoc,
                   IdentifierInfo *Id, TypeSourceInfo *TInfo)
       : TypeDecl(DK, DC, IdLoc, Id, StartLoc), redeclarable_base(C),
-        MaybeModedTInfo(TInfo), CacheIsTransparentTag(0) {}
+        MaybeModedTInfo(TInfo, 0) {}
 
   using redeclarable_base = Redeclarable<TypedefNameDecl>;
 
@@ -2842,26 +2858,29 @@ class TypedefNameDecl : public TypeDecl, public Redeclarable<TypedefNameDecl> {
   using redeclarable_base::getMostRecentDecl;
   using redeclarable_base::isFirstDecl;
 
-  bool isModed() const { return MaybeModedTInfo.is<ModedTInfo*>(); }
+  bool isModed() const {
+    return MaybeModedTInfo.getPointer().is<ModedTInfo *>();
+  }
 
   TypeSourceInfo *getTypeSourceInfo() const {
-    return isModed()
-      ? MaybeModedTInfo.get<ModedTInfo*>()->first
-      : MaybeModedTInfo.get<TypeSourceInfo*>();
+    return isModed() ? MaybeModedTInfo.getPointer().get<ModedTInfo *>()->first
+                     : MaybeModedTInfo.getPointer().get<TypeSourceInfo *>();
   }
 
   QualType getUnderlyingType() const {
-    return isModed()
-      ? MaybeModedTInfo.get<ModedTInfo*>()->second
-      : MaybeModedTInfo.get<TypeSourceInfo*>()->getType();
+    return isModed() ? MaybeModedTInfo.getPointer().get<ModedTInfo *>()->second
+                     : MaybeModedTInfo.getPointer()
+                           .get<TypeSourceInfo *>()
+                           ->getType();
   }
 
   void setTypeSourceInfo(TypeSourceInfo *newType) {
-    MaybeModedTInfo = newType;
+    MaybeModedTInfo.setPointer(newType);
   }
 
   void setModedTypeSourceInfo(TypeSourceInfo *unmodedTSI, QualType modedTy) {
-    MaybeModedTInfo = new (getASTContext()) ModedTInfo(unmodedTSI, modedTy);
+    MaybeModedTInfo.setPointer(new (getASTContext(), 8)
+                                   ModedTInfo({unmodedTSI, modedTy}));
   }
 
   /// Retrieves the canonical declaration of this typedef-name.
@@ -2878,8 +2897,8 @@ class TypedefNameDecl : public TypeDecl, public Redeclarable<TypedefNameDecl> {
   /// Determines if this typedef shares a name and spelling location with its
   /// underlying tag type, as is the case with the NS_ENUM macro.
   bool isTransparentTag() const {
-    if (CacheIsTransparentTag)
-      return CacheIsTransparentTag & 0x2;
+    if (MaybeModedTInfo.getInt())
+      return MaybeModedTInfo.getInt() & 0x2;
     return isTransparentTagSlow();
   }
 
diff --git a/include/clang/AST/DeclBase.h b/include/clang/AST/DeclBase.h
index f93c9f0b9aaa..f17c34e26b2b 100644
--- a/include/clang/AST/DeclBase.h
+++ b/include/clang/AST/DeclBase.h
@@ -1823,7 +1823,9 @@ class DeclContext {
   using lookups_range = llvm::iterator_range<all_lookups_iterator>;
 
   lookups_range lookups() const;
-  lookups_range noload_lookups() const;
+  // Like lookups(), but avoids loading external declarations.
+  // If PreserveInternalState, avoids building lookup data structures too.
+  lookups_range noload_lookups(bool PreserveInternalState) const;
 
   /// \brief Iterators over all possible lookups within this context.
   all_lookups_iterator lookups_begin() const;
@@ -1943,6 +1945,7 @@ class DeclContext {
 
   StoredDeclsMap *CreateStoredDeclsMap(ASTContext &C) const;
 
+  void loadLazyLocalLexicalLookups();
   void buildLookupImpl(DeclContext *DCtx, bool Internal);
   void makeDeclVisibleInContextWithFlags(NamedDecl *D, bool Internal,
                                          bool Rediscoverable);
diff --git a/include/clang/AST/DeclCXX.h b/include/clang/AST/DeclCXX.h
index 88dc9a655917..9f19c060f2c1 100644
--- a/include/clang/AST/DeclCXX.h
+++ b/include/clang/AST/DeclCXX.h
@@ -3129,10 +3129,14 @@ class UsingShadowDecl : public NamedDecl, public Redeclarable<UsingShadowDecl> {
 
   /// \brief Sets the underlying declaration which has been brought into the
   /// local scope.
-  void setTargetDecl(NamedDecl* ND) {
+  void setTargetDecl(NamedDecl *ND) {
     assert(ND && "Target decl is null!");
     Underlying = ND;
-    IdentifierNamespace = ND->getIdentifierNamespace();
+    // A UsingShadowDecl is never a friend or local extern declaration, even
+    // if it is a shadow declaration for one.
+    IdentifierNamespace =
+        ND->getIdentifierNamespace() &
+        ~(IDNS_OrdinaryFriend | IDNS_TagFriend | IDNS_LocalExtern);
   }
 
   /// \brief Gets the using declaration to which this declaration is tied.
diff --git a/include/clang/AST/DeclLookups.h b/include/clang/AST/DeclLookups.h
index 2fff05582563..64eb3f24b370 100644
--- a/include/clang/AST/DeclLookups.h
+++ b/include/clang/AST/DeclLookups.h
@@ -86,16 +86,11 @@ inline DeclContext::lookups_range DeclContext::lookups() const {
   return lookups_range(all_lookups_iterator(), all_lookups_iterator());
 }
 
-inline DeclContext::all_lookups_iterator DeclContext::lookups_begin() const {
-  return lookups().begin();
-}
-
-inline DeclContext::all_lookups_iterator DeclContext::lookups_end() const {
-  return lookups().end();
-}
-
-inline DeclContext::lookups_range DeclContext::noload_lookups() const {
+inline DeclContext::lookups_range
+DeclContext::noload_lookups(bool PreserveInternalState) const {
   DeclContext *Primary = const_cast<DeclContext*>(this)->getPrimaryContext();
+  if (!PreserveInternalState)
+    Primary->loadLazyLocalLexicalLookups();
   if (StoredDeclsMap *Map = Primary->getLookupPtr())
     return lookups_range(all_lookups_iterator(Map->begin(), Map->end()),
                          all_lookups_iterator(Map->end(), Map->end()));
@@ -105,16 +100,6 @@ inline DeclContext::lookups_range DeclContext::noload_lookups() const {
   return lookups_range(all_lookups_iterator(), all_lookups_iterator());
 }
 
-inline
-DeclContext::all_lookups_iterator DeclContext::noload_lookups_begin() const {
-  return noload_lookups().begin();
-}
-
-inline
-DeclContext::all_lookups_iterator DeclContext::noload_lookups_end() const {
-  return noload_lookups().end();
-}
-
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_DECLLOOKUPS_H
diff --git a/include/clang/AST/DeclOpenMP.h b/include/clang/AST/DeclOpenMP.h
index 2a329c3732cb..76fbe01acbe0 100644
--- a/include/clang/AST/DeclOpenMP.h
+++ b/include/clang/AST/DeclOpenMP.h
@@ -190,8 +190,8 @@ class OMPCapturedExprDecl final : public VarDecl {
 
   OMPCapturedExprDecl(ASTContext &C, DeclContext *DC, IdentifierInfo *Id,
                       QualType Type, SourceLocation StartLoc)
-      : VarDecl(OMPCapturedExpr, C, DC, StartLoc, SourceLocation(), Id, Type,
-                nullptr, SC_None) {
+      : VarDecl(OMPCapturedExpr, C, DC, StartLoc, StartLoc, Id, Type, nullptr,
+                SC_None) {
     setImplicit();
   }
 
diff --git a/include/clang/AST/Expr.h b/include/clang/AST/Expr.h
index f2770940372f..9d7a29e909f9 100644
--- a/include/clang/AST/Expr.h
+++ b/include/clang/AST/Expr.h
@@ -1720,19 +1720,19 @@ class UnaryOperator : public Expr {
 
 private:
   unsigned Opc : 5;
+  unsigned CanOverflow : 1;
   SourceLocation Loc;
   Stmt *Val;
 public:
-
-  UnaryOperator(Expr *input, Opcode opc, QualType type,
-                ExprValueKind VK, ExprObjectKind OK, SourceLocation l)
-    : Expr(UnaryOperatorClass, type, VK, OK,
-           input->isTypeDependent() || type->isDependentType(),
-           input->isValueDependent(),
-           (input->isInstantiationDependent() ||
-            type->isInstantiationDependentType()),
-           input->containsUnexpandedParameterPack()),
-      Opc(opc), Loc(l), Val(input) {}
+  UnaryOperator(Expr *input, Opcode opc, QualType type, ExprValueKind VK,
+                ExprObjectKind OK, SourceLocation l, bool CanOverflow)
+      : Expr(UnaryOperatorClass, type, VK, OK,
+             input->isTypeDependent() || type->isDependentType(),
+             input->isValueDependent(),
+             (input->isInstantiationDependent() ||
+              type->isInstantiationDependentType()),
+             input->containsUnexpandedParameterPack()),
+        Opc(opc), CanOverflow(CanOverflow), Loc(l), Val(input) {}
 
   /// \brief Build an empty unary operator.
   explicit UnaryOperator(EmptyShell Empty)
@@ -1748,6 +1748,15 @@ class UnaryOperator : public Expr {
   SourceLocation getOperatorLoc() const { return Loc; }
   void setOperatorLoc(SourceLocation L) { Loc = L; }
 
+  /// Returns true if the unary operator can cause an overflow. For instance,
+  ///   signed int i = INT_MAX; i++;
+  ///   signed char c = CHAR_MAX; c++;
+  /// Due to integer promotions, c++ is promoted to an int before the postfix
+  /// increment, and the result is an int that cannot overflow. However, i++
+  /// can overflow.
+  bool canOverflow() const { return CanOverflow; }
+  void setCanOverflow(bool C) { CanOverflow = C; }
+
   /// isPostfix - Return true if this is a postfix operation, like x++.
   static bool isPostfix(Opcode Op) {
     return Op == UO_PostInc || Op == UO_PostDec;
diff --git a/include/clang/AST/ExprCXX.h b/include/clang/AST/ExprCXX.h
index 58054dda31aa..3fb2c6130f86 100644
--- a/include/clang/AST/ExprCXX.h
+++ b/include/clang/AST/ExprCXX.h
@@ -1504,6 +1504,9 @@ class CXXFunctionalCastExpr final
   SourceLocation getRParenLoc() const { return RParenLoc; }
   void setRParenLoc(SourceLocation L) { RParenLoc = L; }
 
+  /// Determine whether this expression models list-initialization.
+  bool isListInitialization() const { return LParenLoc.isInvalid(); }
+
   SourceLocation getLocStart() const LLVM_READONLY;
   SourceLocation getLocEnd() const LLVM_READONLY;
 
diff --git a/include/clang/AST/OpenMPClause.h b/include/clang/AST/OpenMPClause.h
index 1f732f67517a..abcd1cbefcfa 100644
--- a/include/clang/AST/OpenMPClause.h
+++ b/include/clang/AST/OpenMPClause.h
@@ -3192,14 +3192,17 @@ class OMPDeviceClause : public OMPClause, public OMPClauseWithPreInit {
   /// \brief Build 'device' clause.
   ///
   /// \param E Expression associated with this clause.
+  /// \param CaptureRegion Innermost OpenMP region where expressions in this
+  /// clause must be captured.
   /// \param StartLoc Starting location of the clause.
   /// \param LParenLoc Location of '('.
   /// \param EndLoc Ending location of the clause.
-  OMPDeviceClause(Expr *E, Stmt *HelperE, SourceLocation StartLoc,
-                  SourceLocation LParenLoc, SourceLocation EndLoc)
+  OMPDeviceClause(Expr *E, Stmt *HelperE, OpenMPDirectiveKind CaptureRegion,
+                  SourceLocation StartLoc, SourceLocation LParenLoc,
+                  SourceLocation EndLoc)
       : OMPClause(OMPC_device, StartLoc, EndLoc), OMPClauseWithPreInit(this),
         LParenLoc(LParenLoc), Device(E) {
-    setPreInitStmt(HelperE);
+    setPreInitStmt(HelperE, CaptureRegion);
   }
 
   /// \brief Build an empty clause.
@@ -3899,7 +3902,7 @@ class OMPMapClause final : public OMPMappableExprListClause<OMPMapClause>,
                               OpenMPMapClauseKind Type, bool TypeIsImplicit,
                               SourceLocation TypeLoc);
 
-  /// \brief Creates an empty clause with the place for for \a NumVars original
+  /// \brief Creates an empty clause with the place for \a NumVars original
   /// expressions, \a NumUniqueDeclarations declarations, \NumComponentLists
   /// lists, and \a NumComponents expression components.
   ///
diff --git a/include/clang/AST/StmtOpenMP.h b/include/clang/AST/StmtOpenMP.h
index b6fd1a2c28cb..88052097c2f6 100644
--- a/include/clang/AST/StmtOpenMP.h
+++ b/include/clang/AST/StmtOpenMP.h
@@ -222,6 +222,25 @@ class OMPExecutableDirective : public Stmt {
     llvm_unreachable("Incorrect RegionKind specified for directive.");
   }
 
+  /// Get innermost captured statement for the construct.
+  CapturedStmt *getInnermostCapturedStmt() {
+    assert(hasAssociatedStmt() && getAssociatedStmt() &&
+           "Must have associated statement.");
+    SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
+    getOpenMPCaptureRegions(CaptureRegions, getDirectiveKind());
+    assert(!CaptureRegions.empty() &&
+           "At least one captured statement must be provided.");
+    auto *CS = cast<CapturedStmt>(getAssociatedStmt());
+    for (unsigned Level = CaptureRegions.size(); Level > 1; --Level)
+      CS = cast<CapturedStmt>(CS->getCapturedStmt());
+    return CS;
+  }
+
+  const CapturedStmt *getInnermostCapturedStmt() const {
+    return const_cast<OMPExecutableDirective *>(this)
+        ->getInnermostCapturedStmt();
+  }
+
   OpenMPDirectiveKind getDirectiveKind() const { return Kind; }
 
   static bool classof(const Stmt *S) {
@@ -903,9 +922,8 @@ class OMPLoopDirective : public OMPExecutableDirective {
   }
   const Stmt *getBody() const {
     // This relies on the loop form is already checked by Sema.
-    const Stmt *Body = getAssociatedStmt()->IgnoreContainers(true);
-    while(const auto *CS = dyn_cast<CapturedStmt>(Body))
-      Body = CS->getCapturedStmt();
+    const Stmt *Body =
+        getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers();
     Body = cast<ForStmt>(Body)->getBody();
     for (unsigned Cnt = 1; Cnt < CollapsedNum; ++Cnt) {
       Body = Body->IgnoreContainers();
diff --git a/include/clang/AST/Type.h b/include/clang/AST/Type.h
index 882878bb7e1e..41b2ed445ce4 100644
--- a/include/clang/AST/Type.h
+++ b/include/clang/AST/Type.h
@@ -1728,6 +1728,7 @@ class Type : public ExtQualsTypeCommonBase {
   bool isAnyComplexType() const;   // C99 6.2.5p11 (complex) + Complex Int.
   bool isFloatingType() const;     // C99 6.2.5p11 (real floating + complex)
   bool isHalfType() const;         // OpenCL 6.1.1.1, NEON (IEEE 754-2008 half)
+  bool isFloat16Type() const;      // C11 extension ISO/IEC TS 18661
   bool isRealType() const;         // C99 6.2.5p17 (real floating + integer)
   bool isArithmeticType() const;   // C99 6.2.5p18 (integer + floating)
   bool isVoidType() const;         // C99 6.2.5p19
@@ -6179,6 +6180,12 @@ inline bool Type::isHalfType() const {
   return false;
 }
 
+inline bool Type::isFloat16Type() const {
+  if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
+    return BT->getKind() == BuiltinType::Float16;
+  return false;
+}
+
 inline bool Type::isNullPtrType() const {
   if (const BuiltinType *BT = getAs<BuiltinType>())
     return BT->getKind() == BuiltinType::NullPtr;
diff --git a/include/clang/ASTMatchers/ASTMatchers.h b/include/clang/ASTMatchers/ASTMatchers.h
index 64e7e908d51e..e51ce4175e75 100644
--- a/include/clang/ASTMatchers/ASTMatchers.h
+++ b/include/clang/ASTMatchers/ASTMatchers.h
@@ -800,7 +800,7 @@ AST_MATCHER_P(QualType, ignoringParens,
 ///   A<bool, int> b;
 ///   A<int, bool> c;
 ///
-///   template<typename T> f() {};
+///   template<typename T> void f() {}
 ///   void func() { f<int>(); };
 /// \endcode
 /// classTemplateSpecializationDecl(hasTemplateArgument(
@@ -880,12 +880,12 @@ AST_MATCHER_P(TemplateArgument, refersToTemplate,
 ///
 /// Given
 /// \code
-///   template<typename T> struct A {};
-///   struct B { B* next; };
+///   struct B { int next; };
+///   template<int(B::*next_ptr)> struct A {};
 ///   A<&B::next> a;
 /// \endcode
 /// classTemplateSpecializationDecl(hasAnyTemplateArgument(
-///     refersToDeclaration(fieldDecl(hasName("next"))))
+///     refersToDeclaration(fieldDecl(hasName("next")))))
 ///   matches the specialization \c A<&B::next> with \c fieldDecl(...) matching
 ///     \c B::next
 AST_MATCHER_P(TemplateArgument, refersToDeclaration,
@@ -899,8 +899,8 @@ AST_MATCHER_P(TemplateArgument, refersToDeclaration,
 ///
 /// Given
 /// \code
-///   template<typename T> struct A {};
-///   struct B { B* next; };
+///   struct B { int next; };
+///   template<int(B::*next_ptr)> struct A {};
 ///   A<&B::next> a;
 /// \endcode
 /// templateSpecializationType(hasAnyTemplateArgument(
@@ -917,7 +917,7 @@ AST_MATCHER_P(TemplateArgument, isExpr, internal::Matcher<Expr>, InnerMatcher) {
 ///
 /// Given
 /// \code
-///   template<int T> struct A {};
+///   template<int T> struct C {};
 ///   C<42> c;
 /// \endcode
 /// classTemplateSpecializationDecl(
@@ -932,7 +932,7 @@ AST_MATCHER(TemplateArgument, isIntegral) {
 ///
 /// Given
 /// \code
-///   template<int T> struct A {};
+///   template<int T> struct C {};
 ///   C<42> c;
 /// \endcode
 /// classTemplateSpecializationDecl(
@@ -953,7 +953,7 @@ AST_MATCHER_P(TemplateArgument, refersToIntegralType,
 ///
 /// Given
 /// \code
-///   template<int T> struct A {};
+///   template<int T> struct C {};
 ///   C<42> c;
 /// \endcode
 /// classTemplateSpecializationDecl(
@@ -1523,11 +1523,11 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, CXXBindTemporaryExpr>
 /// \code
 ///   T u(f());
 ///   g(f());
+///   f().func();
 /// \endcode
 /// but does not match
 /// \code
 ///   f();
-///   f().func();
 /// \endcode
 extern const internal::VariadicDynCastAllOfMatcher<Stmt,
                                                    MaterializeTemporaryExpr>
@@ -1799,7 +1799,7 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, SwitchStmt> switchStmt;
 ///   switch(a) { case 42: break; default: break; }
 /// \endcode
 /// switchCase()
-///   matches 'case 42: break;' and 'default: break;'.
+///   matches 'case 42:' and 'default:'.
 extern const internal::VariadicDynCastAllOfMatcher<Stmt, SwitchCase> switchCase;
 
 /// \brief Matches case statements inside switch statements.
@@ -1809,7 +1809,7 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, SwitchCase> switchCase;
 ///   switch(a) { case 42: break; default: break; }
 /// \endcode
 /// caseStmt()
-///   matches 'case 42: break;'.
+///   matches 'case 42:'.
 extern const internal::VariadicDynCastAllOfMatcher<Stmt, CaseStmt> caseStmt;
 
 /// \brief Matches default statements inside switch statements.
@@ -1819,13 +1819,13 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, CaseStmt> caseStmt;
 ///   switch(a) { case 42: break; default: break; }
 /// \endcode
 /// defaultStmt()
-///   matches 'default: break;'.
+///   matches 'default:'.
 extern const internal::VariadicDynCastAllOfMatcher<Stmt, DefaultStmt>
     defaultStmt;
 
 /// \brief Matches compound statements.
 ///
-/// Example matches '{}' and '{{}}'in 'for (;;) {{}}'
+/// Example matches '{}' and '{{}}' in 'for (;;) {{}}'
 /// \code
 ///   for (;;) {{}}
 /// \endcode
@@ -2502,11 +2502,12 @@ extern const internal::ArgumentAdaptingMatcherFunc<
 /// \brief Matches AST nodes that have child AST nodes that match the
 /// provided matcher.
 ///
-/// Example matches X, Y
+/// Example matches X, Y, Y::X, Z::Y, Z::Y::X
 ///   (matcher = cxxRecordDecl(forEach(cxxRecordDecl(hasName("X")))
 /// \code
-///   class X {};  // Matches X, because X::X is a class of name X inside X.
-///   class Y { class X {}; };
+///   class X {};
+///   class Y { class X {}; };  // Matches Y, because Y::X is a class of name X
+///                             // inside Y.
 ///   class Z { class Y { class X {}; }; };  // Does not match Z.
 /// \endcode
 ///
@@ -2522,11 +2523,12 @@ extern const internal::ArgumentAdaptingMatcherFunc<internal::ForEachMatcher>
 /// \brief Matches AST nodes that have descendant AST nodes that match the
 /// provided matcher.
 ///
-/// Example matches X, A, B, C
+/// Example matches X, A, A::X, B, B::C, B::C::X
 ///   (matcher = cxxRecordDecl(forEachDescendant(cxxRecordDecl(hasName("X")))))
 /// \code
-///   class X {};  // Matches X, because X::X is a class of name X inside X.
-///   class A { class X {}; };
+///   class X {};
+///   class A { class X {}; };  // Matches A, because A::X is a class of name
+///                             // X inside A.
 ///   class B { class C { class X {}; }; };
 /// \endcode
 ///
@@ -2681,7 +2683,7 @@ AST_MATCHER_P(NamedDecl, hasUnderlyingDecl, internal::Matcher<NamedDecl>,
 ///   (matcher = cxxMemberCallExpr(on(hasType(cxxRecordDecl(hasName("Y"))))))
 /// \code
 ///   class Y { public: void x(); };
-///   void z() { Y y; y.x(); }",
+///   void z() { Y y; y.x(); }
 /// \endcode
 ///
 /// FIXME: Overload to allow directly matching types?
@@ -3555,11 +3557,11 @@ AST_MATCHER_P(FunctionDecl, hasAnyParameter,
 ///   void k(int x, int y, int z, ...);
 /// \endcode
 /// functionDecl(parameterCountIs(2))
-///   matches void g(int i, int j) {}
+///   matches \c g and \c h
 /// functionProtoType(parameterCountIs(2))
-///   matches void h(int i, int j)
+///   matches \c g and \c h
 /// functionProtoType(parameterCountIs(3))
-///   matches void k(int x, int y, int z, ...);
+///   matches \c k
 AST_POLYMORPHIC_MATCHER_P(parameterCountIs,
                           AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
                                                           FunctionProtoType),
@@ -3567,6 +3569,22 @@ AST_POLYMORPHIC_MATCHER_P(parameterCountIs,
   return Node.getNumParams() == N;
 }
 
+/// \brief Matches \c FunctionDecls that have a noreturn attribute.
+///
+/// Given
+/// \code
+///   void nope();
+///   [[noreturn]] void a();
+///   __attribute__((noreturn)) void b();
+///   struct c { [[noreturn]] c(); };
+/// \endcode
+/// functionDecl(isNoReturn())
+///   matches all of those except
+/// \code
+///   void nope();
+/// \endcode
+AST_MATCHER(FunctionDecl, isNoReturn) { return Node.isNoReturn(); }
+
 /// \brief Matches the return type of a function declaration.
 ///
 /// Given:
@@ -5864,6 +5882,30 @@ AST_MATCHER(CXXRecordDecl, hasDefinition) {
   return Node.hasDefinition();
 }
 
+/// \brief Matches C++11 scoped enum declaration.
+///
+/// Example matches Y (matcher = enumDecl(isScoped()))
+/// \code
+/// enum X {};
+/// enum class Y {};
+/// \endcode
+AST_MATCHER(EnumDecl, isScoped) {
+  return Node.isScoped();
+}
+
+/// \brief Matches a function declared with a trailing return type.
+///
+/// Example matches Y (matcher = functionDecl(hasTrailingReturn()))
+/// \code
+/// int X() {}
+/// auto Y() -> int {}
+/// \endcode
+AST_MATCHER(FunctionDecl, hasTrailingReturn) {
+  if (const auto *F = Node.getType()->getAs<FunctionProtoType>())
+    return F->hasTrailingReturn();
+  return false;
+}
+
 } // namespace ast_matchers
 } // namespace clang
 
diff --git a/include/clang/ASTMatchers/ASTMatchersMacros.h b/include/clang/ASTMatchers/ASTMatchersMacros.h
index 6a48da821a53..19333244f058 100644
--- a/include/clang/ASTMatchers/ASTMatchersMacros.h
+++ b/include/clang/ASTMatchers/ASTMatchersMacros.h
@@ -359,11 +359,6 @@
               ::clang::ast_matchers::internal::BoundNodesTreeBuilder *Builder) \
           const
 
-/// \brief Creates a variadic matcher for both a specific \c Type as well as
-/// the corresponding \c TypeLoc.
-#define AST_TYPE_MATCHER(NodeType, MatcherName)                                \
-  const ::clang::ast_matchers::internal::VariadicDynCastAllOfMatcher<          \
-      Type, NodeType> MatcherName
 // FIXME: add a matcher for TypeLoc derived classes using its custom casting
 // API (no longer dyn_cast) if/when we need such matching
 
diff --git a/include/clang/Analysis/CloneDetection.h b/include/clang/Analysis/CloneDetection.h
index 051b9236658c..5ef44570186b 100644
--- a/include/clang/Analysis/CloneDetection.h
+++ b/include/clang/Analysis/CloneDetection.h
@@ -351,7 +351,7 @@ struct FilenamePatternConstraint {
 /// Analyzes the pattern of the referenced variables in a statement.
 class VariablePattern {
 
-  /// Describes an occurence of a variable reference in a statement.
+  /// Describes an occurrence of a variable reference in a statement.
   struct VariableOccurence {
     /// The index of the associated VarDecl in the Variables vector.
     size_t KindID;
@@ -362,7 +362,7 @@ class VariablePattern {
         : KindID(KindID), Mention(Mention) {}
   };
 
-  /// All occurences of referenced variables in the order of appearance.
+  /// All occurrences of referenced variables in the order of appearance.
   std::vector<VariableOccurence> Occurences;
   /// List of referenced variables in the order of appearance.
   /// Every item in this list is unique.
diff --git a/include/clang/Analysis/ProgramPoint.h b/include/clang/Analysis/ProgramPoint.h
index 2d59dec48a88..8a72399069e8 100644
--- a/include/clang/Analysis/ProgramPoint.h
+++ b/include/clang/Analysis/ProgramPoint.h
@@ -73,8 +73,9 @@ class ProgramPoint {
               PostStoreKind,
               PostConditionKind,
               PostLValueKind,
+              PostAllocatorCallKind,
               MinPostStmtKind = PostStmtKind,
-              MaxPostStmtKind = PostLValueKind,
+              MaxPostStmtKind = PostAllocatorCallKind,
               PostInitializerKind,
               CallEnterKind,
               CallExitBeginKind,
@@ -97,7 +98,7 @@ class ProgramPoint {
   llvm::PointerIntPair<const ProgramPointTag *, 2, unsigned> Tag;
 
 protected:
-  ProgramPoint() {}
+  ProgramPoint() = default;
   ProgramPoint(const void *P,
                Kind k,
                const LocationContext *l,
@@ -233,7 +234,7 @@ class BlockEntrance : public ProgramPoint {
   
 private:
   friend class ProgramPoint;
-  BlockEntrance() {}
+  BlockEntrance() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == BlockEntranceKind;
   }
@@ -254,7 +255,7 @@ class BlockExit : public ProgramPoint {
 
 private:
   friend class ProgramPoint;
-  BlockExit() {}
+  BlockExit() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == BlockExitKind;
   }
@@ -274,7 +275,7 @@ class StmtPoint : public ProgramPoint {
   const T* getStmtAs() const { return dyn_cast<T>(getStmt()); }
 
 protected:
-  StmtPoint() {}
+  StmtPoint() = default;
 private:
   friend class ProgramPoint;
   static bool isKind(const ProgramPoint &Location) {
@@ -294,7 +295,7 @@ class PreStmt : public StmtPoint {
 
 private:
   friend class ProgramPoint;
-  PreStmt() {}
+  PreStmt() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PreStmtKind;
   }
@@ -302,7 +303,7 @@ class PreStmt : public StmtPoint {
 
 class PostStmt : public StmtPoint {
 protected:
-  PostStmt() {}
+  PostStmt() = default;
   PostStmt(const Stmt *S, const void *data, Kind k, const LocationContext *L,
            const ProgramPointTag *tag = nullptr)
     : StmtPoint(S, data, k, L, tag) {}
@@ -333,7 +334,7 @@ class PostCondition : public PostStmt {
 
 private:
   friend class ProgramPoint;
-  PostCondition() {}
+  PostCondition() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PostConditionKind;
   }
@@ -341,7 +342,7 @@ class PostCondition : public PostStmt {
 
 class LocationCheck : public StmtPoint {
 protected:
-  LocationCheck() {}
+  LocationCheck() = default;
   LocationCheck(const Stmt *S, const LocationContext *L,
                 ProgramPoint::Kind K, const ProgramPointTag *tag)
     : StmtPoint(S, nullptr, K, L, tag) {}
@@ -362,7 +363,7 @@ class PreLoad : public LocationCheck {
   
 private:
   friend class ProgramPoint;
-  PreLoad() {}
+  PreLoad() = default;
   static bool isKind(const ProgramPoint &location) {
     return location.getKind() == PreLoadKind;
   }
@@ -376,7 +377,7 @@ class PreStore : public LocationCheck {
   
 private:
   friend class ProgramPoint;
-  PreStore() {}
+  PreStore() = default;
   static bool isKind(const ProgramPoint &location) {
     return location.getKind() == PreStoreKind;
   }
@@ -390,7 +391,7 @@ class PostLoad : public PostStmt {
 
 private:
   friend class ProgramPoint;
-  PostLoad() {}
+  PostLoad() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PostLoadKind;
   }
@@ -417,7 +418,7 @@ class PostStore : public PostStmt {
 
 private:
   friend class ProgramPoint;
-  PostStore() {}
+  PostStore() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PostStoreKind;
   }
@@ -431,7 +432,7 @@ class PostLValue : public PostStmt {
 
 private:
   friend class ProgramPoint;
-  PostLValue() {}
+  PostLValue() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PostLValueKind;
   }
@@ -447,7 +448,7 @@ class PreStmtPurgeDeadSymbols : public StmtPoint {
 
 private:
   friend class ProgramPoint;
-  PreStmtPurgeDeadSymbols() {}
+  PreStmtPurgeDeadSymbols() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PreStmtPurgeDeadSymbolsKind;
   }
@@ -463,7 +464,7 @@ class PostStmtPurgeDeadSymbols : public StmtPoint {
 
 private:
   friend class ProgramPoint;
-  PostStmtPurgeDeadSymbols() {}
+  PostStmtPurgeDeadSymbols() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PostStmtPurgeDeadSymbolsKind;
   }
@@ -487,7 +488,7 @@ class BlockEdge : public ProgramPoint {
 
 private:
   friend class ProgramPoint;
-  BlockEdge() {}
+  BlockEdge() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == BlockEdgeKind;
   }
@@ -516,7 +517,7 @@ class PostInitializer : public ProgramPoint {
 
 private:
   friend class ProgramPoint;
-  PostInitializer() {}
+  PostInitializer() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PostInitializerKind;
   }
@@ -537,7 +538,7 @@ class ImplicitCallPoint : public ProgramPoint {
   }
 
 protected:
-  ImplicitCallPoint() {}
+  ImplicitCallPoint() = default;
 private:
   friend class ProgramPoint;
   static bool isKind(const ProgramPoint &Location) {
@@ -557,7 +558,7 @@ class PreImplicitCall : public ImplicitCallPoint {
 
 private:
   friend class ProgramPoint;
-  PreImplicitCall() {}
+  PreImplicitCall() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PreImplicitCallKind;
   }
@@ -574,12 +575,26 @@ class PostImplicitCall : public ImplicitCallPoint {
 
 private:
   friend class ProgramPoint;
-  PostImplicitCall() {}
+  PostImplicitCall() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == PostImplicitCallKind;
   }
 };
 
+class PostAllocatorCall : public StmtPoint {
+public:
+  PostAllocatorCall(const Stmt *S, const LocationContext *L,
+                    const ProgramPointTag *Tag = nullptr)
+      : StmtPoint(S, nullptr, PostAllocatorCallKind, L, Tag) {}
+
+private:
+  friend class ProgramPoint;
+  PostAllocatorCall() = default;
+  static bool isKind(const ProgramPoint &Location) {
+    return Location.getKind() == PostAllocatorCallKind;
+  }
+};
+
 /// Represents a point when we begin processing an inlined call.
 /// CallEnter uses the caller's location context.
 class CallEnter : public ProgramPoint {
@@ -605,7 +620,7 @@ class CallEnter : public ProgramPoint {
 
 private:
   friend class ProgramPoint;
-  CallEnter() {}
+  CallEnter() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == CallEnterKind;
   }
@@ -628,7 +643,7 @@ class CallExitBegin : public ProgramPoint {
 
 private:
   friend class ProgramPoint;
-  CallExitBegin() {}
+  CallExitBegin() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == CallExitBeginKind;
   }
@@ -649,7 +664,7 @@ class CallExitEnd : public ProgramPoint {
 
 private:
   friend class ProgramPoint;
-  CallExitEnd() {}
+  CallExitEnd() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == CallExitEndKind;
   }
@@ -672,7 +687,7 @@ class LoopExit : public ProgramPoint {
 
 private:
     friend class ProgramPoint;
-    LoopExit() {}
+    LoopExit() = default;
     static bool isKind(const ProgramPoint &Location) {
       return Location.getKind() == LoopExitKind;
     }
@@ -691,7 +706,7 @@ class EpsilonPoint : public ProgramPoint {
 
 private:
   friend class ProgramPoint;
-  EpsilonPoint() {}
+  EpsilonPoint() = default;
   static bool isKind(const ProgramPoint &Location) {
     return Location.getKind() == EpsilonKind;
   }
diff --git a/include/clang/Basic/AddressSpaces.h b/include/clang/Basic/AddressSpaces.h
index 6b0090813e9b..67019f69253f 100644
--- a/include/clang/Basic/AddressSpaces.h
+++ b/include/clang/Basic/AddressSpaces.h
@@ -24,7 +24,7 @@ namespace clang {
 /// of QualType.
 ///
 enum class LangAS : unsigned {
-  // The default value 0 is the value used in QualType for the the situation
+  // The default value 0 is the value used in QualType for the situation
   // where there is no address space qualifier.
   Default = 0,
 
diff --git a/include/clang/Basic/Attr.td b/include/clang/Basic/Attr.td
index 8b84c4b8b50d..5332bde157c9 100644
--- a/include/clang/Basic/Attr.td
+++ b/include/clang/Basic/Attr.td
@@ -231,9 +231,12 @@ class GCC<string name> : Spelling<name, "GCC"> {
   let KnownToGCC = 1;
 }
 
-// The Clang spelling implies GNU<name> and CXX11<"clang", name>. This spelling
-// should be used for any Clang-specific attributes.
-class Clang<string name> : Spelling<name, "Clang">;
+// The Clang spelling implies GNU<name>, CXX11<"clang", name>, and optionally,
+// C2x<"clang", name>. This spelling should be used for any Clang-specific
+// attributes.
+class Clang<string name, bit allowInC = 0> : Spelling<name, "Clang"> {
+  bit AllowInC = allowInC;
+}
 
 class Accessor<string name, list<Spelling> spellings> {
   string Name = name;
@@ -438,9 +441,6 @@ class Attr {
   // Set to true if all of the attribute's arguments should be parsed in an
   // unevaluated context.
   bit ParseArgumentsAsUnevaluated = 0;
-  // Set to true if this attribute can be duplicated on a subject when merging
-  // attributes. By default, attributes are not merged.
-  bit DuplicatesAllowedWhileMerging = 0;
   // Set to true if this attribute meaningful when applied to or inherited 
   // in a class template definition.
   bit MeaningfulToClassTemplateDefinition = 0;
@@ -475,7 +475,11 @@ class TypeAttr : Attr {
 class StmtAttr : Attr;
 
 /// An inheritable attribute is inherited by later redeclarations.
-class InheritableAttr : Attr;
+class InheritableAttr : Attr {
+  // Set to true if this attribute can be duplicated on a subject when inheriting
+  // attributes from prior declarations.
+  bit InheritEvenIfAlreadyPresent = 0;
+}
 
 /// A target-specific attribute.  This class is meant to be used as a mixin
 /// with InheritableAttr or Attr depending on the attribute's needs.
@@ -618,7 +622,7 @@ def AnalyzerNoReturn : InheritableAttr {
 }
 
 def Annotate : InheritableParamAttr {
-  let Spellings = [Clang<"annotate">];
+  let Spellings = [Clang<"annotate", 1>];
   let Args = [StringArgument<"Annotation">];
   // Ensure that the annotate attribute can be used with
   // '#pragma clang attribute' even though it has no subject list.
@@ -706,7 +710,7 @@ static llvm::StringRef canonicalizePlatformName(llvm::StringRef Platform) {
              .Default(Platform);
 } }];
   let HasCustomParsing = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Named]>;
   let Documentation = [AvailabilityDocs];
 }
@@ -1376,7 +1380,7 @@ def NonNull : InheritableParamAttr {
     return false;
   } }];
   // FIXME: We should merge duplicates into a single nonnull attribute.
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Documentation = [NonNullDocs];
 }
 
@@ -1840,12 +1844,27 @@ def Target : InheritableAttr {
       std::vector<std::string> Features;
       StringRef Architecture;
       bool DuplicateArchitecture = false;
+      bool operator ==(const ParsedTargetAttr &Other) const {
+        return DuplicateArchitecture == Other.DuplicateArchitecture &&
+               Architecture == Other.Architecture && Features == Other.Features;
+      }
     };
     ParsedTargetAttr parse() const {
       return parse(getFeaturesStr());
     }
+
+    template<class Compare>
+    ParsedTargetAttr parse(Compare cmp) const {
+      ParsedTargetAttr Attrs = parse();
+      std::sort(std::begin(Attrs.Features), std::end(Attrs.Features), cmp);
+      return Attrs;
+    }
+
+    bool isDefaultVersion() const { return getFeaturesStr() == "default"; }
+
     static ParsedTargetAttr parse(StringRef Features) {
       ParsedTargetAttr Ret;
+      if (Features == "default") return Ret;
       SmallVector<StringRef, 1> AttrFeatures;
       Features.split(AttrFeatures, ",");
 
@@ -1913,7 +1932,7 @@ def DiagnoseIf : InheritableAttr {
                            ["DT_Error", "DT_Warning"]>,
               BoolArgument<"ArgDependent", 0, /*fake*/ 1>,
               NamedArgument<"Parent", 0, /*fake*/ 1>];
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let LateParsed = 1;
   let AdditionalMembers = [{
     bool isError() const { return diagnosticType == DT_Error; }
@@ -2156,7 +2175,7 @@ def AssertCapability : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Args = [VariadicExprArgument<"Args">];
   let Accessors = [Accessor<"isShared",
                     [Clang<"assert_shared_capability">]>];
@@ -2172,7 +2191,7 @@ def AcquireCapability : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Args = [VariadicExprArgument<"Args">];
   let Accessors = [Accessor<"isShared",
                     [Clang<"acquire_shared_capability">,
@@ -2188,7 +2207,7 @@ def TryAcquireCapability : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Args = [ExprArgument<"SuccessValue">, VariadicExprArgument<"Args">];
   let Accessors = [Accessor<"isShared",
                     [Clang<"try_acquire_shared_capability">]>];
@@ -2204,7 +2223,7 @@ def ReleaseCapability : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Args = [VariadicExprArgument<"Args">];
   let Accessors = [Accessor<"isShared",
                     [Clang<"release_shared_capability">]>,
@@ -2223,7 +2242,7 @@ def RequiresCapability : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Function]>;
   let Accessors = [Accessor<"isShared", [Clang<"requires_shared_capability">,
                                          Clang<"shared_locks_required">]>];
@@ -2242,7 +2261,7 @@ def GuardedBy : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Field, SharedVar]>;
   let Documentation = [Undocumented];
 }
@@ -2253,7 +2272,7 @@ def PtGuardedBy : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Field, SharedVar]>;
   let Documentation = [Undocumented];
 }
@@ -2264,7 +2283,7 @@ def AcquiredAfter : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Field, SharedVar]>;
   let Documentation = [Undocumented];
 }
@@ -2275,7 +2294,7 @@ def AcquiredBefore : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Field, SharedVar]>;
   let Documentation = [Undocumented];
 }
@@ -2286,7 +2305,7 @@ def AssertExclusiveLock : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Function]>;
   let Documentation = [Undocumented];
 }
@@ -2297,7 +2316,7 @@ def AssertSharedLock : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Function]>;
   let Documentation = [Undocumented];
 }
@@ -2310,7 +2329,7 @@ def ExclusiveTrylockFunction : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Function]>;
   let Documentation = [Undocumented];
 }
@@ -2323,7 +2342,7 @@ def SharedTrylockFunction : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Function]>;
   let Documentation = [Undocumented];
 }
@@ -2344,7 +2363,7 @@ def LocksExcluded : InheritableAttr {
   let LateParsed = 1;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
-  let DuplicatesAllowedWhileMerging = 1;
+  let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Function]>;
   let Documentation = [Undocumented];
 }
diff --git a/include/clang/Basic/AttrDocs.td b/include/clang/Basic/AttrDocs.td
index ecff329c4ccb..7922a699a9a9 100644
--- a/include/clang/Basic/AttrDocs.td
+++ b/include/clang/Basic/AttrDocs.td
@@ -1474,6 +1474,24 @@ for the function.
 Example "subtarget features" from the x86 backend include: "mmx", "sse", "sse4.2",
 "avx", "xop" and largely correspond to the machine specific options handled by
 the front end.
+
+Additionally, this attribute supports function multiversioning for ELF based
+x86/x86-64 targets, which can be used to create multiple implementations of the
+same function that will be resolved at runtime based on the priority of their
+``target`` attribute strings. A function is considered a multiversioned function
+if either two declarations of the function have different ``target`` attribute
+  strings, or if it has a ``target`` attribute string of ``default``.  For
+  example:
+
+  .. code-block:: c++
+  __attribute__((target("arch=atom")))
+  void foo() {} // will be called on 'atom' processors.
+  __attribute__((target("default")))
+  void foo() {} // will be called on any other processors.
+
+All multiversioned functions must contain a ``default`` (fallback)
+implementation, otherwise usages of the function are considered invalid.
+Additionally, a function may not become multiversioned after its first use.
 }];
 }
 
diff --git a/include/clang/Basic/Builtins.def b/include/clang/Basic/Builtins.def
index 3d4deb5ed306..716f014fc3df 100644
--- a/include/clang/Basic/Builtins.def
+++ b/include/clang/Basic/Builtins.def
@@ -48,8 +48,8 @@
 //  . -> "...".  This may only occur at the end of the function list.
 //
 // Types may be prefixed with the following modifiers:
-//  L   -> long (e.g. Li for 'long int')
-//  LL  -> long long
+//  L   -> long (e.g. Li for 'long int', Ld for 'long double')
+//  LL  -> long long (e.g. LLi for 'long long int', LLd for __float128)
 //  LLL -> __int128_t (e.g. LLLi)
 //  W   -> int64_t
 //  N   -> 'int' size if target is LP64, 'L' otherwise.
@@ -110,9 +110,11 @@ BUILTIN(__builtin_abs  , "ii"  , "ncF")
 BUILTIN(__builtin_copysign, "ddd", "ncF")
 BUILTIN(__builtin_copysignf, "fff", "ncF")
 BUILTIN(__builtin_copysignl, "LdLdLd", "ncF")
+BUILTIN(__builtin_copysignf128, "LLdLLdLLd", "ncF")
 BUILTIN(__builtin_fabs , "dd"  , "ncF")
 BUILTIN(__builtin_fabsf, "ff"  , "ncF")
 BUILTIN(__builtin_fabsl, "LdLd", "ncF")
+BUILTIN(__builtin_fabsf128, "LLdLLd", "ncF")
 BUILTIN(__builtin_fmod , "ddd"  , "Fne")
 BUILTIN(__builtin_fmodf, "fff"  , "Fne")
 BUILTIN(__builtin_fmodl, "LdLdLd", "Fne")
@@ -122,9 +124,11 @@ BUILTIN(__builtin_frexpl, "LdLdi*", "Fn")
 BUILTIN(__builtin_huge_val, "d", "nc")
 BUILTIN(__builtin_huge_valf, "f", "nc")
 BUILTIN(__builtin_huge_vall, "Ld", "nc")
+BUILTIN(__builtin_huge_valf128, "LLd", "nc")
 BUILTIN(__builtin_inf  , "d"   , "nc")
 BUILTIN(__builtin_inff , "f"   , "nc")
 BUILTIN(__builtin_infl , "Ld"  , "nc")
+BUILTIN(__builtin_inff128 , "LLd"  , "nc")
 BUILTIN(__builtin_labs , "LiLi"  , "Fnc")
 BUILTIN(__builtin_llabs, "LLiLLi", "Fnc")
 BUILTIN(__builtin_ldexp , "ddi"  , "Fne")
@@ -136,9 +140,11 @@ BUILTIN(__builtin_modfl, "LdLdLd*", "Fn")
 BUILTIN(__builtin_nan,  "dcC*" , "ncF")
 BUILTIN(__builtin_nanf, "fcC*" , "ncF")
 BUILTIN(__builtin_nanl, "LdcC*", "ncF")
+BUILTIN(__builtin_nanf128, "LLdcC*", "ncF")
 BUILTIN(__builtin_nans,  "dcC*" , "ncF")
 BUILTIN(__builtin_nansf, "fcC*" , "ncF")
 BUILTIN(__builtin_nansl, "LdcC*", "ncF")
+BUILTIN(__builtin_nansf128, "LLdcC*", "ncF")
 BUILTIN(__builtin_powi , "ddi"  , "Fnc")
 BUILTIN(__builtin_powif, "ffi"  , "Fnc")
 BUILTIN(__builtin_powil, "LdLdi", "Fnc")
@@ -796,6 +802,10 @@ LANGBUILTIN(__fastfail, "vUi",    "nr", ALL_MS_LANGUAGES)
 LIBBUILTIN(_setjmpex, "iJ", "fj",   "setjmpex.h", ALL_MS_LANGUAGES)
 
 // C99 library functions
+// C99 stdarg.h
+LIBBUILTIN(va_start, "vA.",       "fnt",   "stdarg.h", ALL_LANGUAGES)
+LIBBUILTIN(va_end, "vA",          "fn",    "stdarg.h", ALL_LANGUAGES)
+LIBBUILTIN(va_copy, "vAA",        "fn",    "stdarg.h", ALL_LANGUAGES)
 // C99 stdlib.h
 LIBBUILTIN(abort, "v",            "fr",    "stdlib.h", ALL_LANGUAGES)
 LIBBUILTIN(calloc, "v*zz",        "f",     "stdlib.h", ALL_LANGUAGES)
diff --git a/include/clang/Basic/BuiltinsNEON.def b/include/clang/Basic/BuiltinsNEON.def
index 7800ae69c4c9..241b93a915a9 100644
--- a/include/clang/Basic/BuiltinsNEON.def
+++ b/include/clang/Basic/BuiltinsNEON.def
@@ -16,6 +16,7 @@
 
 #define GET_NEON_BUILTINS
 #include "clang/Basic/arm_neon.inc"
+#include "clang/Basic/arm_fp16.inc"
 #undef GET_NEON_BUILTINS
 
 #undef BUILTIN
diff --git a/include/clang/Basic/BuiltinsWebAssembly.def b/include/clang/Basic/BuiltinsWebAssembly.def
index 19318dcebb9e..d6981516d926 100644
--- a/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/include/clang/Basic/BuiltinsWebAssembly.def
@@ -16,8 +16,16 @@
 
 // The format of this database matches clang/Basic/Builtins.def.
 
-// Note that current_memory is not "c" (readnone) because it must be sequenced
+// Query the current memory size, and increase the current memory size.
+// Note that mem.size is not "c" (readnone) because it must be sequenced
 // with respect to grow_memory calls.
+// These are the new proposed names, which aren't yet official. Use at your own
+// risk.
+BUILTIN(__builtin_wasm_mem_size, "zIi", "n")
+BUILTIN(__builtin_wasm_mem_grow, "zIiz", "n")
+
+// These are the existing names, which are currently official, but expected
+// to be deprecated in the future. They also lack the immediate field.
 BUILTIN(__builtin_wasm_current_memory, "z", "n")
 BUILTIN(__builtin_wasm_grow_memory, "zz", "n")
 
diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def
index 465551be7742..b7a058182c30 100644
--- a/include/clang/Basic/BuiltinsX86.def
+++ b/include/clang/Basic/BuiltinsX86.def
@@ -894,6 +894,9 @@ BUILTIN(__builtin_ia32_rdpmc, "ULLii", "")
 BUILTIN(__builtin_ia32_rdtsc, "ULLi", "")
 BUILTIN(__rdtsc, "ULLi", "")
 BUILTIN(__builtin_ia32_rdtscp, "ULLiUi*", "")
+
+TARGET_BUILTIN(__builtin_ia32_rdpid, "Ui", "", "rdpid")
+
 // PKU
 TARGET_BUILTIN(__builtin_ia32_rdpkru, "Ui", "", "pku")
 TARGET_BUILTIN(__builtin_ia32_wrpkru, "vUi", "", "pku")
@@ -1357,15 +1360,15 @@ TARGET_BUILTIN(__builtin_ia32_vpshrdvw128_maskz, "V8sV8sV8sV8sUc", "", "avx512vl
 TARGET_BUILTIN(__builtin_ia32_vpshrdvw256_maskz, "V16sV16sV16sV16sUs", "", "avx512vl,avx512vbmi2")
 TARGET_BUILTIN(__builtin_ia32_vpshrdvw512_maskz, "V32sV32sV32sV32sUi", "", "avx512vbmi2")
 
-TARGET_BUILTIN(__builtin_ia32_vpshrdd128_mask, "V4iV4iV4iiV4iUc", "", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd256_mask, "V8iV8iV8iiV8iUc", "", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd512_mask, "V16iV16iV16iiV16iUs", "", "avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq128_mask, "V2LLiV2LLiV2LLiiV2LLiUc", "", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq256_mask, "V4LLiV4LLiV4LLiiV4LLiUc", "", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq512_mask, "V8LLiV8LLiV8LLiiV8LLiUc", "", "avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw128_mask, "V8sV8sV8siV8sUc", "", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw256_mask, "V16sV16sV16siV16sUs", "", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw512_mask, "V32sV32sV32siV32sUi", "", "avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdd128_mask, "V4iV4iV4iIiV4iUc", "", "avx512vl,avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdd256_mask, "V8iV8iV8iIiV8iUc", "", "avx512vl,avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdd512_mask, "V16iV16iV16iIiV16iUs", "", "avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdq128_mask, "V2LLiV2LLiV2LLiIiV2LLiUc", "", "avx512vl,avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdq256_mask, "V4LLiV4LLiV4LLiIiV4LLiUc", "", "avx512vl,avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdq512_mask, "V8LLiV8LLiV8LLiIiV8LLiUc", "", "avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdw128_mask, "V8sV8sV8sIiV8sUc", "", "avx512vl,avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdw256_mask, "V16sV16sV16sIiV16sUs", "", "avx512vl,avx512vbmi2")
+TARGET_BUILTIN(__builtin_ia32_vpshrdw512_mask, "V32sV32sV32sIiV32sUi", "", "avx512vbmi2")
 
 TARGET_BUILTIN(__builtin_ia32_pmovswb512_mask, "V32cV32sV32cUi", "", "avx512bw")
 TARGET_BUILTIN(__builtin_ia32_pmovuswb512_mask, "V32cV32sV32cUi", "", "avx512bw")
diff --git a/include/clang/Basic/CMakeLists.txt b/include/clang/Basic/CMakeLists.txt
index 821c405913c8..be61ffa3bcca 100644
--- a/include/clang/Basic/CMakeLists.txt
+++ b/include/clang/Basic/CMakeLists.txt
@@ -46,3 +46,7 @@ clang_tablegen(arm_neon.inc -gen-arm-neon-sema
   -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
   SOURCE arm_neon.td
   TARGET ClangARMNeon)
+clang_tablegen(arm_fp16.inc -gen-arm-neon-sema
+  -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
+  SOURCE arm_fp16.td
+  TARGET ClangARMFP16)
diff --git a/include/clang/Basic/Cuda.h b/include/clang/Basic/Cuda.h
index 1a0731c37a35..1fa0b5280739 100644
--- a/include/clang/Basic/Cuda.h
+++ b/include/clang/Basic/Cuda.h
@@ -22,7 +22,8 @@ enum class CudaVersion {
   CUDA_75,
   CUDA_80,
   CUDA_90,
-  LATEST = CUDA_90,
+  CUDA_91,
+  LATEST = CUDA_91,
 };
 const char *CudaVersionToString(CudaVersion V);
 
@@ -44,6 +45,7 @@ enum class CudaArch {
   SM_61,
   SM_62,
   SM_70,
+  SM_72,
 };
 const char *CudaArchToString(CudaArch A);
 
@@ -64,6 +66,7 @@ enum class CudaVirtualArch {
   COMPUTE_61,
   COMPUTE_62,
   COMPUTE_70,
+  COMPUTE_72,
 };
 const char *CudaVirtualArchToString(CudaVirtualArch A);
 
diff --git a/include/clang/Basic/Diagnostic.h b/include/clang/Basic/Diagnostic.h
index a7458d45618e..d87d25f25433 100644
--- a/include/clang/Basic/Diagnostic.h
+++ b/include/clang/Basic/Diagnostic.h
@@ -402,7 +402,7 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
 
 public:
   explicit DiagnosticsEngine(IntrusiveRefCntPtr<DiagnosticIDs> Diags,
-                             DiagnosticOptions *DiagOpts,
+                             IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
                              DiagnosticConsumer *client = nullptr,
                              bool ShouldOwnClient = true);
   DiagnosticsEngine(const DiagnosticsEngine &) = delete;
diff --git a/include/clang/Basic/DiagnosticCommonKinds.td b/include/clang/Basic/DiagnosticCommonKinds.td
index 82ca27b7345e..33e9867a8f1e 100644
--- a/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/include/clang/Basic/DiagnosticCommonKinds.td
@@ -94,6 +94,9 @@ def remark_module_lock_failure : Remark<
   "could not acquire lock file for module '%0': %1">, InGroup<ModuleBuild>;
 def remark_module_lock_timeout : Remark<
   "timed out waiting to acquire lock file for module '%0'">, InGroup<ModuleBuild>;
+def err_module_shadowed : Error<"import of shadowed module '%0'">, DefaultFatal;
+def err_module_build_shadowed_submodule : Error<
+  "build a shadowed submodule '%0'">, DefaultFatal;
 def err_module_cycle : Error<"cyclic dependency in module '%0': %1">, 
   DefaultFatal;
 def err_module_prebuilt : Error<
@@ -200,6 +203,8 @@ def err_target_unsupported_execute_only : Error<
   "execute only is not supported for the %0 sub-architecture">;
 def err_opt_not_valid_with_opt : Error<
   "option '%0' cannot be specified with '%1'">;
+def err_opt_not_valid_without_opt : Error<
+  "option '%0' cannot be specified without '%1'">;
 
 // Source manager
 def err_cannot_open_file : Error<"cannot open file '%0': %1">, DefaultFatal;
diff --git a/include/clang/Basic/DiagnosticDriverKinds.td b/include/clang/Basic/DiagnosticDriverKinds.td
index 41b5e42b4432..ac1de1665b7c 100644
--- a/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/include/clang/Basic/DiagnosticDriverKinds.td
@@ -11,6 +11,8 @@ let Component = "Driver" in {
 
 def err_drv_no_such_file : Error<"no such file or directory: '%0'">;
 def err_drv_unsupported_opt : Error<"unsupported option '%0'">;
+def err_drv_unsupported_opt_with_suggestion
+  : Error<"unsupported option '%0', did you mean '%1'?">;
 def err_drv_unsupported_opt_for_target : Error<
   "unsupported option '%0' for target '%1'">;
 def err_drv_unsupported_option_argument : Error<
@@ -43,6 +45,10 @@ def err_drv_invalid_pgo_instrumentor : Error<
   "invalid PGO instrumentor in argument '%0'">;
 def err_drv_invalid_rtlib_name : Error<
   "invalid runtime library name in argument '%0'">;
+def err_drv_invalid_allocatable_mode : Error<
+  "invalid semantic mode for assignments to allocatables in argument '%0'">;
+def err_drv_unsupported_fixed_line_length : Error<
+  "unsupported fixed-format line length in argument '%0'">;
 def err_drv_unsupported_rtlib_for_platform : Error<
   "unsupported runtime library '%0' for platform '%1'">;
 def err_drv_invalid_stdlib_name : Error<
@@ -112,6 +118,18 @@ def err_drv_invalid_argument_to_fdebug_prefix_map : Error<
   "invalid argument '%0' to -fdebug-prefix-map">;
 def err_drv_malformed_sanitizer_blacklist : Error<
   "malformed sanitizer blacklist: '%0'">;
+def err_drv_duplicate_config : Error<
+  "no more than one option '--config' is allowed">;
+def err_drv_config_file_not_exist : Error<
+  "configuration file '%0' does not exist">;
+def err_drv_config_file_not_found : Error<
+  "configuration file '%0' cannot be found">;
+def note_drv_config_file_searched_in : Note<
+  "was searched for in the directory: %0">;
+def err_drv_cannot_read_config_file : Error<
+  "cannot read configuration file '%0'">;
+def err_drv_nested_config_file: Error<
+  "option '--config' is not allowed inside configuration file">;
 
 def err_target_unsupported_arch
   : Error<"the target architecture '%0' is not supported by the target '%1'">;
@@ -123,9 +141,14 @@ def err_arch_unsupported_isa
 def err_drv_I_dash_not_supported : Error<
   "'%0' not supported, please use -iquote instead">;
 def err_drv_unknown_argument : Error<"unknown argument: '%0'">;
+def err_drv_unknown_argument_with_suggestion
+  : Error<"unknown argument '%0', did you mean '%1'?">;
 def warn_drv_unknown_argument_clang_cl : Warning<
   "unknown argument ignored in clang-cl: '%0'">,
   InGroup<UnknownArgument>;
+def warn_drv_unknown_argument_clang_cl_with_suggestion : Warning<
+  "unknown argument ignored in clang-cl '%0' (did you mean '%1'?)">,
+  InGroup<UnknownArgument>;
 
 def warn_drv_ycyu_no_arg_clang_cl : Warning<
   "support for '%0' without a filename not implemented yet; flag ignored">,
@@ -342,4 +365,12 @@ def warn_drv_fine_grained_bitfield_accesses_ignored : Warning<
 def note_drv_verify_prefix_spelling : Note<
   "-verify prefixes must start with a letter and contain only alphanumeric"
   " characters, hyphens, and underscores">;
+
+def warn_drv_experimental_isel_incomplete : Warning<
+  "-fexperimental-isel support for the '%0' architecture is incomplete">,
+  InGroup<ExperimentalISel>;
+
+def warn_drv_experimental_isel_incomplete_opt : Warning<
+  "-fexperimental-isel support is incomplete for this architecture at the current optimization level">,
+  InGroup<ExperimentalISel>;
 }
diff --git a/include/clang/Basic/DiagnosticGroups.td b/include/clang/Basic/DiagnosticGroups.td
index c23183c81ac8..2d471f1fa5a4 100644
--- a/include/clang/Basic/DiagnosticGroups.td
+++ b/include/clang/Basic/DiagnosticGroups.td
@@ -435,13 +435,16 @@ def StringCompare : DiagGroup<"string-compare">;
 def StringPlusInt : DiagGroup<"string-plus-int">;
 def StringPlusChar : DiagGroup<"string-plus-char">;
 def StrncatSize : DiagGroup<"strncat-size">;
+def TautologicalTypeLimitCompare : DiagGroup<"tautological-type-limit-compare">;
 def TautologicalUnsignedZeroCompare : DiagGroup<"tautological-unsigned-zero-compare">;
 def TautologicalUnsignedEnumZeroCompare : DiagGroup<"tautological-unsigned-enum-zero-compare">;
+def TautologicalInRangeCompare : DiagGroup<"tautological-constant-in-range-compare",
+                                           [TautologicalTypeLimitCompare,
+                                            TautologicalUnsignedZeroCompare,
+                                            TautologicalUnsignedEnumZeroCompare]>;
 def TautologicalOutOfRangeCompare : DiagGroup<"tautological-constant-out-of-range-compare">;
 def TautologicalConstantCompare : DiagGroup<"tautological-constant-compare",
-                                            [TautologicalUnsignedZeroCompare,
-                                             TautologicalUnsignedEnumZeroCompare,
-                                             TautologicalOutOfRangeCompare]>;
+                                            [TautologicalOutOfRangeCompare]>;
 def TautologicalPointerCompare : DiagGroup<"tautological-pointer-compare">;
 def TautologicalOverlapCompare : DiagGroup<"tautological-overlap-compare">;
 def TautologicalUndefinedCompare : DiagGroup<"tautological-undefined-compare">;
@@ -982,3 +985,6 @@ def UnknownArgument : DiagGroup<"unknown-argument">;
 // A warning group for warnings about code that clang accepts when
 // compiling OpenCL C/C++ but which is not compatible with the SPIR spec.
 def SpirCompat : DiagGroup<"spir-compat">;
+
+// Warning for the experimental-isel options.
+def ExperimentalISel : DiagGroup<"experimental-isel">;
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index c391470cb1c8..bf13b5baef27 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -505,17 +505,12 @@ def warn_pragma_message : Warning<"%0">,
 def err_pragma_message : Error<"%0">;
 def warn_pragma_ignored : Warning<"unknown pragma ignored">,
    InGroup<UnknownPragmas>, DefaultIgnore;
-def ext_stdc_pragma_ignored : ExtWarn<"unknown pragma in STDC namespace">,
-   InGroup<UnknownPragmas>;
 def ext_on_off_switch_syntax :
    ExtWarn<"expected 'ON' or 'OFF' or 'DEFAULT' in pragma">,
    InGroup<UnknownPragmas>;
 def ext_pragma_syntax_eod :
    ExtWarn<"expected end of directive in pragma">,
    InGroup<UnknownPragmas>;
-def warn_stdc_fenv_access_not_supported :
-   Warning<"pragma STDC FENV_ACCESS ON is not supported, ignoring pragma">,
-   InGroup<UnknownPragmas>;
 def warn_pragma_diagnostic_invalid :
    ExtWarn<"pragma diagnostic expected 'error', 'warning', 'ignored', 'fatal',"
             " 'push', or 'pop'">,
diff --git a/include/clang/Basic/DiagnosticParseKinds.td b/include/clang/Basic/DiagnosticParseKinds.td
index 7936cdd96f80..620adc80bc03 100644
--- a/include/clang/Basic/DiagnosticParseKinds.td
+++ b/include/clang/Basic/DiagnosticParseKinds.td
@@ -973,6 +973,12 @@ def warn_pragma_init_seg_unsupported_target : Warning<
 def err_pragma_fp_contract_scope : Error<
   "'#pragma fp_contract' can only appear at file scope or at the start of a "
   "compound statement">; 
+// - #pragma stdc unknown
+def ext_stdc_pragma_ignored : ExtWarn<"unknown pragma in STDC namespace">,
+   InGroup<UnknownPragmas>;
+def warn_stdc_fenv_access_not_supported :
+   Warning<"pragma STDC FENV_ACCESS ON is not supported, ignoring pragma">,
+   InGroup<UnknownPragmas>;
 // - #pragma comment
 def err_pragma_comment_malformed : Error<
   "pragma comment requires parenthesized identifier and optional string">;
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 01e819942f68..d5b5fe3b4afa 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1589,12 +1589,20 @@ def err_non_virtual_pure : Error<
 def ext_pure_function_definition : ExtWarn<
   "function definition with pure-specifier is a Microsoft extension">,
   InGroup<MicrosoftPureDefinition>;
-def err_implicit_object_parameter_init : Error<
-  "cannot initialize object parameter of type %0 with an expression "
-  "of type %1">;
 def err_qualified_member_of_unrelated : Error<
   "%q0 is not a member of class %1">;
 
+def err_member_function_call_bad_cvr : Error<
+  "'this' argument to member function %0 has type %1, but function is not marked "
+  "%select{const|restrict|const or restrict|volatile|const or volatile|"
+  "volatile or restrict|const, volatile, or restrict}2">;
+def err_member_function_call_bad_ref : Error<
+  "'this' argument to member function %0 is an %select{lvalue|rvalue}1, "
+  "but function has %select{non-const lvalue|rvalue}2 ref-qualifier">;
+def err_member_function_call_bad_type : Error<
+  "cannot initialize object parameter of type %0 with an expression "
+  "of type %1">;
+
 def warn_call_to_pure_virtual_member_function_from_ctor_dtor : Warning<
   "call to pure virtual member function %0 has undefined behavior; "
   "overrides of %0 in subclasses are not available in the "
@@ -1815,10 +1823,6 @@ def warn_temporary_array_to_pointer_decay : Warning<
 def err_init_list_bad_dest_type : Error<
   "%select{|non-aggregate }0type %1 cannot be initialized with an initializer "
   "list">;
-def err_member_function_call_bad_cvr : Error<"member function %0 not viable: "
-    "'this' argument has type %1, but function is not marked "
-    "%select{const|restrict|const or restrict|volatile|const or volatile|"
-    "volatile or restrict|const, volatile, or restrict}2">;
 
 def err_reference_bind_to_bitfield : Error<
   "%select{non-const|volatile}0 reference cannot bind to "
@@ -5942,15 +5946,15 @@ def note_typecheck_assign_const : Note<
 def warn_unsigned_always_true_comparison : Warning<
   "result of comparison of %select{%3|unsigned expression}0 %2 "
   "%select{unsigned expression|%3}0 is always %4">,
-  InGroup<TautologicalUnsignedZeroCompare>;
+  InGroup<TautologicalUnsignedZeroCompare>, DefaultIgnore;
 def warn_unsigned_enum_always_true_comparison : Warning<
   "result of comparison of %select{%3|unsigned enum expression}0 %2 "
   "%select{unsigned enum expression|%3}0 is always %4">,
-  InGroup<TautologicalUnsignedEnumZeroCompare>;
+  InGroup<TautologicalUnsignedEnumZeroCompare>, DefaultIgnore;
 def warn_tautological_constant_compare : Warning<
   "result of comparison %select{%3|%1}0 %2 "
   "%select{%1|%3}0 is always %4">,
-  InGroup<TautologicalConstantCompare>;
+  InGroup<TautologicalTypeLimitCompare>, DefaultIgnore;
 
 def warn_mixed_sign_comparison : Warning<
   "comparison of integers of different signs: %0 and %1">,
@@ -7902,7 +7906,7 @@ def note_ref_subobject_of_member_declared_here : Note<
 // should result in a warning, since these always evaluate to a constant.
 // Array comparisons have similar warnings
 def warn_comparison_always : Warning<
-  "%select{self-|array }0comparison always evaluates to %select{false|true|a constant}1">,
+  "%select{self-|array }0comparison always evaluates to %select{a constant|%2}1">,
   InGroup<TautologicalCompare>;
 def warn_comparison_bitwise_always : Warning<
   "bitwise comparison always evaluates to %select{false|true}0">,
@@ -8296,12 +8300,16 @@ def err_c99_array_usage_cxx : Error<
   "feature, not permitted in C++">;
 def err_type_unsupported : Error<
   "%0 is not supported on this target">;
-def warn_nsconsumed_attribute_mismatch : Warning<
+def err_nsconsumed_attribute_mismatch : Error<
   "overriding method has mismatched ns_consumed attribute on its"
-  " parameter">, InGroup<NSConsumedMismatch>;
-def warn_nsreturns_retained_attribute_mismatch : Warning<
+  " parameter">;
+def err_nsreturns_retained_attribute_mismatch : Error<
   "overriding method has mismatched ns_returns_%select{not_retained|retained}0"
-  " attributes">, InGroup<NSReturnsMismatch>;
+  " attributes">;
+def warn_nsconsumed_attribute_mismatch : Warning<
+  err_nsconsumed_attribute_mismatch.Text>, InGroup<NSConsumedMismatch>;
+def warn_nsreturns_retained_attribute_mismatch : Warning<
+  err_nsreturns_retained_attribute_mismatch.Text>, InGroup<NSReturnsMismatch>;
   
 def note_getter_unavailable : Note<
   "or because setter is declared here, but no getter method %0 is found">;
@@ -9324,4 +9332,35 @@ def warn_shadow_field :
   InGroup<ShadowField>, DefaultIgnore;
 def note_shadow_field : Note<"declared here">;
 
+def err_target_required_in_redecl : Error<
+  "function declaration is missing 'target' attribute in a multiversioned "
+  "function">;
+def note_multiversioning_caused_here : Note<
+  "function multiversioning caused by this declaration">;
+def err_multiversion_after_used : Error<
+  "function declaration cannot become a multiversioned function after first "
+  "usage">;
+def err_bad_multiversion_option : Error<
+  "function multiversioning doesn't support %select{feature|architecture}0 "
+  "'%1'">;
+def err_multiversion_duplicate : Error<
+  "multiversioned function redeclarations require identical target attributes">;
+def err_multiversion_noproto : Error<
+  "multiversioned function must have a prototype">;
+def err_multiversion_no_other_attrs : Error<
+  "attribute 'target' multiversioning cannot be combined with other "
+  "attributes">;
+def err_multiversion_diff : Error<
+  "multiversioned function declaration has a different %select{calling convention"
+  "|return type|constexpr specification|inline specification|storage class|"
+  "linkage}0">;
+def err_multiversion_doesnt_support : Error<
+  "multiversioned functions do not yet support %select{function templates|"
+  "virtual functions|deduced return types|constructors|destructors|"
+  "deleted functions|defaulted functions}0">;
+def err_multiversion_not_allowed_on_main : Error<
+  "'main' cannot be a multiversioned function">;
+def err_multiversion_not_supported : Error<
+ "function multiversioning is not supported on the current target">;
+
 } // end of sema component.
diff --git a/include/clang/Basic/Module.h b/include/clang/Basic/Module.h
index 6631721e3531..3725fcb06db9 100644
--- a/include/clang/Basic/Module.h
+++ b/include/clang/Basic/Module.h
@@ -197,6 +197,9 @@ class Module {
   /// will be false to indicate that this (sub)module is not available.
   SmallVector<Requirement, 2> Requirements;
 
+  /// \brief A module with the same name that shadows this module.
+  Module *ShadowingModule = nullptr;
+
   /// \brief Whether this module is missing a feature from \c Requirements.
   unsigned IsMissingRequirement : 1;
 
@@ -375,13 +378,20 @@ class Module {
   ///
   /// \param Target The target options used for the current translation unit.
   ///
-  /// \param Req If this module is unavailable, this parameter
-  /// will be set to one of the requirements that is not met for use of
-  /// this module.
+  /// \param Req If this module is unavailable because of a missing requirement,
+  /// this parameter will be set to one of the requirements that is not met for
+  /// use of this module.
+  ///
+  /// \param MissingHeader If this module is unavailable because of a missing
+  /// header, this parameter will be set to one of the missing headers.
+  ///
+  /// \param ShadowingModule If this module is unavailable because it is
+  /// shadowed, this parameter will be set to the shadowing module.
   bool isAvailable(const LangOptions &LangOpts, 
                    const TargetInfo &Target,
                    Requirement &Req,
-                   UnresolvedHeaderDirective &MissingHeader) const;
+                   UnresolvedHeaderDirective &MissingHeader,
+                   Module *&ShadowingModule) const;
 
   /// \brief Determine whether this module is a submodule.
   bool isSubModule() const { return Parent != nullptr; }
diff --git a/include/clang/Basic/OpenCLExtensions.def b/include/clang/Basic/OpenCLExtensions.def
index c3319d2d808b..13cb12e7c581 100644
--- a/include/clang/Basic/OpenCLExtensions.def
+++ b/include/clang/Basic/OpenCLExtensions.def
@@ -53,6 +53,9 @@ OPENCLEXT_INTERNAL(cl_khr_icd, 100, ~0U)
 OPENCLEXT_INTERNAL(cl_khr_gl_event, 110, ~0U)
 OPENCLEXT_INTERNAL(cl_khr_d3d10_sharing, 110, ~0U)
 
+// EMBEDDED_PROFILE
+OPENCLEXT_INTERNAL(cles_khr_int64, 110, ~0U)
+
 // OpenCL 1.2.
 OPENCLEXT_INTERNAL(cl_khr_context_abort, 120, ~0U)
 OPENCLEXT_INTERNAL(cl_khr_d3d11_sharing, 120, ~0U)
diff --git a/include/clang/Basic/TargetInfo.h b/include/clang/Basic/TargetInfo.h
index 6ba58779114f..0fc622348c6f 100644
--- a/include/clang/Basic/TargetInfo.h
+++ b/include/clang/Basic/TargetInfo.h
@@ -920,10 +920,20 @@ class TargetInfo : public RefCountedBase<TargetInfo> {
     return false;
   }
 
+  /// \brief Identify whether this taret supports multiversioning of functions,
+  /// which requires support for cpu_supports and cpu_is functionality.
+  virtual bool supportsMultiVersioning() const { return false; }
+
   // \brief Validate the contents of the __builtin_cpu_supports(const char*)
   // argument.
   virtual bool validateCpuSupports(StringRef Name) const { return false; }
 
+  // \brief Return the target-specific priority for features/cpus/vendors so
+  // that they can be properly sorted for checking.
+  virtual unsigned multiVersionSortPriority(StringRef Name) const {
+    return 0;
+  }
+
   // \brief Validate the contents of the __builtin_cpu_is(const char*)
   // argument.
   virtual bool validateCpuIs(StringRef Name) const { return false; }
@@ -1041,6 +1051,18 @@ class TargetInfo : public RefCountedBase<TargetInfo> {
     return false;
   }
 
+  /// Check if the target supports CFProtection branch.
+  virtual bool
+  checkCFProtectionBranchSupported(DiagnosticsEngine &Diags) const {
+    return false;
+  }
+
+  /// Check if the target supports CFProtection branch.
+  virtual bool
+  checkCFProtectionReturnSupported(DiagnosticsEngine &Diags) const {
+    return false;
+  }
+
   /// \brief Whether target allows to overalign ABI-specified preferred alignment
   virtual bool allowsLargerPreferedTypeAlignment() const { return true; }
 
diff --git a/include/clang/Basic/TokenKinds.def b/include/clang/Basic/TokenKinds.def
index 6ae8821a834d..f18e06f5b7de 100644
--- a/include/clang/Basic/TokenKinds.def
+++ b/include/clang/Basic/TokenKinds.def
@@ -398,7 +398,6 @@ TYPE_TRAIT_2(__builtin_types_compatible_p, TypeCompatible, KEYNOCXX)
 KEYWORD(__builtin_va_arg            , KEYALL)
 KEYWORD(__extension__               , KEYALL)
 KEYWORD(__float128                  , KEYALL)
-ALIAS("_Float128", __float128       , KEYNOCXX)
 KEYWORD(__imag                      , KEYALL)
 KEYWORD(__int128                    , KEYALL)
 KEYWORD(__label__                   , KEYALL)
@@ -464,6 +463,7 @@ TYPE_TRAIT_1(__has_unique_object_representations,
 TYPE_TRAIT_N(__is_trivially_constructible, IsTriviallyConstructible, KEYCXX)
 TYPE_TRAIT_1(__is_trivially_copyable, IsTriviallyCopyable, KEYCXX)
 TYPE_TRAIT_2(__is_trivially_assignable, IsTriviallyAssignable, KEYCXX)
+TYPE_TRAIT_2(__reference_binds_to_temporary, ReferenceBindsToTemporary, KEYCXX)
 KEYWORD(__underlying_type           , KEYCXX)
 
 // Embarcadero Expression Traits
diff --git a/include/clang/Basic/TypeTraits.h b/include/clang/Basic/TypeTraits.h
index 8ecd63f9c3cb..509e8b43515e 100644
--- a/include/clang/Basic/TypeTraits.h
+++ b/include/clang/Basic/TypeTraits.h
@@ -80,7 +80,8 @@ namespace clang {
     BTT_IsAssignable,
     BTT_IsNothrowAssignable,
     BTT_IsTriviallyAssignable,
-    BTT_Last = BTT_IsTriviallyAssignable,
+    BTT_ReferenceBindsToTemporary,
+    BTT_Last = BTT_ReferenceBindsToTemporary,
     TT_IsConstructible,
     TT_IsNothrowConstructible,
     TT_IsTriviallyConstructible
diff --git a/include/clang/Basic/X86Target.def b/include/clang/Basic/X86Target.def
index 21550fe2dd55..0bb758978eb7 100644
--- a/include/clang/Basic/X86Target.def
+++ b/include/clang/Basic/X86Target.def
@@ -12,6 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef PROC_WITH_FEAT
+#define PROC_WITH_FEAT(ENUM, STRING, IS64BIT, KEYFEATURE)                      \
+  PROC(ENUM, STRING, IS64BIT)
+#endif
 
 #ifndef PROC
 #define PROC(ENUM, STRING, IS64BIT)
@@ -21,6 +25,10 @@
 #define PROC_ALIAS(ENUM, ALIAS)
 #endif
 
+#ifndef FEATURE
+#define FEATURE(ENUM)
+#endif
+
 #define PROC_64_BIT true
 #define PROC_32_BIT false
 
@@ -77,7 +85,7 @@ PROC(Nocona, "nocona", PROC_64_BIT)
 /// \name Core
 /// Core microarchitecture based processors.
 //@{
-PROC(Core2, "core2", PROC_64_BIT)
+PROC_WITH_FEAT(Core2, "core2", PROC_64_BIT, FEATURE_SSSE3)
 
 /// This enumerator, like Yonah, is a bit odd. It is another
 /// codename which GCC no longer accepts as an option to -march, but Clang
@@ -89,10 +97,10 @@ PROC(Penryn, "penryn", PROC_64_BIT)
 /// \name Atom
 /// Atom processors
 //@{
-PROC(Bonnell, "bonnell", PROC_64_BIT)
+PROC_WITH_FEAT(Bonnell, "bonnell", PROC_64_BIT, FEATURE_SSSE3)
 PROC_ALIAS(Bonnell, "atom")
 
-PROC(Silvermont, "silvermont", PROC_64_BIT)
+PROC_WITH_FEAT(Silvermont, "silvermont", PROC_64_BIT, FEATURE_SSE4_2)
 PROC_ALIAS(Silvermont, "slm")
 
 PROC(Goldmont, "goldmont", PROC_64_BIT)
@@ -100,44 +108,44 @@ PROC(Goldmont, "goldmont", PROC_64_BIT)
 
 /// \name Nehalem
 /// Nehalem microarchitecture based processors.
-PROC(Nehalem, "nehalem", PROC_64_BIT)
+PROC_WITH_FEAT(Nehalem, "nehalem", PROC_64_BIT, FEATURE_SSE4_2)
 PROC_ALIAS(Nehalem, "corei7")
 
 /// \name Westmere
 /// Westmere microarchitecture based processors.
-PROC(Westmere, "westmere", PROC_64_BIT)
+PROC_WITH_FEAT(Westmere, "westmere", PROC_64_BIT, FEATURE_PCLMUL)
 
 /// \name Sandy Bridge
 /// Sandy Bridge microarchitecture based processors.
-PROC(SandyBridge, "sandybridge", PROC_64_BIT)
+PROC_WITH_FEAT(SandyBridge, "sandybridge", PROC_64_BIT, FEATURE_AVX)
 PROC_ALIAS(SandyBridge, "corei7-avx")
 
 /// \name Ivy Bridge
 /// Ivy Bridge microarchitecture based processors.
-PROC(IvyBridge, "ivybridge", PROC_64_BIT)
+PROC_WITH_FEAT(IvyBridge, "ivybridge", PROC_64_BIT, FEATURE_AVX)
 PROC_ALIAS(IvyBridge, "core-avx-i")
 
 /// \name Haswell
 /// Haswell microarchitecture based processors.
-PROC(Haswell, "haswell", PROC_64_BIT)
+PROC_WITH_FEAT(Haswell, "haswell", PROC_64_BIT, FEATURE_AVX2)
 PROC_ALIAS(Haswell, "core-avx2")
 
 /// \name Broadwell
 /// Broadwell microarchitecture based processors.
-PROC(Broadwell, "broadwell", PROC_64_BIT)
+PROC_WITH_FEAT(Broadwell, "broadwell", PROC_64_BIT, FEATURE_AVX2)
 
 /// \name Skylake Client
 /// Skylake client microarchitecture based processors.
-PROC(SkylakeClient, "skylake", PROC_64_BIT)
+PROC_WITH_FEAT(SkylakeClient, "skylake", PROC_64_BIT, FEATURE_AVX2)
 
 /// \name Skylake Server
 /// Skylake server microarchitecture based processors.
-PROC(SkylakeServer, "skylake-avx512", PROC_64_BIT)
+PROC_WITH_FEAT(SkylakeServer, "skylake-avx512", PROC_64_BIT, FEATURE_AVX512F)
 PROC_ALIAS(SkylakeServer, "skx")
 
 /// \name Cannonlake Client
 /// Cannonlake client microarchitecture based processors.
-PROC(Cannonlake, "cannonlake", PROC_64_BIT)
+PROC_WITH_FEAT(Cannonlake, "cannonlake", PROC_64_BIT, FEATURE_AVX512VBMI)
 
 /// \name Icelake Client
 /// Icelake client microarchitecture based processors.
@@ -145,11 +153,11 @@ PROC(Icelake, "icelake", PROC_64_BIT)
 
 /// \name Knights Landing
 /// Knights Landing processor.
-PROC(KNL, "knl", PROC_64_BIT)
+PROC_WITH_FEAT(KNL, "knl", PROC_64_BIT, FEATURE_AVX512F)
 
 /// \name Knights Mill
 /// Knights Mill processor.
-PROC(KNM, "knm", PROC_64_BIT)
+PROC_WITH_FEAT(KNM, "knm", PROC_64_BIT, FEATURE_AVX5124FMAPS)
 
 /// \name Lakemont
 /// Lakemont microarchitecture based processors.
@@ -186,30 +194,30 @@ PROC(K8SSE3, "k8-sse3", PROC_64_BIT)
 PROC_ALIAS(K8SSE3, "athlon64-sse3")
 PROC_ALIAS(K8SSE3, "opteron-sse3")
 
-PROC(AMDFAM10, "amdfam10", PROC_64_BIT)
+PROC_WITH_FEAT(AMDFAM10, "amdfam10", PROC_64_BIT, FEATURE_SSE4_A)
 PROC_ALIAS(AMDFAM10, "barcelona")
 //@}
 
 /// \name Bobcat
 /// Bobcat architecture processors.
 //@{
-PROC(BTVER1, "btver1", PROC_64_BIT)
-PROC(BTVER2, "btver2", PROC_64_BIT)
+PROC_WITH_FEAT(BTVER1, "btver1", PROC_64_BIT, FEATURE_SSE4_A)
+PROC_WITH_FEAT(BTVER2, "btver2", PROC_64_BIT, FEATURE_BMI)
 //@}
 
 /// \name Bulldozer
 /// Bulldozer architecture processors.
 //@{
-PROC(BDVER1, "bdver1", PROC_64_BIT)
-PROC(BDVER2, "bdver2", PROC_64_BIT)
-PROC(BDVER3, "bdver3", PROC_64_BIT)
-PROC(BDVER4, "bdver4", PROC_64_BIT)
+PROC_WITH_FEAT(BDVER1, "bdver1", PROC_64_BIT, FEATURE_XOP)
+PROC_WITH_FEAT(BDVER2, "bdver2", PROC_64_BIT, FEATURE_FMA)
+PROC_WITH_FEAT(BDVER3, "bdver3", PROC_64_BIT, FEATURE_FMA)
+PROC_WITH_FEAT(BDVER4, "bdver4", PROC_64_BIT, FEATURE_AVX2)
 //@}
 
 /// \name zen
 /// Zen architecture processors.
 //@{
-PROC(ZNVER1, "znver1", PROC_64_BIT)
+PROC_WITH_FEAT(ZNVER1, "znver1", PROC_64_BIT, FEATURE_AVX2)
 //@}
 
 /// This specification is deprecated and will be removed in the future.
@@ -225,8 +233,45 @@ PROC(x86_64, "x86-64", PROC_64_BIT)
 PROC(Geode, "geode", PROC_32_BIT)
 //@}
 
+// List of CPU Supports features in order.  These need to remain in the order
+// required by attribute 'target' checking.  Note that not all are supported/
+// prioritized by GCC, so synchronization with GCC's implementation may require
+// changing some existing values.
+FEATURE(FEATURE_CMOV)
+FEATURE(FEATURE_MMX)
+FEATURE(FEATURE_SSE)
+FEATURE(FEATURE_SSE2)
+FEATURE(FEATURE_SSE3)
+FEATURE(FEATURE_SSSE3)
+FEATURE(FEATURE_SSE4_A)
+FEATURE(FEATURE_SSE4_1)
+FEATURE(FEATURE_SSE4_2)
+FEATURE(FEATURE_POPCNT)
+FEATURE(FEATURE_AES)
+FEATURE(FEATURE_PCLMUL)
+FEATURE(FEATURE_AVX)
+FEATURE(FEATURE_BMI)
+FEATURE(FEATURE_FMA4)
+FEATURE(FEATURE_XOP)
+FEATURE(FEATURE_FMA)
+FEATURE(FEATURE_BMI2)
+FEATURE(FEATURE_AVX2)
+FEATURE(FEATURE_AVX512F)
+FEATURE(FEATURE_AVX512VL)
+FEATURE(FEATURE_AVX512BW)
+FEATURE(FEATURE_AVX512DQ)
+FEATURE(FEATURE_AVX512CD)
+FEATURE(FEATURE_AVX512ER)
+FEATURE(FEATURE_AVX512PF)
+FEATURE(FEATURE_AVX512VBMI)
+FEATURE(FEATURE_AVX512IFMA)
+FEATURE(FEATURE_AVX5124VNNIW)
+FEATURE(FEATURE_AVX5124FMAPS)
+FEATURE(FEATURE_AVX512VPOPCNTDQ)
 
 #undef PROC_64_BIT
 #undef PROC_32_BIT
+#undef FEATURE
 #undef PROC
 #undef PROC_ALIAS
+#undef PROC_WITH_FEAT
diff --git a/include/clang/Basic/arm_fp16.td b/include/clang/Basic/arm_fp16.td
new file mode 100644
index 000000000000..5c7e437b72fe
--- /dev/null
+++ b/include/clang/Basic/arm_fp16.td
@@ -0,0 +1,131 @@
+//===--- arm_fp16.td - ARM FP16 compiler interface ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the TableGen definitions from which the ARM FP16 header
+//  file will be generated.
+//
+//===----------------------------------------------------------------------===//
+
+include "arm_neon_incl.td"
+
+// ARMv8.2-A FP16 intrinsics.
+let ArchGuard = "defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)" in {
+
+  // Negate
+  def VNEGSH          : SInst<"vneg", "ss", "Sh">;
+
+  // Reciprocal/Sqrt
+  def SCALAR_FRECPSH  : IInst<"vrecps", "sss", "Sh">;
+  def FSQRTSH         : SInst<"vsqrt", "ss", "Sh">;
+  def SCALAR_FRSQRTSH : IInst<"vrsqrts", "sss", "Sh">;
+
+  // Reciprocal Estimate
+  def SCALAR_FRECPEH  : IInst<"vrecpe", "ss", "Sh">;
+
+  // Reciprocal Exponent
+  def SCALAR_FRECPXH  : IInst<"vrecpx", "ss", "Sh">;
+
+  // Reciprocal Square Root Estimate
+  def SCALAR_FRSQRTEH : IInst<"vrsqrte", "ss", "Sh">;
+
+  // Rounding
+  def FRINTZ_S64H     : SInst<"vrnd", "ss", "Sh">;
+  def FRINTA_S64H     : SInst<"vrnda", "ss", "Sh">;
+  def FRINTI_S64H     : SInst<"vrndi", "ss", "Sh">;
+  def FRINTM_S64H     : SInst<"vrndm", "ss", "Sh">;
+  def FRINTN_S64H     : SInst<"vrndn", "ss", "Sh">;
+  def FRINTP_S64H     : SInst<"vrndp", "ss", "Sh">;
+  def FRINTX_S64H     : SInst<"vrndx", "ss", "Sh">;
+
+  // Conversion
+  def SCALAR_SCVTFSH  : SInst<"vcvth_f16", "Ys", "silUsUiUl">;
+  def SCALAR_FCVTZSH  : SInst<"vcvt_s16", "$s", "Sh">;
+  def SCALAR_FCVTZSH1 : SInst<"vcvt_s32", "Is", "Sh">;
+  def SCALAR_FCVTZSH2 : SInst<"vcvt_s64", "Ls", "Sh">;
+  def SCALAR_FCVTZUH  : SInst<"vcvt_u16", "bs", "Sh">;
+  def SCALAR_FCVTZUH1 : SInst<"vcvt_u32", "Us", "Sh">;
+  def SCALAR_FCVTZUH2 : SInst<"vcvt_u64", "Os", "Sh">;
+  def SCALAR_FCVTASH  : SInst<"vcvta_s16", "$s", "Sh">;
+  def SCALAR_FCVTASH1 : SInst<"vcvta_s32", "Is", "Sh">;
+  def SCALAR_FCVTASH2 : SInst<"vcvta_s64", "Ls", "Sh">;
+  def SCALAR_FCVTAUH  : SInst<"vcvta_u16", "bs", "Sh">;
+  def SCALAR_FCVTAUH1 : SInst<"vcvta_u32", "Us", "Sh">;
+  def SCALAR_FCVTAUH2 : SInst<"vcvta_u64", "Os", "Sh">;
+  def SCALAR_FCVTMSH  : SInst<"vcvtm_s16", "$s", "Sh">;
+  def SCALAR_FCVTMSH1 : SInst<"vcvtm_s32", "Is", "Sh">;
+  def SCALAR_FCVTMSH2 : SInst<"vcvtm_s64", "Ls", "Sh">;
+  def SCALAR_FCVTMUH  : SInst<"vcvtm_u16", "bs", "Sh">;
+  def SCALAR_FCVTMUH1 : SInst<"vcvtm_u32", "Us", "Sh">;
+  def SCALAR_FCVTMUH2 : SInst<"vcvtm_u64", "Os", "Sh">;
+  def SCALAR_FCVTNSH  : SInst<"vcvtn_s16", "$s", "Sh">;
+  def SCALAR_FCVTNSH1 : SInst<"vcvtn_s32", "Is", "Sh">;
+  def SCALAR_FCVTNSH2 : SInst<"vcvtn_s64", "Ls", "Sh">;
+  def SCALAR_FCVTNUH  : SInst<"vcvtn_u16", "bs", "Sh">;
+  def SCALAR_FCVTNUH1 : SInst<"vcvtn_u32", "Us", "Sh">;
+  def SCALAR_FCVTNUH2 : SInst<"vcvtn_u64", "Os", "Sh">;
+  def SCALAR_FCVTPSH  : SInst<"vcvtp_s16", "$s", "Sh">;
+  def SCALAR_FCVTPSH1 : SInst<"vcvtp_s32", "Is", "Sh">;
+  def SCALAR_FCVTPSH2 : SInst<"vcvtp_s64", "Ls", "Sh">;
+  def SCALAR_FCVTPUH  : SInst<"vcvtp_u16", "bs", "Sh">;
+  def SCALAR_FCVTPUH1 : SInst<"vcvtp_u32", "Us", "Sh">;
+  def SCALAR_FCVTPUH2 : SInst<"vcvtp_u64", "Os", "Sh">;
+
+  def SCALAR_SCVTFSHO : SInst<"vcvth_n_f16", "Ysi", "silUsUiUl">;
+  def SCALAR_FCVTZSHO : SInst<"vcvt_n_s16", "$si", "Sh">;
+  def SCALAR_FCVTZSH1O: SInst<"vcvt_n_s32", "Isi", "Sh">;
+  def SCALAR_FCVTZSH2O: SInst<"vcvt_n_s64", "Lsi", "Sh">;
+  def SCALAR_FCVTZUHO : SInst<"vcvt_n_u16", "bsi", "Sh">;
+  def SCALAR_FCVTZUH1O: SInst<"vcvt_n_u32", "Usi", "Sh">;
+  def SCALAR_FCVTZUH2O: SInst<"vcvt_n_u64", "Osi", "Sh">;
+
+  // Comparison
+  def SCALAR_CMEQRH   : SInst<"vceq", "bss", "Sh">;
+  def SCALAR_CMEQZH   : SInst<"vceqz", "bs", "Sh">;
+  def SCALAR_CMGERH   : SInst<"vcge", "bss", "Sh">;
+  def SCALAR_CMGEZH   : SInst<"vcgez", "bs", "Sh">;
+  def SCALAR_CMGTRH   : SInst<"vcgt", "bss", "Sh">;
+  def SCALAR_CMGTZH   : SInst<"vcgtz", "bs", "Sh">;
+  def SCALAR_CMLERH   : SInst<"vcle", "bss", "Sh">;
+  def SCALAR_CMLEZH   : SInst<"vclez", "bs", "Sh">;
+  def SCALAR_CMLTH    : SInst<"vclt", "bss", "Sh">;
+  def SCALAR_CMLTZH   : SInst<"vcltz", "bs", "Sh">;
+
+  // Absolute Compare Mask Greater Than Or Equal
+  def SCALAR_FACGEH   : IInst<"vcage", "bss", "Sh">;
+  def SCALAR_FACLEH   : IInst<"vcale", "bss", "Sh">;
+
+  // Absolute Compare Mask Greater Than
+  def SCALAR_FACGT    : IInst<"vcagt", "bss", "Sh">;
+  def SCALAR_FACLT    : IInst<"vcalt", "bss", "Sh">;
+
+  // Scalar Absolute Value
+  def SCALAR_ABSH     : SInst<"vabs", "ss", "Sh">;
+
+  // Scalar Absolute Difference
+  def SCALAR_ABDH: IInst<"vabd", "sss", "Sh">;
+
+  // Add/Sub
+  def VADDSH          : SInst<"vadd", "sss", "Sh">;
+  def VSUBHS          : SInst<"vsub", "sss", "Sh">;
+
+  // Max/Min
+  def VMAXHS          : SInst<"vmax", "sss", "Sh">;
+  def VMINHS          : SInst<"vmin", "sss", "Sh">;
+  def FMAXNMHS        : SInst<"vmaxnm", "sss", "Sh">;
+  def FMINNMHS        : SInst<"vminnm", "sss", "Sh">;
+
+  // Multiplication/Division
+  def VMULHS          : SInst<"vmul", "sss", "Sh">;
+  def MULXHS          : SInst<"vmulx", "sss", "Sh">;
+  def FDIVHS          : SInst<"vdiv", "sss",  "Sh">;
+
+  // Vector fused multiply-add operations
+  def VFMAHS          : SInst<"vfma", "ssss", "Sh">;
+  def VFMSHS          : SInst<"vfms", "ssss", "Sh">;
+}
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index d5c16a91a34f..37aac58a1130 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -11,309 +11,8 @@
 //  file will be generated.  See ARM document DUI0348B.
 //
 //===----------------------------------------------------------------------===//
-//
-// Each intrinsic is a subclass of the Inst class. An intrinsic can either
-// generate a __builtin_* call or it can expand to a set of generic operations.
-//
-// The operations are subclasses of Operation providing a list of DAGs, the
-// last of which is the return value. The available DAG nodes are documented
-// below.
-//
-//===----------------------------------------------------------------------===//
-
-// The base Operation class. All operations must subclass this.
-class Operation<list<dag> ops=[]> {
-  list<dag> Ops = ops;
-  bit Unavailable = 0;
-}
-// An operation that only contains a single DAG.
-class Op<dag op> : Operation<[op]>;
-// A shorter version of Operation - takes a list of DAGs. The last of these will
-// be the return value.
-class LOp<list<dag> ops> : Operation<ops>;
-
-// These defs and classes are used internally to implement the SetTheory
-// expansion and should be ignored.
-foreach Index = 0-63 in
-  def sv##Index;
-class MaskExpand;
-
-//===----------------------------------------------------------------------===//
-// Available operations
-//===----------------------------------------------------------------------===//
-
-// DAG arguments can either be operations (documented below) or variables.
-// Variables are prefixed with '$'. There are variables for each input argument,
-// with the name $pN, where N starts at zero. So the zero'th argument will be
-// $p0, the first $p1 etc.
-
-// op - Binary or unary operator, depending on the number of arguments. The
-//      operator itself is just treated as a raw string and is not checked.
-// example: (op "+", $p0, $p1) -> "__p0 + __p1".
-//          (op "-", $p0)      -> "-__p0"
-def op;
-// call - Invoke another intrinsic. The input types are type checked and
-//        disambiguated. If there is no intrinsic defined that takes
-//        the given types (or if there is a type ambiguity) an error is
-//        generated at tblgen time. The name of the intrinsic is the raw
-//        name as given to the Inst class (not mangled).
-// example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)"
-//            (assuming $p0 has type int16x8_t).
-def call;
-// cast - Perform a cast to a different type. This gets emitted as a static
-//        C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use
-//        "bitcast".
-//
-//        The syntax is (cast MOD* VAL). The last argument is the value to
-//        cast, preceded by a sequence of type modifiers. The target type
-//        starts off as the type of VAL, and is modified by MOD in sequence.
-//        The available modifiers are:
-//          - $X  - Take the type of parameter/variable X. For example:
-//                  (cast $p0, $p1) would cast $p1 to the type of $p0.
-//          - "R" - The type of the return type.
-//          - A typedef string - A NEON or stdint.h type that is then parsed.
-//                               for example: (cast "uint32x4_t", $p0).
-//          - "U" - Make the type unsigned.
-//          - "S" - Make the type signed.
-//          - "H" - Halve the number of lanes in the type.
-//          - "D" - Double the number of lanes in the type.
-//          - "8" - Convert type to an equivalent vector of 8-bit signed
-//                  integers.
-// example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return
-//           value is of type "int32x4_t".
-//          (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0
-//           has type float64x1_t or any other vector type of 64 bits).
-//          (cast "int32_t", $p2) -> "(int32_t)__p2"
-def cast;
-// bitcast - Same as "cast", except a reinterpret-cast is produced:
-//             (bitcast "T", $p0) -> "*(T*)&__p0".
-//           The VAL argument is saved to a temporary so it can be used
-//           as an l-value.
-def bitcast;
-// dup - Take a scalar argument and create a vector by duplicating it into
-//       all lanes. The type of the vector is the base type of the intrinsic.
-// example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type
-//          is uint32x2_t).
-def dup;
-// splat - Take a vector and a lane index, and return a vector of the same type
-//         containing repeated instances of the source vector at the lane index.
-// example: (splat $p0, $p1) ->
-//            "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)"
-//          (assuming __p0 has four elements).
-def splat;
-// save_temp - Create a temporary (local) variable. The variable takes a name
-//             based on the zero'th parameter and can be referenced using
-//             using that name in subsequent DAGs in the same
-//             operation. The scope of a temp is the operation. If a variable
-//             with the given name already exists, an error will be given at
-//             tblgen time.
-// example: [(save_temp $var, (call "foo", $p0)),
-//           (op "+", $var, $p1)] ->
-//              "int32x2_t __var = foo(__p0); return __var + __p1;"
-def save_temp;
-// name_replace - Return the name of the current intrinsic with the first
-//                argument replaced by the second argument. Raises an error if
-//                the first argument does not exist in the intrinsic name.
-// example: (call (name_replace "_high_", "_"), $p0) (to call the non-high
-//            version of this intrinsic).
-def name_replace;
-// literal - Create a literal piece of code. The code is treated as a raw
-//           string, and must be given a type. The type is a stdint.h or
-//           NEON intrinsic type as given to (cast).
-// example: (literal "int32_t", "0")
-def literal;
-// shuffle - Create a vector shuffle. The syntax is (shuffle ARG0, ARG1, MASK).
-//           The MASK argument is a set of elements. The elements are generated
-//           from the two special defs "mask0" and "mask1". "mask0" expands to
-//           the lane indices in sequence for ARG0, and "mask1" expands to
-//           the lane indices in sequence for ARG1. They can be used as-is, e.g.
-//
-//             (shuffle $p0, $p1, mask0) -> $p0
-//             (shuffle $p0, $p1, mask1) -> $p1
-//
-//           or, more usefully, they can be manipulated using the SetTheory
-//           operators plus some extra operators defined in the NEON emitter.
-//           The operators are described below.
-// example: (shuffle $p0, $p1, (add (highhalf mask0), (highhalf mask1))) ->
-//            A concatenation of the high halves of the input vectors.
-def shuffle;
-
-// add, interleave, decimate: These set operators are vanilla SetTheory
-// operators and take their normal definition.
-def add;
-def interleave;
-def decimate;
-// rotl - Rotate set left by a number of elements.
-// example: (rotl mask0, 3) -> [3, 4, 5, 6, 0, 1, 2]
-def rotl;
-// rotl - Rotate set right by a number of elements.
-// example: (rotr mask0, 3) -> [4, 5, 6, 0, 1, 2, 3]
-def rotr;
-// highhalf - Take only the high half of the input.
-// example: (highhalf mask0) -> [4, 5, 6, 7] (assuming mask0 had 8 elements)
-def highhalf;
-// highhalf - Take only the low half of the input.
-// example: (lowhalf mask0) -> [0, 1, 2, 3] (assuming mask0 had 8 elements)
-def lowhalf;
-// rev - Perform a variable-width reversal of the elements. The zero'th argument
-//       is a width in bits to reverse. The lanes this maps to is determined
-//       based on the element width of the underlying type.
-// example: (rev 32, mask0) -> [3, 2, 1, 0, 7, 6, 5, 4] (if 8-bit elements)
-// example: (rev 32, mask0) -> [1, 0, 3, 2]             (if 16-bit elements)
-def rev;
-// mask0 - The initial sequence of lanes for shuffle ARG0
-def mask0 : MaskExpand;
-// mask0 - The initial sequence of lanes for shuffle ARG1
-def mask1 : MaskExpand;
-
-def OP_NONE  : Operation;
-def OP_UNAVAILABLE : Operation {
-  let Unavailable = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Instruction definitions
-//===----------------------------------------------------------------------===//
 
-// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and
-// a sequence of typespecs.
-//
-// The name is the base name of the intrinsic, for example "vget_lane". This is
-// then mangled by the tblgen backend to add type information ("vget_lane_s16").
-//
-// A typespec is a sequence of uppercase characters (modifiers) followed by one
-// lowercase character. A typespec encodes a particular "base type" of the
-// intrinsic.
-//
-// An example typespec is "Qs" - quad-size short - uint16x8_t. The available
-// typespec codes are given below.
-//
-// The string given to an Inst class is a sequence of typespecs. The intrinsic
-// is instantiated for every typespec in the sequence. For example "sdQsQd".
-//
-// The prototype is a string that defines the return type of the intrinsic
-// and the type of each argument. The return type and every argument gets a
-// "modifier" that can change in some way the "base type" of the intrinsic.
-//
-// The modifier 'd' means "default" and does not modify the base type in any
-// way. The available modifiers are given below.
-//
-// Typespecs
-// ---------
-// c: char
-// s: short
-// i: int
-// l: long
-// k: 128-bit long
-// f: float
-// h: half-float
-// d: double
-//
-// Typespec modifiers
-// ------------------
-// S: scalar, only used for function mangling.
-// U: unsigned
-// Q: 128b
-// H: 128b without mangling 'q'
-// P: polynomial
-//
-// Prototype modifiers
-// -------------------
-// prototype: return (arg, arg, ...)
-//
-// v: void
-// t: best-fit integer (int/poly args)
-// x: signed integer   (int/float args)
-// u: unsigned integer (int/float args)
-// f: float (int args)
-// F: double (int args)
-// H: half (int args)
-// d: default
-// g: default, ignore 'Q' size modifier.
-// j: default, force 'Q' size modifier.
-// w: double width elements, same num elts
-// n: double width elements, half num elts
-// h: half width elements, double num elts
-// q: half width elements, quad num elts
-// e: half width elements, double num elts, unsigned
-// m: half width elements, same num elts
-// i: constant int
-// l: constant uint64
-// s: scalar of element type
-// z: scalar of half width element type, signed
-// r: scalar of double width element type, signed
-// a: scalar of element type (splat to vector type)
-// b: scalar of unsigned integer/long type (int/float args)
-// $: scalar of signed integer/long type (int/float args)
-// y: scalar of float
-// o: scalar of double
-// k: default elt width, double num elts
-// 2,3,4: array of default vectors
-// B,C,D: array of default elts, force 'Q' size modifier.
-// p: pointer type
-// c: const pointer type
-
-// Every intrinsic subclasses Inst.
-class Inst <string n, string p, string t, Operation o> {
-  string Name = n;
-  string Prototype = p;
-  string Types = t;
-  string ArchGuard = "";
-
-  Operation Operation = o;
-  bit CartesianProductOfTypes = 0;
-  bit BigEndianSafe = 0;
-  bit isShift = 0;
-  bit isScalarShift = 0;
-  bit isScalarNarrowShift = 0;
-  bit isVCVT_N = 0;
-  // For immediate checks: the immediate will be assumed to specify the lane of
-  // a Q register. Only used for intrinsics which end up calling polymorphic
-  // builtins.
-  bit isLaneQ = 0;
-
-  // Certain intrinsics have different names than their representative
-  // instructions. This field allows us to handle this correctly when we
-  // are generating tests.
-  string InstName = "";
-
-  // Certain intrinsics even though they are not a WOpInst or LOpInst,
-  // generate a WOpInst/LOpInst instruction (see below for definition
-  // of a WOpInst/LOpInst). For testing purposes we need to know
-  // this. Ex: vset_lane which outputs vmov instructions.
-  bit isHiddenWInst = 0;
-  bit isHiddenLInst = 0;
-}
-
-// The following instruction classes are implemented via builtins.
-// These declarations are used to generate Builtins.def:
-//
-// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8", "p8")
-// IInst: Instruction with generic integer suffix (e.g., "i8")
-// WInst: Instruction with only bit size suffix (e.g., "8")
-class SInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
-class IInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
-class WInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
-
-// The following instruction classes are implemented via operators
-// instead of builtins. As such these declarations are only used for
-// the purpose of generating tests.
-//
-// SOpInst:       Instruction with signed/unsigned suffix (e.g., "s8",
-//                "u8", "p8").
-// IOpInst:       Instruction with generic integer suffix (e.g., "i8").
-// WOpInst:       Instruction with bit size only suffix (e.g., "8").
-// LOpInst:       Logical instruction with no bit size suffix.
-// NoTestOpInst:  Intrinsic that has no corresponding instruction.
-class SOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
-class IOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
-class WOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
-class LOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
-class NoTestOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
-
-//===----------------------------------------------------------------------===//
-// Operations
-//===----------------------------------------------------------------------===//
+include "arm_neon_incl.td"
 
 def OP_ADD      : Op<(op "+", $p0, $p1)>;
 def OP_ADDL     : Op<(op "+", (call "vmovl", $p0), (call "vmovl", $p1))>;
diff --git a/include/clang/Basic/arm_neon_incl.td b/include/clang/Basic/arm_neon_incl.td
new file mode 100644
index 000000000000..e6a6d258ddbf
--- /dev/null
+++ b/include/clang/Basic/arm_neon_incl.td
@@ -0,0 +1,313 @@
+//===--- arm_neon_incl.td - ARM NEON compiler interface ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines data structures shared by arm_neon.td and arm_fp16.td.
+//  It constains base operation classes, operations, instructions, instruction
+//  modifiers, etc.  
+//
+//===----------------------------------------------------------------------===//
+//
+// Each intrinsic is a subclass of the Inst class. An intrinsic can either
+// generate a __builtin_* call or it can expand to a set of generic operations.
+//
+// The operations are subclasses of Operation providing a list of DAGs, the
+// last of which is the return value. The available DAG nodes are documented
+// below.
+//
+//===----------------------------------------------------------------------===//
+
+// The base Operation class. All operations must subclass this.
+class Operation<list<dag> ops=[]> {
+  list<dag> Ops = ops;
+  bit Unavailable = 0;
+}
+// An operation that only contains a single DAG.
+class Op<dag op> : Operation<[op]>;
+// A shorter version of Operation - takes a list of DAGs. The last of these will
+// be the return value.
+class LOp<list<dag> ops> : Operation<ops>;
+
+// These defs and classes are used internally to implement the SetTheory
+// expansion and should be ignored.
+foreach Index = 0-63 in
+  def sv##Index;
+class MaskExpand;
+
+//===----------------------------------------------------------------------===//
+// Available operations
+//===----------------------------------------------------------------------===//
+
+// DAG arguments can either be operations (documented below) or variables.
+// Variables are prefixed with '$'. There are variables for each input argument,
+// with the name $pN, where N starts at zero. So the zero'th argument will be
+// $p0, the first $p1 etc.
+
+// op - Binary or unary operator, depending on the number of arguments. The
+//      operator itself is just treated as a raw string and is not checked.
+// example: (op "+", $p0, $p1) -> "__p0 + __p1".
+//          (op "-", $p0)      -> "-__p0"
+def op;
+// call - Invoke another intrinsic. The input types are type checked and
+//        disambiguated. If there is no intrinsic defined that takes
+//        the given types (or if there is a type ambiguity) an error is
+//        generated at tblgen time. The name of the intrinsic is the raw
+//        name as given to the Inst class (not mangled).
+// example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)"
+//            (assuming $p0 has type int16x8_t).
+def call;
+// cast - Perform a cast to a different type. This gets emitted as a static
+//        C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use
+//        "bitcast".
+//
+//        The syntax is (cast MOD* VAL). The last argument is the value to
+//        cast, preceded by a sequence of type modifiers. The target type
+//        starts off as the type of VAL, and is modified by MOD in sequence.
+//        The available modifiers are:
+//          - $X  - Take the type of parameter/variable X. For example:
+//                  (cast $p0, $p1) would cast $p1 to the type of $p0.
+//          - "R" - The type of the return type.
+//          - A typedef string - A NEON or stdint.h type that is then parsed.
+//                               for example: (cast "uint32x4_t", $p0).
+//          - "U" - Make the type unsigned.
+//          - "S" - Make the type signed.
+//          - "H" - Halve the number of lanes in the type.
+//          - "D" - Double the number of lanes in the type.
+//          - "8" - Convert type to an equivalent vector of 8-bit signed
+//                  integers.
+// example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return
+//           value is of type "int32x4_t".
+//          (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0
+//           has type float64x1_t or any other vector type of 64 bits).
+//          (cast "int32_t", $p2) -> "(int32_t)__p2"
+def cast;
+// bitcast - Same as "cast", except a reinterpret-cast is produced:
+//             (bitcast "T", $p0) -> "*(T*)&__p0".
+//           The VAL argument is saved to a temporary so it can be used
+//           as an l-value.
+def bitcast;
+// dup - Take a scalar argument and create a vector by duplicating it into
+//       all lanes. The type of the vector is the base type of the intrinsic.
+// example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type
+//          is uint32x2_t).
+def dup;
+// splat - Take a vector and a lane index, and return a vector of the same type
+//         containing repeated instances of the source vector at the lane index.
+// example: (splat $p0, $p1) ->
+//            "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)"
+//          (assuming __p0 has four elements).
+def splat;
+// save_temp - Create a temporary (local) variable. The variable takes a name
+//             based on the zero'th parameter and can be referenced using
+//             using that name in subsequent DAGs in the same
+//             operation. The scope of a temp is the operation. If a variable
+//             with the given name already exists, an error will be given at
+//             tblgen time.
+// example: [(save_temp $var, (call "foo", $p0)),
+//           (op "+", $var, $p1)] ->
+//              "int32x2_t __var = foo(__p0); return __var + __p1;"
+def save_temp;
+// name_replace - Return the name of the current intrinsic with the first
+//                argument replaced by the second argument. Raises an error if
+//                the first argument does not exist in the intrinsic name.
+// example: (call (name_replace "_high_", "_"), $p0) (to call the non-high
+//            version of this intrinsic).
+def name_replace;
+// literal - Create a literal piece of code. The code is treated as a raw
+//           string, and must be given a type. The type is a stdint.h or
+//           NEON intrinsic type as given to (cast).
+// example: (literal "int32_t", "0")
+def literal;
+// shuffle - Create a vector shuffle. The syntax is (shuffle ARG0, ARG1, MASK).
+//           The MASK argument is a set of elements. The elements are generated
+//           from the two special defs "mask0" and "mask1". "mask0" expands to
+//           the lane indices in sequence for ARG0, and "mask1" expands to
+//           the lane indices in sequence for ARG1. They can be used as-is, e.g.
+//
+//             (shuffle $p0, $p1, mask0) -> $p0
+//             (shuffle $p0, $p1, mask1) -> $p1
+//
+//           or, more usefully, they can be manipulated using the SetTheory
+//           operators plus some extra operators defined in the NEON emitter.
+//           The operators are described below.
+// example: (shuffle $p0, $p1, (add (highhalf mask0), (highhalf mask1))) ->
+//            A concatenation of the high halves of the input vectors.
+def shuffle;
+
+// add, interleave, decimate: These set operators are vanilla SetTheory
+// operators and take their normal definition.
+def add;
+def interleave;
+def decimate;
+// rotl - Rotate set left by a number of elements.
+// example: (rotl mask0, 3) -> [3, 4, 5, 6, 0, 1, 2]
+def rotl;
+// rotl - Rotate set right by a number of elements.
+// example: (rotr mask0, 3) -> [4, 5, 6, 0, 1, 2, 3]
+def rotr;
+// highhalf - Take only the high half of the input.
+// example: (highhalf mask0) -> [4, 5, 6, 7] (assuming mask0 had 8 elements)
+def highhalf;
+// highhalf - Take only the low half of the input.
+// example: (lowhalf mask0) -> [0, 1, 2, 3] (assuming mask0 had 8 elements)
+def lowhalf;
+// rev - Perform a variable-width reversal of the elements. The zero'th argument
+//       is a width in bits to reverse. The lanes this maps to is determined
+//       based on the element width of the underlying type.
+// example: (rev 32, mask0) -> [3, 2, 1, 0, 7, 6, 5, 4] (if 8-bit elements)
+// example: (rev 32, mask0) -> [1, 0, 3, 2]             (if 16-bit elements)
+def rev;
+// mask0 - The initial sequence of lanes for shuffle ARG0
+def mask0 : MaskExpand;
+// mask0 - The initial sequence of lanes for shuffle ARG1
+def mask1 : MaskExpand;
+
+def OP_NONE  : Operation;
+def OP_UNAVAILABLE : Operation {
+  let Unavailable = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions
+//===----------------------------------------------------------------------===//
+
+// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and
+// a sequence of typespecs.
+//
+// The name is the base name of the intrinsic, for example "vget_lane". This is
+// then mangled by the tblgen backend to add type information ("vget_lane_s16").
+//
+// A typespec is a sequence of uppercase characters (modifiers) followed by one
+// lowercase character. A typespec encodes a particular "base type" of the
+// intrinsic.
+//
+// An example typespec is "Qs" - quad-size short - uint16x8_t. The available
+// typespec codes are given below.
+//
+// The string given to an Inst class is a sequence of typespecs. The intrinsic
+// is instantiated for every typespec in the sequence. For example "sdQsQd".
+//
+// The prototype is a string that defines the return type of the intrinsic
+// and the type of each argument. The return type and every argument gets a
+// "modifier" that can change in some way the "base type" of the intrinsic.
+//
+// The modifier 'd' means "default" and does not modify the base type in any
+// way. The available modifiers are given below.
+//
+// Typespecs
+// ---------
+// c: char
+// s: short
+// i: int
+// l: long
+// k: 128-bit long
+// f: float
+// h: half-float
+// d: double
+//
+// Typespec modifiers
+// ------------------
+// S: scalar, only used for function mangling.
+// U: unsigned
+// Q: 128b
+// H: 128b without mangling 'q'
+// P: polynomial
+//
+// Prototype modifiers
+// -------------------
+// prototype: return (arg, arg, ...)
+//
+// v: void
+// t: best-fit integer (int/poly args)
+// x: signed integer   (int/float args)
+// u: unsigned integer (int/float args)
+// f: float (int args)
+// F: double (int args)
+// H: half (int args)
+// d: default
+// g: default, ignore 'Q' size modifier.
+// j: default, force 'Q' size modifier.
+// w: double width elements, same num elts
+// n: double width elements, half num elts
+// h: half width elements, double num elts
+// q: half width elements, quad num elts
+// e: half width elements, double num elts, unsigned
+// m: half width elements, same num elts
+// i: constant int
+// l: constant uint64
+// s: scalar of element type
+// z: scalar of half width element type, signed
+// r: scalar of double width element type, signed
+// a: scalar of element type (splat to vector type)
+// b: scalar of unsigned integer/long type (int/float args)
+// $: scalar of signed integer/long type (int/float args)
+// y: scalar of float
+// o: scalar of double
+// k: default elt width, double num elts
+// 2,3,4: array of default vectors
+// B,C,D: array of default elts, force 'Q' size modifier.
+// p: pointer type
+// c: const pointer type
+
+// Every intrinsic subclasses Inst.
+class Inst <string n, string p, string t, Operation o> {
+  string Name = n;
+  string Prototype = p;
+  string Types = t;
+  string ArchGuard = "";
+
+  Operation Operation = o;
+  bit CartesianProductOfTypes = 0;
+  bit BigEndianSafe = 0;
+  bit isShift = 0;
+  bit isScalarShift = 0;
+  bit isScalarNarrowShift = 0;
+  bit isVCVT_N = 0;
+  // For immediate checks: the immediate will be assumed to specify the lane of
+  // a Q register. Only used for intrinsics which end up calling polymorphic
+  // builtins.
+  bit isLaneQ = 0;
+
+  // Certain intrinsics have different names than their representative
+  // instructions. This field allows us to handle this correctly when we
+  // are generating tests.
+  string InstName = "";
+
+  // Certain intrinsics even though they are not a WOpInst or LOpInst,
+  // generate a WOpInst/LOpInst instruction (see below for definition
+  // of a WOpInst/LOpInst). For testing purposes we need to know
+  // this. Ex: vset_lane which outputs vmov instructions.
+  bit isHiddenWInst = 0;
+  bit isHiddenLInst = 0;
+}
+
+// The following instruction classes are implemented via builtins.
+// These declarations are used to generate Builtins.def:
+//
+// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8", "p8")
+// IInst: Instruction with generic integer suffix (e.g., "i8")
+// WInst: Instruction with only bit size suffix (e.g., "8")
+class SInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class IInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class WInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+
+// The following instruction classes are implemented via operators
+// instead of builtins. As such these declarations are only used for
+// the purpose of generating tests.
+//
+// SOpInst:       Instruction with signed/unsigned suffix (e.g., "s8",
+//                "u8", "p8").
+// IOpInst:       Instruction with generic integer suffix (e.g., "i8").
+// WOpInst:       Instruction with bit size only suffix (e.g., "8").
+// LOpInst:       Logical instruction with no bit size suffix.
+// NoTestOpInst:  Intrinsic that has no corresponding instruction.
+class SOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class IOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class WOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class LOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class NoTestOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
diff --git a/include/clang/CodeGen/CGFunctionInfo.h b/include/clang/CodeGen/CGFunctionInfo.h
index d1f05124d5e5..42f963bad05a 100644
--- a/include/clang/CodeGen/CGFunctionInfo.h
+++ b/include/clang/CodeGen/CGFunctionInfo.h
@@ -95,6 +95,7 @@ class ABIArgInfo {
   bool SRetAfterThis : 1;   // isIndirect()
   bool InReg : 1;           // isDirect() || isExtend() || isIndirect()
   bool CanBeFlattened: 1;   // isDirect()
+  bool SignExt : 1;         // isExtend()
 
   bool canHavePaddingType() const {
     return isDirect() || isExtend() || isIndirect() || isExpand();
@@ -133,15 +134,38 @@ class ABIArgInfo {
     AI.setInReg(true);
     return AI;
   }
-  static ABIArgInfo getExtend(llvm::Type *T = nullptr) {
+
+  static ABIArgInfo getSignExtend(QualType Ty, llvm::Type *T = nullptr) {
+    assert(Ty->isIntegralOrEnumerationType() && "Unexpected QualType");
+    auto AI = ABIArgInfo(Extend);
+    AI.setCoerceToType(T);
+    AI.setPaddingType(nullptr);
+    AI.setDirectOffset(0);
+    AI.setSignExt(true);
+    return AI;
+  }
+
+  static ABIArgInfo getZeroExtend(QualType Ty, llvm::Type *T = nullptr) {
+    assert(Ty->isIntegralOrEnumerationType() && "Unexpected QualType");
     auto AI = ABIArgInfo(Extend);
     AI.setCoerceToType(T);
     AI.setPaddingType(nullptr);
     AI.setDirectOffset(0);
+    AI.setSignExt(false);
     return AI;
   }
-  static ABIArgInfo getExtendInReg(llvm::Type *T = nullptr) {
-    auto AI = getExtend(T);
+
+  // ABIArgInfo will record the argument as being extended based on the sign
+  // of its type.
+  static ABIArgInfo getExtend(QualType Ty, llvm::Type *T = nullptr) {
+    assert(Ty->isIntegralOrEnumerationType() && "Unexpected QualType");
+    if (Ty->hasSignedIntegerRepresentation())
+      return getSignExtend(Ty, T);
+    return getZeroExtend(Ty, T);
+  }
+
+  static ABIArgInfo getExtendInReg(QualType Ty, llvm::Type *T = nullptr) {
+    auto AI = getExtend(Ty, T);
     AI.setInReg(true);
     return AI;
   }
@@ -254,6 +278,15 @@ class ABIArgInfo {
     DirectOffset = Offset;
   }
 
+  bool isSignExt() const {
+    assert(isExtend() && "Invalid kind!");
+    return SignExt;
+  }
+  void setSignExt(bool SExt) {
+    assert(isExtend() && "Invalid kind!");
+    SignExt = SExt;
+  }
+
   llvm::Type *getPaddingType() const {
     return (canHavePaddingType() ? PaddingType : nullptr);
   }
diff --git a/include/clang/CodeGen/ConstantInitBuilder.h b/include/clang/CodeGen/ConstantInitBuilder.h
index 113d86d82c10..d1388f0c51b9 100644
--- a/include/clang/CodeGen/ConstantInitBuilder.h
+++ b/include/clang/CodeGen/ConstantInitBuilder.h
@@ -295,7 +295,7 @@ class ConstantAggregateBuilderBase {
     slot = value;
   }
 
-  /// Produce an address which will eventually point to the the next
+  /// Produce an address which will eventually point to the next
   /// position to be filled.  This is computed with an indexed
   /// getelementptr rather than by computing offsets.
   ///
diff --git a/include/clang/CodeGen/SwiftCallingConv.h b/include/clang/CodeGen/SwiftCallingConv.h
index 23db43e6739c..8ea2b9d35db3 100644
--- a/include/clang/CodeGen/SwiftCallingConv.h
+++ b/include/clang/CodeGen/SwiftCallingConv.h
@@ -116,6 +116,12 @@ class SwiftAggLowering {
   void splitVectorEntry(unsigned index);
 };
 
+/// Should an aggregate which expands to the given type sequence
+/// be passed/returned indirectly under swiftcall?
+bool shouldPassIndirectly(CodeGenModule &CGM,
+                          ArrayRef<llvm::Type*> types,
+                          bool asReturnValue);
+
 /// Return the maximum voluntary integer size for the current target.
 CharUnits getMaximumVoluntaryIntegerSize(CodeGenModule &CGM);
 
diff --git a/include/clang/Config/config.h.cmake b/include/clang/Config/config.h.cmake
index daac9b7b45d1..5f420195e85c 100644
--- a/include/clang/Config/config.h.cmake
+++ b/include/clang/Config/config.h.cmake
@@ -35,6 +35,10 @@
 /* Directories clang will search for headers */
 #define C_INCLUDE_DIRS "${C_INCLUDE_DIRS}"
 
+/* Directories clang will search for configuration files */
+#cmakedefine CLANG_CONFIG_FILE_SYSTEM_DIR "${CLANG_CONFIG_FILE_SYSTEM_DIR}"
+#cmakedefine CLANG_CONFIG_FILE_USER_DIR "${CLANG_CONFIG_FILE_USER_DIR}"
+
 /* Default <path> to all compiler invocations for --sysroot=<path>. */
 #define DEFAULT_SYSROOT "${DEFAULT_SYSROOT}"
 
diff --git a/include/clang/Driver/Action.h b/include/clang/Driver/Action.h
index 72456d34a0d9..1e182d88f11a 100644
--- a/include/clang/Driver/Action.h
+++ b/include/clang/Driver/Action.h
@@ -59,6 +59,7 @@ class Action {
     AnalyzeJobClass,
     MigrateJobClass,
     CompileJobClass,
+    FortranFrontendJobClass,
     BackendJobClass,
     AssembleJobClass,
     LinkJobClass,
@@ -400,6 +401,16 @@ class MigrateJobAction : public JobAction {
   }
 };
 
+class FortranFrontendJobAction : public JobAction {
+  void anchor() override;
+public:
+  FortranFrontendJobAction(Action *Input, types::ID OutputType);
+
+  static bool classof(const Action *A) {
+    return A->getKind() == FortranFrontendJobClass;
+  }
+};
+
 class CompileJobAction : public JobAction {
   void anchor() override;
 public:
diff --git a/include/clang/Driver/CLCompatOptions.td b/include/clang/Driver/CLCompatOptions.td
index c1f0a89b5dc8..34a1218b25e4 100644
--- a/include/clang/Driver/CLCompatOptions.td
+++ b/include/clang/Driver/CLCompatOptions.td
@@ -235,6 +235,8 @@ def _SLASH_Fi : CLCompileJoined<"Fi">,
 def _SLASH_Fo : CLCompileJoined<"Fo">,
   HelpText<"Set output object file, or directory (ends in / or \\) (with /c)">,
   MetaVarName<"<file or directory>">;
+def _SLASH_Guard : CLJoined<"guard:">,
+  HelpText<"Enable Control Flow Guard with /guard:cf">;
 def _SLASH_GX : CLFlag<"GX">,
   HelpText<"Enable exception handling">;
 def _SLASH_GX_ : CLFlag<"GX-">,
@@ -324,6 +326,7 @@ def _SLASH_kernel_ : CLIgnoredFlag<"kernel-">;
 def _SLASH_nologo : CLIgnoredFlag<"nologo">;
 def _SLASH_Og : CLIgnoredFlag<"Og">;
 def _SLASH_openmp_ : CLIgnoredFlag<"openmp-">;
+def _SLASH_permissive_ : CLIgnoredFlag<"permissive-">;
 def _SLASH_RTC : CLIgnoredJoined<"RTC">;
 def _SLASH_sdl : CLIgnoredFlag<"sdl">;
 def _SLASH_sdl_ : CLIgnoredFlag<"sdl-">;
@@ -344,6 +347,8 @@ def _SLASH_Zo_ : CLIgnoredFlag<"Zo-">;
 // Unsupported:
 
 def _SLASH_AI : CLJoined<"AI">;
+def _SLASH_Bt : CLFlag<"Bt">;
+def _SLASH_Bt_plus : CLFlag<"Bt+">;
 def _SLASH_clr : CLJoined<"clr">;
 def _SLASH_doc : CLJoined<"doc">;
 def _SLASH_FA_joined : CLJoined<"FA">;
@@ -364,7 +369,6 @@ def _SLASH_GL_ : CLFlag<"GL-">;
 def _SLASH_Gm : CLFlag<"Gm">;
 def _SLASH_Gm_ : CLFlag<"Gm-">;
 def _SLASH_GT : CLFlag<"GT">;
-def _SLASH_Guard : CLJoined<"guard:">;
 def _SLASH_GZ : CLFlag<"GZ">;
 def _SLASH_H : CLFlag<"H">;
 def _SLASH_homeparams : CLFlag<"homeparams">;
diff --git a/include/clang/Driver/Driver.h b/include/clang/Driver/Driver.h
index a3662872a953..8328b871a31a 100644
--- a/include/clang/Driver/Driver.h
+++ b/include/clang/Driver/Driver.h
@@ -19,6 +19,8 @@
 #include "clang/Driver/Util.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/StringSaver.h"
 
 #include <list>
 #include <map>
@@ -26,14 +28,6 @@
 
 namespace llvm {
 class Triple;
-
-namespace opt {
-  class Arg;
-  class ArgList;
-  class DerivedArgList;
-  class InputArgList;
-  class OptTable;
-}
 }
 
 namespace clang {
@@ -73,7 +67,8 @@ class Driver {
     GCCMode,
     GXXMode,
     CPPMode,
-    CLMode
+    CLMode,
+    FortranMode
   } Mode;
 
   enum SaveTempsMode {
@@ -138,6 +133,12 @@ class Driver {
   /// The path to the compiler resource directory.
   std::string ResourceDir;
 
+  /// System directory for config files.
+  std::string SystemConfigDir;
+
+  /// User directory for config files.
+  std::string UserConfigDir;
+
   /// A prefix directory used to emulate a limited subset of GCC's '-Bprefix'
   /// functionality.
   /// FIXME: This type of customization should be removed in favor of the
@@ -182,6 +183,9 @@ class Driver {
   /// Whether the driver should follow cl.exe like behavior.
   bool IsCLMode() const { return Mode == CLMode; }
 
+  /// Whether the driver should follow gfortran like behavior.
+  bool IsFortranMode() const { return Mode == FortranMode; }
+
   /// Only print tool bindings, don't build any jobs.
   unsigned CCCPrintBindings : 1;
 
@@ -208,6 +212,21 @@ class Driver {
   /// Name to use when invoking gcc/g++.
   std::string CCCGenericGCCName;
 
+  /// Name of configuration file if used.
+  std::string ConfigFile;
+
+  /// Allocator for string saver.
+  llvm::BumpPtrAllocator Alloc;
+
+  /// Object that stores strings read from configuration file.
+  llvm::StringSaver Saver;
+
+  /// Arguments originated from configuration file.
+  std::unique_ptr<llvm::opt::InputArgList> CfgOptions;
+
+  /// Arguments originated from command line.
+  std::unique_ptr<llvm::opt::InputArgList> CLOptions;
+
   /// Whether to check that input files exist when constructing compilation
   /// jobs.
   unsigned CheckInputsExist : 1;
@@ -277,6 +296,8 @@ class Driver {
   /// Name to use when invoking gcc/g++.
   const std::string &getCCCGenericGCCName() const { return CCCGenericGCCName; }
 
+  const std::string &getConfigFile() const { return ConfigFile; }
+
   const llvm::opt::OptTable &getOpts() const { return *Opts; }
 
   const DiagnosticsEngine &getDiags() const { return Diags; }
@@ -493,6 +514,18 @@ class Driver {
   LTOKind getLTOMode() const { return LTOMode; }
 
 private:
+
+  /// Tries to load options from configuration file.
+  ///
+  /// \returns true if error occurred.
+  bool loadConfigFile();
+
+  /// Read options from the specified file.
+  ///
+  /// \param [in] FileName File to read.
+  /// \returns true, if error occurred while reading.
+  bool readConfigFile(StringRef FileName);
+
   /// Set the driver mode (cl, gcc, etc) from an option string of the form
   /// --driver-mode=<mode>.
   void setDriverModeFromOption(StringRef Opt);
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td
index 7dc6632e8228..4e26fd82bb4a 100644
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -247,7 +247,7 @@ class InternalDriverOpt : Group<internal_driver_Group>,
   Flags<[DriverOption, HelpHidden]>;
 def driver_mode : Joined<["--"], "driver-mode=">, Group<internal_driver_Group>,
   Flags<[CoreOption, DriverOption, HelpHidden]>,
-  HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', or 'cl'">;
+  HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', 'cl', or 'fortran'">;
 def rsp_quoting : Joined<["--"], "rsp-quoting=">, Group<internal_driver_Group>,
   Flags<[CoreOption, DriverOption, HelpHidden]>,
   HelpText<"Set the rsp quoting to either 'posix', or 'windows'">;
@@ -492,6 +492,8 @@ def bind__at__load : Flag<["-"], "bind_at_load">;
 def bundle__loader : Separate<["-"], "bundle_loader">;
 def bundle : Flag<["-"], "bundle">;
 def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
+def cfguard : Flag<["-"], "cfguard">, Flags<[CC1Option]>,
+  HelpText<"Emit tables required for Windows Control Flow Guard.">;
 def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
 def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">, Group<opencl_Group>, Flags<[CC1Option]>,
@@ -519,6 +521,12 @@ def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-round
 def client__name : JoinedOrSeparate<["-"], "client_name">;
 def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
 def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
+def config : Separate<["--"], "config">, Flags<[DriverOption]>,
+  HelpText<"Specifies configuration file">;
+def config_system_dir_EQ : Joined<["--"], "config-system-dir=">, Flags<[DriverOption, HelpHidden]>,
+  HelpText<"System directory for configuration files">;
+def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, Flags<[DriverOption, HelpHidden]>,
+  HelpText<"User directory for configuration files">;
 def coverage : Flag<["-", "--"], "coverage">, Flags<[CoreOption]>;
 def cpp_precomp : Flag<["-"], "cpp-precomp">, Group<clang_ignored_f_Group>;
 def current__version : JoinedOrSeparate<["-"], "current_version">;
@@ -548,6 +556,8 @@ def no_cuda_version_check : Flag<["--"], "no-cuda-version-check">,
 def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
 def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
   HelpText<"CUDA installation path">;
+def cuda_path_ignore_env : Joined<["--"], "cuda-path-ignore-env">, Group<i_Group>,
+  HelpText<"Ignore environment variables to detect CUDA installation">;
 def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
   HelpText<"Path to ptxas (used for compiling CUDA code)">;
 def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
@@ -1025,6 +1035,8 @@ def finline_functions : Flag<["-"], "finline-functions">, Group<f_clang_Group>,
 def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Inline functions which are (explicitly or implicitly) marked inline">;
 def finline : Flag<["-"], "finline">, Group<clang_ignored_f_Group>;
+def fexperimental_isel : Flag<["-"], "fexperimental-isel">, Group<f_clang_Group>,
+  HelpText<"Enables the experimental global instruction selector">;
 def fexperimental_new_pass_manager : Flag<["-"], "fexperimental-new-pass-manager">,
   Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Enables an experimental new pass manager in LLVM.">;
@@ -1036,6 +1048,11 @@ def finstrument_functions_after_inlining : Flag<["-"], "finstrument-functions-af
   HelpText<"Like -finstrument-functions, but insert the calls after inlining">;
 def finstrument_function_entry_bare : Flag<["-"], "finstrument-function-entry-bare">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Instrument function entry only, after inlining, without arguments to the instrumentation call">;
+def fcf_protection_EQ : Joined<["-"], "fcf-protection=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
+  HelpText<"Instrument control-flow architecture protection. Options: return, branch, full, none.">, Values<"return,branch,full,none">;
+def fcf_protection : Flag<["-"], "fcf-protection">, Group<f_Group>, Flags<[CoreOption, CC1Option]>,
+  Alias<fcf_protection_EQ>, AliasArgs<["full"]>,
+  HelpText<"Enable cf-protection in 'full' mode">;
 
 def fxray_instrument : Flag<["-"], "fxray-instrument">, Group<f_Group>,
   Flags<[CC1Option]>,
@@ -1231,6 +1248,8 @@ def fno_exceptions : Flag<["-"], "fno-exceptions">, Group<f_Group>;
 def fno_gnu_keywords : Flag<["-"], "fno-gnu-keywords">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_inline_functions : Flag<["-"], "fno-inline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>;
 def fno_inline : Flag<["-"], "fno-inline">, Group<f_clang_Group>, Flags<[CC1Option]>;
+def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_Group>,
+  HelpText<"Disables the experimental global instruction selector">;
 def fno_experimental_new_pass_manager : Flag<["-"], "fno-experimental-new-pass-manager">,
   Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Disables an experimental new pass manager in LLVM.">;
@@ -1568,6 +1587,10 @@ def fdata_sections : Flag <["-"], "fdata-sections">, Group<f_Group>,
  Flags<[CC1Option]>, HelpText<"Place each data in its own section (ELF Only)">;
 def fno_data_sections : Flag <["-"], "fno-data-sections">, Group<f_Group>,
   Flags<[CC1Option]>;
+def fstack_size_section : Flag<["-"], "fstack-size-section">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Emit section containing metadata on function stack sizes">;
+def fno_stack_size_section : Flag<["-"], "fno-stack-size-section">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Don't emit section containing metadata on function stack sizes">;
 
 def funique_section_names : Flag <["-"], "funique-section-names">,
   Group<f_Group>, Flags<[CC1Option]>,
@@ -1853,6 +1876,8 @@ def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
 def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
 def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
 def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
+def msign_ext : Flag<["-"], "msign-ext">, Group<m_wasm_Features_Group>;
+def mno_sign_ext : Flag<["-"], "mno-sign-ext">, Group<m_wasm_Features_Group>;
 
 def mamdgpu_debugger_abi : Joined<["-"], "mamdgpu-debugger-abi=">,
   Flags<[HelpHidden]>,
@@ -2543,6 +2568,8 @@ def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
 def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;
 def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
 def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
+def mrdpid : Flag<["-"], "mrdpid">, Group<m_x86_Features_Group>;
+def mno_rdpid : Flag<["-"], "mno-rdpid">, Group<m_x86_Features_Group>;
 def mrdrnd : Flag<["-"], "mrdrnd">, Group<m_x86_Features_Group>;
 def mno_rdrnd : Flag<["-"], "mno-rdrnd">, Group<m_x86_Features_Group>;
 def mrtm : Flag<["-"], "mrtm">, Group<m_x86_Features_Group>;
@@ -2573,6 +2600,10 @@ def mshstk : Flag<["-"], "mshstk">, Group<m_x86_Features_Group>;
 def mno_shstk : Flag<["-"], "mno-shstk">, Group<m_x86_Features_Group>;
 def mibt : Flag<["-"], "mibt">, Group<m_x86_Features_Group>;
 def mno_ibt : Flag<["-"], "mno-ibt">, Group<m_x86_Features_Group>;
+def mretpoline : Flag<["-"], "mretpoline">, Group<m_x86_Features_Group>;
+def mno_retpoline : Flag<["-"], "mno-retpoline">, Group<m_x86_Features_Group>;
+def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group<m_x86_Features_Group>;
+def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group<m_x86_Features_Group>;
 
 // These are legacy user-facing driver-level option spellings. They are always
 // aliases for options that are spelled using the more common Unix / GNU flag
@@ -2702,11 +2733,19 @@ defm devirtualize : BooleanFFlag<"devirtualize">, Group<clang_ignored_gcc_optimi
 defm devirtualize_speculatively : BooleanFFlag<"devirtualize-speculatively">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 
+// gfortran options that we recognize in the driver and pass along when
+// invoking GCC to compile Fortran code.
+def flang_rt_Group : OptionGroup<"Flang runtime library Group">;
+def pgi_fortran_Group : OptionGroup<"PGI Fortran compatibility Group">,
+  Flags<[HelpHidden]>;
+
 // Generic gfortran options.
 def A_DASH : Joined<["-"], "A-">, Group<gfortran_Group>;
 def J : JoinedOrSeparate<["-"], "J">, Flags<[RenderJoined]>, Group<gfortran_Group>;
-def cpp : Flag<["-"], "cpp">, Group<gfortran_Group>;
-def nocpp : Flag<["-"], "nocpp">, Group<gfortran_Group>;
+def cpp : Flag<["-"], "cpp">, Group<gfortran_Group>,
+  HelpText<"Preprocess Fortran files">;
+def nocpp : Flag<["-"], "nocpp">, Group<gfortran_Group>,
+  HelpText<"Don't preprocess Fortran files">;
 def static_libgfortran : Flag<["-"], "static-libgfortran">, Group<gfortran_Group>;
 
 // "f" options with values for gfortran.
@@ -2714,7 +2753,8 @@ def fblas_matmul_limit_EQ : Joined<["-"], "fblas-matmul-limit=">, Group<gfortran
 def fcheck_EQ : Joined<["-"], "fcheck=">, Group<gfortran_Group>;
 def fcoarray_EQ : Joined<["-"], "fcoarray=">, Group<gfortran_Group>;
 def fconvert_EQ : Joined<["-"], "fconvert=">, Group<gfortran_Group>;
-def ffixed_line_length_VALUE : Joined<["-"], "ffixed-line-length-">, Group<gfortran_Group>;
+def ffixed_line_length_VALUE : Joined<["-"], "ffixed-line-length-">, Group<gfortran_Group>,
+  HelpText<"Set line length in fixed-form format Fortran, current supporting only 72 and 132 characters">;
 def ffpe_trap_EQ : Joined<["-"], "ffpe-trap=">, Group<gfortran_Group>;
 def ffree_line_length_VALUE : Joined<["-"], "ffree-line-length-">, Group<gfortran_Group>;
 def finit_character_EQ : Joined<["-"], "finit-character=">, Group<gfortran_Group>;
@@ -2727,12 +2767,17 @@ def fmax_stack_var_size_EQ : Joined<["-"], "fmax-stack-var-size=">, Group<gfortr
 def fmax_subrecord_length_EQ : Joined<["-"], "fmax-subrecord-length=">, Group<gfortran_Group>;
 def frecord_marker_EQ : Joined<["-"], "frecord-marker=">, Group<gfortran_Group>;
 
+// Define a group for Fortran source format
+def fortran_format_Group : OptionGroup<"Fortran format Group">, Group<gfortran_Group>;
 // "f" flags for gfortran.
 defm aggressive_function_elimination : BooleanFFlag<"aggressive-function-elimination">, Group<gfortran_Group>;
 defm align_commons : BooleanFFlag<"align-commons">, Group<gfortran_Group>;
 defm all_intrinsics : BooleanFFlag<"all-intrinsics">, Group<gfortran_Group>;
 defm automatic : BooleanFFlag<"automatic">, Group<gfortran_Group>;
-defm backslash : BooleanFFlag<"backslash">, Group<gfortran_Group>;
+def fbackslash: Flag<["-"], "fbackslash">, Group<gfortran_Group>,
+  HelpText<"Treat backslash as C-style escape character">;
+def fnobackslash: Flag<["-"], "fno-backslash">, Group<gfortran_Group>,
+  HelpText<"Treat backslash like any other character in character strings">;
 defm backtrace : BooleanFFlag<"backtrace">, Group<gfortran_Group>;
 defm bounds_check : BooleanFFlag<"bounds-check">, Group<gfortran_Group>;
 defm check_array_temporaries : BooleanFFlag<"check-array-temporaries">, Group<gfortran_Group>;
@@ -2740,17 +2785,27 @@ defm cray_pointer : BooleanFFlag<"cray-pointer">, Group<gfortran_Group>;
 defm d_lines_as_code : BooleanFFlag<"d-lines-as-code">, Group<gfortran_Group>;
 defm d_lines_as_comments : BooleanFFlag<"d-lines-as-comments">, Group<gfortran_Group>;
 defm default_double_8 : BooleanFFlag<"default-double-8">, Group<gfortran_Group>;
-defm default_integer_8 : BooleanFFlag<"default-integer-8">, Group<gfortran_Group>;
-defm default_real_8 : BooleanFFlag<"default-real-8">, Group<gfortran_Group>;
+def default_integer_8_f : Flag<["-"], "fdefault-integer-8">, Group<gfortran_Group>,
+  HelpText<"Treat INTEGER and LOGICAL as INTEGER*8 and LOGICAL*8">;
+def default_integer_8_fno : Flag<["-"], "fno-default-integer-8">, Group<gfortran_Group>;
+def default_real_8_f : Flag<["-"], "fdefault-real-8">, Group<gfortran_Group>,
+  HelpText<"Treat REAL as REAL*8">;
+def default_real_8_fno : Flag<["-"], "fno-default-real-8">, Group<gfortran_Group>;
 defm dollar_ok : BooleanFFlag<"dollar-ok">, Group<gfortran_Group>;
 defm dump_fortran_optimized : BooleanFFlag<"dump-fortran-optimized">, Group<gfortran_Group>;
 defm dump_fortran_original : BooleanFFlag<"dump-fortran-original">, Group<gfortran_Group>;
 defm dump_parse_tree : BooleanFFlag<"dump-parse-tree">, Group<gfortran_Group>;
 defm external_blas : BooleanFFlag<"external-blas">, Group<gfortran_Group>;
 defm f2c : BooleanFFlag<"f2c">, Group<gfortran_Group>;
-defm fixed_form : BooleanFFlag<"fixed-form">, Group<gfortran_Group>;
-defm free_form : BooleanFFlag<"free-form">, Group<gfortran_Group>;
-defm frontend_optimize : BooleanFFlag<"frontend-optimize">, Group<gfortran_Group>;
+def fixed_form_on : Flag<["-"], "ffixed-form">, Group<fortran_format_Group>,
+  HelpText<"Enable fixed-form format for Fortran">;
+def fixed_form_off : Flag<["-"], "fno-fixed-form">, Group<fortran_format_Group>,
+  HelpText<"Disable fixed-form format for Fortran">;
+def free_form_on : Flag<["-"], "ffree-form">, Group<fortran_format_Group>,
+  HelpText<"Enable free-form format for Fortran">;
+def free_form_off : Flag<["-"], "fno-free-form">, Group<fortran_format_Group>,
+  HelpText<"Disable free-form format for Fortran">;
+defm frontend_optimize : BooleanFFlag<"frontend-optimize">, Group<fortran_format_Group>;
 defm implicit_none : BooleanFFlag<"implicit-none">, Group<gfortran_Group>;
 defm init_local_zero : BooleanFFlag<"init-local-zero">, Group<gfortran_Group>;
 defm integer_4_integer_8 : BooleanFFlag<"integer-4-integer-8">, Group<gfortran_Group>;
@@ -2775,6 +2830,119 @@ defm stack_arrays : BooleanFFlag<"stack-arrays">, Group<gfortran_Group>;
 defm underscoring : BooleanFFlag<"underscoring">, Group<gfortran_Group>;
 defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>;
 
+// Flang-specific options
+multiclass BooleanKFlag<string name> {
+  def _on : Flag<["-"], "K"#name>;
+  def _off : Flag<["-"], "Kno"#name>;
+}
+
+multiclass BooleanMFlag<string name> {
+  def _on : Flag<["-"], "M"#name>;
+  def _off : Flag<["-"], "Mno"#name>;
+}
+
+def Mfixed : Flag<["-"], "Mfixed">, Group<fortran_format_Group>,
+  HelpText<"Force fixed-form format Fortran">,
+  Flags<[HelpHidden]>;
+def Mfree_on: Flag<["-"], "Mfree">, Group<fortran_format_Group>,
+  HelpText<"Enable free-form format for Fortran">,
+  Flags<[HelpHidden]>;
+def Mfree_off: Flag<["-"], "Mnofree">, Group<fortran_format_Group>,
+  HelpText<"Disable free-form format for Fortran">,
+  Flags<[HelpHidden]>;
+def Mfreeform_on: Flag<["-"], "Mfreeform">, Group<fortran_format_Group>,
+  HelpText<"Enable free-form format for Fortran">,
+  Flags<[HelpHidden]>;
+def Mfreeform_off: Flag<["-"], "Mnofreeform">, Group<fortran_format_Group>,
+  HelpText<"Disable free-form format for Fortran">,
+  Flags<[HelpHidden]>;
+
+def Mipa: Joined<["-"], "Mipa">, Group<pgi_fortran_Group>;
+def Mstackarrays: Joined<["-"], "Mstack_arrays">, Group<pgi_fortran_Group>;
+def pc: JoinedOrSeparate<["-"], "pc">, Group<pgi_fortran_Group>;
+def Mfprelaxed: Joined<["-"], "Mfprelaxed">, Group<pgi_fortran_Group>;
+def Mnofprelaxed: Joined<["-"], "Mnofprelaxed">, Group<pgi_fortran_Group>;
+defm Mstride0: BooleanMFlag<"stride0">, Group<pgi_fortran_Group>;
+defm Mrecursive: BooleanMFlag<"recursive">, Group<pgi_fortran_Group>;
+defm Mreentrant: BooleanMFlag<"reentrant">, Group<pgi_fortran_Group>;
+defm Mbounds: BooleanMFlag<"bounds">, Group<pgi_fortran_Group>;
+def Mdaz_on: Flag<["-"], "Mdaz">, Group<pgi_fortran_Group>,
+  HelpText<"Treat denormalized numbers as zero">;
+def Mdaz_off: Flag<["-"], "Mnodaz">, Group<pgi_fortran_Group>,
+  HelpText<"Disable treating denormalized numbers as zero">;
+def Kieee_on : Flag<["-"], "Kieee">, Group<pgi_fortran_Group>,
+  HelpText<"Enable IEEE division">;
+def Kieee_off : Flag<["-"], "Knoieee">, Group<pgi_fortran_Group>,
+  HelpText<"Disable IEEE division">;
+def Mextend : Flag<["-"], "Mextend">, Group<pgi_fortran_Group>,
+  HelpText<"Allow lines up to 132 characters in Fortran sources">;
+def Mpreprocess : Flag<["-"], "Mpreprocess">, Group<pgi_fortran_Group>,
+  HelpText<"Preprocess Fortran files">;
+def Mstandard: Flag<["-"], "Mstandard">, Group<pgi_fortran_Group>,
+  HelpText<"Check Fortran standard conformance">;
+def Mchkptr: Flag<["-"], "Mchkptr">, Group<pgi_fortran_Group>;
+defm Minline: BooleanMFlag<"inline">, Group<pgi_fortran_Group>;
+def fma: Flag<["-"], "fma">, Group<pgi_fortran_Group>,
+  HelpText<"Enable generation of FMA instructions">;
+def nofma: Flag<["-"], "nofma">, Group<pgi_fortran_Group>,
+  HelpText<"Disable generation of FMA instructions">;
+defm Mfma: BooleanMFlag<"fma">, Group<pgi_fortran_Group>,
+  HelpText<"Enable generation of FMA instructions">;
+def mp: Flag<["-"], "mp">, Group<pgi_fortran_Group>,
+  HelpText<"Enable OpenMP">;
+def nomp: Flag<["-"], "nomp">, Group<pgi_fortran_Group>,
+  HelpText<"Do not link with OpenMP library libomp">;
+def Mflushz_on: Flag<["-"], "Mflushz">, Group<pgi_fortran_Group>,
+  HelpText<"Set SSE to flush-to-zero mode">;
+def Mflushz_off: Flag<["-"], "Mnoflushz">, Group<pgi_fortran_Group>,
+  HelpText<"Disabling setting SSE to flush-to-zero mode">;
+def Msave_on: Flag<["-"], "Msave">, Group<pgi_fortran_Group>,
+  HelpText<"Assume all Fortran variables have SAVE attribute">;
+def Msave_off: Flag<["-"], "Mnosave">, Group<pgi_fortran_Group>,
+  HelpText<"Assume no Fortran variables have SAVE attribute">;
+def Mcache_align_on: Flag<["-"], "Mcache_align">, Group<pgi_fortran_Group>,
+  HelpText<"Align large objects on cache-line boundaries">;
+def Mcache_align_off: Flag<["-"], "Mnocache_align">, Group<pgi_fortran_Group>,
+  HelpText<"Disable aligning large objects on cache-line boundaries">;
+def ModuleDir : Separate<["-"], "module">, Group<pgi_fortran_Group>,
+  HelpText<"Fortran module path">;
+def Minform_EQ : Joined<["-"], "Minform=">,
+  HelpText<"Set error level of messages to display">;
+def Mallocatable_EQ : Joined<["-"], "Mallocatable=">,
+  HelpText<"Select semantics for assignments to allocatables (F03 or F95)">;
+def Mbyteswapio: Flag<["-"], "Mbyteswapio">, Group<pgi_fortran_Group>,
+  HelpText<"Swap byte-order for unformatted input/output">;
+def byteswapio: Flag<["-"], "byteswapio">, Group<gfortran_Group>,
+  HelpText<"Swap byte-order for unformatted input/output">;
+def Mbackslash: Flag<["-"], "Mbackslash">, Group<pgi_fortran_Group>,
+  HelpText<"Treat backslash like any other character in character strings">;
+def Mnobackslash: Flag<["-"], "Mnobackslash">, Group<pgi_fortran_Group>,
+  HelpText<"Treat backslash as C-style escape character">;
+def staticFlangLibs: Flag<["-"], "static-flang-libs">, Group<flang_rt_Group>,
+  HelpText<"Link using static Flang libraries">;
+def noFlangLibs: Flag<["-"], "no-flang-libs">, Group<flang_rt_Group>,
+  HelpText<"Do not link against Flang libraries">;
+def r8: Flag<["-"], "r8">, Group<pgi_fortran_Group>,
+  HelpText<"Treat REAL as REAL*8">;
+def i8: Flag<["-"], "i8">, Group<pgi_fortran_Group>,
+  HelpText<"Treat INTEGER and LOGICAL as INTEGER*8 and LOGICAL*8">;
+def no_fortran_main: Flag<["-"], "fno-fortran-main">, Group<gfortran_Group>,
+  HelpText<"Don't link in Fortran main">;
+def Mnomain: Flag<["-"], "Mnomain">, Group<pgi_fortran_Group>,
+  HelpText<"Don't link in Fortran main">;
+
+// Flang internal debug options
+def Mx_EQ : Joined<["-"], "Mx,">, Group<pgi_fortran_Group>;
+def My_EQ : Joined<["-"], "My,">, Group<pgi_fortran_Group>;
+def Hx_EQ : Joined<["-"], "Hx,">, Group<pgi_fortran_Group>;
+def Hy_EQ : Joined<["-"], "Hy,">, Group<pgi_fortran_Group>;
+def Wm_EQ : Joined<["-"], "Wm,">, Group<pgi_fortran_Group>;
+
+def Mq_EQ : Joined<["-"], "Mq,">, Group<pgi_fortran_Group>;
+def Hq_EQ : Joined<["-"], "Hq,">, Group<pgi_fortran_Group>;
+def Mqq_EQ : Joined<["-"], "Mqq,">, Group<pgi_fortran_Group>;
+def Hqq_EQ : Joined<["-"], "Hqq,">, Group<pgi_fortran_Group>;
+def Wh_EQ : Joined<["-"], "Wh,">, Group<pgi_fortran_Group>;
 
 include "CC1Options.td"
 
diff --git a/include/clang/Driver/Phases.h b/include/clang/Driver/Phases.h
index cd6b5b5c9f05..fe96caaaf800 100644
--- a/include/clang/Driver/Phases.h
+++ b/include/clang/Driver/Phases.h
@@ -18,6 +18,7 @@ namespace phases {
   enum ID {
     Preprocess,
     Precompile,
+    FortranFrontend,
     Compile,
     Backend,
     Assemble,
diff --git a/include/clang/Driver/SanitizerArgs.h b/include/clang/Driver/SanitizerArgs.h
index a31ac05afc1b..d144e488b56b 100644
--- a/include/clang/Driver/SanitizerArgs.h
+++ b/include/clang/Driver/SanitizerArgs.h
@@ -30,7 +30,7 @@ class SanitizerArgs {
   std::vector<std::string> ExtraDeps;
   int CoverageFeatures = 0;
   int MsanTrackOrigins = 0;
-  bool MsanUseAfterDtor = false;
+  bool MsanUseAfterDtor = true;
   bool CfiCrossDso = false;
   bool CfiICallGeneralizePointers = false;
   int AsanFieldPadding = 0;
diff --git a/include/clang/Driver/ToolChain.h b/include/clang/Driver/ToolChain.h
index 13f54d3718b4..9bcec45e6c90 100644
--- a/include/clang/Driver/ToolChain.h
+++ b/include/clang/Driver/ToolChain.h
@@ -107,11 +107,13 @@ class ToolChain {
   /// programs.
   path_list ProgramPaths;
 
+  mutable std::unique_ptr<Tool> FlangFrontend;
   mutable std::unique_ptr<Tool> Clang;
   mutable std::unique_ptr<Tool> Assemble;
   mutable std::unique_ptr<Tool> Link;
   mutable std::unique_ptr<Tool> OffloadBundler;
   Tool *getClang() const;
+  Tool *getFlangFrontend() const;
   Tool *getAssemble() const;
   Tool *getLink() const;
   Tool *getClangAs() const;
@@ -450,6 +452,14 @@ class ToolChain {
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const;
 
+  /// \brief Add the flang arguments for system include paths.
+  ///
+  /// This routine is responsible for adding the -stdinc argument to
+  /// include headers and module files from standard system header directories.
+  virtual void
+  AddFlangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &Flang1Args) const { }
+
   /// \brief Add options that need to be passed to cc1 for this target.
   virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                                      llvm::opt::ArgStringList &CC1Args,
@@ -514,6 +524,11 @@ class ToolChain {
   virtual VersionTuple computeMSVCVersion(const Driver *D,
                                           const llvm::opt::ArgList &Args) const;
 
+  /// AddFortranStdlibLibArgs - Add the system specific linker arguments to use
+  /// for the given Fortran runtime library type.
+  virtual void AddFortranStdlibLibArgs(const llvm::opt::ArgList &Args,
+                                       llvm::opt::ArgStringList &CmdArgs) const;
+
   /// \brief Return sanitizers which are available in this toolchain.
   virtual SanitizerMask getSupportedSanitizers() const;
 
diff --git a/include/clang/Driver/Types.def b/include/clang/Driver/Types.def
index 2430b5b924c3..0fd9c2779cf4 100644
--- a/include/clang/Driver/Types.def
+++ b/include/clang/Driver/Types.def
@@ -69,12 +69,16 @@ TYPE("objective-c++-header",     ObjCXXHeader, PP_ObjCXXHeader, "h",     "pu")
 TYPE("c++-module",               CXXModule,    PP_CXXModule,    "cppm",  "mu")
 TYPE("c++-module-cpp-output",    PP_CXXModule, INVALID,         "iim",   "m")
 
+// Fortran
+TYPE("f77",                      PP_F_FixedForm, INVALID,       "f",   "u")
+TYPE("f77-cpp-input",            F_FixedForm,  PP_F_FixedForm,  "F",   "u")
+TYPE("f95",                      PP_F_FreeForm, INVALID,        "f95", "u")
+TYPE("f95-cpp-input",            F_FreeForm,   PP_F_FreeForm,   "F95", "u")
+
 // Other languages.
 TYPE("ada",                      Ada,          INVALID,         nullptr, "u")
 TYPE("assembler",                PP_Asm,       INVALID,         "s",     "au")
 TYPE("assembler-with-cpp",       Asm,          PP_Asm,          "S",     "au")
-TYPE("f95",                      PP_Fortran,   INVALID,         nullptr, "u")
-TYPE("f95-cpp-input",            Fortran,      PP_Fortran,      nullptr, "u")
 TYPE("java",                     Java,         INVALID,         nullptr, "u")
 
 // LLVM IR/LTO types. We define separate types for IR and LTO because LTO
diff --git a/include/clang/Driver/Types.h b/include/clang/Driver/Types.h
index 22a26ae46a0b..ae3fa36231b5 100644
--- a/include/clang/Driver/Types.h
+++ b/include/clang/Driver/Types.h
@@ -107,6 +107,15 @@ namespace types {
   /// source file type (used for clang-cl emulation of \Yc).
   ID lookupHeaderTypeForSourceType(ID Id);
 
+  /// isFortran -- is it a Fortran input
+  bool isFortran(ID Id);
+
+  /// isFreeFormFortran -- is it a free form layout Fortran input
+  bool isFreeFormFortran(ID Id);
+
+  /// isFixedFormFortran -- is it a fixed form layout Fortran input
+  bool isFixedFormFortran(ID Id);
+
 } // end namespace types
 } // end namespace driver
 } // end namespace clang
diff --git a/include/clang/Format/Format.h b/include/clang/Format/Format.h
index d27d934f7679..bffa6ddac222 100644
--- a/include/clang/Format/Format.h
+++ b/include/clang/Format/Format.h
@@ -1363,36 +1363,60 @@ struct FormatStyle {
 
   /// See documentation of ``RawStringFormats``.
   struct RawStringFormat {
-    /// \brief The delimiter that this raw string format matches.
-    std::string Delimiter;
     /// \brief The language of this raw string.
     LanguageKind Language;
+    /// \brief A list of raw string delimiters that match this language.
+    std::vector<std::string> Delimiters;
+    /// \brief A list of enclosing function names that match this language.
+    std::vector<std::string> EnclosingFunctions;
+    /// \brief The canonical delimiter for this language.
+    std::string CanonicalDelimiter;
     /// \brief The style name on which this raw string format is based on.
     /// If not specified, the raw string format is based on the style that this
     /// format is based on.
     std::string BasedOnStyle;
     bool operator==(const RawStringFormat &Other) const {
-      return Delimiter == Other.Delimiter && Language == Other.Language &&
+      return Language == Other.Language && Delimiters == Other.Delimiters &&
+             EnclosingFunctions == Other.EnclosingFunctions &&
+             CanonicalDelimiter == Other.CanonicalDelimiter &&
              BasedOnStyle == Other.BasedOnStyle;
     }
   };
 
-  /// \brief Raw string delimiters denoting that the raw string contents are
-  /// code in a particular language and can be reformatted.
+  /// \brief Defines hints for detecting supported languages code blocks in raw
+  /// strings.
+  ///
+  /// A raw string with a matching delimiter or a matching enclosing function
+  /// name will be reformatted assuming the specified language based on the
+  /// style for that language defined in the .clang-format file. If no style has
+  /// been defined in the .clang-format file for the specific language, a
+  /// predefined style given by 'BasedOnStyle' is used. If 'BasedOnStyle' is not
+  /// found, the formatting is based on llvm style. A matching delimiter takes
+  /// precedence over a matching enclosing function name for determining the
+  /// language of the raw string contents.
   ///
-  /// A raw string with a matching delimiter will be reformatted assuming the
-  /// specified language based on a predefined style given by 'BasedOnStyle'.
-  /// If 'BasedOnStyle' is not found, the formatting is based on llvm style.
+  /// If a canonical delimiter is specified, occurrences of other delimiters for
+  /// the same language will be updated to the canonical if possible.
+  ///
+  /// There should be at most one specification per language and each delimiter
+  /// and enclosing function should not occur in multiple specifications.
   ///
   /// To configure this in the .clang-format file, use:
   /// \code{.yaml}
   ///   RawStringFormats:
-  ///     - Delimiter: 'pb'
-  ///       Language:  TextProto
-  ///       BasedOnStyle: llvm
-  ///     - Delimiter: 'proto'
-  ///       Language:  TextProto
-  ///       BasedOnStyle: google
+  ///     - Language: TextProto
+  ///         Delimiters:
+  ///           - 'pb'
+  ///           - 'proto'
+  ///         EnclosingFunctions:
+  ///           - 'PARSE_TEXT_PROTO'
+  ///         BasedOnStyle: google
+  ///     - Language: Cpp
+  ///         Delimiters:
+  ///           - 'cc'
+  ///           - 'cpp'
+  ///         BasedOnStyle: llvm
+  ///         CanonicalDelimiter: 'cc'
   /// \endcode
   std::vector<RawStringFormat> RawStringFormats;
 
@@ -1685,6 +1709,43 @@ struct FormatStyle {
            Standard == R.Standard && TabWidth == R.TabWidth &&
            UseTab == R.UseTab;
   }
+
+  llvm::Optional<FormatStyle> GetLanguageStyle(LanguageKind Language) const;
+
+  // Stores per-language styles. A FormatStyle instance inside has an empty
+  // StyleSet. A FormatStyle instance returned by the Get method has its
+  // StyleSet set to a copy of the originating StyleSet, effectively keeping the
+  // internal representation of that StyleSet alive.
+  //
+  // The memory management and ownership reminds of a birds nest: chicks
+  // leaving the nest take photos of the nest with them.
+  struct FormatStyleSet {
+    typedef std::map<FormatStyle::LanguageKind, FormatStyle> MapType;
+
+    llvm::Optional<FormatStyle> Get(FormatStyle::LanguageKind Language) const;
+
+    // Adds \p Style to this FormatStyleSet. Style must not have an associated
+    // FormatStyleSet.
+    // Style.Language should be different than LK_None. If this FormatStyleSet
+    // already contains an entry for Style.Language, that gets replaced with the
+    // passed Style.
+    void Add(FormatStyle Style);
+
+    // Clears this FormatStyleSet.
+    void Clear();
+
+  private:
+    std::shared_ptr<MapType> Styles;
+  };
+
+  static FormatStyleSet BuildStyleSetFromConfiguration(
+      const FormatStyle &MainStyle,
+      const std::vector<FormatStyle> &ConfigurationStyles);
+
+private:
+  FormatStyleSet StyleSet;
+
+  friend std::error_code parseConfiguration(StringRef Text, FormatStyle *Style);
 };
 
 /// \brief Returns a format style complying with the LLVM coding standards:
@@ -1730,6 +1791,8 @@ bool getPredefinedStyle(StringRef Name, FormatStyle::LanguageKind Language,
 /// Style->Language is used to get the base style, if the ``BasedOnStyle``
 /// option is present.
 ///
+/// The FormatStyleSet of Style is reset.
+///
 /// When ``BasedOnStyle`` is not present, options not present in the YAML
 /// document, are retained in \p Style.
 std::error_code parseConfiguration(StringRef Text, FormatStyle *Style);
diff --git a/include/clang/Frontend/CodeGenOptions.def b/include/clang/Frontend/CodeGenOptions.def
index bb91cf5f742b..7db368ccf4fe 100644
--- a/include/clang/Frontend/CodeGenOptions.def
+++ b/include/clang/Frontend/CodeGenOptions.def
@@ -38,6 +38,7 @@ CODEGENOPT(AssumeSaneOperatorNew , 1, 1) ///< implicit __attribute__((malloc)) o
 CODEGENOPT(Autolink          , 1, 1) ///< -fno-autolink
 CODEGENOPT(ObjCAutoRefCountExceptions , 1, 0) ///< Whether ARC should be EH-safe.
 CODEGENOPT(Backchain         , 1, 0) ///< -mbackchain
+CODEGENOPT(ControlFlowGuard  , 1, 0) ///< -cfguard
 CODEGENOPT(CoverageExtraChecksum, 1, 0) ///< Whether we need a second checksum for functions in GCNO files.
 CODEGENOPT(CoverageNoFunctionNamesInData, 1, 0) ///< Do not include function names in GCDA files.
 CODEGENOPT(CoverageExitBlockBeforeBody, 1, 0) ///< Whether to emit the exit block before the body blocks in GCNO files.
@@ -80,9 +81,13 @@ CODEGENOPT(InstrumentFunctionsAfterInlining , 1, 0) ///< Set when
                           ///< -finstrument-functions-after-inlining is enabled.
 CODEGENOPT(InstrumentFunctionEntryBare , 1, 0) ///< Set when
                                ///< -finstrument-function-entry-bare is enabled.
-
+CODEGENOPT(CFProtectionReturn , 1, 0) ///< if -fcf-protection is
+                                      ///< set to full or return.
+CODEGENOPT(CFProtectionBranch , 1, 0) ///< if -fcf-protection is
+                                      ///< set to full or branch.
 CODEGENOPT(XRayInstrumentFunctions , 1, 0) ///< Set when -fxray-instrument is
                                            ///< enabled.
+CODEGENOPT(StackSizeSection  , 1, 0) ///< Set when -fstack-size-section is enabled.
 
 ///< Set when -fxray-always-emit-customevents is enabled.
 CODEGENOPT(XRayAlwaysEmitCustomEvents , 1, 0)
diff --git a/include/clang/Frontend/CodeGenOptions.h b/include/clang/Frontend/CodeGenOptions.h
index 6b8d2b935fdd..183e6ac71240 100644
--- a/include/clang/Frontend/CodeGenOptions.h
+++ b/include/clang/Frontend/CodeGenOptions.h
@@ -16,6 +16,7 @@
 
 #include "clang/Basic/DebugInfoOptions.h"
 #include "clang/Basic/Sanitizers.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetOptions.h"
 #include <map>
@@ -167,7 +168,7 @@ class CodeGenOptions : public CodeGenOptionsBase {
   std::string SplitDwarfFile;
 
   /// The name of the relocation model to use.
-  std::string RelocationModel;
+  llvm::Reloc::Model RelocationModel;
 
   /// The thread model to use
   std::string ThreadModel;
diff --git a/include/clang/Frontend/PrecompiledPreamble.h b/include/clang/Frontend/PrecompiledPreamble.h
index 130fe60704a7..f1898f573ff6 100644
--- a/include/clang/Frontend/PrecompiledPreamble.h
+++ b/include/clang/Frontend/PrecompiledPreamble.h
@@ -104,14 +104,22 @@ class PrecompiledPreamble {
   /// Changes options inside \p CI to use PCH from this preamble. Also remaps
   /// main file to \p MainFileBuffer and updates \p VFS to ensure the preamble
   /// is accessible.
-  /// For in-memory preambles, PrecompiledPreamble instance continues to own
-  /// the MemoryBuffer with the Preamble after this method returns. The caller
-  /// is reponsible for making sure the PrecompiledPreamble instance outlives
-  /// the compiler run and the AST that will be using the PCH.
+  /// Requires that CanReuse() is true.
+  /// For in-memory preambles, PrecompiledPreamble instance continues to own the
+  /// MemoryBuffer with the Preamble after this method returns. The caller is
+  /// reponsible for making sure the PrecompiledPreamble instance outlives the
+  /// compiler run and the AST that will be using the PCH.
   void AddImplicitPreamble(CompilerInvocation &CI,
                            IntrusiveRefCntPtr<vfs::FileSystem> &VFS,
                            llvm::MemoryBuffer *MainFileBuffer) const;
 
+  /// Configure \p CI to use this preamble.
+  /// Like AddImplicitPreamble, but doesn't assume CanReuse() is true.
+  /// If this preamble does not match the file, it may parse differently.
+  void OverridePreamble(CompilerInvocation &CI,
+                        IntrusiveRefCntPtr<vfs::FileSystem> &VFS,
+                        llvm::MemoryBuffer *MainFileBuffer) const;
+
 private:
   PrecompiledPreamble(PCHStorage Storage, std::vector<char> PreambleBytes,
                       bool PreambleEndsAtStartOfLine,
@@ -222,6 +230,12 @@ class PrecompiledPreamble {
     }
   };
 
+  /// Helper function to set up PCH for the preamble into \p CI and \p VFS to
+  /// with the specified \p Bounds.
+  void configurePreamble(PreambleBounds Bounds, CompilerInvocation &CI,
+                         IntrusiveRefCntPtr<vfs::FileSystem> &VFS,
+                         llvm::MemoryBuffer *MainFileBuffer) const;
+
   /// Sets up the PreprocessorOptions and changes VFS, so that PCH stored in \p
   /// Storage is accessible to clang. This method is an implementation detail of
   /// AddImplicitPreamble.
diff --git a/include/clang/Frontend/SerializedDiagnosticPrinter.h b/include/clang/Frontend/SerializedDiagnosticPrinter.h
index 4c57e9d404f0..36ef01488ffe 100644
--- a/include/clang/Frontend/SerializedDiagnosticPrinter.h
+++ b/include/clang/Frontend/SerializedDiagnosticPrinter.h
@@ -29,7 +29,7 @@ namespace serialized_diags {
 ///  a bitcode file.
 ///
 /// The created DiagnosticConsumer is designed for quick and lightweight
-/// transfer of of diagnostics to the enclosing build system (e.g., an IDE).
+/// transfer of diagnostics to the enclosing build system (e.g., an IDE).
 /// This allows wrapper tools for Clang to get diagnostics from Clang
 /// (via libclang) without needing to parse Clang's command line output.
 ///
diff --git a/include/clang/Lex/HeaderSearch.h b/include/clang/Lex/HeaderSearch.h
index 6b9dbfcd1e93..4e8d9cba7053 100644
--- a/include/clang/Lex/HeaderSearch.h
+++ b/include/clang/Lex/HeaderSearch.h
@@ -416,7 +416,7 @@ class HeaderSearch {
     return FrameworkMap[FWName];
   }
 
-  /// \brief Mark the specified file as a target of of a \#include,
+  /// \brief Mark the specified file as a target of a \#include,
   /// \#include_next, or \#import directive.
   ///
   /// \return false if \#including the file will have no effect or true
@@ -693,7 +693,7 @@ class HeaderSearch {
 
   /// \brief Retrieve a uniqued framework name.
   StringRef getUniqueFrameworkName(StringRef Framework);
-  
+
   /// \brief Suggest a path by which the specified file could be found, for
   /// use in diagnostics to suggest a #include.
   ///
@@ -702,6 +702,15 @@ class HeaderSearch {
   std::string suggestPathToFileForDiagnostics(const FileEntry *File,
                                               bool *IsSystem = nullptr);
 
+  /// \brief Suggest a path by which the specified file could be found, for
+  /// use in diagnostics to suggest a #include.
+  ///
+  /// \param WorkingDir If non-empty, this will be prepended to search directory
+  /// paths that are relative.
+  std::string suggestPathToFileForDiagnostics(llvm::StringRef File,
+                                              llvm::StringRef WorkingDir,
+                                              bool *IsSystem = nullptr);
+
   void PrintStats();
   
   size_t getTotalMemory() const;
diff --git a/include/clang/Lex/ModuleMap.h b/include/clang/Lex/ModuleMap.h
index 41ed8e49b6c1..b4ca32096273 100644
--- a/include/clang/Lex/ModuleMap.h
+++ b/include/clang/Lex/ModuleMap.h
@@ -98,6 +98,9 @@ class ModuleMap {
   /// \brief The top-level modules that are known.
   llvm::StringMap<Module *> Modules;
 
+  /// Shadow modules created while building this module map.
+  llvm::SmallVector<Module*, 2> ShadowModules;
+
   /// \brief The number of modules we have created in total.
   unsigned NumCreatedModules = 0;
 
@@ -195,6 +198,15 @@ class ModuleMap {
   /// header.
   llvm::DenseMap<const DirectoryEntry *, Module *> UmbrellaDirs;
 
+  /// \brief A generation counter that is used to test whether modules of the
+  /// same name may shadow or are illegal redefintions.
+  ///
+  /// Modules from earlier scopes may shadow modules from later ones.
+  /// Modules from the same scope may not have the same name.
+  unsigned CurrentModuleScopeID = 0;
+
+  llvm::DenseMap<Module *, unsigned> ModuleScopeIDs;
+
   /// \brief The set of attributes that can be attached to a module.
   struct Attributes {
     /// \brief Whether this is a system module.
@@ -502,6 +514,24 @@ class ModuleMap {
   Module *inferFrameworkModule(const DirectoryEntry *FrameworkDir,
                                bool IsSystem, Module *Parent);
 
+  /// \brief Create a new top-level module that is shadowed by
+  /// \p ShadowingModule.
+  Module *createShadowedModule(StringRef Name, bool IsFramework,
+                               Module *ShadowingModule);
+
+  /// \brief Creates a new declaration scope for module names, allowing
+  /// previously defined modules to shadow definitions from the new scope.
+  ///
+  /// \note Module names from earlier scopes will shadow names from the new
+  /// scope, which is the opposite of how shadowing works for variables.
+  void finishModuleDeclarationScope() { CurrentModuleScopeID += 1; }
+
+  bool mayShadowNewModule(Module *ExistingModule) {
+    assert(!ExistingModule->Parent && "expected top-level module");
+    assert(ModuleScopeIDs.count(ExistingModule) && "unknown module");
+    return ModuleScopeIDs[ExistingModule] < CurrentModuleScopeID;
+  }
+
   /// \brief Retrieve the module map file containing the definition of the given
   /// module.
   ///
@@ -608,8 +638,8 @@ class ModuleMap {
   ///
   /// \returns true if an error occurred, false otherwise.
   bool parseModuleMapFile(const FileEntry *File, bool IsSystem,
-                          const DirectoryEntry *HomeDir, FileID ID = FileID(),
-                          unsigned *Offset = nullptr,
+                          const DirectoryEntry *HomeDir,
+                          FileID ID = FileID(), unsigned *Offset = nullptr,
                           SourceLocation ExternModuleLoc = SourceLocation());
 
   /// \brief Dump the contents of the module map, for debugging purposes.
diff --git a/include/clang/Lex/MultipleIncludeOpt.h b/include/clang/Lex/MultipleIncludeOpt.h
index 3967f8688966..00d3047e536e 100644
--- a/include/clang/Lex/MultipleIncludeOpt.h
+++ b/include/clang/Lex/MultipleIncludeOpt.h
@@ -119,7 +119,7 @@ class MultipleIncludeOpt {
   /// Note, we don't care about the input value of 'ReadAnyTokens'.  The caller
   /// ensures that this is only called if there are no tokens read before the
   /// \#ifndef.  The caller is required to do this, because reading the \#if
-  /// line obviously reads in in tokens.
+  /// line obviously reads in tokens.
   void EnterTopLevelIfndef(const IdentifierInfo *M, SourceLocation Loc) {
     // If the macro is already set, this is after the top-level #endif.
     if (TheMacro)
diff --git a/include/clang/Lex/PreprocessingRecord.h b/include/clang/Lex/PreprocessingRecord.h
index 5f7a6efcef10..77da1eb26f2c 100644
--- a/include/clang/Lex/PreprocessingRecord.h
+++ b/include/clang/Lex/PreprocessingRecord.h
@@ -297,6 +297,9 @@ class Token;
                                                         FileID FID) {
       return None;
     }
+
+    /// \brief Read a preallocated skipped range from the external source.
+    virtual SourceRange ReadSkippedRange(unsigned Index) = 0;
   };
   
   /// \brief A record of the steps taken while preprocessing a source file,
@@ -322,6 +325,8 @@ class Token;
     /// \brief The set of ranges that were skipped by the preprocessor,
     std::vector<SourceRange> SkippedRanges;
 
+    bool SkippedRangesAllLoaded = true;
+
     /// \brief Global (loaded or local) ID for a preprocessed entity.
     /// Negative values are used to indicate preprocessed entities
     /// loaded from the external source while non-negative values are used to
@@ -377,6 +382,16 @@ class Token;
     /// corresponds to the first newly-allocated entity.
     unsigned allocateLoadedEntities(unsigned NumEntities);
 
+    /// \brief Allocate space for a new set of loaded preprocessed skipped
+    /// ranges.
+    ///
+    /// \returns The index into the set of loaded preprocessed ranges, which
+    /// corresponds to the first newly-allocated range.
+    unsigned allocateSkippedRanges(unsigned NumRanges);
+
+    /// \brief Ensures that all external skipped ranges have been loaded.
+    void ensureSkippedRangesLoaded();
+
     /// \brief Register a new macro definition.
     void RegisterMacroDefinition(MacroInfo *Macro, MacroDefinitionRecord *Def);
 
@@ -499,7 +514,8 @@ class Token;
     MacroDefinitionRecord *findMacroDefinition(const MacroInfo *MI);
 
     /// \brief Retrieve all ranges that got skipped while preprocessing.
-    const std::vector<SourceRange> &getSkippedRanges() const {
+    const std::vector<SourceRange> &getSkippedRanges() {
+      ensureSkippedRangesLoaded();
       return SkippedRanges;
     }
         
diff --git a/include/clang/Lex/VariadicMacroSupport.h b/include/clang/Lex/VariadicMacroSupport.h
index cebaf15187de..db1ce0e6be1e 100644
--- a/include/clang/Lex/VariadicMacroSupport.h
+++ b/include/clang/Lex/VariadicMacroSupport.h
@@ -55,7 +55,7 @@ namespace clang {
 
     /// Client code should call this function as soon as the Preprocessor has
     /// either completed lexing the macro's definition tokens, or an error
-    /// occured and the context is being exited.  This function is idempotent
+    /// occurred and the context is being exited.  This function is idempotent
     /// (might be explicitly called, and then reinvoked via the destructor).
     void exitScope() {
       Ident__VA_ARGS__->setIsPoisoned(true);
diff --git a/include/clang/Parse/Parser.h b/include/clang/Parse/Parser.h
index 8f136f4f4438..6b722929f60c 100644
--- a/include/clang/Parse/Parser.h
+++ b/include/clang/Parse/Parser.h
@@ -185,6 +185,9 @@ class Parser : public CodeCompletionHandler {
   std::unique_ptr<PragmaHandler> UnrollHintHandler;
   std::unique_ptr<PragmaHandler> NoUnrollHintHandler;
   std::unique_ptr<PragmaHandler> FPHandler;
+  std::unique_ptr<PragmaHandler> STDCFENVHandler;
+  std::unique_ptr<PragmaHandler> STDCCXLIMITHandler;
+  std::unique_ptr<PragmaHandler> STDCUnknownHandler;
   std::unique_ptr<PragmaHandler> AttributePragmaHandler;
 
   std::unique_ptr<CommentHandler> CommentSemaHandler;
@@ -1856,7 +1859,7 @@ class Parser : public CodeCompletionHandler {
   /// A context for parsing declaration specifiers.  TODO: flesh this
   /// out, there are other significant restrictions on specifiers than
   /// would be best implemented in the parser.
-  enum DeclSpecContext {
+  enum class DeclSpecContext {
     DSC_normal, // normal context
     DSC_class,  // class context, enables 'friend'
     DSC_type_specifier, // C++ type-specifier-seq or C specifier-qualifier-list
@@ -1873,18 +1876,18 @@ class Parser : public CodeCompletionHandler {
   /// trailing-type-specifier)?
   static bool isTypeSpecifier(DeclSpecContext DSC) {
     switch (DSC) {
-    case DSC_normal:
-    case DSC_template_param:
-    case DSC_class:
-    case DSC_top_level:
-    case DSC_objc_method_result:
-    case DSC_condition:
+    case DeclSpecContext::DSC_normal:
+    case DeclSpecContext::DSC_template_param:
+    case DeclSpecContext::DSC_class:
+    case DeclSpecContext::DSC_top_level:
+    case DeclSpecContext::DSC_objc_method_result:
+    case DeclSpecContext::DSC_condition:
       return false;
 
-    case DSC_template_type_arg:
-    case DSC_type_specifier:
-    case DSC_trailing:
-    case DSC_alias_declaration:
+    case DeclSpecContext::DSC_template_type_arg:
+    case DeclSpecContext::DSC_type_specifier:
+    case DeclSpecContext::DSC_trailing:
+    case DeclSpecContext::DSC_alias_declaration:
       return true;
     }
     llvm_unreachable("Missing DeclSpecContext case");
@@ -1894,18 +1897,18 @@ class Parser : public CodeCompletionHandler {
   /// deduction?
   static bool isClassTemplateDeductionContext(DeclSpecContext DSC) {
     switch (DSC) {
-    case DSC_normal:
-    case DSC_template_param:
-    case DSC_class:
-    case DSC_top_level:
-    case DSC_condition:
-    case DSC_type_specifier:
+    case DeclSpecContext::DSC_normal:
+    case DeclSpecContext::DSC_template_param:
+    case DeclSpecContext::DSC_class:
+    case DeclSpecContext::DSC_top_level:
+    case DeclSpecContext::DSC_condition:
+    case DeclSpecContext::DSC_type_specifier:
       return true;
 
-    case DSC_objc_method_result:
-    case DSC_template_type_arg:
-    case DSC_trailing:
-    case DSC_alias_declaration:
+    case DeclSpecContext::DSC_objc_method_result:
+    case DeclSpecContext::DSC_template_type_arg:
+    case DeclSpecContext::DSC_trailing:
+    case DeclSpecContext::DSC_alias_declaration:
       return false;
     }
     llvm_unreachable("Missing DeclSpecContext case");
@@ -1954,17 +1957,19 @@ class Parser : public CodeCompletionHandler {
                         ParsedAttributesWithRange &Attrs);
   DeclSpecContext
   getDeclSpecContextFromDeclaratorContext(DeclaratorContext Context);
-  void ParseDeclarationSpecifiers(DeclSpec &DS,
-                const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo(),
-                                  AccessSpecifier AS = AS_none,
-                                  DeclSpecContext DSC = DSC_normal,
-                                  LateParsedAttrList *LateAttrs = nullptr);
-  bool DiagnoseMissingSemiAfterTagDefinition(DeclSpec &DS, AccessSpecifier AS,
-                                       DeclSpecContext DSContext,
-                                       LateParsedAttrList *LateAttrs = nullptr);
-
-  void ParseSpecifierQualifierList(DeclSpec &DS, AccessSpecifier AS = AS_none,
-                                   DeclSpecContext DSC = DSC_normal);
+  void ParseDeclarationSpecifiers(
+      DeclSpec &DS,
+      const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo(),
+      AccessSpecifier AS = AS_none,
+      DeclSpecContext DSC = DeclSpecContext::DSC_normal,
+      LateParsedAttrList *LateAttrs = nullptr);
+  bool DiagnoseMissingSemiAfterTagDefinition(
+      DeclSpec &DS, AccessSpecifier AS, DeclSpecContext DSContext,
+      LateParsedAttrList *LateAttrs = nullptr);
+
+  void ParseSpecifierQualifierList(
+      DeclSpec &DS, AccessSpecifier AS = AS_none,
+      DeclSpecContext DSC = DeclSpecContext::DSC_normal);
 
   void ParseObjCTypeQualifierList(ObjCDeclSpec &DS,
                                   DeclaratorContext Context);
@@ -2670,30 +2675,42 @@ class Parser : public CodeCompletionHandler {
   /// \brief Parses clause with a single expression of a kind \a Kind.
   ///
   /// \param Kind Kind of current clause.
+  /// \param ParseOnly true to skip the clause's semantic actions and return
+  /// nullptr.
   ///
-  OMPClause *ParseOpenMPSingleExprClause(OpenMPClauseKind Kind);
+  OMPClause *ParseOpenMPSingleExprClause(OpenMPClauseKind Kind,
+                                         bool ParseOnly);
   /// \brief Parses simple clause of a kind \a Kind.
   ///
   /// \param Kind Kind of current clause.
+  /// \param ParseOnly true to skip the clause's semantic actions and return
+  /// nullptr.
   ///
-  OMPClause *ParseOpenMPSimpleClause(OpenMPClauseKind Kind);
+  OMPClause *ParseOpenMPSimpleClause(OpenMPClauseKind Kind, bool ParseOnly);
   /// \brief Parses clause with a single expression and an additional argument
   /// of a kind \a Kind.
   ///
   /// \param Kind Kind of current clause.
+  /// \param ParseOnly true to skip the clause's semantic actions and return
+  /// nullptr.
   ///
-  OMPClause *ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind);
+  OMPClause *ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind,
+                                                bool ParseOnly);
   /// \brief Parses clause without any additional arguments.
   ///
   /// \param Kind Kind of current clause.
+  /// \param ParseOnly true to skip the clause's semantic actions and return
+  /// nullptr.
   ///
-  OMPClause *ParseOpenMPClause(OpenMPClauseKind Kind);
+  OMPClause *ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly = false);
   /// \brief Parses clause with the list of variables of a kind \a Kind.
   ///
   /// \param Kind Kind of current clause.
+  /// \param ParseOnly true to skip the clause's semantic actions and return
+  /// nullptr.
   ///
   OMPClause *ParseOpenMPVarListClause(OpenMPDirectiveKind DKind,
-                                      OpenMPClauseKind Kind);
+                                      OpenMPClauseKind Kind, bool ParseOnly);
 
 public:
   /// Parses simple expression in parens for single-expression clauses of OpenMP
diff --git a/include/clang/Sema/CodeCompleteConsumer.h b/include/clang/Sema/CodeCompleteConsumer.h
index 5d280b5608e7..2fea262a44fc 100644
--- a/include/clang/Sema/CodeCompleteConsumer.h
+++ b/include/clang/Sema/CodeCompleteConsumer.h
@@ -268,6 +268,8 @@ class CodeCompletionContext {
     CCC_Recovery
   };
 
+  using VisitedContextSet = llvm::SmallPtrSet<DeclContext*, 8>;
+
 private:
   enum Kind Kind;
 
@@ -285,6 +287,10 @@ class CodeCompletionContext {
   /// "a::b::"
   llvm::Optional<CXXScopeSpec> ScopeSpecifier;
 
+  /// \brief A set of declaration contexts visited by Sema when doing lookup for
+  /// code completion.
+  VisitedContextSet VisitedContexts;
+
 public:
   /// \brief Construct a new code-completion context of the given kind.
   CodeCompletionContext(enum Kind Kind) : Kind(Kind), SelIdents(None) { }
@@ -328,6 +334,16 @@ class CodeCompletionContext {
     this->ScopeSpecifier = std::move(SS);
   }
 
+  /// \brief Adds a visited context.
+  void addVisitedContext(DeclContext* Ctx) {
+    VisitedContexts.insert(Ctx);
+  }
+
+  /// \brief Retrieves all visited contexts.
+  const VisitedContextSet &getVisitedContexts() const {
+    return VisitedContexts;
+  }
+
   llvm::Optional<const CXXScopeSpec *> getCXXScopeSpecifier() {
     if (ScopeSpecifier)
       return ScopeSpecifier.getPointer();
@@ -934,6 +950,12 @@ class CodeCompleteConsumer {
     return CodeCompleteOpts.IncludeBriefComments;
   }
 
+  /// \brief Hint whether to load data from the external AST in order to provide
+  /// full results. If false, declarations from the preamble may be omitted.
+  bool loadExternal() const {
+    return CodeCompleteOpts.LoadExternal;
+  }
+
   /// \brief Determine whether the output of this consumer is binary.
   bool isOutputBinary() const { return OutputIsBinary; }
 
diff --git a/include/clang/Sema/CodeCompleteOptions.h b/include/clang/Sema/CodeCompleteOptions.h
index 091d8ca60505..bdd4732fd773 100644
--- a/include/clang/Sema/CodeCompleteOptions.h
+++ b/include/clang/Sema/CodeCompleteOptions.h
@@ -35,9 +35,14 @@ class CodeCompleteOptions {
   /// Show brief documentation comments in code completion results.
   unsigned IncludeBriefComments : 1;
 
+  /// Hint whether to load data from the external AST to provide full results.
+  /// If false, namespace-level declarations from the preamble may be omitted.
+  unsigned LoadExternal : 1;
+
   CodeCompleteOptions()
       : IncludeMacros(0), IncludeCodePatterns(0), IncludeGlobals(1),
-        IncludeNamespaceLevelDecls(1), IncludeBriefComments(0) {}
+        IncludeNamespaceLevelDecls(1), IncludeBriefComments(0),
+        LoadExternal(1) {}
 };
 
 } // namespace clang
diff --git a/include/clang/Sema/DeclSpec.h b/include/clang/Sema/DeclSpec.h
index 39aa2f24ad76..2e7411337bfc 100644
--- a/include/clang/Sema/DeclSpec.h
+++ b/include/clang/Sema/DeclSpec.h
@@ -882,6 +882,30 @@ class ObjCDeclSpec {
 
 };
 
+/// \brief Describes the kind of unqualified-id parsed.
+enum class UnqualifiedIdKind {
+  /// \brief An identifier.
+  IK_Identifier,
+  /// \brief An overloaded operator name, e.g., operator+.
+  IK_OperatorFunctionId,
+  /// \brief A conversion function name, e.g., operator int.
+  IK_ConversionFunctionId,
+  /// \brief A user-defined literal name, e.g., operator "" _i.
+  IK_LiteralOperatorId,
+  /// \brief A constructor name.
+  IK_ConstructorName,
+  /// \brief A constructor named via a template-id.
+  IK_ConstructorTemplateId,
+  /// \brief A destructor name.
+  IK_DestructorName,
+  /// \brief A template-id, e.g., f<int>.
+  IK_TemplateId,
+  /// \brief An implicit 'self' parameter
+  IK_ImplicitSelfParam,
+  /// \brief A deduction-guide name (a template-name)
+  IK_DeductionGuideName
+};
+
 /// \brief Represents a C++ unqualified-id that has been parsed. 
 class UnqualifiedId {
 private:
@@ -890,28 +914,7 @@ class UnqualifiedId {
 
 public:
   /// \brief Describes the kind of unqualified-id parsed.
-  enum IdKind {
-    /// \brief An identifier.
-    IK_Identifier,
-    /// \brief An overloaded operator name, e.g., operator+.
-    IK_OperatorFunctionId,
-    /// \brief A conversion function name, e.g., operator int.
-    IK_ConversionFunctionId,
-    /// \brief A user-defined literal name, e.g., operator "" _i.
-    IK_LiteralOperatorId,
-    /// \brief A constructor name.
-    IK_ConstructorName,
-    /// \brief A constructor named via a template-id.
-    IK_ConstructorTemplateId,
-    /// \brief A destructor name.
-    IK_DestructorName,
-    /// \brief A template-id, e.g., f<int>.
-    IK_TemplateId,
-    /// \brief An implicit 'self' parameter
-    IK_ImplicitSelfParam,
-    /// \brief A deduction-guide name (a template-name)
-    IK_DeductionGuideName
-  } Kind;
+  UnqualifiedIdKind Kind;
 
   struct OFI {
     /// \brief The kind of overloaded operator.
@@ -966,13 +969,14 @@ class UnqualifiedId {
   
   /// \brief The location of the last token that describes this unqualified-id.
   SourceLocation EndLocation;
-  
-  UnqualifiedId() : Kind(IK_Identifier), Identifier(nullptr) { }
+
+  UnqualifiedId()
+      : Kind(UnqualifiedIdKind::IK_Identifier), Identifier(nullptr) {}
 
   /// \brief Clear out this unqualified-id, setting it to default (invalid) 
   /// state.
   void clear() {
-    Kind = IK_Identifier;
+    Kind = UnqualifiedIdKind::IK_Identifier;
     Identifier = nullptr;
     StartLocation = SourceLocation();
     EndLocation = SourceLocation();
@@ -985,15 +989,15 @@ class UnqualifiedId {
   bool isInvalid() const { return !isValid(); }
   
   /// \brief Determine what kind of name we have.
-  IdKind getKind() const { return Kind; }
-  void setKind(IdKind kind) { Kind = kind; } 
+  UnqualifiedIdKind getKind() const { return Kind; }
+  void setKind(UnqualifiedIdKind kind) { Kind = kind; } 
   
   /// \brief Specify that this unqualified-id was parsed as an identifier.
   ///
   /// \param Id the parsed identifier.
   /// \param IdLoc the location of the parsed identifier.
   void setIdentifier(const IdentifierInfo *Id, SourceLocation IdLoc) {
-    Kind = IK_Identifier;
+    Kind = UnqualifiedIdKind::IK_Identifier;
     Identifier = const_cast<IdentifierInfo *>(Id);
     StartLocation = EndLocation = IdLoc;
   }
@@ -1022,7 +1026,7 @@ class UnqualifiedId {
   void setConversionFunctionId(SourceLocation OperatorLoc, 
                                ParsedType Ty,
                                SourceLocation EndLoc) {
-    Kind = IK_ConversionFunctionId;
+    Kind = UnqualifiedIdKind::IK_ConversionFunctionId;
     StartLocation = OperatorLoc;
     EndLocation = EndLoc;
     ConversionFunctionId = Ty;
@@ -1038,7 +1042,7 @@ class UnqualifiedId {
   /// \param IdLoc the location of the identifier.
   void setLiteralOperatorId(const IdentifierInfo *Id, SourceLocation OpLoc,
                               SourceLocation IdLoc) {
-    Kind = IK_LiteralOperatorId;
+    Kind = UnqualifiedIdKind::IK_LiteralOperatorId;
     Identifier = const_cast<IdentifierInfo *>(Id);
     StartLocation = OpLoc;
     EndLocation = IdLoc;
@@ -1054,7 +1058,7 @@ class UnqualifiedId {
   void setConstructorName(ParsedType ClassType, 
                           SourceLocation ClassNameLoc,
                           SourceLocation EndLoc) {
-    Kind = IK_ConstructorName;
+    Kind = UnqualifiedIdKind::IK_ConstructorName;
     StartLocation = ClassNameLoc;
     EndLocation = EndLoc;
     ConstructorName = ClassType;
@@ -1077,7 +1081,7 @@ class UnqualifiedId {
   void setDestructorName(SourceLocation TildeLoc,
                          ParsedType ClassType,
                          SourceLocation EndLoc) {
-    Kind = IK_DestructorName;
+    Kind = UnqualifiedIdKind::IK_DestructorName;
     StartLocation = TildeLoc;
     EndLocation = EndLoc;
     DestructorName = ClassType;
@@ -1097,7 +1101,7 @@ class UnqualifiedId {
   /// \param TemplateLoc The location of the parsed template-name.
   void setDeductionGuideName(ParsedTemplateTy Template,
                              SourceLocation TemplateLoc) {
-    Kind = IK_DeductionGuideName;
+    Kind = UnqualifiedIdKind::IK_DeductionGuideName;
     TemplateName = Template;
     StartLocation = EndLocation = TemplateLoc;
   }
@@ -2040,7 +2044,7 @@ class Declarator {
       return false;
 
     // Special names can't have direct initializers.
-    if (Name.getKind() != UnqualifiedId::IK_Identifier)
+    if (Name.getKind() != UnqualifiedIdKind::IK_Identifier)
       return false;
 
     switch (Context) {
@@ -2090,8 +2094,8 @@ class Declarator {
   /// special C++ name (constructor, destructor, etc.), or a structured
   /// binding (which is not exactly a name, but occupies the same position).
   bool hasName() const {
-    return Name.getKind() != UnqualifiedId::IK_Identifier || Name.Identifier ||
-           isDecompositionDeclarator();
+    return Name.getKind() != UnqualifiedIdKind::IK_Identifier ||
+           Name.Identifier || isDecompositionDeclarator();
   }
 
   /// Return whether this declarator is a decomposition declarator.
@@ -2100,7 +2104,7 @@ class Declarator {
   }
 
   IdentifierInfo *getIdentifier() const { 
-    if (Name.getKind() == UnqualifiedId::IK_Identifier)
+    if (Name.getKind() == UnqualifiedIdKind::IK_Identifier)
       return Name.Identifier;
     
     return nullptr;
diff --git a/include/clang/Sema/Initialization.h b/include/clang/Sema/Initialization.h
index bd07b9ea9aee..122df47f6ecc 100644
--- a/include/clang/Sema/Initialization.h
+++ b/include/clang/Sema/Initialization.h
@@ -539,8 +539,15 @@ class InitializationKind {
   }
 
   static InitializationKind CreateDirectList(SourceLocation InitLoc) {
-    return InitializationKind(IK_DirectList, IC_Normal,
-                              InitLoc, InitLoc, InitLoc);
+    return InitializationKind(IK_DirectList, IC_Normal, InitLoc, InitLoc,
+                              InitLoc);
+  }
+
+  static InitializationKind CreateDirectList(SourceLocation InitLoc,
+                                             SourceLocation LBraceLoc,
+                                             SourceLocation RBraceLoc) {
+    return InitializationKind(IK_DirectList, IC_Normal, InitLoc, LBraceLoc,
+                              RBraceLoc);
   }
 
   /// \brief Create a direct initialization due to a cast that isn't a C-style 
@@ -598,7 +605,8 @@ class InitializationKind {
                                           Expr *Init) {
     if (!Init) return CreateDefault(Loc);
     if (!DirectInit) return CreateCopy(Loc, Init->getLocStart());
-    if (isa<InitListExpr>(Init)) return CreateDirectList(Loc);
+    if (isa<InitListExpr>(Init))
+      return CreateDirectList(Loc, Init->getLocStart(), Init->getLocEnd());
     return CreateDirect(Loc, Init->getLocStart(), Init->getLocEnd());
   }
   
@@ -660,12 +668,20 @@ class InitializationKind {
   bool allowExplicitConversionFunctionsInRefBinding() const {
     return !isCopyInit() || Context == IC_ExplicitConvs;
   }
+
+  /// Determine whether this initialization has a source range containing the
+  /// locations of open and closing parentheses or braces.
+  bool hasParenOrBraceRange() const {
+    return Kind == IK_Direct || Kind == IK_Value || Kind == IK_DirectList;
+  }
   
   /// \brief Retrieve the source range containing the locations of the open
-  /// and closing parentheses for value and direct initializations.
-  SourceRange getParenRange() const {
-    assert((Kind == IK_Direct || Kind == IK_Value) &&
-           "Only direct- and value-initialization have parentheses");
+  /// and closing parentheses or braces for value, direct, and direct list
+  /// initializations.
+  SourceRange getParenOrBraceRange() const {
+    assert(hasParenOrBraceRange() && "Only direct, value, and direct-list "
+                                     "initialization have parentheses or "
+                                     "braces");
     return SourceRange(Locations[1], Locations[2]);
   }
 };
diff --git a/include/clang/Sema/Lookup.h b/include/clang/Sema/Lookup.h
index 546df8842a35..0e8dcada7429 100644
--- a/include/clang/Sema/Lookup.h
+++ b/include/clang/Sema/Lookup.h
@@ -784,6 +784,12 @@ class VisibleDeclConsumer {
   /// class of the context we searched.
   virtual void FoundDecl(NamedDecl *ND, NamedDecl *Hiding, DeclContext *Ctx,
                          bool InBaseClass) = 0;
+
+  /// \brief Callback to inform the client that Sema entered into a new context
+  /// to find a visible declaration.
+  //
+  /// \param Ctx the context which Sema entered.
+  virtual void EnteredContext(DeclContext *Ctx) {}
 };
 
 /// \brief A class for storing results from argument-dependent lookup.
diff --git a/include/clang/Sema/MultiplexExternalSemaSource.h b/include/clang/Sema/MultiplexExternalSemaSource.h
index 1d681a00552f..4de36afbcb97 100644
--- a/include/clang/Sema/MultiplexExternalSemaSource.h
+++ b/include/clang/Sema/MultiplexExternalSemaSource.h
@@ -148,8 +148,10 @@ class MultiplexExternalSemaSource : public ExternalSemaSource {
   /// \brief Print any statistics that have been gathered regarding
   /// the external AST source.
   void PrintStats() override;
-  
-  
+
+  /// \brief Retrieve the module that corresponds to the given module ID.
+  Module *getModule(unsigned ID) override;
+
   /// \brief Perform layout on the given record.
   ///
   /// This routine allows the external AST source to provide an specific 
diff --git a/include/clang/Sema/Overload.h b/include/clang/Sema/Overload.h
index 05cfe53666ca..eb32b8fffd97 100644
--- a/include/clang/Sema/Overload.h
+++ b/include/clang/Sema/Overload.h
@@ -613,6 +613,10 @@ namespace clang {
     /// This inherited constructor is not viable because it would slice the
     /// argument.
     ovl_fail_inhctor_slice,
+
+    /// This candidate was not viable because it is a non-default multiversioned
+    /// function.
+    ovl_non_default_multiversion_function,
   };
 
   /// A list of implicit conversion sequences for the arguments of an
diff --git a/include/clang/Sema/ScopeInfo.h b/include/clang/Sema/ScopeInfo.h
index c707a3e8ef07..3aa34ae0e011 100644
--- a/include/clang/Sema/ScopeInfo.h
+++ b/include/clang/Sema/ScopeInfo.h
@@ -22,6 +22,7 @@
 #include "clang/Sema/CleanupInfo.h"
 #include "clang/Sema/Ownership.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -172,6 +173,10 @@ class FunctionScopeInfo {
   /// \brief The promise object for this coroutine, if any.
   VarDecl *CoroutinePromise = nullptr;
 
+  /// \brief A mapping between the coroutine function parameters that were moved
+  /// to the coroutine frame, and their move statements.
+  llvm::SmallMapVector<ParmVarDecl *, Stmt *, 4> CoroutineParameterMoves;
+
   /// \brief The initial and final coroutine suspend points.
   std::pair<Stmt *, Stmt *> CoroutineSuspends;
 
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 9cbe8e5cd63e..7349edff6177 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -1459,6 +1459,7 @@ class Sema {
       const PartialDiagnostic &DiagID, const PartialDiagnostic & NoteID,
       const FunctionProtoType *Old, SourceLocation OldLoc,
       const FunctionProtoType *New, SourceLocation NewLoc);
+  bool handlerCanCatch(QualType HandlerType, QualType ExceptionType);
   bool CheckExceptionSpecSubset(const PartialDiagnostic &DiagID,
                                 const PartialDiagnostic &NestedDiagID,
                                 const PartialDiagnostic &NoteID,
@@ -3199,11 +3200,13 @@ class Sema {
 
   void LookupVisibleDecls(Scope *S, LookupNameKind Kind,
                           VisibleDeclConsumer &Consumer,
-                          bool IncludeGlobalScope = true);
+                          bool IncludeGlobalScope = true,
+                          bool LoadExternal = true);
   void LookupVisibleDecls(DeclContext *Ctx, LookupNameKind Kind,
                           VisibleDeclConsumer &Consumer,
                           bool IncludeGlobalScope = true,
-                          bool IncludeDependentBases = false);
+                          bool IncludeDependentBases = false,
+                          bool LoadExternal = true);
 
   enum CorrectTypoKind {
     CTK_NonError,     // CorrectTypo used in a non error recovery situation.
@@ -5103,14 +5106,16 @@ class Sema {
   /// or class type construction ("ClassType(x,y,z)")
   /// or creation of a value-initialized type ("int()").
   ExprResult ActOnCXXTypeConstructExpr(ParsedType TypeRep,
-                                       SourceLocation LParenLoc,
+                                       SourceLocation LParenOrBraceLoc,
                                        MultiExprArg Exprs,
-                                       SourceLocation RParenLoc);
+                                       SourceLocation RParenOrBraceLoc,
+                                       bool ListInitialization);
 
   ExprResult BuildCXXTypeConstructExpr(TypeSourceInfo *Type,
                                        SourceLocation LParenLoc,
                                        MultiExprArg Exprs,
-                                       SourceLocation RParenLoc);
+                                       SourceLocation RParenLoc,
+                                       bool ListInitialization);
 
   /// ActOnCXXNew - Parsed a C++ 'new' expression.
   ExprResult ActOnCXXNew(SourceLocation StartLoc, bool UseGlobal,
@@ -5547,6 +5552,11 @@ class Sema {
   ExprResult BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc,
                              sema::LambdaScopeInfo *LSI);
 
+  /// Get the return type to use for a lambda's conversion function(s) to
+  /// function pointer type, given the type of the call operator.
+  QualType
+  getLambdaConversionFunctionResultType(const FunctionProtoType *CallOpType);
+
   /// \brief Define the "body" of the conversion from a lambda object to a
   /// function pointer.
   ///
@@ -7760,6 +7770,9 @@ class Sema {
 
   void InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
                                 FunctionDecl *Function);
+  FunctionDecl *InstantiateFunctionDeclaration(FunctionTemplateDecl *FTD,
+                                               const TemplateArgumentList *Args,
+                                               SourceLocation Loc);
   void InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
                                      FunctionDecl *Function,
                                      bool Recursive = false,
@@ -8465,6 +8478,7 @@ class Sema {
   StmtResult BuildCoreturnStmt(SourceLocation KwLoc, Expr *E,
                                bool IsImplicit = false);
   StmtResult BuildCoroutineBodyStmt(CoroutineBodyStmt::CtorArgs);
+  bool buildCoroutineParameterMoves(SourceLocation Loc);
   VarDecl *buildCoroutinePromise(SourceLocation Loc);
   void CheckCompletedCoroutineBody(FunctionDecl *FD, Stmt *&Body);
 
@@ -9572,7 +9586,8 @@ class Sema {
                                bool AllowBothBool, bool AllowBoolConversion);
   QualType GetSignedVectorType(QualType V);
   QualType CheckVectorCompareOperands(ExprResult &LHS, ExprResult &RHS,
-                                      SourceLocation Loc, bool isRelational);
+                                      SourceLocation Loc,
+                                      BinaryOperatorKind Opc);
   QualType CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
                                       SourceLocation Loc);
 
@@ -10396,7 +10411,10 @@ class Sema {
                           const AttrVec *Attrs = nullptr,
                           const FunctionDecl *FD = nullptr);
 
-  void CheckFloatComparison(SourceLocation Loc, Expr* LHS, Expr* RHS);
+public:
+  void CheckFloatComparison(SourceLocation Loc, Expr *LHS, Expr *RHS);
+
+private:
   void CheckImplicitConversions(Expr *E, SourceLocation CC = SourceLocation());
   void CheckBoolLikeConversion(Expr *E, SourceLocation CC);
   void CheckForIntOverflow(Expr *E);
diff --git a/include/clang/Serialization/ASTBitCodes.h b/include/clang/Serialization/ASTBitCodes.h
index 34a7bb330174..1f4e0347bc48 100644
--- a/include/clang/Serialization/ASTBitCodes.h
+++ b/include/clang/Serialization/ASTBitCodes.h
@@ -198,6 +198,25 @@ namespace serialization {
       }
     };
 
+    /// \brief Source range of a skipped preprocessor region
+    struct PPSkippedRange {
+      /// \brief Raw source location of beginning of range.
+      unsigned Begin;
+      /// \brief Raw source location of end of range.
+      unsigned End;
+
+      PPSkippedRange(SourceRange R)
+        : Begin(R.getBegin().getRawEncoding()),
+          End(R.getEnd().getRawEncoding()) { }
+
+      SourceLocation getBegin() const {
+        return SourceLocation::getFromRawEncoding(Begin);
+      }
+      SourceLocation getEnd() const {
+        return SourceLocation::getFromRawEncoding(End);
+      }
+    };
+
     /// \brief Source range/offset of a preprocessed entity.
     struct DeclOffset {
       /// \brief Raw source location.
@@ -627,6 +646,9 @@ namespace serialization {
 
       /// \brief The stack of open #ifs/#ifdefs recorded in a preamble.
       PP_CONDITIONAL_STACK = 62,
+
+      /// \brief A table of skipped ranges within the preprocessing record.
+      PPD_SKIPPED_RANGES = 63
     };
 
     /// \brief Record types used within a source manager block.
diff --git a/include/clang/Serialization/ASTReader.h b/include/clang/Serialization/ASTReader.h
index 37920fc143ea..ed1f4cbc681f 100644
--- a/include/clang/Serialization/ASTReader.h
+++ b/include/clang/Serialization/ASTReader.h
@@ -753,6 +753,13 @@ class ASTReader
   /// added to the global preprocessing entity ID to produce a local ID.
   GlobalPreprocessedEntityMapType GlobalPreprocessedEntityMap;
 
+  typedef ContinuousRangeMap<unsigned, ModuleFile *, 4>
+    GlobalSkippedRangeMapType;
+
+  /// \brief Mapping from global skipped range base IDs to the module in which
+  /// the skipped ranges reside.
+  GlobalSkippedRangeMapType GlobalSkippedRangeMap;
+
   /// \name CodeGen-relevant special data
   /// \brief Fields containing data that is relevant to CodeGen.
   //@{
@@ -1693,6 +1700,9 @@ class ASTReader
   Optional<bool> isPreprocessedEntityInFileID(unsigned Index,
                                               FileID FID) override;
 
+  /// \brief Read a preallocated skipped range from the external source.
+  SourceRange ReadSkippedRange(unsigned Index) override;
+
   /// \brief Read the header file information for the given file entry.
   HeaderFileInfo GetHeaderFileInfo(const FileEntry *FE) override;
 
diff --git a/include/clang/Serialization/Module.h b/include/clang/Serialization/Module.h
index 4e4bf44f3492..26b3bce593d6 100644
--- a/include/clang/Serialization/Module.h
+++ b/include/clang/Serialization/Module.h
@@ -324,6 +324,12 @@ class ModuleFile {
   const PPEntityOffset *PreprocessedEntityOffsets = nullptr;
   unsigned NumPreprocessedEntities = 0;
 
+  /// \brief Base ID for preprocessed skipped ranges local to this module.
+  unsigned BasePreprocessedSkippedRangeID = 0;
+
+  const PPSkippedRange *PreprocessedSkippedRangeOffsets = nullptr;
+  unsigned NumPreprocessedSkippedRanges = 0;
+
   // === Header search information ===
 
   /// \brief The number of local HeaderFileInfo structures.
diff --git a/include/clang/StaticAnalyzer/Checkers/Checkers.td b/include/clang/StaticAnalyzer/Checkers/Checkers.td
index e510e84e938a..77820aeea30c 100644
--- a/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -86,7 +86,7 @@ def LLVM : Package<"llvm">;
 
 // The APIModeling package is for checkers that model APIs and don't perform
 // any diagnostics. These checkers are always turned on; this package is
-// intended for API modeling that is not controlled by the the target triple.
+// intended for API modeling that is not controlled by the target triple.
 def APIModeling : Package<"apiModeling">, Hidden;
 def GoogleAPIModeling : Package<"google">, InPackage<APIModeling>;
 
diff --git a/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index ce50cc582d1e..2f28addb96ec 100644
--- a/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -139,6 +139,10 @@ class AnalyzerOptions : public RefCountedBase<AnalyzerOptions> {
   AnalysisPurgeMode AnalysisPurgeOpt;
   
   std::string AnalyzeSpecificFunction;
+
+  /// Store full compiler invocation for reproducible instructions in the
+  /// generated report.
+  std::string FullCompilerInvocation;
   
   /// \brief The maximum number of times the analyzer visits a block.
   unsigned maxBlockVisitOnPath;
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
index cd1355d03b63..a5febd8b78fa 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
@@ -470,8 +470,6 @@ class BugReporter {
     return true;
   }
 
-  bool RemoveUnneededCalls(PathPieces &pieces, BugReport *R);
-
   void Register(BugType *BT);
 
   /// \brief Add the given report to the set of reports tracked by BugReporter.
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h b/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h
index 18fa85c9657f..d3163ef3e576 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/BugType.h
@@ -32,27 +32,39 @@ class BugType {
   const CheckName Check;
   const std::string Name;
   const std::string Category;
-  bool SuppressonSink;
+  const CheckerBase *Checker;
+  bool SuppressOnSink;
 
   virtual void anchor();
+
 public:
-  BugType(class CheckName check, StringRef name, StringRef cat)
-      : Check(check), Name(name), Category(cat), SuppressonSink(false) {}
-  BugType(const CheckerBase *checker, StringRef name, StringRef cat)
-      : Check(checker->getCheckName()), Name(name), Category(cat),
-        SuppressonSink(false) {}
-  virtual ~BugType() {}
-
-  // FIXME: Should these be made strings as well?
+  BugType(CheckName Check, StringRef Name, StringRef Cat)
+      : Check(Check), Name(Name), Category(Cat), Checker(nullptr),
+        SuppressOnSink(false) {}
+  BugType(const CheckerBase *Checker, StringRef Name, StringRef Cat)
+      : Check(Checker->getCheckName()), Name(Name), Category(Cat),
+        Checker(Checker), SuppressOnSink(false) {}
+  virtual ~BugType() = default;
+
   StringRef getName() const { return Name; }
   StringRef getCategory() const { return Category; }
-  StringRef getCheckName() const { return Check.getName(); }
+  StringRef getCheckName() const {
+    // FIXME: This is a workaround to ensure that the correct check name is used
+    // The check names are set after the constructors are run.
+    // In case the BugType object is initialized in the checker's ctor
+    // the Check field will be empty. To circumvent this problem we use 
+    // CheckerBase whenever it is possible.
+    StringRef CheckName =
+        Checker ? Checker->getCheckName().getName() : Check.getName();
+    assert(!CheckName.empty() && "Check name is not set properly.");
+    return CheckName;
+  }
 
   /// isSuppressOnSink - Returns true if bug reports associated with this bug
   ///  type should be suppressed if the end node of the report is post-dominated
   ///  by a sink node.
-  bool isSuppressOnSink() const { return SuppressonSink; }
-  void setSuppressOnSink(bool x) { SuppressonSink = x; }
+  bool isSuppressOnSink() const { return SuppressOnSink; }
+  void setSuppressOnSink(bool x) { SuppressOnSink = x; }
 
   virtual void FlushReports(BugReporter& BR);
 };
@@ -74,7 +86,7 @@ class BuiltinBug : public BugType {
   StringRef getDescription() const { return desc; }
 };
 
-} // end GR namespace
+} // end ento namespace
 
 } // end clang namespace
 #endif
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h b/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
index f31ab2cd81cd..2adc09cd8db3 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
@@ -23,6 +23,8 @@
 #include <deque>
 #include <iterator>
 #include <list>
+#include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -733,6 +735,9 @@ class PathDiagnosticNotePiece: public PathDiagnosticSpotPiece {
   void Profile(llvm::FoldingSetNodeID &ID) const override;
 };
 
+/// File IDs mapped to sets of line numbers.
+typedef std::map<unsigned, std::set<unsigned>> FilesToLineNumsMap;
+
 /// PathDiagnostic - PathDiagnostic objects represent a single path-sensitive
 ///  diagnostic.  It represents an ordered-collection of PathDiagnosticPieces,
 ///  each which represent the pieces of the path.
@@ -756,12 +761,16 @@ class PathDiagnostic : public llvm::FoldingSetNode {
   PathDiagnosticLocation UniqueingLoc;
   const Decl *UniqueingDecl;
 
+  /// Lines executed in the path.
+  std::unique_ptr<FilesToLineNumsMap> ExecutedLines;
+
   PathDiagnostic() = delete;
 public:
   PathDiagnostic(StringRef CheckName, const Decl *DeclWithIssue,
                  StringRef bugtype, StringRef verboseDesc, StringRef shortDesc,
                  StringRef category, PathDiagnosticLocation LocationToUnique,
-                 const Decl *DeclToUnique);
+                 const Decl *DeclToUnique,
+                 std::unique_ptr<FilesToLineNumsMap> ExecutedLines);
 
   ~PathDiagnostic();
   
@@ -830,6 +839,12 @@ class PathDiagnostic : public llvm::FoldingSetNode {
   meta_iterator meta_end() const { return OtherDesc.end(); }
   void addMeta(StringRef s) { OtherDesc.push_back(s); }
 
+  typedef FilesToLineNumsMap::const_iterator filesmap_iterator;
+  filesmap_iterator executedLines_begin() const {
+    return ExecutedLines->begin();
+  }
+  filesmap_iterator executedLines_end() const { return ExecutedLines->end(); }
+
   PathDiagnosticLocation getLocation() const {
     assert(Loc.isValid() && "No report location set yet!");
     return Loc;
diff --git a/include/clang/StaticAnalyzer/Core/Checker.h b/include/clang/StaticAnalyzer/Core/Checker.h
index f9477762c758..305abea1fab3 100644
--- a/include/clang/StaticAnalyzer/Core/Checker.h
+++ b/include/clang/StaticAnalyzer/Core/Checker.h
@@ -283,6 +283,22 @@ class BranchCondition {
   }
 };
 
+class NewAllocator {
+  template <typename CHECKER>
+  static void _checkNewAllocator(void *checker, const CXXNewExpr *NE,
+                                 SVal Target, CheckerContext &C) {
+    ((const CHECKER *)checker)->checkNewAllocator(NE, Target, C);
+  }
+
+public:
+  template <typename CHECKER>
+  static void _register(CHECKER *checker, CheckerManager &mgr) {
+    mgr._registerForNewAllocator(
+        CheckerManager::CheckNewAllocatorFunc(checker,
+                                              _checkNewAllocator<CHECKER>));
+  }
+};
+
 class LiveSymbols {
   template <typename CHECKER>
   static void _checkLiveSymbols(void *checker, ProgramStateRef state,
diff --git a/include/clang/StaticAnalyzer/Core/CheckerManager.h b/include/clang/StaticAnalyzer/Core/CheckerManager.h
index 88cb08a4b647..49d159e37fe7 100644
--- a/include/clang/StaticAnalyzer/Core/CheckerManager.h
+++ b/include/clang/StaticAnalyzer/Core/CheckerManager.h
@@ -303,6 +303,13 @@ class CheckerManager {
                                      ExplodedNodeSet &Dst, ExplodedNode *Pred,
                                      ExprEngine &Eng);
 
+  /// \brief Run checkers between C++ operator new and constructor calls.
+  void runCheckersForNewAllocator(const CXXNewExpr *NE, SVal Target,
+                                  ExplodedNodeSet &Dst,
+                                  ExplodedNode *Pred,
+                                  ExprEngine &Eng,
+                                  bool wasInlined = false);
+
   /// \brief Run checkers for live symbols.
   ///
   /// Allows modifying SymbolReaper object. For example, checkers can explicitly
@@ -437,6 +444,9 @@ class CheckerManager {
   
   typedef CheckerFn<void (const Stmt *, CheckerContext &)>
       CheckBranchConditionFunc;
+
+  typedef CheckerFn<void (const CXXNewExpr *, SVal, CheckerContext &)>
+      CheckNewAllocatorFunc;
   
   typedef CheckerFn<void (SymbolReaper &, CheckerContext &)>
       CheckDeadSymbolsFunc;
@@ -494,6 +504,8 @@ class CheckerManager {
 
   void _registerForBranchCondition(CheckBranchConditionFunc checkfn);
 
+  void _registerForNewAllocator(CheckNewAllocatorFunc checkfn);
+
   void _registerForLiveSymbols(CheckLiveSymbolsFunc checkfn);
 
   void _registerForDeadSymbols(CheckDeadSymbolsFunc checkfn);
@@ -603,6 +615,8 @@ class CheckerManager {
 
   std::vector<CheckBranchConditionFunc> BranchConditionCheckers;
 
+  std::vector<CheckNewAllocatorFunc> NewAllocatorCheckers;
+
   std::vector<CheckLiveSymbolsFunc> LiveSymbolsCheckers;
 
   std::vector<CheckDeadSymbolsFunc> DeadSymbolsCheckers;
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h b/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h
index 78d38a3d598d..2ca564dd543f 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h
@@ -193,7 +193,7 @@ class CheckerContext {
 
   /// \brief Get the value of arbitrary expressions at this point in the path.
   SVal getSVal(const Stmt *S) const {
-    return getState()->getSVal(S, getLocationContext());
+    return Pred->getSVal(S);
   }
 
   /// \brief Returns true if the value of \p E is greater than or equal to \p
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h
index c01600d5c969..1e1bbc876b20 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h
@@ -37,6 +37,12 @@ class ConditionTruthVal {
   /// Construct a ConstraintVal indicating the constraint is underconstrained.
   ConditionTruthVal() {}
 
+  /// \return Stored value, assuming that the value is known.
+  /// Crashes otherwise.
+  bool getValue() const {
+    return *Val;
+  }
+
   /// Return true if the constraint is perfectly constrained to 'true'.
   bool isConstrainedTrue() const {
     return Val.hasValue() && Val.getValue();
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h
index dcea5e461d27..b82b404e5c59 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h
@@ -156,6 +156,11 @@ class ExplodedNode : public llvm::FoldingSetNode {
     return Location.getAs<T>();
   }
 
+  /// Get the value of an arbitrary expression at this node.
+  SVal getSVal(const Stmt *S) const {
+    return getState()->getSVal(S, getLocationContext());
+  }
+
   static void Profile(llvm::FoldingSetNodeID &ID,
                       const ProgramPoint &Loc,
                       const ProgramStateRef &state,
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 712cd6361e11..1d36244020bc 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -642,8 +642,8 @@ class ExprEngine : public SubEngine {
   const CXXConstructExpr *findDirectConstructorForCurrentCFGElement();
 
   /// For a CXXConstructExpr, walk forward in the current CFG block to find the
-  /// CFGElement for the DeclStmt or CXXInitCtorInitializer for which is
-  /// directly constructed by this constructor. Returns None if the current
+  /// CFGElement for the DeclStmt or CXXInitCtorInitializer or CXXNewExpr which
+  /// is directly constructed by this constructor. Returns None if the current
   /// constructor expression did not directly construct into an existing
   /// region.
   Optional<CFGElement> findElementDirectlyInitializedByCurrentConstructor();
@@ -655,6 +655,30 @@ class ExprEngine : public SubEngine {
   /// if not.
   const MemRegion *getRegionForConstructedObject(const CXXConstructExpr *CE,
                                                  ExplodedNode *Pred);
+
+  /// Store the region returned by operator new() so that the constructor
+  /// that follows it knew what location to initialize. The value should be
+  /// cleared once the respective CXXNewExpr CFGStmt element is processed.
+  static ProgramStateRef
+  setCXXNewAllocatorValue(ProgramStateRef State, const CXXNewExpr *CNE,
+                          const LocationContext *CallerLC, SVal V);
+
+  /// Retrieve the location returned by the current operator new().
+  static SVal
+  getCXXNewAllocatorValue(ProgramStateRef State, const CXXNewExpr *CNE,
+                          const LocationContext *CallerLC);
+
+  /// Clear the location returned by the respective operator new(). This needs
+  /// to be done as soon as CXXNewExpr CFG block is evaluated.
+  static ProgramStateRef
+  clearCXXNewAllocatorValue(ProgramStateRef State, const CXXNewExpr *CNE,
+                            const LocationContext *CallerLC);
+
+  /// Check if all allocator values are clear for the given context range
+  /// (including FromLC, not including ToLC). This is useful for assertions.
+  static bool areCXXNewAllocatorValuesClear(ProgramStateRef State,
+                                            const LocationContext *FromLC,
+                                            const LocationContext *ToLC);
 };
 
 /// Traits for storing the call processing policy inside GDM.
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h b/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
index 8ab665623088..14909e3160ed 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
@@ -103,6 +103,7 @@ class MemRegion : public llvm::FoldingSetNode {
   const MemRegion *getBaseRegion() const;
 
   /// Check if the region is a subregion of the given region.
+  /// Each region is a subregion of itself.
   virtual bool isSubRegionOf(const MemRegion *R) const;
 
   const MemRegion *StripCasts(bool StripBaseCasts = true) const;
@@ -961,7 +962,10 @@ class CXXThisRegion : public TypedValueRegion {
   CXXThisRegion(const PointerType *thisPointerTy,
                 const StackArgumentsSpaceRegion *sReg)
       : TypedValueRegion(sReg, CXXThisRegionKind),
-        ThisPointerTy(thisPointerTy) {}
+        ThisPointerTy(thisPointerTy) {
+    assert(ThisPointerTy->getPointeeType()->getAsCXXRecordDecl() &&
+           "Invalid region type!");
+  }
 
   static void ProfileRegion(llvm::FoldingSetNodeID &ID,
                             const PointerType *PT,
@@ -1075,6 +1079,8 @@ class ElementRegion : public TypedValueRegion {
     assert((!Idx.getAs<nonloc::ConcreteInt>() ||
             Idx.castAs<nonloc::ConcreteInt>().getValue().isSigned()) &&
            "The index must be signed");
+    assert(!elementType.isNull() && !elementType->isVoidType() &&
+           "Invalid region type!");
   }
 
   static void ProfileRegion(llvm::FoldingSetNodeID& ID, QualType elementType,
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index dd2564b0a3c3..f1283996e567 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -212,10 +212,17 @@ class ProgramState : public llvm::FoldingSetNode {
   assumeInclusiveRange(DefinedOrUnknownSVal Val, const llvm::APSInt &From,
                        const llvm::APSInt &To) const;
 
+  /// \brief Check if the given SVal is not constrained to zero and is not
+  ///        a zero constant.
+  ConditionTruthVal isNonNull(SVal V) const;
+
   /// \brief Check if the given SVal is constrained to zero or is a zero
   ///        constant.
   ConditionTruthVal isNull(SVal V) const;
 
+  /// \return Whether values \p Lhs and \p Rhs are equal.
+  ConditionTruthVal areEqual(SVal Lhs, SVal Rhs) const;
+
   /// Utility method for getting regions.
   const VarRegion* getRegion(const VarDecl *D, const LocationContext *LC) const;
 
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h b/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
index d58d0a690c88..dae5b5495b27 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
@@ -29,6 +29,8 @@ class CXXBoolLiteralExpr;
 
 namespace ento {
 
+class ConditionTruthVal;
+
 class SValBuilder {
   virtual void anchor();
 protected:
@@ -124,7 +126,12 @@ class SValBuilder {
 
   SVal evalBinOp(ProgramStateRef state, BinaryOperator::Opcode op,
                  SVal lhs, SVal rhs, QualType type);
-  
+
+  /// \return Whether values in \p lhs and \p rhs are equal at \p state.
+  ConditionTruthVal areEqual(ProgramStateRef state, SVal lhs, SVal rhs);
+
+  SVal evalEQ(ProgramStateRef state, SVal lhs, SVal rhs);
+
   DefinedOrUnknownSVal evalEQ(ProgramStateRef state, DefinedOrUnknownSVal lhs,
                               DefinedOrUnknownSVal rhs);
 
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/WorkList.h b/include/clang/StaticAnalyzer/Core/PathSensitive/WorkList.h
index 4f1a60e67556..2528aa1b316c 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/WorkList.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/WorkList.h
@@ -80,14 +80,6 @@ class WorkList {
   void setBlockCounter(BlockCounter C) { CurrentCounter = C; }
   BlockCounter getBlockCounter() const { return CurrentCounter; }
 
-  class Visitor {
-  public:
-    Visitor() {}
-    virtual ~Visitor();
-    virtual bool visit(const WorkListUnit &U) = 0;
-  };
-  virtual bool visitItemsInWorkList(Visitor &V) = 0;
-  
   static WorkList *makeDFS();
   static WorkList *makeBFS();
   static WorkList *makeBFSBlockDFSContents();
diff --git a/include/clang/Tooling/AllTUsExecution.h b/include/clang/Tooling/AllTUsExecution.h
new file mode 100644
index 000000000000..0417aea6e526
--- /dev/null
+++ b/include/clang/Tooling/AllTUsExecution.h
@@ -0,0 +1,76 @@
+//===--- AllTUsExecution.h - Execute actions on all TUs. -*- C++ --------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines a tool executor that runs given actions on all TUs in the
+//  compilation database. Tool results are deuplicated by the result key.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_ALLTUSEXECUTION_H
+#define LLVM_CLANG_TOOLING_ALLTUSEXECUTION_H
+
+#include "clang/Tooling/ArgumentsAdjusters.h"
+#include "clang/Tooling/Execution.h"
+
+namespace clang {
+namespace tooling {
+
+/// \brief Executes given frontend actions on all files/TUs in the compilation
+/// database.
+class AllTUsToolExecutor : public ToolExecutor {
+public:
+  static const char *ExecutorName;
+
+  /// \brief Init with \p CompilationDatabase.
+  /// This uses \p ThreadCount threads to exececute the actions on all files in
+  /// parallel. If \p ThreadCount is 0, this uses `llvm::hardware_concurrency`.
+  AllTUsToolExecutor(const CompilationDatabase &Compilations,
+                     unsigned ThreadCount,
+                     std::shared_ptr<PCHContainerOperations> PCHContainerOps =
+                         std::make_shared<PCHContainerOperations>());
+
+  /// \brief Init with \p CommonOptionsParser. This is expected to be used by
+  /// `createExecutorFromCommandLineArgs` based on commandline options.
+  ///
+  /// The executor takes ownership of \p Options.
+  AllTUsToolExecutor(CommonOptionsParser Options, unsigned ThreadCount,
+                     std::shared_ptr<PCHContainerOperations> PCHContainerOps =
+                         std::make_shared<PCHContainerOperations>());
+
+  StringRef getExecutorName() const override { return ExecutorName; }
+
+  using ToolExecutor::execute;
+
+  llvm::Error
+  execute(llvm::ArrayRef<
+          std::pair<std::unique_ptr<FrontendActionFactory>, ArgumentsAdjuster>>
+              Actions) override;
+
+  ExecutionContext *getExecutionContext() override { return &Context; };
+
+  ToolResults *getToolResults() override { return Results.get(); }
+
+  void mapVirtualFile(StringRef FilePath, StringRef Content) override {
+    OverlayFiles[FilePath] = Content;
+  }
+
+private:
+  // Used to store the parser when the executor is initialized with parser.
+  llvm::Optional<CommonOptionsParser> OptionsParser;
+  const CompilationDatabase &Compilations;
+  std::unique_ptr<ToolResults> Results;
+  ExecutionContext Context;
+  llvm::StringMap<std::string> OverlayFiles;
+  unsigned ThreadCount;
+};
+
+} // end namespace tooling
+} // end namespace clang
+
+#endif // LLVM_CLANG_TOOLING_ALLTUSEXECUTION_H
diff --git a/include/clang/Tooling/Core/Diagnostic.h b/include/clang/Tooling/Core/Diagnostic.h
index b4920d4fe456..c8307c88df83 100644
--- a/include/clang/Tooling/Core/Diagnostic.h
+++ b/include/clang/Tooling/Core/Diagnostic.h
@@ -33,7 +33,7 @@ struct DiagnosticMessage {
   DiagnosticMessage(llvm::StringRef Message = "");
 
   /// \brief Constructs a diagnostic message with anoffset to the diagnostic
-  /// within the file where the problem occured.
+  /// within the file where the problem occurred.
   ///
   /// \param Loc Should be a file location, it is not meaningful for a macro
   /// location.
diff --git a/include/clang/Tooling/Refactoring/Rename/RenamingAction.h b/include/clang/Tooling/Refactoring/Rename/RenamingAction.h
index 734b624d777b..d32ed642b087 100644
--- a/include/clang/Tooling/Refactoring/Rename/RenamingAction.h
+++ b/include/clang/Tooling/Refactoring/Rename/RenamingAction.h
@@ -82,7 +82,7 @@ class QualifiedRenameRule final : public SourceChangeRefactoringRule {
   Expected<AtomicChanges>
   createSourceReplacements(RefactoringRuleContext &Context) override;
 
-  // A NamedDecl which indentifies the the symbol being renamed.
+  // A NamedDecl which indentifies the symbol being renamed.
   const NamedDecl *ND;
   // The new qualified name to change the symbol to.
   std::string NewQualifiedName;
diff --git a/include/clang/Tooling/Tooling.h b/include/clang/Tooling/Tooling.h
index e64be07d9ab4..ea3b0e042140 100644
--- a/include/clang/Tooling/Tooling.h
+++ b/include/clang/Tooling/Tooling.h
@@ -296,10 +296,14 @@ class ClangTool {
   ///        not found in Compilations, it is skipped.
   /// \param PCHContainerOps The PCHContainerOperations for loading and creating
   /// clang modules.
+  /// \param BaseFS VFS used for all underlying file accesses when running the
+  /// tool.
   ClangTool(const CompilationDatabase &Compilations,
             ArrayRef<std::string> SourcePaths,
             std::shared_ptr<PCHContainerOperations> PCHContainerOps =
-                std::make_shared<PCHContainerOperations>());
+                std::make_shared<PCHContainerOperations>(),
+            IntrusiveRefCntPtr<vfs::FileSystem> BaseFS =
+                vfs::getRealFileSystem());
 
   ~ClangTool();
 
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index 3dc961d4f12b..c81d57864020 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -2582,26 +2582,24 @@ void ASTContext::adjustDeducedFunctionResultType(FunctionDecl *FD,
 /// specified exception specification. Type sugar that can be present on a
 /// declaration of a function with an exception specification is permitted
 /// and preserved. Other type sugar (for instance, typedefs) is not.
-static QualType getFunctionTypeWithExceptionSpec(
-    ASTContext &Context, QualType Orig,
-    const FunctionProtoType::ExceptionSpecInfo &ESI) {
+QualType ASTContext::getFunctionTypeWithExceptionSpec(
+    QualType Orig, const FunctionProtoType::ExceptionSpecInfo &ESI) {
   // Might have some parens.
   if (auto *PT = dyn_cast<ParenType>(Orig))
-    return Context.getParenType(
-        getFunctionTypeWithExceptionSpec(Context, PT->getInnerType(), ESI));
+    return getParenType(
+        getFunctionTypeWithExceptionSpec(PT->getInnerType(), ESI));
 
   // Might have a calling-convention attribute.
   if (auto *AT = dyn_cast<AttributedType>(Orig))
-    return Context.getAttributedType(
+    return getAttributedType(
         AT->getAttrKind(),
-        getFunctionTypeWithExceptionSpec(Context, AT->getModifiedType(), ESI),
-        getFunctionTypeWithExceptionSpec(Context, AT->getEquivalentType(),
-                                         ESI));
+        getFunctionTypeWithExceptionSpec(AT->getModifiedType(), ESI),
+        getFunctionTypeWithExceptionSpec(AT->getEquivalentType(), ESI));
 
   // Anything else must be a function type. Rebuild it with the new exception
   // specification.
   const FunctionProtoType *Proto = cast<FunctionProtoType>(Orig);
-  return Context.getFunctionType(
+  return getFunctionType(
       Proto->getReturnType(), Proto->getParamTypes(),
       Proto->getExtProtoInfo().withExceptionSpec(ESI));
 }
@@ -2610,8 +2608,8 @@ bool ASTContext::hasSameFunctionTypeIgnoringExceptionSpec(QualType T,
                                                           QualType U) {
   return hasSameType(T, U) ||
          (getLangOpts().CPlusPlus17 &&
-          hasSameType(getFunctionTypeWithExceptionSpec(*this, T, EST_None),
-                      getFunctionTypeWithExceptionSpec(*this, U, EST_None)));
+          hasSameType(getFunctionTypeWithExceptionSpec(T, EST_None),
+                      getFunctionTypeWithExceptionSpec(U, EST_None)));
 }
 
 void ASTContext::adjustExceptionSpec(
@@ -2619,7 +2617,7 @@ void ASTContext::adjustExceptionSpec(
     bool AsWritten) {
   // Update the type.
   QualType Updated =
-      getFunctionTypeWithExceptionSpec(*this, FD->getType(), ESI);
+      getFunctionTypeWithExceptionSpec(FD->getType(), ESI);
   FD->setType(Updated);
 
   if (!AsWritten)
@@ -2630,7 +2628,7 @@ void ASTContext::adjustExceptionSpec(
     // If the type and the type-as-written differ, we may need to update
     // the type-as-written too.
     if (TSInfo->getType() != FD->getType())
-      Updated = getFunctionTypeWithExceptionSpec(*this, TSInfo->getType(), ESI);
+      Updated = getFunctionTypeWithExceptionSpec(TSInfo->getType(), ESI);
 
     // FIXME: When we get proper type location information for exceptions,
     // we'll also have to rebuild the TypeSourceInfo. For now, we just patch
@@ -8931,10 +8929,12 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
     Type = Context.FloatTy;
     break;
   case 'd':
-    assert(HowLong < 2 && !Signed && !Unsigned &&
+    assert(HowLong < 3 && !Signed && !Unsigned &&
            "Bad modifiers used with 'd'!");
-    if (HowLong)
+    if (HowLong == 1)
       Type = Context.LongDoubleTy;
+    else if (HowLong == 2)
+      Type = Context.Float128Ty;
     else
       Type = Context.DoubleTy;
     break;
@@ -9483,6 +9483,23 @@ bool ASTContext::DeclMustBeEmitted(const Decl *D) {
   return false;
 }
 
+void ASTContext::forEachMultiversionedFunctionVersion(
+    const FunctionDecl *FD,
+    llvm::function_ref<void(const FunctionDecl *)> Pred) const {
+  assert(FD->isMultiVersion() && "Only valid for multiversioned functions");
+  llvm::SmallDenseSet<const FunctionDecl*, 4> SeenDecls;
+  FD = FD->getCanonicalDecl();
+  for (auto *CurDecl :
+       FD->getDeclContext()->getRedeclContext()->lookup(FD->getDeclName())) {
+    FunctionDecl *CurFD = CurDecl->getAsFunction()->getCanonicalDecl();
+    if (CurFD && hasSameType(CurFD->getType(), FD->getType()) &&
+        std::end(SeenDecls) == llvm::find(SeenDecls, CurFD)) {
+      SeenDecls.insert(CurFD);
+      Pred(CurFD);
+    }
+  }
+}
+
 CallingConv ASTContext::getDefaultCallingConvention(bool IsVariadic,
                                                     bool IsCXXMethod) const {
   // Pass through to the C++ ABI object
diff --git a/lib/AST/ASTDumper.cpp b/lib/AST/ASTDumper.cpp
index 92be6d95e898..e264d056e49f 100644
--- a/lib/AST/ASTDumper.cpp
+++ b/lib/AST/ASTDumper.cpp
@@ -539,6 +539,7 @@ namespace  {
     void VisitAddrLabelExpr(const AddrLabelExpr *Node);
     void VisitBlockExpr(const BlockExpr *Node);
     void VisitOpaqueValueExpr(const OpaqueValueExpr *Node);
+    void VisitGenericSelectionExpr(const GenericSelectionExpr *E);
 
     // C++
     void VisitCXXNamedCastExpr(const CXXNamedCastExpr *Node);
@@ -808,11 +809,10 @@ void ASTDumper::dumpLookups(const DeclContext *DC, bool DumpDecls) {
 
     bool HasUndeserializedLookups = Primary->hasExternalVisibleStorage();
 
-    for (auto I = Deserialize ? Primary->lookups_begin()
-                              : Primary->noload_lookups_begin(),
-              E = Deserialize ? Primary->lookups_end()
-                              : Primary->noload_lookups_end();
-         I != E; ++I) {
+    auto Range = Deserialize
+                     ? Primary->lookups()
+                     : Primary->noload_lookups(/*PreserveInternalState=*/true);
+    for (auto I = Range.begin(), E = Range.end(); I != E; ++I) {
       DeclarationName Name = I.getLookupName();
       DeclContextLookupResult R = *I;
 
@@ -1946,10 +1946,15 @@ void ASTDumper::dumpStmt(const Stmt *S) {
       return;
     }
 
+    // Some statements have custom mechanisms for dumping their children.
     if (const DeclStmt *DS = dyn_cast<DeclStmt>(S)) {
       VisitDeclStmt(DS);
       return;
     }
+    if (const GenericSelectionExpr *GSE = dyn_cast<GenericSelectionExpr>(S)) {
+      VisitGenericSelectionExpr(GSE);
+      return;
+    }
 
     ConstStmtVisitor<ASTDumper>::Visit(S);
 
@@ -2208,12 +2213,14 @@ void ASTDumper::VisitArrayInitIndexExpr(const ArrayInitIndexExpr *E) {
 }
 
 void ASTDumper::VisitUnaryOperator(const UnaryOperator *Node) {
-  VisitExpr(Node);
-  OS << " " << (Node->isPostfix() ? "postfix" : "prefix")
-     << " '" << UnaryOperator::getOpcodeStr(Node->getOpcode()) << "'";
-}
-
-void ASTDumper::VisitUnaryExprOrTypeTraitExpr(
+  VisitExpr(Node);
+  OS << " " << (Node->isPostfix() ? "postfix" : "prefix")
+     << " '" << UnaryOperator::getOpcodeStr(Node->getOpcode()) << "'";
+  if (!Node->canOverflow())
+    OS << " cannot overflow";
+}
+
+void ASTDumper::VisitUnaryExprOrTypeTraitExpr(
     const UnaryExprOrTypeTraitExpr *Node) {
   VisitExpr(Node);
   switch(Node->getKind()) {
@@ -2272,6 +2279,32 @@ void ASTDumper::VisitOpaqueValueExpr(const OpaqueValueExpr *Node) {
     dumpStmt(Source);
 }
 
+void ASTDumper::VisitGenericSelectionExpr(const GenericSelectionExpr *E) {
+  VisitExpr(E);
+  if (E->isResultDependent())
+    OS << " result_dependent";
+  dumpStmt(E->getControllingExpr());
+  dumpTypeAsChild(E->getControllingExpr()->getType()); // FIXME: remove
+
+  for (unsigned I = 0, N = E->getNumAssocs(); I != N; ++I) {
+    dumpChild([=] {
+      if (const TypeSourceInfo *TSI = E->getAssocTypeSourceInfo(I)) {
+        OS << "case ";
+        dumpType(TSI->getType());
+      } else {
+        OS << "default";
+      }
+
+      if (!E->isResultDependent() && E->getResultIndex() == I)
+        OS << " selected";
+
+      if (const TypeSourceInfo *TSI = E->getAssocTypeSourceInfo(I))
+        dumpTypeAsChild(TSI->getType());
+      dumpStmt(E->getAssocExpr(I));
+    });
+  }
+}
+
 // GNU extensions.
 
 void ASTDumper::VisitAddrLabelExpr(const AddrLabelExpr *Node) {
diff --git a/lib/AST/ASTImporter.cpp b/lib/AST/ASTImporter.cpp
index 0d1d9807549f..805c3871ded0 100644
--- a/lib/AST/ASTImporter.cpp
+++ b/lib/AST/ASTImporter.cpp
@@ -52,7 +52,7 @@ namespace clang {
     QualType VisitConstantArrayType(const ConstantArrayType *T);
     QualType VisitIncompleteArrayType(const IncompleteArrayType *T);
     QualType VisitVariableArrayType(const VariableArrayType *T);
-    // FIXME: DependentSizedArrayType
+    QualType VisitDependentSizedArrayType(const DependentSizedArrayType *T);
     // FIXME: DependentSizedExtVectorType
     QualType VisitVectorType(const VectorType *T);
     QualType VisitExtVectorType(const ExtVectorType *T);
@@ -78,7 +78,8 @@ namespace clang {
     QualType VisitElaboratedType(const ElaboratedType *T);
     // FIXME: DependentNameType
     QualType VisitPackExpansionType(const PackExpansionType *T);
-    // FIXME: DependentTemplateSpecializationType
+    QualType VisitDependentTemplateSpecializationType(
+        const DependentTemplateSpecializationType *T);
     QualType VisitObjCInterfaceType(const ObjCInterfaceType *T);
     QualType VisitObjCObjectType(const ObjCObjectType *T);
     QualType VisitObjCObjectPointerType(const ObjCObjectPointerType *T);
@@ -97,6 +98,8 @@ namespace clang {
     typedef DesignatedInitExpr::Designator Designator;
     Designator ImportDesignator(const Designator &D);
 
+    Optional<LambdaCapture> ImportLambdaCapture(const LambdaCapture &From);
+
                         
     /// \brief What we should import from the definition.
     enum ImportDefinitionKind { 
@@ -127,16 +130,26 @@ namespace clang {
     bool ImportDefinition(ObjCProtocolDecl *From, ObjCProtocolDecl *To,
                           ImportDefinitionKind Kind = IDK_Default);
     TemplateParameterList *ImportTemplateParameterList(
-                                                 TemplateParameterList *Params);
+        TemplateParameterList *Params);
     TemplateArgument ImportTemplateArgument(const TemplateArgument &From);
     Optional<TemplateArgumentLoc> ImportTemplateArgumentLoc(
         const TemplateArgumentLoc &TALoc);
     bool ImportTemplateArguments(const TemplateArgument *FromArgs,
                                  unsigned NumFromArgs,
-                               SmallVectorImpl<TemplateArgument> &ToArgs);
+                                 SmallVectorImpl<TemplateArgument> &ToArgs);
+
     template <typename InContainerTy>
     bool ImportTemplateArgumentListInfo(const InContainerTy &Container,
                                         TemplateArgumentListInfo &ToTAInfo);
+
+    template<typename InContainerTy>
+    bool ImportTemplateArgumentListInfo(SourceLocation FromLAngleLoc,
+                                        SourceLocation FromRAngleLoc,
+                                        const InContainerTy &Container,
+                                        TemplateArgumentListInfo &Result);
+
+    bool ImportTemplateInformation(FunctionDecl *FromFD, FunctionDecl *ToFD);
+
     bool IsStructuralMatch(RecordDecl *FromRecord, RecordDecl *ToRecord,
                            bool Complain = true);
     bool IsStructuralMatch(VarDecl *FromVar, VarDecl *ToVar,
@@ -282,17 +295,21 @@ namespace clang {
     Expr *VisitCXXTemporaryObjectExpr(CXXTemporaryObjectExpr *CE);
     Expr *VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E);
     Expr *VisitPackExpansionExpr(PackExpansionExpr *E);
+    Expr *VisitSizeOfPackExpr(SizeOfPackExpr *E);
     Expr *VisitCXXNewExpr(CXXNewExpr *CE);
     Expr *VisitCXXDeleteExpr(CXXDeleteExpr *E);
     Expr *VisitCXXConstructExpr(CXXConstructExpr *E);
     Expr *VisitCXXMemberCallExpr(CXXMemberCallExpr *E);
     Expr *VisitCXXDependentScopeMemberExpr(CXXDependentScopeMemberExpr *E);
+    Expr *VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *CE);
+    Expr *VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E);
     Expr *VisitExprWithCleanups(ExprWithCleanups *EWC);
     Expr *VisitCXXThisExpr(CXXThisExpr *E);
     Expr *VisitCXXBoolLiteralExpr(CXXBoolLiteralExpr *E);
     Expr *VisitCXXPseudoDestructorExpr(CXXPseudoDestructorExpr *E);
     Expr *VisitMemberExpr(MemberExpr *E);
     Expr *VisitCallExpr(CallExpr *E);
+    Expr *VisitLambdaExpr(LambdaExpr *LE);
     Expr *VisitInitListExpr(InitListExpr *E);
     Expr *VisitArrayInitLoopExpr(ArrayInitLoopExpr *E);
     Expr *VisitArrayInitIndexExpr(ArrayInitIndexExpr *E);
@@ -300,6 +317,7 @@ namespace clang {
     Expr *VisitCXXNamedCastExpr(CXXNamedCastExpr *E);
     Expr *VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *E);
     Expr *VisitTypeTraitExpr(TypeTraitExpr *E);
+    Expr *VisitCXXTypeidExpr(CXXTypeidExpr *E);
 
 
     template<typename IIter, typename OIter>
@@ -512,6 +530,24 @@ QualType ASTNodeImporter::VisitVariableArrayType(const VariableArrayType *T) {
                                                       Brackets);
 }
 
+QualType ASTNodeImporter::VisitDependentSizedArrayType(
+    const DependentSizedArrayType *T) {
+  QualType ToElementType = Importer.Import(T->getElementType());
+  if (ToElementType.isNull())
+    return QualType();
+
+  // SizeExpr may be null if size is not specified directly.
+  // For example, 'int a[]'.
+  Expr *Size = Importer.Import(T->getSizeExpr());
+  if (!Size && T->getSizeExpr())
+    return QualType();
+
+  SourceRange Brackets = Importer.Import(T->getBracketsRange());
+  return Importer.getToContext().getDependentSizedArrayType(
+      ToElementType, Size, T->getSizeModifier(), T->getIndexTypeCVRQualifiers(),
+      Brackets);
+}
+
 QualType ASTNodeImporter::VisitVectorType(const VectorType *T) {
   QualType ToElementType = Importer.Import(T->getElementType());
   if (ToElementType.isNull())
@@ -812,6 +848,25 @@ QualType ASTNodeImporter::VisitPackExpansionType(const PackExpansionType *T) {
                                                       T->getNumExpansions());
 }
 
+QualType ASTNodeImporter::VisitDependentTemplateSpecializationType(
+    const DependentTemplateSpecializationType *T) {
+  NestedNameSpecifier *Qualifier = Importer.Import(T->getQualifier());
+  if (!Qualifier && T->getQualifier())
+    return QualType();
+
+  IdentifierInfo *Name = Importer.Import(T->getIdentifier());
+  if (!Name && T->getIdentifier())
+    return QualType();
+
+  SmallVector<TemplateArgument, 2> ToPack;
+  ToPack.reserve(T->getNumArgs());
+  if (ImportTemplateArguments(T->getArgs(), T->getNumArgs(), ToPack))
+    return QualType();
+
+  return Importer.getToContext().getDependentTemplateSpecializationType(
+        T->getKeyword(), Qualifier, Name, ToPack);
+}
+
 QualType ASTNodeImporter::VisitObjCInterfaceType(const ObjCInterfaceType *T) {
   ObjCInterfaceDecl *Class
     = dyn_cast_or_null<ObjCInterfaceDecl>(Importer.Import(T->getDecl()));
@@ -1043,7 +1098,6 @@ bool ASTNodeImporter::ImportDefinition(RecordDecl *From, RecordDecl *To,
       = FromData.HasDeclaredCopyConstructorWithConstParam;
     ToData.HasDeclaredCopyAssignmentWithConstParam
       = FromData.HasDeclaredCopyAssignmentWithConstParam;
-    ToData.IsLambda = FromData.IsLambda;
 
     SmallVector<CXXBaseSpecifier *, 4> Bases;
     for (const auto &Base1 : FromCXX->bases()) {
@@ -1254,6 +1308,9 @@ bool ASTNodeImporter::ImportTemplateArguments(const TemplateArgument *FromArgs,
   return false;
 }
 
+// We cannot use Optional<> pattern here and below because
+// TemplateArgumentListInfo's operator new is declared as deleted so it cannot
+// be stored in Optional.
 template <typename InContainerTy>
 bool ASTNodeImporter::ImportTemplateArgumentListInfo(
     const InContainerTy &Container, TemplateArgumentListInfo &ToTAInfo) {
@@ -1266,6 +1323,18 @@ bool ASTNodeImporter::ImportTemplateArgumentListInfo(
   return false;
 }
 
+template <typename InContainerTy>
+bool ASTNodeImporter::ImportTemplateArgumentListInfo(
+    SourceLocation FromLAngleLoc, SourceLocation FromRAngleLoc,
+    const InContainerTy &Container, TemplateArgumentListInfo &Result) {
+  TemplateArgumentListInfo ToTAInfo(Importer.Import(FromLAngleLoc),
+                                    Importer.Import(FromRAngleLoc));
+  if (ImportTemplateArgumentListInfo(Container, ToTAInfo))
+    return true;
+  Result = ToTAInfo;
+  return false;
+}
+
 bool ASTNodeImporter::IsStructuralMatch(RecordDecl *FromRecord, 
                                         RecordDecl *ToRecord, bool Complain) {
   // Eliminate a potential failure point where we attempt to re-import
@@ -1916,16 +1985,16 @@ Decl *ASTNodeImporter::VisitRecordDecl(RecordDecl *D) {
         if (DCXX->getLambdaContextDecl() && !CDecl)
           return nullptr;
         D2CXX->setLambdaMangling(DCXX->getLambdaManglingNumber(), CDecl);
-      } else if (DCXX->isInjectedClassName()) {                                                 
-        // We have to be careful to do a similar dance to the one in                            
-        // Sema::ActOnStartCXXMemberDeclarations                                                
-        CXXRecordDecl *const PrevDecl = nullptr;                                                
-        const bool DelayTypeCreation = true;                                                    
-        D2CXX = CXXRecordDecl::Create(                                                          
-            Importer.getToContext(), D->getTagKind(), DC, StartLoc, Loc,                        
-            Name.getAsIdentifierInfo(), PrevDecl, DelayTypeCreation);                           
-        Importer.getToContext().getTypeDeclType(                                                
-            D2CXX, llvm::dyn_cast<CXXRecordDecl>(DC));                                          
+      } else if (DCXX->isInjectedClassName()) {
+        // We have to be careful to do a similar dance to the one in
+        // Sema::ActOnStartCXXMemberDeclarations
+        CXXRecordDecl *const PrevDecl = nullptr;
+        const bool DelayTypeCreation = true;
+        D2CXX = CXXRecordDecl::Create(
+            Importer.getToContext(), D->getTagKind(), DC, StartLoc, Loc,
+            Name.getAsIdentifierInfo(), PrevDecl, DelayTypeCreation);
+        Importer.getToContext().getTypeDeclType(
+            D2CXX, llvm::dyn_cast<CXXRecordDecl>(DC));
       } else {
         D2CXX = CXXRecordDecl::Create(Importer.getToContext(),
                                       D->getTagKind(),
@@ -1934,6 +2003,9 @@ Decl *ASTNodeImporter::VisitRecordDecl(RecordDecl *D) {
       }
       D2 = D2CXX;
       D2->setAccess(D->getAccess());
+      D2->setLexicalDeclContext(LexicalDC);
+      if (!DCXX->getDescribedClassTemplate())
+        LexicalDC->addDeclInternal(D2);
 
       Importer.Imported(D, D2);
 
@@ -1962,11 +2034,11 @@ Decl *ASTNodeImporter::VisitRecordDecl(RecordDecl *D) {
     } else {
       D2 = RecordDecl::Create(Importer.getToContext(), D->getTagKind(),
                               DC, StartLoc, Loc, Name.getAsIdentifierInfo());
+      D2->setLexicalDeclContext(LexicalDC);
+      LexicalDC->addDeclInternal(D2);
     }
     
     D2->setQualifierInfo(Importer.Import(D->getQualifierLoc()));
-    D2->setLexicalDeclContext(LexicalDC);
-    LexicalDC->addDeclInternal(D2);
     if (D->isAnonymousStructOrUnion())
       D2->setAnonymousStructOrUnion(true);
     if (PrevDecl) {
@@ -2042,6 +2114,91 @@ Decl *ASTNodeImporter::VisitEnumConstantDecl(EnumConstantDecl *D) {
   return ToEnumerator;
 }
 
+bool ASTNodeImporter::ImportTemplateInformation(FunctionDecl *FromFD,
+                                                FunctionDecl *ToFD) {
+  switch (FromFD->getTemplatedKind()) {
+  case FunctionDecl::TK_NonTemplate:
+  case FunctionDecl::TK_FunctionTemplate:
+    return false;
+
+  case FunctionDecl::TK_MemberSpecialization: {
+    auto *InstFD = cast_or_null<FunctionDecl>(
+          Importer.Import(FromFD->getInstantiatedFromMemberFunction()));
+    if (!InstFD)
+      return true;
+
+    TemplateSpecializationKind TSK = FromFD->getTemplateSpecializationKind();
+    SourceLocation POI = Importer.Import(
+          FromFD->getMemberSpecializationInfo()->getPointOfInstantiation());
+    ToFD->setInstantiationOfMemberFunction(InstFD, TSK);
+    ToFD->getMemberSpecializationInfo()->setPointOfInstantiation(POI);
+    return false;
+  }
+
+  case FunctionDecl::TK_FunctionTemplateSpecialization: {
+    auto *FTSInfo = FromFD->getTemplateSpecializationInfo();
+    auto *Template = cast_or_null<FunctionTemplateDecl>(
+        Importer.Import(FTSInfo->getTemplate()));
+    if (!Template)
+      return true;
+    TemplateSpecializationKind TSK = FTSInfo->getTemplateSpecializationKind();
+
+    // Import template arguments.
+    auto TemplArgs = FTSInfo->TemplateArguments->asArray();
+    SmallVector<TemplateArgument, 8> ToTemplArgs;
+    if (ImportTemplateArguments(TemplArgs.data(), TemplArgs.size(),
+                                ToTemplArgs))
+      return true;
+
+    TemplateArgumentList *ToTAList = TemplateArgumentList::CreateCopy(
+          Importer.getToContext(), ToTemplArgs);
+
+    TemplateArgumentListInfo ToTAInfo;
+    const auto *FromTAArgsAsWritten = FTSInfo->TemplateArgumentsAsWritten;
+    if (FromTAArgsAsWritten) {
+      if (ImportTemplateArgumentListInfo(
+              FromTAArgsAsWritten->LAngleLoc, FromTAArgsAsWritten->RAngleLoc,
+              FromTAArgsAsWritten->arguments(), ToTAInfo))
+        return true;
+    }
+
+    SourceLocation POI = Importer.Import(FTSInfo->getPointOfInstantiation());
+
+    ToFD->setFunctionTemplateSpecialization(
+        Template, ToTAList, /* InsertPos= */ nullptr,
+        TSK, FromTAArgsAsWritten ? &ToTAInfo : nullptr, POI);
+    return false;
+  }
+
+  case FunctionDecl::TK_DependentFunctionTemplateSpecialization: {
+    auto *FromInfo = FromFD->getDependentSpecializationInfo();
+    UnresolvedSet<8> TemplDecls;
+    unsigned NumTemplates = FromInfo->getNumTemplates();
+    for (unsigned I = 0; I < NumTemplates; I++) {
+      if (auto *ToFTD = cast_or_null<FunctionTemplateDecl>(
+              Importer.Import(FromInfo->getTemplate(I))))
+        TemplDecls.addDecl(ToFTD);
+      else
+        return true;
+    }
+
+    // Import TemplateArgumentListInfo.
+    TemplateArgumentListInfo ToTAInfo;
+    if (ImportTemplateArgumentListInfo(
+            FromInfo->getLAngleLoc(), FromInfo->getRAngleLoc(),
+            llvm::makeArrayRef(FromInfo->getTemplateArgs(),
+                               FromInfo->getNumTemplateArgs()),
+            ToTAInfo))
+      return true;
+
+    ToFD->setDependentTemplateSpecialization(Importer.getToContext(),
+                                             TemplDecls, ToTAInfo);
+    return false;
+  }
+  }
+  llvm_unreachable("All cases should be covered!");
+}
+
 Decl *ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
   // Import the major distinguishing characteristics of this function.
   DeclContext *DC, *LexicalDC;
@@ -2149,15 +2306,18 @@ Decl *ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
     Parameters.push_back(ToP);
   }
   
-  // Create the imported function.
   TypeSourceInfo *TInfo = Importer.Import(D->getTypeSourceInfo());
+  if (D->getTypeSourceInfo() && !TInfo)
+    return nullptr;
+
+  // Create the imported function.
   FunctionDecl *ToFunction = nullptr;
   SourceLocation InnerLocStart = Importer.Import(D->getInnerLocStart());
   if (CXXConstructorDecl *FromConstructor = dyn_cast<CXXConstructorDecl>(D)) {
     ToFunction = CXXConstructorDecl::Create(Importer.getToContext(),
                                             cast<CXXRecordDecl>(DC),
                                             InnerLocStart,
-                                            NameInfo, T, TInfo, 
+                                            NameInfo, T, TInfo,
                                             FromConstructor->isExplicit(),
                                             D->isInlineSpecified(), 
                                             D->isImplicit(),
@@ -2223,9 +2383,9 @@ Decl *ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
   Importer.Imported(D, ToFunction);
 
   // Set the parameters.
-  for (unsigned I = 0, N = Parameters.size(); I != N; ++I) {
-    Parameters[I]->setOwningFunction(ToFunction);
-    ToFunction->addDeclInternal(Parameters[I]);
+  for (ParmVarDecl *Param : Parameters) {
+    Param->setOwningFunction(ToFunction);
+    ToFunction->addDeclInternal(Param);
   }
   ToFunction->setParams(Parameters);
 
@@ -2235,6 +2395,16 @@ Decl *ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
     ToFunction->setPreviousDecl(Recent);
   }
 
+  // We need to complete creation of FunctionProtoTypeLoc manually with setting
+  // params it refers to.
+  if (TInfo) {
+    if (auto ProtoLoc =
+        TInfo->getTypeLoc().IgnoreParens().getAs<FunctionProtoTypeLoc>()) {
+      for (unsigned I = 0, N = Parameters.size(); I != N; ++I)
+        ProtoLoc.setParam(I, Parameters[I]);
+    }
+  }
+
   if (usedDifferentExceptionSpec) {
     // Update FunctionProtoType::ExtProtoInfo.
     QualType T = Importer.Import(D->getType());
@@ -2252,8 +2422,17 @@ Decl *ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
 
   // FIXME: Other bits to merge?
 
+  // If it is a template, import all related things.
+  if (ImportTemplateInformation(D, ToFunction))
+    return nullptr;
+
   // Add this function to the lexical context.
-  LexicalDC->addDeclInternal(ToFunction);
+  // NOTE: If the function is templated declaration, it should be not added into
+  // LexicalDC. But described template is imported during import of
+  // FunctionTemplateDecl (it happens later). So, we use source declaration
+  // to determine if we should add the result function.
+  if (!D->getDescribedFunctionTemplate())
+    LexicalDC->addDeclInternal(ToFunction);
 
   if (auto *FromCXXMethod = dyn_cast<CXXMethodDecl>(D))
     ImportOverrides(cast<CXXMethodDecl>(ToFunction), FromCXXMethod);
@@ -2747,6 +2926,14 @@ Decl *ASTNodeImporter::VisitParmVarDecl(ParmVarDecl *D) {
   if (FromDefArg && !ToDefArg)
     return nullptr;
 
+  if (D->isObjCMethodParameter()) {
+    ToParm->setObjCMethodScopeInfo(D->getFunctionScopeIndex());
+    ToParm->setObjCDeclQualifier(D->getObjCDeclQualifier());
+  } else {
+    ToParm->setScopeInfo(D->getFunctionScopeDepth(),
+                         D->getFunctionScopeIndex());
+  }
+
   if (D->isUsed())
     ToParm->setIsUsed();
 
@@ -2857,9 +3044,13 @@ Decl *ASTNodeImporter::VisitObjCMethodDecl(ObjCMethodDecl *D) {
     ToParams[I]->setOwningFunction(ToMethod);
     ToMethod->addDeclInternal(ToParams[I]);
   }
+
   SmallVector<SourceLocation, 12> SelLocs;
   D->getSelectorLocs(SelLocs);
-  ToMethod->setMethodParams(Importer.getToContext(), ToParams, SelLocs); 
+  for (SourceLocation &Loc : SelLocs)
+    Loc = Importer.Import(Loc);
+
+  ToMethod->setMethodParams(Importer.getToContext(), ToParams, SelLocs);
 
   ToMethod->setLexicalDeclContext(LexicalDC);
   Importer.Imported(D, ToMethod);
@@ -3844,12 +4035,12 @@ Decl *ASTNodeImporter::VisitClassTemplateDecl(ClassTemplateDecl *D) {
       return nullptr;
   }
 
-  CXXRecordDecl *DTemplated = D->getTemplatedDecl();
-  
+  CXXRecordDecl *FromTemplated = D->getTemplatedDecl();
+
   // Create the declaration that is being templated.
-  CXXRecordDecl *D2Templated = cast_or_null<CXXRecordDecl>(
-        Importer.Import(DTemplated));
-  if (!D2Templated)
+  CXXRecordDecl *ToTemplated = cast_or_null<CXXRecordDecl>(
+        Importer.Import(FromTemplated));
+  if (!ToTemplated)
     return nullptr;
 
   // Resolve possible cyclic import.
@@ -3857,15 +4048,15 @@ Decl *ASTNodeImporter::VisitClassTemplateDecl(ClassTemplateDecl *D) {
     return AlreadyImported;
 
   // Create the class template declaration itself.
-  TemplateParameterList *TemplateParams
-    = ImportTemplateParameterList(D->getTemplateParameters());
+  TemplateParameterList *TemplateParams =
+      ImportTemplateParameterList(D->getTemplateParameters());
   if (!TemplateParams)
     return nullptr;
 
   ClassTemplateDecl *D2 = ClassTemplateDecl::Create(Importer.getToContext(), DC, 
                                                     Loc, Name, TemplateParams, 
-                                                    D2Templated);
-  D2Templated->setDescribedClassTemplate(D2);    
+                                                    ToTemplated);
+  ToTemplated->setDescribedClassTemplate(D2);
   
   D2->setAccess(D->getAccess());
   D2->setLexicalDeclContext(LexicalDC);
@@ -3873,10 +4064,10 @@ Decl *ASTNodeImporter::VisitClassTemplateDecl(ClassTemplateDecl *D) {
   
   // Note the relationship between the class templates.
   Importer.Imported(D, D2);
-  Importer.Imported(DTemplated, D2Templated);
+  Importer.Imported(FromTemplated, ToTemplated);
 
-  if (DTemplated->isCompleteDefinition() &&
-      !D2Templated->isCompleteDefinition()) {
+  if (FromTemplated->isCompleteDefinition() &&
+      !ToTemplated->isCompleteDefinition()) {
     // FIXME: Import definition!
   }
   
@@ -3952,12 +4143,8 @@ Decl *ASTNodeImporter::VisitClassTemplateSpecializationDecl(
       // Import TemplateArgumentListInfo
       TemplateArgumentListInfo ToTAInfo;
       auto &ASTTemplateArgs = *PartialSpec->getTemplateArgsAsWritten();
-      for (unsigned I = 0, E = ASTTemplateArgs.NumTemplateArgs; I < E; ++I) {
-        if (auto ToLoc = ImportTemplateArgumentLoc(ASTTemplateArgs[I]))
-          ToTAInfo.addArgument(*ToLoc);
-        else
-          return nullptr;
-      }
+      if (ImportTemplateArgumentListInfo(ASTTemplateArgs.arguments(), ToTAInfo))
+        return nullptr;
 
       QualType CanonInjType = Importer.Import(
             PartialSpec->getInjectedSpecializationType());
@@ -4895,12 +5082,8 @@ Expr *ASTNodeImporter::VisitDeclRefExpr(DeclRefExpr *E) {
   TemplateArgumentListInfo ToTAInfo;
   TemplateArgumentListInfo *ResInfo = nullptr;
   if (E->hasExplicitTemplateArgs()) {
-    for (const auto &FromLoc : E->template_arguments()) {
-      if (auto ToTALoc = ImportTemplateArgumentLoc(FromLoc))
-        ToTAInfo.addArgument(*ToTALoc);
-      else
-        return nullptr;
-    }
+    if (ImportTemplateArgumentListInfo(E->template_arguments(), ToTAInfo))
+      return nullptr;
     ResInfo = &ToTAInfo;
   }
 
@@ -5120,14 +5303,13 @@ Expr *ASTNodeImporter::VisitUnaryOperator(UnaryOperator *E) {
   if (!SubExpr)
     return nullptr;
 
-  return new (Importer.getToContext()) UnaryOperator(SubExpr, E->getOpcode(),
-                                                     T, E->getValueKind(),
-                                                     E->getObjectKind(),
-                                         Importer.Import(E->getOperatorLoc()));                                        
+  return new (Importer.getToContext()) UnaryOperator(
+      SubExpr, E->getOpcode(), T, E->getValueKind(), E->getObjectKind(),
+      Importer.Import(E->getOperatorLoc()), E->canOverflow());
 }
 
-Expr *ASTNodeImporter::VisitUnaryExprOrTypeTraitExpr(
-                                            UnaryExprOrTypeTraitExpr *E) {
+Expr *
+ASTNodeImporter::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *E) {
   QualType ResultType = Importer.Import(E->getType());
   
   if (E->isArgumentType()) {
@@ -5567,6 +5749,11 @@ Expr *ASTNodeImporter::VisitCXXTemporaryObjectExpr(CXXTemporaryObjectExpr *CE) {
   if (T.isNull())
     return nullptr;
 
+
+  TypeSourceInfo *TInfo = Importer.Import(CE->getTypeSourceInfo());
+  if (!TInfo)
+    return nullptr;
+
   SmallVector<Expr *, 8> Args(CE->getNumArgs());
   if (ImportContainerChecked(CE->arguments(), Args))
     return nullptr;
@@ -5576,18 +5763,11 @@ Expr *ASTNodeImporter::VisitCXXTemporaryObjectExpr(CXXTemporaryObjectExpr *CE) {
   if (!Ctor)
     return nullptr;
 
-  return CXXTemporaryObjectExpr::Create(
-        Importer.getToContext(), T,
-        Importer.Import(CE->getLocStart()),
-        Ctor,
-        CE->isElidable(),
-        Args,
-        CE->hadMultipleCandidates(),
-        CE->isListInitialization(),
-        CE->isStdInitListInitialization(),
-        CE->requiresZeroInitialization(),
-        CE->getConstructionKind(),
-        Importer.Import(CE->getParenOrBraceRange()));
+  return new (Importer.getToContext()) CXXTemporaryObjectExpr(
+      Importer.getToContext(), Ctor, T, TInfo, Args,
+      Importer.Import(CE->getParenOrBraceRange()), CE->hadMultipleCandidates(),
+      CE->isListInitialization(), CE->isStdInitListInitialization(),
+      CE->requiresZeroInitialization());
 }
 
 Expr *
@@ -5627,6 +5807,31 @@ Expr *ASTNodeImporter::VisitPackExpansionExpr(PackExpansionExpr *E) {
         E->getNumExpansions());
 }
 
+Expr *ASTNodeImporter::VisitSizeOfPackExpr(SizeOfPackExpr *E) {
+  auto *Pack = cast_or_null<NamedDecl>(Importer.Import(E->getPack()));
+  if (!Pack)
+    return nullptr;
+
+  Optional<unsigned> Length;
+
+  if (!E->isValueDependent())
+    Length = E->getPackLength();
+
+  SmallVector<TemplateArgument, 8> PartialArguments;
+  if (E->isPartiallySubstituted()) {
+    if (ImportTemplateArguments(E->getPartialArguments().data(),
+                                E->getPartialArguments().size(),
+                                PartialArguments))
+      return nullptr;
+  }
+
+  return SizeOfPackExpr::Create(
+      Importer.getToContext(), Importer.Import(E->getOperatorLoc()), Pack,
+      Importer.Import(E->getPackLoc()), Importer.Import(E->getRParenLoc()),
+      Length, PartialArguments);
+}
+
+
 Expr *ASTNodeImporter::VisitCXXNewExpr(CXXNewExpr *CE) {
   QualType T = Importer.Import(CE->getType());
   if (T.isNull())
@@ -5856,11 +6061,10 @@ Expr *ASTNodeImporter::VisitCXXDependentScopeMemberExpr(
   if (BaseType.isNull())
     return nullptr;
 
-  TemplateArgumentListInfo ToTAInfo(Importer.Import(E->getLAngleLoc()),
-                                    Importer.Import(E->getRAngleLoc()));
-  TemplateArgumentListInfo *ResInfo = nullptr;
+  TemplateArgumentListInfo ToTAInfo, *ResInfo = nullptr;
   if (E->hasExplicitTemplateArgs()) {
-    if (ImportTemplateArgumentListInfo(E->template_arguments(), ToTAInfo))
+    if (ImportTemplateArgumentListInfo(E->getLAngleLoc(), E->getRAngleLoc(),
+                                       E->template_arguments(), ToTAInfo))
       return nullptr;
     ResInfo = &ToTAInfo;
   }
@@ -5884,6 +6088,64 @@ Expr *ASTNodeImporter::VisitCXXDependentScopeMemberExpr(
       cast_or_null<NamedDecl>(ToFQ), MemberNameInfo, ResInfo);
 }
 
+Expr *ASTNodeImporter::VisitCXXUnresolvedConstructExpr(
+    CXXUnresolvedConstructExpr *CE) {
+
+  unsigned NumArgs = CE->arg_size();
+
+  llvm::SmallVector<Expr *, 8> ToArgs(NumArgs);
+  if (ImportArrayChecked(CE->arg_begin(), CE->arg_end(), ToArgs.begin()))
+    return nullptr;
+
+  return CXXUnresolvedConstructExpr::Create(
+      Importer.getToContext(), Importer.Import(CE->getTypeSourceInfo()),
+      Importer.Import(CE->getLParenLoc()), llvm::makeArrayRef(ToArgs),
+      Importer.Import(CE->getRParenLoc()));
+}
+
+Expr *ASTNodeImporter::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E) {
+  CXXRecordDecl *NamingClass =
+      cast_or_null<CXXRecordDecl>(Importer.Import(E->getNamingClass()));
+  if (E->getNamingClass() && !NamingClass)
+    return nullptr;
+
+  DeclarationName Name = Importer.Import(E->getName());
+  if (E->getName() && !Name)
+    return nullptr;
+
+  DeclarationNameInfo NameInfo(Name, Importer.Import(E->getNameLoc()));
+  // Import additional name location/type info.
+  ImportDeclarationNameLoc(E->getNameInfo(), NameInfo);
+
+  UnresolvedSet<8> ToDecls;
+  for (Decl *D : E->decls()) {
+    if (NamedDecl *To = cast_or_null<NamedDecl>(Importer.Import(D)))
+      ToDecls.addDecl(To);
+    else
+      return nullptr;
+  }
+
+  TemplateArgumentListInfo ToTAInfo, *ResInfo = nullptr;
+  if (E->hasExplicitTemplateArgs()) {
+    if (ImportTemplateArgumentListInfo(E->getLAngleLoc(), E->getRAngleLoc(),
+                                       E->template_arguments(), ToTAInfo))
+      return nullptr;
+    ResInfo = &ToTAInfo;
+  }
+
+  if (ResInfo || E->getTemplateKeywordLoc().isValid())
+    return UnresolvedLookupExpr::Create(
+        Importer.getToContext(), NamingClass,
+        Importer.Import(E->getQualifierLoc()),
+        Importer.Import(E->getTemplateKeywordLoc()), NameInfo, E->requiresADL(),
+        ResInfo, ToDecls.begin(), ToDecls.end());
+
+  return UnresolvedLookupExpr::Create(
+      Importer.getToContext(), NamingClass,
+      Importer.Import(E->getQualifierLoc()), NameInfo, E->requiresADL(),
+      E->isOverloaded(), ToDecls.begin(), ToDecls.end());
+}
+
 Expr *ASTNodeImporter::VisitCallExpr(CallExpr *E) {
   QualType T = Importer.Import(E->getType());
   if (T.isNull())
@@ -5894,16 +6156,9 @@ Expr *ASTNodeImporter::VisitCallExpr(CallExpr *E) {
     return nullptr;
 
   unsigned NumArgs = E->getNumArgs();
-
   llvm::SmallVector<Expr *, 2> ToArgs(NumArgs);
-
-  for (unsigned ai = 0, ae = NumArgs; ai != ae; ++ai) {
-    Expr *FromArg = E->getArg(ai);
-    Expr *ToArg = Importer.Import(FromArg);
-    if (!ToArg)
-      return nullptr;
-    ToArgs[ai] = ToArg;
-  }
+  if (ImportContainerChecked(E->arguments(), ToArgs))
+     return nullptr;
 
   Expr **ToArgs_Copied = new (Importer.getToContext()) 
     Expr*[NumArgs];
@@ -5911,12 +6166,86 @@ Expr *ASTNodeImporter::VisitCallExpr(CallExpr *E) {
   for (unsigned ai = 0, ae = NumArgs; ai != ae; ++ai)
     ToArgs_Copied[ai] = ToArgs[ai];
 
+  if (const auto *OCE = dyn_cast<CXXOperatorCallExpr>(E)) {
+    return new (Importer.getToContext()) CXXOperatorCallExpr(
+          Importer.getToContext(), OCE->getOperator(), ToCallee, ToArgs, T,
+          OCE->getValueKind(), Importer.Import(OCE->getRParenLoc()),
+          OCE->getFPFeatures());
+  }
+
   return new (Importer.getToContext())
     CallExpr(Importer.getToContext(), ToCallee, 
              llvm::makeArrayRef(ToArgs_Copied, NumArgs), T, E->getValueKind(),
              Importer.Import(E->getRParenLoc()));
 }
 
+Optional<LambdaCapture>
+ASTNodeImporter::ImportLambdaCapture(const LambdaCapture &From) {
+  VarDecl *Var = nullptr;
+  if (From.capturesVariable()) {
+    Var = cast_or_null<VarDecl>(Importer.Import(From.getCapturedVar()));
+    if (!Var)
+      return None;
+  }
+
+  return LambdaCapture(Importer.Import(From.getLocation()), From.isImplicit(),
+                       From.getCaptureKind(), Var,
+                       From.isPackExpansion()
+                         ? Importer.Import(From.getEllipsisLoc())
+                         : SourceLocation());
+}
+
+Expr *ASTNodeImporter::VisitLambdaExpr(LambdaExpr *LE) {
+  CXXRecordDecl *FromClass = LE->getLambdaClass();
+  auto *ToClass = dyn_cast_or_null<CXXRecordDecl>(Importer.Import(FromClass));
+  if (!ToClass)
+    return nullptr;
+
+  // NOTE: lambda classes are created with BeingDefined flag set up.
+  // It means that ImportDefinition doesn't work for them and we should fill it
+  // manually.
+  if (ToClass->isBeingDefined()) {
+    for (auto FromField : FromClass->fields()) {
+      auto *ToField = cast_or_null<FieldDecl>(Importer.Import(FromField));
+      if (!ToField)
+        return nullptr;
+    }
+  }
+
+  auto *ToCallOp = dyn_cast_or_null<CXXMethodDecl>(
+        Importer.Import(LE->getCallOperator()));
+  if (!ToCallOp)
+    return nullptr;
+
+  ToClass->completeDefinition();
+
+  unsigned NumCaptures = LE->capture_size();
+  SmallVector<LambdaCapture, 8> Captures;
+  Captures.reserve(NumCaptures);
+  for (const auto &FromCapture : LE->captures()) {
+    if (auto ToCapture = ImportLambdaCapture(FromCapture))
+      Captures.push_back(*ToCapture);
+    else
+      return nullptr;
+  }
+
+  SmallVector<Expr *, 8> InitCaptures(NumCaptures);
+  if (ImportContainerChecked(LE->capture_inits(), InitCaptures))
+    return nullptr;
+
+  return LambdaExpr::Create(Importer.getToContext(), ToClass,
+                            Importer.Import(LE->getIntroducerRange()),
+                            LE->getCaptureDefault(),
+                            Importer.Import(LE->getCaptureDefaultLoc()),
+                            Captures,
+                            LE->hasExplicitParameters(),
+                            LE->hasExplicitResultType(),
+                            InitCaptures,
+                            Importer.Import(LE->getLocEnd()),
+                            LE->containsUnexpandedParameterPack());
+}
+
+
 Expr *ASTNodeImporter::VisitInitListExpr(InitListExpr *ILE) {
   QualType T = Importer.Import(ILE->getType());
   if (T.isNull())
@@ -6070,6 +6399,28 @@ Expr *ASTNodeImporter::VisitTypeTraitExpr(TypeTraitExpr *E) {
       E->getTrait(), ToArgs, Importer.Import(E->getLocEnd()), ToValue);
 }
 
+Expr *ASTNodeImporter::VisitCXXTypeidExpr(CXXTypeidExpr *E) {
+  QualType ToType = Importer.Import(E->getType());
+  if (ToType.isNull())
+    return nullptr;
+
+  if (E->isTypeOperand()) {
+    TypeSourceInfo *TSI = Importer.Import(E->getTypeOperandSourceInfo());
+    if (!TSI)
+      return nullptr;
+
+    return new (Importer.getToContext())
+        CXXTypeidExpr(ToType, TSI, Importer.Import(E->getSourceRange()));
+  }
+
+  Expr *Op = Importer.Import(E->getExprOperand());
+  if (!Op)
+    return nullptr;
+
+  return new (Importer.getToContext())
+      CXXTypeidExpr(ToType, Op, Importer.Import(E->getSourceRange()));
+}
+
 void ASTNodeImporter::ImportOverrides(CXXMethodDecl *ToMethod,
                                       CXXMethodDecl *FromMethod) {
   for (auto *FromOverriddenMethod : FromMethod->overridden_methods())
diff --git a/lib/AST/ASTStructuralEquivalence.cpp b/lib/AST/ASTStructuralEquivalence.cpp
index 0df8e5653f3b..a563c7e2fa06 100644
--- a/lib/AST/ASTStructuralEquivalence.cpp
+++ b/lib/AST/ASTStructuralEquivalence.cpp
@@ -1153,12 +1153,22 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                   D2->getTemplateParameters());
 }
 
+static bool IsTemplateDeclCommonStructurallyEquivalent(
+    StructuralEquivalenceContext &Ctx, TemplateDecl *D1, TemplateDecl *D2) {
+  if (!IsStructurallyEquivalent(D1->getIdentifier(), D2->getIdentifier()))
+    return false;
+  if (!D1->getIdentifier()) // Special name
+    if (D1->getNameAsString() != D2->getNameAsString())
+      return false;
+  return IsStructurallyEquivalent(Ctx, D1->getTemplateParameters(),
+                                  D2->getTemplateParameters());
+}
+
 static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                      ClassTemplateDecl *D1,
                                      ClassTemplateDecl *D2) {
   // Check template parameters.
-  if (!IsStructurallyEquivalent(Context, D1->getTemplateParameters(),
-                                D2->getTemplateParameters()))
+  if (!IsTemplateDeclCommonStructurallyEquivalent(Context, D1, D2))
     return false;
 
   // Check the templated declaration.
@@ -1166,6 +1176,18 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                           D2->getTemplatedDecl());
 }
 
+static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
+                                     FunctionTemplateDecl *D1,
+                                     FunctionTemplateDecl *D2) {
+  // Check template parameters.
+  if (!IsTemplateDeclCommonStructurallyEquivalent(Context, D1, D2))
+    return false;
+
+  // Check the templated declaration.
+  return Context.IsStructurallyEquivalent(D1->getTemplatedDecl()->getType(),
+                                          D2->getTemplatedDecl()->getType());
+}
+
 /// Determine structural equivalence of two declarations.
 static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                      Decl *D1, Decl *D2) {
@@ -1293,6 +1315,7 @@ bool StructuralEquivalenceContext::Finish() {
         // Record/non-record mismatch.
         Equivalent = false;
       }
+
     } else if (EnumDecl *Enum1 = dyn_cast<EnumDecl>(D1)) {
       if (EnumDecl *Enum2 = dyn_cast<EnumDecl>(D2)) {
         // Check for equivalent enum names.
@@ -1309,6 +1332,7 @@ bool StructuralEquivalenceContext::Finish() {
         // Enum/non-enum mismatch
         Equivalent = false;
       }
+
     } else if (TypedefNameDecl *Typedef1 = dyn_cast<TypedefNameDecl>(D1)) {
       if (TypedefNameDecl *Typedef2 = dyn_cast<TypedefNameDecl>(D2)) {
         if (!::IsStructurallyEquivalent(Typedef1->getIdentifier(),
@@ -1320,17 +1344,30 @@ bool StructuralEquivalenceContext::Finish() {
         // Typedef/non-typedef mismatch.
         Equivalent = false;
       }
+
     } else if (ClassTemplateDecl *ClassTemplate1 =
                    dyn_cast<ClassTemplateDecl>(D1)) {
       if (ClassTemplateDecl *ClassTemplate2 = dyn_cast<ClassTemplateDecl>(D2)) {
-        if (!::IsStructurallyEquivalent(ClassTemplate1->getIdentifier(),
-                                        ClassTemplate2->getIdentifier()) ||
-            !::IsStructurallyEquivalent(*this, ClassTemplate1, ClassTemplate2))
+        if (!::IsStructurallyEquivalent(*this, ClassTemplate1,
+                                        ClassTemplate2))
           Equivalent = false;
       } else {
         // Class template/non-class-template mismatch.
         Equivalent = false;
       }
+
+    } else if (FunctionTemplateDecl *FunctionTemplate1 =
+               dyn_cast<FunctionTemplateDecl>(D1)) {
+      if (FunctionTemplateDecl *FunctionTemplate2 =
+          dyn_cast<FunctionTemplateDecl>(D2)) {
+        if (!::IsStructurallyEquivalent(*this, FunctionTemplate1,
+                                        FunctionTemplate2))
+          Equivalent = false;
+      } else {
+        // Class template/non-class-template mismatch.
+        Equivalent = false;
+      }
+
     } else if (TemplateTypeParmDecl *TTP1 =
                    dyn_cast<TemplateTypeParmDecl>(D1)) {
       if (TemplateTypeParmDecl *TTP2 = dyn_cast<TemplateTypeParmDecl>(D2)) {
@@ -1350,6 +1387,7 @@ bool StructuralEquivalenceContext::Finish() {
         // Kind mismatch.
         Equivalent = false;
       }
+
     } else if (TemplateTemplateParmDecl *TTP1 =
                    dyn_cast<TemplateTemplateParmDecl>(D1)) {
       if (TemplateTemplateParmDecl *TTP2 =
diff --git a/lib/AST/Decl.cpp b/lib/AST/Decl.cpp
index 4c1d591b41e9..9c73ee7ede3b 100644
--- a/lib/AST/Decl.cpp
+++ b/lib/AST/Decl.cpp
@@ -779,7 +779,7 @@ LinkageComputer::getLVForNamespaceScopeDecl(const NamedDecl *D,
     // unique-external linkage, it's not legally usable from outside
     // this translation unit.  However, we should use the C linkage
     // rules instead for extern "C" declarations.
-    if (Context.getLangOpts().CPlusPlus && !Function->isInExternCContext()) {
+    if (Context.getLangOpts().CPlusPlus && !isFirstInExternCContext(Function)) {
       // Only look at the type-as-written. Otherwise, deducing the return type
       // of a function could change its linkage.
       QualType TypeAsWritten = Function->getType();
@@ -1165,7 +1165,7 @@ LinkageInfo LinkageComputer::getLVForLocalDecl(const NamedDecl *D,
                                                LVComputationKind computation) {
   if (const auto *Function = dyn_cast<FunctionDecl>(D)) {
     if (Function->isInAnonymousNamespace() &&
-        !Function->isInExternCContext())
+        !isFirstInExternCContext(Function))
       return getInternalLinkageFor(Function);
 
     // This is a "void f();" which got merged with a file static.
@@ -1188,7 +1188,7 @@ LinkageInfo LinkageComputer::getLVForLocalDecl(const NamedDecl *D,
 
   if (const auto *Var = dyn_cast<VarDecl>(D)) {
     if (Var->hasExternalStorage()) {
-      if (Var->isInAnonymousNamespace() && !Var->isInExternCContext())
+      if (Var->isInAnonymousNamespace() && !isFirstInExternCContext(Var))
         return getInternalLinkageFor(Var);
 
       LinkageInfo LV;
@@ -2901,6 +2901,13 @@ unsigned FunctionDecl::getBuiltinID() const {
       Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
     return 0;
 
+  // CUDA does not have device-side standard library. printf and malloc are the
+  // only special cases that are supported by device-side runtime.
+  if (Context.getLangOpts().CUDA && hasAttr<CUDADeviceAttr>() &&
+      !hasAttr<CUDAHostAttr>() &&
+      !(BuiltinID == Builtin::BIprintf || BuiltinID == Builtin::BImalloc))
+    return 0;
+
   return BuiltinID;
 }
 
@@ -4365,9 +4372,7 @@ bool TypedefNameDecl::isTransparentTagSlow() const {
   };
 
   bool isTransparent = determineIsTransparent();
-  CacheIsTransparentTag = 1;
-  if (isTransparent)
-    CacheIsTransparentTag |= 0x2;
+  MaybeModedTInfo.setInt((isTransparent << 1) | 1);
   return isTransparent;
 }
 
diff --git a/lib/AST/DeclBase.cpp b/lib/AST/DeclBase.cpp
index 29ce7ae034b5..18aa53b85770 100644
--- a/lib/AST/DeclBase.cpp
+++ b/lib/AST/DeclBase.cpp
@@ -891,12 +891,14 @@ bool Decl::AccessDeclContextSanity() const {
   // 4. the context is not a record
   // 5. it's invalid
   // 6. it's a C++0x static_assert.
+  // 7. it's a block literal declaration
   if (isa<TranslationUnitDecl>(this) ||
       isa<TemplateTypeParmDecl>(this) ||
       isa<NonTypeTemplateParmDecl>(this) ||
       !isa<CXXRecordDecl>(getDeclContext()) ||
       isInvalidDecl() ||
       isa<StaticAssertDecl>(this) ||
+      isa<BlockDecl>(this) ||
       // FIXME: a ParmVarDecl can have ClassTemplateSpecialization
       // as DeclContext (?).
       isa<ParmVarDecl>(this) ||
@@ -1588,17 +1590,7 @@ DeclContext::noload_lookup(DeclarationName Name) {
   if (PrimaryContext != this)
     return PrimaryContext->noload_lookup(Name);
 
-  // If we have any lazy lexical declarations not in our lookup map, add them
-  // now. Don't import any external declarations, not even if we know we have
-  // some missing from the external visible lookups.
-  if (HasLazyLocalLexicalLookups) {
-    SmallVector<DeclContext *, 2> Contexts;
-    collectAllContexts(Contexts);
-    for (unsigned I = 0, N = Contexts.size(); I != N; ++I)
-      buildLookupImpl(Contexts[I], hasExternalVisibleStorage());
-    HasLazyLocalLexicalLookups = false;
-  }
-
+  loadLazyLocalLexicalLookups();
   StoredDeclsMap *Map = LookupPtr;
   if (!Map)
     return lookup_result();
@@ -1608,6 +1600,19 @@ DeclContext::noload_lookup(DeclarationName Name) {
                          : lookup_result();
 }
 
+// If we have any lazy lexical declarations not in our lookup map, add them
+// now. Don't import any external declarations, not even if we know we have
+// some missing from the external visible lookups.
+void DeclContext::loadLazyLocalLexicalLookups() {
+  if (HasLazyLocalLexicalLookups) {
+    SmallVector<DeclContext *, 2> Contexts;
+    collectAllContexts(Contexts);
+    for (unsigned I = 0, N = Contexts.size(); I != N; ++I)
+      buildLookupImpl(Contexts[I], hasExternalVisibleStorage());
+    HasLazyLocalLexicalLookups = false;
+  }
+}
+
 void DeclContext::localUncachedLookup(DeclarationName Name,
                                       SmallVectorImpl<NamedDecl *> &Results) {
   Results.clear();
diff --git a/lib/AST/DeclCXX.cpp b/lib/AST/DeclCXX.cpp
index 41f2449a9d6a..afbc3f315f5d 100644
--- a/lib/AST/DeclCXX.cpp
+++ b/lib/AST/DeclCXX.cpp
@@ -2387,10 +2387,10 @@ UsingShadowDecl::UsingShadowDecl(Kind K, ASTContext &C, DeclContext *DC,
                                  SourceLocation Loc, UsingDecl *Using,
                                  NamedDecl *Target)
     : NamedDecl(K, DC, Loc, Using ? Using->getDeclName() : DeclarationName()),
-      redeclarable_base(C), Underlying(Target),
+      redeclarable_base(C), Underlying(),
       UsingOrNextShadow(cast<NamedDecl>(Using)) {
   if (Target)
-    IdentifierNamespace = Target->getIdentifierNamespace();
+    setTargetDecl(Target);
   setImplicit();
 }
 
diff --git a/lib/AST/DeclPrinter.cpp b/lib/AST/DeclPrinter.cpp
index b792c5920a55..e82144b0ae98 100644
--- a/lib/AST/DeclPrinter.cpp
+++ b/lib/AST/DeclPrinter.cpp
@@ -128,9 +128,7 @@ static QualType GetBaseType(QualType T) {
   // FIXME: This should be on the Type class!
   QualType BaseType = T;
   while (!BaseType->isSpecifierType()) {
-    if (isa<TypedefType>(BaseType))
-      break;
-    else if (const PointerType* PTy = BaseType->getAs<PointerType>())
+    if (const PointerType *PTy = BaseType->getAs<PointerType>())
       BaseType = PTy->getPointeeType();
     else if (const BlockPointerType *BPy = BaseType->getAs<BlockPointerType>())
       BaseType = BPy->getPointeeType();
@@ -144,8 +142,11 @@ static QualType GetBaseType(QualType T) {
       BaseType = RTy->getPointeeType();
     else if (const AutoType *ATy = BaseType->getAs<AutoType>())
       BaseType = ATy->getDeducedType();
+    else if (const ParenType *PTy = BaseType->getAs<ParenType>())
+      BaseType = PTy->desugar();
     else
-      llvm_unreachable("Unknown declarator!");
+      // This must be a syntax error.
+      break;
   }
   return BaseType;
 }
diff --git a/lib/AST/ExprConstant.cpp b/lib/AST/ExprConstant.cpp
index 8d9b3c3bebc0..61bd1e5f1298 100644
--- a/lib/AST/ExprConstant.cpp
+++ b/lib/AST/ExprConstant.cpp
@@ -3244,17 +3244,12 @@ static bool handleAssignment(EvalInfo &Info, const Expr *E, const LValue &LVal,
   }
 
   CompleteObject Obj = findCompleteObject(Info, E, AK_Assign, LVal, LValType);
-  return Obj && modifySubobject(Info, E, Obj, LVal.Designator, Val);
-}
-
-static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
-  return T->isSignedIntegerType() &&
-         Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
-}
-
-namespace {
-struct CompoundAssignSubobjectHandler {
-  EvalInfo &Info;
+  return Obj && modifySubobject(Info, E, Obj, LVal.Designator, Val);
+}
+
+namespace {
+struct CompoundAssignSubobjectHandler {
+  EvalInfo &Info;
   const Expr *E;
   QualType PromotedLHSType;
   BinaryOperatorKind Opcode;
@@ -3370,13 +3365,13 @@ static bool handleCompoundAssignment(
   return Obj && findSubobject(Info, E, Obj, LVal.Designator, Handler);
 }
 
-namespace {
-struct IncDecSubobjectHandler {
-  EvalInfo &Info;
-  const Expr *E;
-  AccessKinds AccessKind;
-  APValue *Old;
-
+namespace {
+struct IncDecSubobjectHandler {
+  EvalInfo &Info;
+  const UnaryOperator *E;
+  AccessKinds AccessKind;
+  APValue *Old;
+
   typedef bool result_type;
 
   bool checkConst(QualType QT) {
@@ -3442,22 +3437,20 @@ struct IncDecSubobjectHandler {
     }
 
     bool WasNegative = Value.isNegative();
-    if (AccessKind == AK_Increment) {
-      ++Value;
-
-      if (!WasNegative && Value.isNegative() &&
-          isOverflowingIntegerType(Info.Ctx, SubobjType)) {
-        APSInt ActualValue(Value, /*IsUnsigned*/true);
-        return HandleOverflow(Info, E, ActualValue, SubobjType);
-      }
-    } else {
-      --Value;
-
-      if (WasNegative && !Value.isNegative() &&
-          isOverflowingIntegerType(Info.Ctx, SubobjType)) {
-        unsigned BitWidth = Value.getBitWidth();
-        APSInt ActualValue(Value.sext(BitWidth + 1), /*IsUnsigned*/false);
-        ActualValue.setBit(BitWidth);
+    if (AccessKind == AK_Increment) {
+      ++Value;
+
+      if (!WasNegative && Value.isNegative() && E->canOverflow()) {
+        APSInt ActualValue(Value, /*IsUnsigned*/true);
+        return HandleOverflow(Info, E, ActualValue, SubobjType);
+      }
+    } else {
+      --Value;
+
+      if (WasNegative && !Value.isNegative() && E->canOverflow()) {
+        unsigned BitWidth = Value.getBitWidth();
+        APSInt ActualValue(Value.sext(BitWidth + 1), /*IsUnsigned*/false);
+        ActualValue.setBit(BitWidth);
         return HandleOverflow(Info, E, ActualValue, SubobjType);
       }
     }
@@ -3512,13 +3505,13 @@ static bool handleIncDec(EvalInfo &Info, const Expr *E, const LValue &LVal,
     Info.FFDiag(E);
     return false;
   }
-
-  AccessKinds AK = IsIncrement ? AK_Increment : AK_Decrement;
-  CompleteObject Obj = findCompleteObject(Info, E, AK, LVal, LValType);
-  IncDecSubobjectHandler Handler = { Info, E, AK, Old };
-  return Obj && findSubobject(Info, E, Obj, LVal.Designator, Handler);
-}
-
+
+  AccessKinds AK = IsIncrement ? AK_Increment : AK_Decrement;
+  CompleteObject Obj = findCompleteObject(Info, E, AK, LVal, LValType);
+  IncDecSubobjectHandler Handler = {Info, cast<UnaryOperator>(E), AK, Old};
+  return Obj && findSubobject(Info, E, Obj, LVal.Designator, Handler);
+}
+
 /// Build an lvalue for the object argument of a member function call.
 static bool EvaluateObjectArgument(EvalInfo &Info, const Expr *Object,
                                    LValue &This) {
@@ -8875,13 +8868,13 @@ bool IntExprEvaluator::VisitUnaryOperator(const UnaryOperator *E) {
     return Visit(E->getSubExpr());
   case UO_Minus: {
     if (!Visit(E->getSubExpr()))
-      return false;
-    if (!Result.isInt()) return Error(E);
-    const APSInt &Value = Result.getInt();
-    if (Value.isSigned() && Value.isMinSignedValue() &&
-        !HandleOverflow(Info, E, -Value.extend(Value.getBitWidth() + 1),
-                        E->getType()))
-      return false;
+      return false;
+    if (!Result.isInt()) return Error(E);
+    const APSInt &Value = Result.getInt();
+    if (Value.isSigned() && Value.isMinSignedValue() && E->canOverflow() &&
+        !HandleOverflow(Info, E, -Value.extend(Value.getBitWidth() + 1),
+                        E->getType()))
+      return false;
     return Success(-Value, E);
   }
   case UO_Not: {
@@ -9170,9 +9163,11 @@ bool FloatExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case Builtin::BI__builtin_huge_val:
   case Builtin::BI__builtin_huge_valf:
   case Builtin::BI__builtin_huge_vall:
+  case Builtin::BI__builtin_huge_valf128:
   case Builtin::BI__builtin_inf:
   case Builtin::BI__builtin_inff:
-  case Builtin::BI__builtin_infl: {
+  case Builtin::BI__builtin_infl:
+  case Builtin::BI__builtin_inff128: {
     const llvm::fltSemantics &Sem =
       Info.Ctx.getFloatTypeSemantics(E->getType());
     Result = llvm::APFloat::getInf(Sem);
@@ -9182,6 +9177,7 @@ bool FloatExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case Builtin::BI__builtin_nans:
   case Builtin::BI__builtin_nansf:
   case Builtin::BI__builtin_nansl:
+  case Builtin::BI__builtin_nansf128:
     if (!TryEvaluateBuiltinNaN(Info.Ctx, E->getType(), E->getArg(0),
                                true, Result))
       return Error(E);
@@ -9190,6 +9186,7 @@ bool FloatExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case Builtin::BI__builtin_nan:
   case Builtin::BI__builtin_nanf:
   case Builtin::BI__builtin_nanl:
+  case Builtin::BI__builtin_nanf128:
     // If this is __builtin_nan() turn this into a nan, otherwise we
     // can't constant fold it.
     if (!TryEvaluateBuiltinNaN(Info.Ctx, E->getType(), E->getArg(0),
@@ -9200,6 +9197,7 @@ bool FloatExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case Builtin::BI__builtin_fabs:
   case Builtin::BI__builtin_fabsf:
   case Builtin::BI__builtin_fabsl:
+  case Builtin::BI__builtin_fabsf128:
     if (!EvaluateFloat(E->getArg(0), Result, Info))
       return false;
 
@@ -9213,7 +9211,8 @@ bool FloatExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
   case Builtin::BI__builtin_copysign:
   case Builtin::BI__builtin_copysignf:
-  case Builtin::BI__builtin_copysignl: {
+  case Builtin::BI__builtin_copysignl:
+  case Builtin::BI__builtin_copysignf128: {
     APFloat RHS(0.);
     if (!EvaluateFloat(E->getArg(0), Result, Info) ||
         !EvaluateFloat(E->getArg(1), RHS, Info))
diff --git a/lib/AST/ExternalASTMerger.cpp b/lib/AST/ExternalASTMerger.cpp
index 6b75c51c6420..edf1310d1740 100644
--- a/lib/AST/ExternalASTMerger.cpp
+++ b/lib/AST/ExternalASTMerger.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclTemplate.h"
 #include "clang/AST/ExternalASTMerger.h"
 
 using namespace clang;
@@ -351,6 +352,27 @@ void ExternalASTMerger::RemoveSources(llvm::ArrayRef<ImporterSource> Sources) {
   }
 }
 
+template <typename DeclTy>
+static bool importSpecializations(DeclTy *D, ASTImporter *Importer) {
+  for (auto *Spec : D->specializations())
+    if (!Importer->Import(Spec))
+      return true;
+  return false;
+}
+
+/// Imports specializations from template declarations that can be specialized.
+static bool importSpecializationsIfNeeded(Decl *D, ASTImporter *Importer) {
+  if (!isa<TemplateDecl>(D))
+    return false;
+  if (auto *FunctionTD = dyn_cast<FunctionTemplateDecl>(D))
+    return importSpecializations(FunctionTD, Importer);
+  else if (auto *ClassTD = dyn_cast<ClassTemplateDecl>(D))
+    return importSpecializations(ClassTD, Importer);
+  else if (auto *VarTD = dyn_cast<VarTemplateDecl>(D))
+    return importSpecializations(VarTD, Importer);
+  return false;
+}
+
 bool ExternalASTMerger::FindExternalVisibleDeclsByName(const DeclContext *DC,
                                                        DeclarationName Name) {
   llvm::SmallVector<NamedDecl *, 1> Decls;
@@ -376,9 +398,18 @@ bool ExternalASTMerger::FindExternalVisibleDeclsByName(const DeclContext *DC,
 
   Decls.reserve(Candidates.size());
   for (const Candidate &C : Candidates) {
-    NamedDecl *d = cast<NamedDecl>(C.second->Import(C.first.get()));
-    assert(d);
-    Decls.push_back(d);
+    Decl *LookupRes = C.first.get();
+    ASTImporter *Importer = C.second;
+    NamedDecl *ND = cast_or_null<NamedDecl>(Importer->Import(LookupRes));
+    assert(ND);
+    // If we don't import specialization, they are not available via lookup
+    // because the lookup result is imported TemplateDecl and it does not
+    // reference its specializations until they are imported explicitly.
+    bool IsSpecImportFailed =
+        importSpecializationsIfNeeded(LookupRes, Importer);
+    assert(!IsSpecImportFailed);
+    (void)IsSpecImportFailed;
+    Decls.push_back(ND);
   }
   SetExternalVisibleDeclsForName(DC, Name, Decls);
   return true;
diff --git a/lib/AST/MicrosoftMangle.cpp b/lib/AST/MicrosoftMangle.cpp
index 0c55c1a92287..9b395a7a11ce 100644
--- a/lib/AST/MicrosoftMangle.cpp
+++ b/lib/AST/MicrosoftMangle.cpp
@@ -1559,12 +1559,11 @@ MicrosoftCXXNameMangler::mangleRefQualifier(RefQualifierKind RefQualifier) {
 
 void MicrosoftCXXNameMangler::manglePointerExtQualifiers(Qualifiers Quals,
                                                          QualType PointeeType) {
-  bool HasRestrict = Quals.hasRestrict();
   if (PointersAre64Bit &&
       (PointeeType.isNull() || !PointeeType->isFunctionType()))
     Out << 'E';
 
-  if (HasRestrict)
+  if (Quals.hasRestrict())
     Out << 'I';
 
   if (Quals.hasUnaligned() ||
@@ -1833,15 +1832,12 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
     llvm_unreachable("placeholder types shouldn't get to name mangling");
 
   case BuiltinType::ObjCId:
-    Out << "PA";
     mangleArtificalTagType(TTK_Struct, "objc_object");
     break;
   case BuiltinType::ObjCClass:
-    Out << "PA";
     mangleArtificalTagType(TTK_Struct, "objc_class");
     break;
   case BuiltinType::ObjCSel:
-    Out << "PA";
     mangleArtificalTagType(TTK_Struct, "objc_selector");
     break;
 
@@ -2337,9 +2333,6 @@ void MicrosoftCXXNameMangler::mangleType(const PointerType *T, Qualifiers Quals,
 
 void MicrosoftCXXNameMangler::mangleType(const ObjCObjectPointerType *T,
                                          Qualifiers Quals, SourceRange Range) {
-  if (T->isObjCIdType() || T->isObjCClassType())
-    return mangleType(T->getPointeeType(), Range, QMM_Drop);
-
   QualType PointeeType = T->getPointeeType();
   manglePointerCVQualifiers(Quals);
   manglePointerExtQualifiers(Quals, PointeeType);
@@ -2457,9 +2450,36 @@ void MicrosoftCXXNameMangler::mangleType(const ObjCInterfaceType *T, Qualifiers,
 
 void MicrosoftCXXNameMangler::mangleType(const ObjCObjectType *T, Qualifiers,
                                          SourceRange Range) {
-  // We don't allow overloading by different protocol qualification,
-  // so mangling them isn't necessary.
-  mangleType(T->getBaseType(), Range, QMM_Drop);
+  if (T->qual_empty())
+    return mangleType(T->getBaseType(), Range, QMM_Drop);
+
+  ArgBackRefMap OuterArgsContext;
+  BackRefVec OuterTemplateContext;
+
+  TypeBackReferences.swap(OuterArgsContext);
+  NameBackReferences.swap(OuterTemplateContext);
+
+  mangleTagTypeKind(TTK_Struct);
+
+  Out << "?$";
+  if (T->isObjCId())
+    mangleSourceName("objc_object");
+  else if (T->isObjCClass())
+    mangleSourceName("objc_class");
+  else
+    mangleSourceName(T->getInterface()->getName());
+
+  for (const auto &Q : T->quals()) {
+    Out << 'Y'; // cointerface
+    mangleSourceName(Q->getName());
+    Out << '@';
+  }
+  Out << '@';
+
+  Out << '@';
+
+  TypeBackReferences.swap(OuterArgsContext);
+  NameBackReferences.swap(OuterTemplateContext);
 }
 
 void MicrosoftCXXNameMangler::mangleType(const BlockPointerType *T,
diff --git a/lib/AST/ODRHash.cpp b/lib/AST/ODRHash.cpp
index 088d8bedd453..38e8d34135f9 100644
--- a/lib/AST/ODRHash.cpp
+++ b/lib/AST/ODRHash.cpp
@@ -478,6 +478,8 @@ void ODRHash::AddFunctionDecl(const FunctionDecl *Function) {
 
   // TODO: Fix hashing for class methods.
   if (isa<CXXMethodDecl>(Function)) return;
+  // And friend functions.
+  if (Function->getFriendObjectKind()) return;
 
   // Skip functions that are specializations or in specialization context.
   const DeclContext *DC = Function;
diff --git a/lib/AST/RecordLayoutBuilder.cpp b/lib/AST/RecordLayoutBuilder.cpp
index a9d43dfa80c5..dddacf2a5fed 100644
--- a/lib/AST/RecordLayoutBuilder.cpp
+++ b/lib/AST/RecordLayoutBuilder.cpp
@@ -1504,9 +1504,10 @@ void ItaniumRecordLayoutBuilder::LayoutBitField(const FieldDecl *D) {
     FieldAlign = TypeSize;
 
     // If the previous field was not a bitfield, or was a bitfield
-    // with a different storage unit size, we're done with that
-    // storage unit.
-    if (LastBitfieldTypeSize != TypeSize) {
+    // with a different storage unit size, or if this field doesn't fit into
+    // the current storage unit, we're done with that storage unit.
+    if (LastBitfieldTypeSize != TypeSize ||
+        UnfilledBitsInLastUnit < FieldSize) {
       // Also, ignore zero-length bitfields after non-bitfields.
       if (!LastBitfieldTypeSize && !FieldSize)
         FieldAlign = 1;
diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp
index d7e668a83280..de8f6a6481c3 100644
--- a/lib/AST/StmtPrinter.cpp
+++ b/lib/AST/StmtPrinter.cpp
@@ -1034,12 +1034,8 @@ void StmtPrinter::PrintOMPExecutableDirective(OMPExecutableDirective *S,
       OS << ' ';
     }
   OS << "\n";
-  if (S->hasAssociatedStmt() && S->getAssociatedStmt() && !ForceNoStmt) {
-    assert(isa<CapturedStmt>(S->getAssociatedStmt()) &&
-           "Expected captured statement!");
-    Stmt *CS = cast<CapturedStmt>(S->getAssociatedStmt())->getCapturedStmt();
-    PrintStmt(CS);
-  }
+  if (!ForceNoStmt && S->hasAssociatedStmt())
+    PrintStmt(S->getInnermostCapturedStmt()->getCapturedStmt());
 }
 
 void StmtPrinter::VisitOMPParallelDirective(OMPParallelDirective *Node) {
@@ -1142,7 +1138,7 @@ void StmtPrinter::VisitOMPFlushDirective(OMPFlushDirective *Node) {
 
 void StmtPrinter::VisitOMPOrderedDirective(OMPOrderedDirective *Node) {
   Indent() << "#pragma omp ordered ";
-  PrintOMPExecutableDirective(Node);
+  PrintOMPExecutableDirective(Node, Node->hasClausesOfKind<OMPDependClause>());
 }
 
 void StmtPrinter::VisitOMPAtomicDirective(OMPAtomicDirective *Node) {
diff --git a/lib/ASTMatchers/Dynamic/Registry.cpp b/lib/ASTMatchers/Dynamic/Registry.cpp
index 2b7bb7a2120d..79aa4e667b34 100644
--- a/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -296,6 +296,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(hasTemplateArgument);
   REGISTER_MATCHER(hasThen);
   REGISTER_MATCHER(hasThreadStorageDuration);
+  REGISTER_MATCHER(hasTrailingReturn);
   REGISTER_MATCHER(hasTrueExpression);
   REGISTER_MATCHER(hasTypeLoc);
   REGISTER_MATCHER(hasUnaryOperand);
@@ -354,12 +355,14 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(isMemberInitializer);
   REGISTER_MATCHER(isMoveAssignmentOperator);
   REGISTER_MATCHER(isMoveConstructor);
+  REGISTER_MATCHER(isNoReturn);
   REGISTER_MATCHER(isNoThrow);
   REGISTER_MATCHER(isOverride);
   REGISTER_MATCHER(isPrivate);
   REGISTER_MATCHER(isProtected);
   REGISTER_MATCHER(isPublic);
   REGISTER_MATCHER(isPure);
+  REGISTER_MATCHER(isScoped);
   REGISTER_MATCHER(isSignedInteger);
   REGISTER_MATCHER(isStaticStorageClass);
   REGISTER_MATCHER(isStruct);
diff --git a/lib/Analysis/BodyFarm.cpp b/lib/Analysis/BodyFarm.cpp
index 89ca8484819d..492d01f6c98e 100644
--- a/lib/Analysis/BodyFarm.cpp
+++ b/lib/Analysis/BodyFarm.cpp
@@ -149,7 +149,8 @@ DeclRefExpr *ASTMaker::makeDeclRefExpr(
 
 UnaryOperator *ASTMaker::makeDereference(const Expr *Arg, QualType Ty) {
   return new (C) UnaryOperator(const_cast<Expr*>(Arg), UO_Deref, Ty,
-                               VK_LValue, OK_Ordinary, SourceLocation());
+                               VK_LValue, OK_Ordinary, SourceLocation(),
+                              /*CanOverflow*/ false);
 }
 
 ImplicitCastExpr *ASTMaker::makeLvalueToRvalue(const Expr *Arg, QualType Ty) {
@@ -441,7 +442,8 @@ static Stmt *create_call_once(ASTContext &C, const FunctionDecl *D) {
       /* opc=*/ UO_LNot,
       /* QualType=*/ C.IntTy,
       /* ExprValueKind=*/ VK_RValue,
-      /* ExprObjectKind=*/ OK_Ordinary, SourceLocation());
+      /* ExprObjectKind=*/ OK_Ordinary, SourceLocation(),
+      /* CanOverflow*/ false);
 
   // Create assignment.
   BinaryOperator *FlagAssignment = M.makeAssignment(
@@ -505,7 +507,8 @@ static Stmt *create_dispatch_once(ASTContext &C, const FunctionDecl *D) {
   // (2) Create the assignment to the predicate.
   Expr *DoneValue =
       new (C) UnaryOperator(M.makeIntegerLiteral(0, C.LongTy), UO_Not, C.LongTy,
-                            VK_RValue, OK_Ordinary, SourceLocation());
+                            VK_RValue, OK_Ordinary, SourceLocation(),
+                            /*CanOverflow*/false);
 
   BinaryOperator *B =
     M.makeAssignment(
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp
index 098803f9a417..3c64f14387a5 100644
--- a/lib/Analysis/CloneDetection.cpp
+++ b/lib/Analysis/CloneDetection.cpp
@@ -534,14 +534,14 @@ void VariablePattern::addVariableOccurence(const VarDecl *VarDecl,
   // First check if we already reference this variable
   for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
     if (Variables[KindIndex] == VarDecl) {
-      // If yes, add a new occurence that points to the existing entry in
+      // If yes, add a new occurrence that points to the existing entry in
       // the Variables vector.
       Occurences.emplace_back(KindIndex, Mention);
       return;
     }
   }
   // If this variable wasn't already referenced, add it to the list of
-  // referenced variables and add a occurence that points to this new entry.
+  // referenced variables and add a occurrence that points to this new entry.
   Occurences.emplace_back(Variables.size(), Mention);
   Variables.push_back(VarDecl);
 }
diff --git a/lib/Analysis/ThreadSafety.cpp b/lib/Analysis/ThreadSafety.cpp
index 6a9c9a04c55d..f81d916bcba8 100644
--- a/lib/Analysis/ThreadSafety.cpp
+++ b/lib/Analysis/ThreadSafety.cpp
@@ -1689,7 +1689,7 @@ void BuildLockset::handleCall(Expr *Exp, const NamedDecl *D, VarDecl *VD) {
   CapExprSet ScopedExclusiveReqs, ScopedSharedReqs;
   StringRef CapDiagKind = "mutex";
 
-  // Figure out if we're calling the constructor of scoped lockable class
+  // Figure out if we're constructing an object of scoped lockable class
   bool isScopedVar = false;
   if (VD) {
     if (const CXXConstructorDecl *CD = dyn_cast<const CXXConstructorDecl>(D)) {
@@ -1991,6 +1991,33 @@ void BuildLockset::VisitCXXConstructExpr(CXXConstructExpr *Exp) {
   // FIXME -- only handles constructors in DeclStmt below.
 }
 
+static CXXConstructorDecl *
+findConstructorForByValueReturn(const CXXRecordDecl *RD) {
+  // Prefer a move constructor over a copy constructor. If there's more than
+  // one copy constructor or more than one move constructor, we arbitrarily
+  // pick the first declared such constructor rather than trying to guess which
+  // one is more appropriate.
+  CXXConstructorDecl *CopyCtor = nullptr;
+  for (CXXConstructorDecl *Ctor : RD->ctors()) {
+    if (Ctor->isDeleted())
+      continue;
+    if (Ctor->isMoveConstructor())
+      return Ctor;
+    if (!CopyCtor && Ctor->isCopyConstructor())
+      CopyCtor = Ctor;
+  }
+  return CopyCtor;
+}
+
+static Expr *buildFakeCtorCall(CXXConstructorDecl *CD, ArrayRef<Expr *> Args,
+                               SourceLocation Loc) {
+  ASTContext &Ctx = CD->getASTContext();
+  return CXXConstructExpr::Create(Ctx, Ctx.getRecordType(CD->getParent()), Loc,
+                                  CD, true, Args, false, false, false, false,
+                                  CXXConstructExpr::CK_Complete,
+                                  SourceRange(Loc, Loc));
+}
+
 void BuildLockset::VisitDeclStmt(DeclStmt *S) {
   // adjust the context
   LVarCtx = Analyzer->LocalVarMap.getNextContext(CtxIndex, S, LVarCtx);
@@ -1998,15 +2025,34 @@ void BuildLockset::VisitDeclStmt(DeclStmt *S) {
   for (auto *D : S->getDeclGroup()) {
     if (VarDecl *VD = dyn_cast_or_null<VarDecl>(D)) {
       Expr *E = VD->getInit();
+      if (!E)
+        continue;
+      E = E->IgnoreParens();
+
       // handle constructors that involve temporaries
-      if (ExprWithCleanups *EWC = dyn_cast_or_null<ExprWithCleanups>(E))
+      if (auto *EWC = dyn_cast<ExprWithCleanups>(E))
         E = EWC->getSubExpr();
+      if (auto *BTE = dyn_cast<CXXBindTemporaryExpr>(E))
+        E = BTE->getSubExpr();
 
-      if (CXXConstructExpr *CE = dyn_cast_or_null<CXXConstructExpr>(E)) {
+      if (CXXConstructExpr *CE = dyn_cast<CXXConstructExpr>(E)) {
         NamedDecl *CtorD = dyn_cast_or_null<NamedDecl>(CE->getConstructor());
         if (!CtorD || !CtorD->hasAttrs())
-          return;
-        handleCall(CE, CtorD, VD);
+          continue;
+        handleCall(E, CtorD, VD);
+      } else if (isa<CallExpr>(E) && E->isRValue()) {
+        // If the object is initialized by a function call that returns a
+        // scoped lockable by value, use the attributes on the copy or move
+        // constructor to figure out what effect that should have on the
+        // lockset.
+        // FIXME: Is this really the best way to handle this situation?
+        auto *RD = E->getType()->getAsCXXRecordDecl();
+        if (!RD || !RD->hasAttr<ScopedLockableAttr>())
+          continue;
+        CXXConstructorDecl *CtorD = findConstructorForByValueReturn(RD);
+        if (!CtorD || !CtorD->hasAttrs())
+          continue;
+        handleCall(buildFakeCtorCall(CtorD, {E}, E->getLocStart()), CtorD, VD);
       }
     }
   }
diff --git a/lib/Basic/CMakeLists.txt b/lib/Basic/CMakeLists.txt
index d0c9b902f67e..823b908184c2 100644
--- a/lib/Basic/CMakeLists.txt
+++ b/lib/Basic/CMakeLists.txt
@@ -83,6 +83,7 @@ add_clang_library(clangBasic
   Targets/OSTargets.cpp
   Targets/PNaCl.cpp
   Targets/PPC.cpp
+  Targets/RISCV.cpp
   Targets/SPIR.cpp
   Targets/Sparc.cpp
   Targets/SystemZ.cpp
diff --git a/lib/Basic/Cuda.cpp b/lib/Basic/Cuda.cpp
index 58b99a3b58cb..4e61743ab0ca 100644
--- a/lib/Basic/Cuda.cpp
+++ b/lib/Basic/Cuda.cpp
@@ -18,6 +18,8 @@ const char *CudaVersionToString(CudaVersion V) {
     return "8.0";
   case CudaVersion::CUDA_90:
     return "9.0";
+  case CudaVersion::CUDA_91:
+    return "9.1";
   }
   llvm_unreachable("invalid enum");
 }
@@ -52,6 +54,8 @@ const char *CudaArchToString(CudaArch A) {
     return "sm_62";
   case CudaArch::SM_70:
     return "sm_70";
+  case CudaArch::SM_72:
+    return "sm_72";
   }
   llvm_unreachable("invalid enum");
 }
@@ -71,6 +75,7 @@ CudaArch StringToCudaArch(llvm::StringRef S) {
       .Case("sm_61", CudaArch::SM_61)
       .Case("sm_62", CudaArch::SM_62)
       .Case("sm_70", CudaArch::SM_70)
+      .Case("sm_72", CudaArch::SM_72)
       .Default(CudaArch::UNKNOWN);
 }
 
@@ -102,6 +107,8 @@ const char *CudaVirtualArchToString(CudaVirtualArch A) {
     return "compute_62";
   case CudaVirtualArch::COMPUTE_70:
     return "compute_70";
+  case CudaVirtualArch::COMPUTE_72:
+    return "compute_72";
   }
   llvm_unreachable("invalid enum");
 }
@@ -120,6 +127,7 @@ CudaVirtualArch StringToCudaVirtualArch(llvm::StringRef S) {
       .Case("compute_61", CudaVirtualArch::COMPUTE_61)
       .Case("compute_62", CudaVirtualArch::COMPUTE_62)
       .Case("compute_70", CudaVirtualArch::COMPUTE_70)
+      .Case("compute_72", CudaVirtualArch::COMPUTE_72)
       .Default(CudaVirtualArch::UNKNOWN);
 }
 
@@ -152,6 +160,8 @@ CudaVirtualArch VirtualArchForCudaArch(CudaArch A) {
     return CudaVirtualArch::COMPUTE_62;
   case CudaArch::SM_70:
     return CudaVirtualArch::COMPUTE_70;
+  case CudaArch::SM_72:
+    return CudaVirtualArch::COMPUTE_72;
   }
   llvm_unreachable("invalid enum");
 }
@@ -176,6 +186,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) {
     return CudaVersion::CUDA_80;
   case CudaArch::SM_70:
     return CudaVersion::CUDA_90;
+  case CudaArch::SM_72:
+    return CudaVersion::CUDA_91;
   }
   llvm_unreachable("invalid enum");
 }
diff --git a/lib/Basic/Diagnostic.cpp b/lib/Basic/Diagnostic.cpp
index 26baa838f8c6..5903d72ea5e0 100644
--- a/lib/Basic/Diagnostic.cpp
+++ b/lib/Basic/Diagnostic.cpp
@@ -57,11 +57,11 @@ static void DummyArgToStringFn(DiagnosticsEngine::ArgumentKind AK, intptr_t QT,
   Output.append(Str.begin(), Str.end());
 }
 
-DiagnosticsEngine::DiagnosticsEngine(IntrusiveRefCntPtr<DiagnosticIDs> diags,
-                                     DiagnosticOptions *DiagOpts,
-                                     DiagnosticConsumer *client,
-                                     bool ShouldOwnClient)
-    : Diags(std::move(diags)), DiagOpts(DiagOpts), Client(nullptr),
+DiagnosticsEngine::DiagnosticsEngine(
+    IntrusiveRefCntPtr<DiagnosticIDs> diags,
+    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, DiagnosticConsumer *client,
+    bool ShouldOwnClient)
+    : Diags(std::move(diags)), DiagOpts(std::move(DiagOpts)), Client(nullptr),
       SourceMgr(nullptr) {
   setClient(client, ShouldOwnClient);
   ArgToStringFn = DummyArgToStringFn;
diff --git a/lib/Basic/Module.cpp b/lib/Basic/Module.cpp
index 7124184865c6..692e138f6c15 100644
--- a/lib/Basic/Module.cpp
+++ b/lib/Basic/Module.cpp
@@ -95,11 +95,16 @@ static bool hasFeature(StringRef Feature, const LangOptions &LangOpts,
 
 bool Module::isAvailable(const LangOptions &LangOpts, const TargetInfo &Target,
                          Requirement &Req,
-                         UnresolvedHeaderDirective &MissingHeader) const {
+                         UnresolvedHeaderDirective &MissingHeader,
+                         Module *&ShadowingModule) const {
   if (IsAvailable)
     return true;
 
   for (const Module *Current = this; Current; Current = Current->Parent) {
+    if (Current->ShadowingModule) {
+      ShadowingModule = Current->ShadowingModule;
+      return false;
+    }
     for (unsigned I = 0, N = Current->Requirements.size(); I != N; ++I) {
       if (hasFeature(Current->Requirements[I].first, LangOpts, Target) !=
               Current->Requirements[I].second) {
diff --git a/lib/Basic/OpenMPKinds.cpp b/lib/Basic/OpenMPKinds.cpp
index 09c919e2b072..c4398d633544 100644
--- a/lib/Basic/OpenMPKinds.cpp
+++ b/lib/Basic/OpenMPKinds.cpp
@@ -891,6 +891,7 @@ void clang::getOpenMPCaptureRegions(
   case OMPD_target_teams:
   case OMPD_target_teams_distribute:
   case OMPD_target_teams_distribute_simd:
+    CaptureRegions.push_back(OMPD_task);
     CaptureRegions.push_back(OMPD_target);
     CaptureRegions.push_back(OMPD_teams);
     break;
@@ -901,6 +902,7 @@ void clang::getOpenMPCaptureRegions(
     break;
   case OMPD_target:
   case OMPD_target_simd:
+    CaptureRegions.push_back(OMPD_task);
     CaptureRegions.push_back(OMPD_target);
     break;
   case OMPD_teams_distribute_parallel_for:
@@ -911,6 +913,7 @@ void clang::getOpenMPCaptureRegions(
   case OMPD_target_parallel:
   case OMPD_target_parallel_for:
   case OMPD_target_parallel_for_simd:
+    CaptureRegions.push_back(OMPD_task);
     CaptureRegions.push_back(OMPD_target);
     CaptureRegions.push_back(OMPD_parallel);
     break;
@@ -924,6 +927,13 @@ void clang::getOpenMPCaptureRegions(
   case OMPD_taskloop_simd:
     CaptureRegions.push_back(OMPD_taskloop);
     break;
+  case OMPD_target_teams_distribute_parallel_for:
+  case OMPD_target_teams_distribute_parallel_for_simd:
+    CaptureRegions.push_back(OMPD_task);
+    CaptureRegions.push_back(OMPD_target);
+    CaptureRegions.push_back(OMPD_teams);
+    CaptureRegions.push_back(OMPD_parallel);
+    break;
   case OMPD_simd:
   case OMPD_for:
   case OMPD_for_simd:
@@ -938,8 +948,6 @@ void clang::getOpenMPCaptureRegions(
   case OMPD_atomic:
   case OMPD_target_data:
   case OMPD_distribute_simd:
-  case OMPD_target_teams_distribute_parallel_for:
-  case OMPD_target_teams_distribute_parallel_for_simd:
     CaptureRegions.push_back(OMPD_unknown);
     break;
   case OMPD_threadprivate:
diff --git a/lib/Basic/Targets.cpp b/lib/Basic/Targets.cpp
index 7deebc06c3ef..931b8afb69a3 100644
--- a/lib/Basic/Targets.cpp
+++ b/lib/Basic/Targets.cpp
@@ -29,6 +29,7 @@
 #include "Targets/OSTargets.h"
 #include "Targets/PNaCl.h"
 #include "Targets/PPC.h"
+#include "Targets/RISCV.h"
 #include "Targets/SPIR.h"
 #include "Targets/Sparc.h"
 #include "Targets/SystemZ.h"
@@ -370,6 +371,11 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple,
   case llvm::Triple::r600:
     return new AMDGPUTargetInfo(Triple, Opts);
 
+  case llvm::Triple::riscv32:
+    return new RISCV32TargetInfo(Triple, Opts);
+  case llvm::Triple::riscv64:
+    return new RISCV64TargetInfo(Triple, Opts);
+
   case llvm::Triple::sparc:
     switch (os) {
     case llvm::Triple::Linux:
diff --git a/lib/Basic/Targets/AArch64.cpp b/lib/Basic/Targets/AArch64.cpp
index 4d3cd121f705..5eccf6a2cda0 100644
--- a/lib/Basic/Targets/AArch64.cpp
+++ b/lib/Basic/Targets/AArch64.cpp
@@ -183,6 +183,8 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
 
   if ((FPU & NeonMode) && HasFullFP16)
     Builder.defineMacro("__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", "1");
+  if (HasFullFP16)
+   Builder.defineMacro("__ARM_FEATURE_FP16_SCALAR_ARITHMETIC", "1");
 
   switch (ArchKind) {
   default:
diff --git a/lib/Basic/Targets/NVPTX.cpp b/lib/Basic/Targets/NVPTX.cpp
index add3b318aeb6..762a6b7aa79a 100644
--- a/lib/Basic/Targets/NVPTX.cpp
+++ b/lib/Basic/Targets/NVPTX.cpp
@@ -186,6 +186,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
         return "620";
       case CudaArch::SM_70:
         return "700";
+      case CudaArch::SM_72:
+        return "720";
       }
       llvm_unreachable("unhandled CudaArch");
     }();
diff --git a/lib/Basic/Targets/OSTargets.h b/lib/Basic/Targets/OSTargets.h
index 5af63615dc5e..05fcd9367a1a 100644
--- a/lib/Basic/Targets/OSTargets.h
+++ b/lib/Basic/Targets/OSTargets.h
@@ -95,16 +95,22 @@ class LLVM_LIBRARY_VISIBILITY DarwinTargetInfo : public OSTargetInfo<Target> {
     if (Triple.isMacOSX())
       this->TLSSupported = !Triple.isMacOSXVersionLT(10, 7);
     else if (Triple.isiOS()) {
-      // 64-bit iOS supported it from 8 onwards, 32-bit from 9 onwards.
-      if (Triple.getArch() == llvm::Triple::x86_64 ||
-          Triple.getArch() == llvm::Triple::aarch64)
+      // 64-bit iOS supported it from 8 onwards, 32-bit device from 9 onwards,
+      // 32-bit simulator from 10 onwards.
+      if (Triple.isArch64Bit())
         this->TLSSupported = !Triple.isOSVersionLT(8);
-      else if (Triple.getArch() == llvm::Triple::x86 ||
-               Triple.getArch() == llvm::Triple::arm ||
-               Triple.getArch() == llvm::Triple::thumb)
-        this->TLSSupported = !Triple.isOSVersionLT(9);
-    } else if (Triple.isWatchOS())
-      this->TLSSupported = !Triple.isOSVersionLT(2);
+      else if (Triple.isArch32Bit()) {
+        if (!Triple.isSimulatorEnvironment())
+          this->TLSSupported = !Triple.isOSVersionLT(9);
+        else
+          this->TLSSupported = !Triple.isOSVersionLT(10);
+      }
+    } else if (Triple.isWatchOS()) {
+      if (!Triple.isSimulatorEnvironment())
+        this->TLSSupported = !Triple.isOSVersionLT(2);
+      else
+        this->TLSSupported = !Triple.isOSVersionLT(3);
+    }
 
     this->MCountName = "\01mcount";
   }
diff --git a/lib/Basic/Targets/RISCV.cpp b/lib/Basic/Targets/RISCV.cpp
new file mode 100644
index 000000000000..8b2338bb856e
--- /dev/null
+++ b/lib/Basic/Targets/RISCV.cpp
@@ -0,0 +1,52 @@
+//===--- RISCV.cpp - Implement RISCV target feature support ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements RISCV TargetInfo objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "clang/Basic/MacroBuilder.h"
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace clang;
+using namespace clang::targets;
+
+ArrayRef<const char *> RISCVTargetInfo::getGCCRegNames() const {
+  static const char *const GCCRegNames[] = {
+      "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+      "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
+      "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+      "x24", "x25", "x26", "x27", "x28", "x29", "x30", "x31"};
+  return llvm::makeArrayRef(GCCRegNames);
+}
+
+ArrayRef<TargetInfo::GCCRegAlias> RISCVTargetInfo::getGCCRegAliases() const {
+  static const TargetInfo::GCCRegAlias GCCRegAliases[] = {
+      {{"zero"}, "x0"}, {{"ra"}, "x1"},  {{"sp"}, "x2"},   {{"gp"}, "x3"},
+      {{"tp"}, "x4"},   {{"t0"}, "x5"},  {{"t1"}, "x6"},   {{"t2"}, "x7"},
+      {{"s0"}, "x8"},   {{"s1"}, "x9"},  {{"a0"}, "x10"},  {{"a1"}, "x11"},
+      {{"a2"}, "x12"},  {{"a3"}, "x13"}, {{"a4"}, "x15"},  {{"a5"}, "x15"},
+      {{"a6"}, "x16"},  {{"a7"}, "x17"}, {{"s2"}, "x18"},  {{"s3"}, "x19"},
+      {{"s4"}, "x20"},  {{"s5"}, "x21"}, {{"s6"}, "x22"},  {{"s7"}, "x23"},
+      {{"s8"}, "x24"},  {{"s9"}, "x25"}, {{"s10"}, "x26"}, {{"s11"}, "x27"},
+      {{"t3"}, "x28"},  {{"t4"}, "x29"}, {{"t5"}, "x30"},  {{"t6"}, "x31"}};
+  return llvm::makeArrayRef(GCCRegAliases);
+}
+
+void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
+                                       MacroBuilder &Builder) const {
+  Builder.defineMacro("__ELF__");
+  Builder.defineMacro("__riscv");
+  bool Is64Bit = getTriple().getArch() == llvm::Triple::riscv64;
+  Builder.defineMacro("__riscv_xlen", Is64Bit ? "64" : "32");
+  // TODO: modify when more code models and ABIs are supported.
+  Builder.defineMacro("__riscv_cmodel_medlow");
+  Builder.defineMacro("__riscv_float_abi_soft");
+}
diff --git a/lib/Basic/Targets/RISCV.h b/lib/Basic/Targets/RISCV.h
new file mode 100644
index 000000000000..b35afd736df5
--- /dev/null
+++ b/lib/Basic/Targets/RISCV.h
@@ -0,0 +1,103 @@
+//===--- RISCV.h - Declare RISCV target feature support ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares RISCV TargetInfo objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_RISCV_H
+#define LLVM_CLANG_LIB_BASIC_TARGETS_RISCV_H
+
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace clang {
+namespace targets {
+
+// RISC-V Target
+class RISCVTargetInfo : public TargetInfo {
+protected:
+  std::string ABI;
+
+public:
+  RISCVTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
+    TLSSupported = false;
+    LongDoubleWidth = 128;
+    LongDoubleAlign = 128;
+    LongDoubleFormat = &llvm::APFloat::IEEEquad();
+    SuitableAlign = 128;
+    WCharType = SignedInt;
+    WIntType = UnsignedInt;
+  }
+
+  StringRef getABI() const override { return ABI; }
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override;
+
+  ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }
+
+  BuiltinVaListKind getBuiltinVaListKind() const override {
+    return TargetInfo::VoidPtrBuiltinVaList;
+  }
+
+  const char *getClobbers() const override { return ""; }
+
+  ArrayRef<const char *> getGCCRegNames() const override;
+
+  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
+
+  bool validateAsmConstraint(const char *&Name,
+                             TargetInfo::ConstraintInfo &Info) const override {
+    return false;
+  }
+};
+class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
+public:
+  RISCV32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : RISCVTargetInfo(Triple, Opts) {
+    IntPtrType = SignedInt;
+    PtrDiffType = SignedInt;
+    SizeType = UnsignedInt;
+    resetDataLayout("e-m:e-p:32:32-i64:64-n32-S128");
+  }
+
+  bool setABI(const std::string &Name) override {
+    // TODO: support ilp32f and ilp32d ABIs.
+    if (Name == "ilp32") {
+      ABI = Name;
+      return true;
+    }
+    return false;
+  }
+};
+class LLVM_LIBRARY_VISIBILITY RISCV64TargetInfo : public RISCVTargetInfo {
+public:
+  RISCV64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : RISCVTargetInfo(Triple, Opts) {
+    LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
+    IntMaxType = Int64Type = SignedLong;
+    resetDataLayout("e-m:e-p:64:64-i64:64-i128:128-n64-S128");
+  }
+
+  bool setABI(const std::string &Name) override {
+    // TODO: support lp64f and lp64d ABIs.
+    if (Name == "lp64") {
+      ABI = Name;
+      return true;
+    }
+    return false;
+  }
+};
+} // namespace targets
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_BASIC_TARGETS_RISCV_H
diff --git a/lib/Basic/Targets/SystemZ.cpp b/lib/Basic/Targets/SystemZ.cpp
index 98f3ae2f72b4..2dc9cdf94f7f 100644
--- a/lib/Basic/Targets/SystemZ.cpp
+++ b/lib/Basic/Targets/SystemZ.cpp
@@ -30,15 +30,30 @@ const Builtin::Info SystemZTargetInfo::BuiltinInfo[] = {
 };
 
 const char *const SystemZTargetInfo::GCCRegNames[] = {
-    "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6", "r7",  "r8",  "r9", "r10",
-    "r11", "r12", "r13", "r14", "r15", "f0",  "f2", "f4",  "f6",  "f1", "f3",
-    "f5",  "f7",  "f8",  "f10", "f12", "f14", "f9", "f11", "f13", "f15"
+    "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+    "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
+    "f0",  "f2",  "f4",  "f6",  "f1",  "f3",  "f5",  "f7",
+    "f8",  "f10", "f12", "f14", "f9",  "f11", "f13", "f15",
+    /*ap*/"", "cc", /*fp*/"", /*rp*/"", "a0",  "a1",
+    "v16", "v18", "v20", "v22", "v17", "v19", "v21", "v23",
+    "v24", "v26", "v28", "v30", "v25", "v27", "v29", "v31"
+};
+
+const TargetInfo::AddlRegName GCCAddlRegNames[] = {
+    {{"v0"}, 16}, {{"v2"},  17}, {{"v4"},  18}, {{"v6"},  19},
+    {{"v1"}, 20}, {{"v3"},  21}, {{"v5"},  22}, {{"v7"},  23},
+    {{"v8"}, 24}, {{"v10"}, 25}, {{"v12"}, 26}, {{"v14"}, 27},
+    {{"v9"}, 28}, {{"v11"}, 29}, {{"v13"}, 30}, {{"v15"}, 31}
 };
 
 ArrayRef<const char *> SystemZTargetInfo::getGCCRegNames() const {
   return llvm::makeArrayRef(GCCRegNames);
 }
 
+ArrayRef<TargetInfo::AddlRegName> SystemZTargetInfo::getGCCAddlRegNames() const {
+  return llvm::makeArrayRef(GCCAddlRegNames);
+}
+
 bool SystemZTargetInfo::validateAsmConstraint(
     const char *&Name, TargetInfo::ConstraintInfo &Info) const {
   switch (*Name) {
@@ -48,6 +63,7 @@ bool SystemZTargetInfo::validateAsmConstraint(
   case 'a': // Address register
   case 'd': // Data register (equivalent to 'r')
   case 'f': // Floating-point register
+  case 'v': // Vector register
     Info.setAllowsRegister();
     return true;
 
diff --git a/lib/Basic/Targets/SystemZ.h b/lib/Basic/Targets/SystemZ.h
index 3023c1d2ea26..91a41bc426bb 100644
--- a/lib/Basic/Targets/SystemZ.h
+++ b/lib/Basic/Targets/SystemZ.h
@@ -62,6 +62,8 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
     return None;
   }
 
+  ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;
+
   bool validateAsmConstraint(const char *&Name,
                              TargetInfo::ConstraintInfo &info) const override;
 
diff --git a/lib/Basic/Targets/WebAssembly.cpp b/lib/Basic/Targets/WebAssembly.cpp
index 915aad4b563b..ecccb5886595 100644
--- a/lib/Basic/Targets/WebAssembly.cpp
+++ b/lib/Basic/Targets/WebAssembly.cpp
@@ -33,6 +33,7 @@ bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("simd128", SIMDLevel >= SIMD128)
       .Case("nontrapping-fptoint", HasNontrappingFPToInt)
+      .Case("sign-ext", HasSignExt)
       .Default(false);
 }
 
@@ -70,6 +71,14 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasNontrappingFPToInt = false;
       continue;
     }
+    if (Feature == "+sign-ext") {
+      HasSignExt = true;
+      continue;
+    }
+    if (Feature == "-sign-ext") {
+      HasSignExt = false;
+      continue;
+    }
 
     Diags.Report(diag::err_opt_not_valid_with_opt)
         << Feature << "-target-feature";
diff --git a/lib/Basic/Targets/WebAssembly.h b/lib/Basic/Targets/WebAssembly.h
index ee0073d081e0..b01613aa8a0f 100644
--- a/lib/Basic/Targets/WebAssembly.h
+++ b/lib/Basic/Targets/WebAssembly.h
@@ -31,10 +31,12 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
   } SIMDLevel;
 
   bool HasNontrappingFPToInt;
+  bool HasSignExt;
 
 public:
   explicit WebAssemblyTargetInfo(const llvm::Triple &T, const TargetOptions &)
-      : TargetInfo(T), SIMDLevel(NoSIMD), HasNontrappingFPToInt(false) {
+      : TargetInfo(T), SIMDLevel(NoSIMD), HasNontrappingFPToInt(false),
+        HasSignExt(false) {
     NoAsmVariants = true;
     SuitableAlign = 128;
     LargeArrayMinWidth = 128;
@@ -43,6 +45,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
     SigAtomicType = SignedLong;
     LongDoubleWidth = LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
+    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
     SizeType = UnsignedInt;
     PtrDiffType = SignedInt;
     IntPtrType = SignedInt;
@@ -60,6 +63,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
     if (CPU == "bleeding-edge") {
       Features["simd128"] = true;
       Features["nontrapping-fptoint"] = true;
+      Features["sign-ext"] = true;
     }
     return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
   }
@@ -115,7 +119,6 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly32TargetInfo
   explicit WebAssembly32TargetInfo(const llvm::Triple &T,
                                    const TargetOptions &Opts)
       : WebAssemblyTargetInfo(T, Opts) {
-    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
     resetDataLayout("e-m:e-p:32:32-i64:64-n32:64-S128");
   }
 
@@ -132,7 +135,6 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly64TargetInfo
       : WebAssemblyTargetInfo(T, Opts) {
     LongAlign = LongWidth = 64;
     PointerAlign = PointerWidth = 64;
-    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
     SizeType = UnsignedLong;
     PtrDiffType = SignedLong;
     IntPtrType = SignedLong;
diff --git a/lib/Basic/Targets/X86.cpp b/lib/Basic/Targets/X86.cpp
index 70433ff066fc..484a83a494ec 100644
--- a/lib/Basic/Targets/X86.cpp
+++ b/lib/Basic/Targets/X86.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/TargetParser.h"
 
 namespace clang {
 namespace targets {
@@ -100,6 +101,26 @@ bool X86TargetInfo::setFPMath(StringRef Name) {
   return false;
 }
 
+bool X86TargetInfo::checkCFProtectionReturnSupported(
+    DiagnosticsEngine &Diags) const {
+  if (HasSHSTK)
+    return true;
+
+  Diags.Report(diag::err_opt_not_valid_without_opt) << "cf-protection=return"
+                                                    << "-mshstk";
+  return false;
+}
+
+bool X86TargetInfo::checkCFProtectionBranchSupported(
+    DiagnosticsEngine &Diags) const {
+  if (HasIBT)
+    return true;
+
+  Diags.Report(diag::err_opt_not_valid_without_opt) << "cf-protection=branch"
+                                                    << "-mibt";
+  return false;
+}
+
 bool X86TargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
@@ -139,6 +160,7 @@ bool X86TargetInfo::initFeatureMap(
     setFeatureEnabledImpl(Features, "avx512vnni", true);
     setFeatureEnabledImpl(Features, "avx512vbmi2", true);
     setFeatureEnabledImpl(Features, "avx512vpopcntdq", true);
+    setFeatureEnabledImpl(Features, "rdpid", true);
     LLVM_FALLTHROUGH;
   case CK_Cannonlake:
     setFeatureEnabledImpl(Features, "avx512ifma", true);
@@ -217,7 +239,6 @@ bool X86TargetInfo::initFeatureMap(
 
   case CK_Goldmont:
     setFeatureEnabledImpl(Features, "sha", true);
-    setFeatureEnabledImpl(Features, "rdrnd", true);
     setFeatureEnabledImpl(Features, "rdseed", true);
     setFeatureEnabledImpl(Features, "xsave", true);
     setFeatureEnabledImpl(Features, "xsaveopt", true);
@@ -228,6 +249,7 @@ bool X86TargetInfo::initFeatureMap(
     setFeatureEnabledImpl(Features, "fsgsbase", true);
     LLVM_FALLTHROUGH;
   case CK_Silvermont:
+    setFeatureEnabledImpl(Features, "rdrnd", true);
     setFeatureEnabledImpl(Features, "aes", true);
     setFeatureEnabledImpl(Features, "pclmul", true);
     setFeatureEnabledImpl(Features, "sse4.2", true);
@@ -409,7 +431,7 @@ void X86TargetInfo::setSSELevel(llvm::StringMap<bool> &Features,
   if (Enabled) {
     switch (Level) {
     case AVX512F:
-      Features["avx512f"] = true;
+      Features["avx512f"] = Features["fma"] = Features["f16c"] = true;
       LLVM_FALLTHROUGH;
     case AVX2:
       Features["avx2"] = true;
@@ -623,6 +645,8 @@ void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
   } else if (Name == "fma") {
     if (Enabled)
       setSSELevel(Features, AVX, Enabled);
+    else
+      setSSELevel(Features, AVX512F, Enabled);
   } else if (Name == "fma4") {
     setXOPLevel(Features, FMA4, Enabled);
   } else if (Name == "xop") {
@@ -632,6 +656,8 @@ void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
   } else if (Name == "f16c") {
     if (Enabled)
       setSSELevel(Features, AVX, Enabled);
+    else
+      setSSELevel(Features, AVX512F, Enabled);
   } else if (Name == "sha") {
     if (Enabled)
       setSSELevel(Features, SSE2, Enabled);
@@ -759,6 +785,12 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasPREFETCHWT1 = true;
     } else if (Feature == "+clzero") {
       HasCLZERO = true;
+    } else if (Feature == "+rdpid") {
+      HasRDPID = true;
+    } else if (Feature == "+retpoline") {
+      HasRetpoline = true;
+    } else if (Feature == "+retpoline-external-thunk") {
+      HasRetpolineExternalThunk = true;
     }
 
     X86SSEEnum Level = llvm::StringSwitch<X86SSEEnum>(Feature)
@@ -1084,8 +1116,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__XSAVES__");
   if (HasPKU)
     Builder.defineMacro("__PKU__");
-  if (HasCX16)
-    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16");
   if (HasCLFLUSHOPT)
     Builder.defineMacro("__CLFLUSHOPT__");
   if (HasCLWB)
@@ -1094,12 +1124,16 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__MPX__");
   if (HasSHSTK)
     Builder.defineMacro("__SHSTK__");
+  if (HasIBT)
+    Builder.defineMacro("__IBT__");
   if (HasSGX)
     Builder.defineMacro("__SGX__");
   if (HasPREFETCHWT1)
     Builder.defineMacro("__PREFETCHWT1__");
   if (HasCLZERO)
     Builder.defineMacro("__CLZERO__");
+  if (HasRDPID)
+    Builder.defineMacro("__RDPID__");
 
   // Each case falls through to the previous one here.
   switch (SSELevel) {
@@ -1179,6 +1213,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   }
   if (CPU >= CK_i586)
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
+  if (HasCX16)
+    Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16");
 
   if (HasFloat128)
     Builder.defineMacro("__SIZEOF_FLOAT128__", "16");
@@ -1228,6 +1264,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("popcnt", true)
       .Case("prefetchwt1", true)
       .Case("prfchw", true)
+      .Case("rdpid", true)
       .Case("rdrnd", true)
       .Case("rdseed", true)
       .Case("rtm", true)
@@ -1299,8 +1336,11 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("popcnt", HasPOPCNT)
       .Case("prefetchwt1", HasPREFETCHWT1)
       .Case("prfchw", HasPRFCHW)
+      .Case("rdpid", HasRDPID)
       .Case("rdrnd", HasRDRND)
       .Case("rdseed", HasRDSEED)
+      .Case("retpoline", HasRetpoline)
+      .Case("retpoline-external-thunk", HasRetpolineExternalThunk)
       .Case("rtm", HasRTM)
       .Case("sgx", HasSGX)
       .Case("sha", HasSHA)
@@ -1338,6 +1378,63 @@ bool X86TargetInfo::validateCpuSupports(StringRef FeatureStr) const {
       .Default(false);
 }
 
+static llvm::X86::ProcessorFeatures getFeature(StringRef Name) {
+  return llvm::StringSwitch<llvm::X86::ProcessorFeatures>(Name)
+#define X86_FEATURE_COMPAT(VAL, ENUM, STR) .Case(STR, llvm::X86::ENUM)
+#include "llvm/Support/X86TargetParser.def"
+      ;
+  // Note, this function should only be used after ensuring the value is
+  // correct, so it asserts if the value is out of range.
+}
+
+static unsigned getFeaturePriority(llvm::X86::ProcessorFeatures Feat) {
+  enum class FeatPriority {
+#define FEATURE(FEAT) FEAT,
+#include "clang/Basic/X86Target.def"
+  };
+  switch (Feat) {
+#define FEATURE(FEAT)                                                          \
+  case llvm::X86::FEAT:                                                        \
+    return static_cast<unsigned>(FeatPriority::FEAT);
+#include "clang/Basic/X86Target.def"
+  default:
+    llvm_unreachable("No Feature Priority for non-CPUSupports Features");
+  }
+}
+
+unsigned X86TargetInfo::multiVersionSortPriority(StringRef Name) const {
+  // Valid CPUs have a 'key feature' that compares just better than its key
+  // feature.
+  CPUKind Kind = getCPUKind(Name);
+  if (Kind != CK_Generic) {
+    switch (Kind) {
+    default:
+      llvm_unreachable(
+          "CPU Type without a key feature used in 'target' attribute");
+#define PROC_WITH_FEAT(ENUM, STR, IS64, KEY_FEAT)                              \
+  case CK_##ENUM:                                                              \
+    return (getFeaturePriority(llvm::X86::KEY_FEAT) << 1) + 1;
+#include "clang/Basic/X86Target.def"
+    }
+  }
+
+  // Now we know we have a feature, so get its priority and shift it a few so
+  // that we have sufficient room for the CPUs (above).
+  return getFeaturePriority(getFeature(Name)) << 1;
+}
+
+std::string X86TargetInfo::getCPUKindCanonicalName(CPUKind Kind) const {
+  switch (Kind) {
+  case CK_Generic:
+    return "";
+#define PROC(ENUM, STRING, IS64BIT)                                            \
+  case CK_##ENUM:                                                              \
+    return STRING;
+#include "clang/Basic/X86Target.def"
+  }
+  llvm_unreachable("Invalid CPUKind");
+}
+
 // We can't use a generic validation scheme for the cpus accepted here
 // versus subtarget cpus accepted in the target attribute because the
 // variables intitialized by the runtime only support the below currently
diff --git a/lib/Basic/Targets/X86.h b/lib/Basic/Targets/X86.h
index cbd6a2d24fb5..abe10017ca26 100644
--- a/lib/Basic/Targets/X86.h
+++ b/lib/Basic/Targets/X86.h
@@ -96,7 +96,11 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasCLWB = false;
   bool HasMOVBE = false;
   bool HasPREFETCHWT1 = false;
+  bool HasRDPID = false;
+  bool HasRetpoline = false;
+  bool HasRetpolineExternalThunk = false;
 
+protected:
   /// \brief Enumeration of all of the X86 CPUs supported by Clang.
   ///
   /// Each enumeration represents a particular CPU supported by Clang. These
@@ -111,6 +115,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
 
   CPUKind getCPUKind(StringRef CPU) const;
 
+  std::string getCPUKindCanonicalName(CPUKind Kind) const;
+
   enum FPMathKind { FP_Default, FP_SSE, FP_387 } FPMath = FP_Default;
 
 public:
@@ -156,6 +162,12 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
 
   bool validateInputSize(StringRef Constraint, unsigned Size) const override;
 
+  virtual bool
+  checkCFProtectionReturnSupported(DiagnosticsEngine &Diags) const override;
+
+  virtual bool
+  checkCFProtectionBranchSupported(DiagnosticsEngine &Diags) const override;
+
   virtual bool validateOperandSize(StringRef Constraint, unsigned Size) const;
 
   std::string convertConstraint(const char *&Constraint) const override;
@@ -256,6 +268,11 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
     return checkCPUKind(CPU = getCPUKind(Name));
   }
 
+  bool supportsMultiVersioning() const override {
+    return getTriple().isOSBinFormatELF();
+  }
+  unsigned multiVersionSortPriority(StringRef Name) const override;
+
   bool setFPMath(StringRef Name) override;
 
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
@@ -309,9 +326,11 @@ class LLVM_LIBRARY_VISIBILITY X86_32TargetInfo : public X86TargetInfo {
          (1 << TargetInfo::LongDouble));
 
     // x86-32 has atomics up to 8 bytes
-    // FIXME: Check that we actually have cmpxchg8b before setting
-    // MaxAtomicInlineWidth. (cmpxchg8b is an i586 instruction.)
-    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
+    CPUKind Kind = getCPUKind(Opts.CPU);
+    if (Kind >= CK_i586 || Kind == CK_Generic)
+      MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
+    else if (Kind >= CK_i486)
+      MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
   }
 
   BuiltinVaListKind getBuiltinVaListKind() const override {
diff --git a/lib/CodeGen/ABIInfo.h b/lib/CodeGen/ABIInfo.h
index 575506da84d4..fa0461f5d90c 100644
--- a/lib/CodeGen/ABIInfo.h
+++ b/lib/CodeGen/ABIInfo.h
@@ -108,8 +108,6 @@ namespace swiftcall {
     virtual bool isHomogeneousAggregateSmallEnough(const Type *Base,
                                                    uint64_t Members) const;
 
-    virtual bool shouldSignExtUnsignedType(QualType Ty) const;
-
     bool isHomogeneousAggregate(QualType Ty, const Type *&Base,
                                 uint64_t &Members) const;
 
@@ -137,8 +135,7 @@ namespace swiftcall {
 
     bool supportsSwift() const final override { return true; }
 
-    virtual bool shouldPassIndirectlyForSwift(CharUnits totalSize,
-                                              ArrayRef<llvm::Type*> types,
+    virtual bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> types,
                                               bool asReturnValue) const = 0;
 
     virtual bool isLegalVectorTypeForSwift(CharUnits totalSize,
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp
index e2349da5f0a4..6d42aac90400 100644
--- a/lib/CodeGen/BackendUtil.cpp
+++ b/lib/CodeGen/BackendUtil.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -44,8 +45,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Coroutines.h"
+#include "llvm/Transforms/GCOVProfiler.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -361,21 +362,6 @@ getCodeModel(const CodeGenOptions &CodeGenOpts) {
   return static_cast<llvm::CodeModel::Model>(CodeModel);
 }
 
-static llvm::Reloc::Model getRelocModel(const CodeGenOptions &CodeGenOpts) {
-  // Keep this synced with the equivalent code in
-  // lib/Frontend/CompilerInvocation.cpp
-  llvm::Optional<llvm::Reloc::Model> RM;
-  RM = llvm::StringSwitch<llvm::Reloc::Model>(CodeGenOpts.RelocationModel)
-      .Case("static", llvm::Reloc::Static)
-      .Case("pic", llvm::Reloc::PIC_)
-      .Case("ropi", llvm::Reloc::ROPI)
-      .Case("rwpi", llvm::Reloc::RWPI)
-      .Case("ropi-rwpi", llvm::Reloc::ROPI_RWPI)
-      .Case("dynamic-no-pic", llvm::Reloc::DynamicNoPIC);
-  assert(RM.hasValue() && "invalid PIC model!");
-  return *RM;
-}
-
 static TargetMachine::CodeGenFileType getCodeGenFileType(BackendAction Action) {
   if (Action == Backend_EmitObj)
     return TargetMachine::CGFT_ObjectFile;
@@ -448,6 +434,7 @@ static void initTargetOptions(llvm::TargetOptions &Options,
   Options.UniqueSectionNames = CodeGenOpts.UniqueSectionNames;
   Options.EmulatedTLS = CodeGenOpts.EmulatedTLS;
   Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning();
+  Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection;
 
   if (CodeGenOpts.EnableSplitDwarf)
     Options.MCOptions.SplitDwarfFile = CodeGenOpts.SplitDwarfFile;
@@ -470,6 +457,23 @@ static void initTargetOptions(llvm::TargetOptions &Options,
       Options.MCOptions.IASSearchPaths.push_back(
           Entry.IgnoreSysRoot ? Entry.Path : HSOpts.Sysroot + Entry.Path);
 }
+static Optional<GCOVOptions> getGCOVOptions(const CodeGenOptions &CodeGenOpts) {
+  if (CodeGenOpts.DisableGCov)
+    return None;
+  if (!CodeGenOpts.EmitGcovArcs && !CodeGenOpts.EmitGcovNotes)
+    return None;
+  // Not using 'GCOVOptions::getDefault' allows us to avoid exiting if
+  // LLVM's -default-gcov-version flag is set to something invalid.
+  GCOVOptions Options;
+  Options.EmitNotes = CodeGenOpts.EmitGcovNotes;
+  Options.EmitData = CodeGenOpts.EmitGcovArcs;
+  llvm::copy(CodeGenOpts.CoverageVersion, std::begin(Options.Version));
+  Options.UseCfgChecksum = CodeGenOpts.CoverageExtraChecksum;
+  Options.NoRedZone = CodeGenOpts.DisableRedZone;
+  Options.FunctionNamesInData = !CodeGenOpts.CoverageNoFunctionNamesInData;
+  Options.ExitBlockBeforeBody = CodeGenOpts.CoverageExitBlockBeforeBody;
+  return Options;
+}
 
 void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
                                       legacy::FunctionPassManager &FPM) {
@@ -612,20 +616,8 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
   if (!CodeGenOpts.RewriteMapFiles.empty())
     addSymbolRewriterPass(CodeGenOpts, &MPM);
 
-  if (!CodeGenOpts.DisableGCov &&
-      (CodeGenOpts.EmitGcovArcs || CodeGenOpts.EmitGcovNotes)) {
-    // Not using 'GCOVOptions::getDefault' allows us to avoid exiting if
-    // LLVM's -default-gcov-version flag is set to something invalid.
-    GCOVOptions Options;
-    Options.EmitNotes = CodeGenOpts.EmitGcovNotes;
-    Options.EmitData = CodeGenOpts.EmitGcovArcs;
-    memcpy(Options.Version, CodeGenOpts.CoverageVersion, 4);
-    Options.UseCfgChecksum = CodeGenOpts.CoverageExtraChecksum;
-    Options.NoRedZone = CodeGenOpts.DisableRedZone;
-    Options.FunctionNamesInData =
-        !CodeGenOpts.CoverageNoFunctionNamesInData;
-    Options.ExitBlockBeforeBody = CodeGenOpts.CoverageExitBlockBeforeBody;
-    MPM.add(createGCOVProfilerPass(Options));
+  if (Optional<GCOVOptions> Options = getGCOVOptions(CodeGenOpts)) {
+    MPM.add(createGCOVProfilerPass(*Options));
     if (CodeGenOpts.getDebugInfo() == codegenoptions::NoDebugInfo)
       MPM.add(createStripSymbolsPass(true));
   }
@@ -685,7 +677,7 @@ void EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) {
   Optional<llvm::CodeModel::Model> CM = getCodeModel(CodeGenOpts);
   std::string FeaturesStr =
       llvm::join(TargetOpts.Features.begin(), TargetOpts.Features.end(), ",");
-  llvm::Reloc::Model RM = getRelocModel(CodeGenOpts);
+  llvm::Reloc::Model RM = CodeGenOpts.RelocationModel;
   CodeGenOpt::Level OptLevel = getCGOptLevel(CodeGenOpts);
 
   llvm::TargetOptions Options;
@@ -917,6 +909,9 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
     bool IsLTO = CodeGenOpts.PrepareForLTO;
 
     if (CodeGenOpts.OptimizationLevel == 0) {
+      if (Optional<GCOVOptions> Options = getGCOVOptions(CodeGenOpts))
+        MPM.addPass(GCOVProfilerPass(*Options));
+
       // Build a minimal pipeline based on the semantics required by Clang,
       // which is just that always inlining occurs.
       MPM.addPass(AlwaysInlinerPass());
@@ -940,6 +935,10 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
             [](FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
               FPM.addPass(BoundsCheckingPass());
             });
+      if (Optional<GCOVOptions> Options = getGCOVOptions(CodeGenOpts))
+        PB.registerPipelineStartEPCallback([Options](ModulePassManager &MPM) {
+          MPM.addPass(GCOVProfilerPass(*Options));
+        });
 
       if (IsThinLTO) {
         MPM = PB.buildThinLTOPreLinkDefaultPipeline(
@@ -1103,7 +1102,7 @@ static void runThinLTOBackend(ModuleSummaryIndex *CombinedIndex, Module *M,
   Conf.CPU = TOpts.CPU;
   Conf.CodeModel = getCodeModel(CGOpts);
   Conf.MAttrs = TOpts.Features;
-  Conf.RelocModel = getRelocModel(CGOpts);
+  Conf.RelocModel = CGOpts.RelocationModel;
   Conf.CGOptLevel = getCGOptLevel(CGOpts);
   initTargetOptions(Conf.Options, CGOpts, TOpts, LOpts, HeaderOpts);
   Conf.SampleProfile = std::move(SampleProfile);
diff --git a/lib/CodeGen/CGAtomic.cpp b/lib/CodeGen/CGAtomic.cpp
index 6862fd811186..3d24e1fcad30 100644
--- a/lib/CodeGen/CGAtomic.cpp
+++ b/lib/CodeGen/CGAtomic.cpp
@@ -1508,11 +1508,12 @@ void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
   // which means that the caller is responsible for having zeroed
   // any padding.  Just do an aggregate copy of that type.
   if (rvalue.isAggregate()) {
-    CGF.EmitAggregateCopy(getAtomicAddress(),
-                          rvalue.getAggregateAddress(),
-                          getAtomicType(),
-                          (rvalue.isVolatileQualified()
-                           || LVal.isVolatileQualified()));
+    LValue Dest = CGF.MakeAddrLValue(getAtomicAddress(), getAtomicType());
+    LValue Src = CGF.MakeAddrLValue(rvalue.getAggregateAddress(),
+                                    getAtomicType());
+    bool IsVolatile = rvalue.isVolatileQualified() ||
+                      LVal.isVolatileQualified();
+    CGF.EmitAggregateCopy(Dest, Src, getAtomicType(), IsVolatile);
     return;
   }
 
diff --git a/lib/CodeGen/CGBuilder.h b/lib/CodeGen/CGBuilder.h
index 61fe4aac3afa..3a4224873f42 100644
--- a/lib/CodeGen/CGBuilder.h
+++ b/lib/CodeGen/CGBuilder.h
@@ -258,23 +258,23 @@ class CGBuilderTy : public CGBuilderBaseTy {
   using CGBuilderBaseTy::CreateMemCpy;
   llvm::CallInst *CreateMemCpy(Address Dest, Address Src, llvm::Value *Size,
                                bool IsVolatile = false) {
-    auto Align = std::min(Dest.getAlignment(), Src.getAlignment());
-    return CreateMemCpy(Dest.getPointer(), Src.getPointer(), Size,
-                        Align.getQuantity(), IsVolatile);
+    return CreateMemCpy(Dest.getPointer(), Dest.getAlignment().getQuantity(),
+                        Src.getPointer(), Src.getAlignment().getQuantity(),
+                        Size,IsVolatile);
   }
   llvm::CallInst *CreateMemCpy(Address Dest, Address Src, uint64_t Size,
                                bool IsVolatile = false) {
-    auto Align = std::min(Dest.getAlignment(), Src.getAlignment());
-    return CreateMemCpy(Dest.getPointer(), Src.getPointer(), Size,
-                        Align.getQuantity(), IsVolatile);
+    return CreateMemCpy(Dest.getPointer(), Dest.getAlignment().getQuantity(),
+                        Src.getPointer(), Src.getAlignment().getQuantity(),
+                        Size, IsVolatile);
   }
 
   using CGBuilderBaseTy::CreateMemMove;
   llvm::CallInst *CreateMemMove(Address Dest, Address Src, llvm::Value *Size,
                                 bool IsVolatile = false) {
-    auto Align = std::min(Dest.getAlignment(), Src.getAlignment());
-    return CreateMemMove(Dest.getPointer(), Src.getPointer(), Size,
-                         Align.getQuantity(), IsVolatile);
+    return CreateMemMove(Dest.getPointer(), Dest.getAlignment().getQuantity(),
+                         Src.getPointer(), Src.getAlignment().getQuantity(),
+                         Size, IsVolatile);
   }
 
   using CGBuilderBaseTy::CreateMemSet;
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index ba54f8342f1b..b4ae779bf519 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -915,7 +915,11 @@ EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
       Overflow = CGF.Builder.CreateOr(Overflow, TruncOverflow);
     }
 
-    Result = CGF.Builder.CreateTrunc(UnsignedResult, ResTy);
+    // Negate the product if it would be negative in infinite precision.
+    Result = CGF.Builder.CreateSelect(
+        IsNegative, CGF.Builder.CreateNeg(UnsignedResult), UnsignedResult);
+
+    Result = CGF.Builder.CreateTrunc(Result, ResTy);
   }
   assert(Overflow && Result && "Missing overflow or result");
 
@@ -962,6 +966,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
     case Builtin::BI__builtin_copysign:
     case Builtin::BI__builtin_copysignf:
     case Builtin::BI__builtin_copysignl:
+    case Builtin::BI__builtin_copysignf128:
       return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
 
     case Builtin::BIcos:
@@ -994,6 +999,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
     case Builtin::BI__builtin_fabs:
     case Builtin::BI__builtin_fabsf:
     case Builtin::BI__builtin_fabsl:
+    case Builtin::BI__builtin_fabsf128:
       return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
 
     case Builtin::BIfloor:
@@ -1735,6 +1741,63 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
     return RValue::get(Dest.getPointer());
   }
+  case Builtin::BI__builtin_wmemcmp: {
+    // The MSVC runtime library does not provide a definition of wmemcmp, so we
+    // need an inline implementation.
+    if (!getTarget().getTriple().isOSMSVCRT())
+      break;
+
+    llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
+
+    Value *Dst = EmitScalarExpr(E->getArg(0));
+    Value *Src = EmitScalarExpr(E->getArg(1));
+    Value *Size = EmitScalarExpr(E->getArg(2));
+
+    BasicBlock *Entry = Builder.GetInsertBlock();
+    BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
+    BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
+    BasicBlock *Next = createBasicBlock("wmemcmp.next");
+    BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
+    Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
+    Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
+
+    EmitBlock(CmpGT);
+    PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
+    DstPhi->addIncoming(Dst, Entry);
+    PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
+    SrcPhi->addIncoming(Src, Entry);
+    PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
+    SizePhi->addIncoming(Size, Entry);
+    CharUnits WCharAlign =
+        getContext().getTypeAlignInChars(getContext().WCharTy);
+    Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
+    Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
+    Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
+    Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
+
+    EmitBlock(CmpLT);
+    Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
+    Builder.CreateCondBr(DstLtSrc, Exit, Next);
+
+    EmitBlock(Next);
+    Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
+    Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
+    Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
+    Value *NextSizeEq0 =
+        Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
+    Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
+    DstPhi->addIncoming(NextDst, Next);
+    SrcPhi->addIncoming(NextSrc, Next);
+    SizePhi->addIncoming(NextSize, Next);
+
+    EmitBlock(Exit);
+    PHINode *Ret = Builder.CreatePHI(IntTy, 4);
+    Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
+    Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
+    Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
+    Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
+    return RValue::get(Ret);
+  }
   case Builtin::BI__builtin_dwarf_cfa: {
     // The offset in bytes from the first argument to the CFA.
     //
@@ -4095,6 +4158,54 @@ static const NeonIntrinsicInfo AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
+  // FP16 scalar intrinisics go here.
+  NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
+  NEONMAP1(vabsh_f16, aarch64_neon_abs, Add1ArgType),
+  NEONMAP1(vcageh_f16, aarch64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcagth_f16, aarch64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcaleh_f16, aarch64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcalth_f16, aarch64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_s16_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_u16_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_s16, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_u16, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_s16_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_u16_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_s16_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_u16_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_s16_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_u16_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_s16_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_u16_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
+  NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
+  NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
+  NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
+  NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
 };
 
 #undef NEONMAP0
@@ -6119,6 +6230,58 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
       return Builder.CreateUIToFP(Ops[0], FTy);
     return Builder.CreateSIToFP(Ops[0], FTy);
   }
+  case NEON::BI__builtin_neon_vcvth_f16_u16:
+  case NEON::BI__builtin_neon_vcvth_f16_u32:
+  case NEON::BI__builtin_neon_vcvth_f16_u64:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_f16_s16:
+  case NEON::BI__builtin_neon_vcvth_f16_s32:
+  case NEON::BI__builtin_neon_vcvth_f16_s64: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *FTy = HalfTy;
+    llvm::Type *InTy;
+    if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
+      InTy = Int64Ty;
+    else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
+      InTy = Int32Ty;
+    else
+      InTy = Int16Ty;
+    Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
+    if (usgn)
+      return Builder.CreateUIToFP(Ops[0], FTy);
+    return Builder.CreateSIToFP(Ops[0], FTy);
+  }
+  case NEON::BI__builtin_neon_vcvth_u16_f16:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_s16_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], Int16Ty);
+    return Builder.CreateFPToSI(Ops[0], Int16Ty);
+  }
+  case NEON::BI__builtin_neon_vcvth_u32_f16:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_s32_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], Int32Ty);
+    return Builder.CreateFPToSI(Ops[0], Int32Ty);
+  }
+  case NEON::BI__builtin_neon_vcvth_u64_f16:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_s64_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], Int64Ty);
+    return Builder.CreateFPToSI(Ops[0], Int64Ty);
+  }
   case NEON::BI__builtin_neon_vpaddd_s64: {
     llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2);
     Value *Vec = EmitScalarExpr(E->getArg(0));
@@ -6160,6 +6323,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vceqzd_s64:
   case NEON::BI__builtin_neon_vceqzd_f64:
   case NEON::BI__builtin_neon_vceqzs_f32:
+  case NEON::BI__builtin_neon_vceqzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6167,6 +6331,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vcgezd_s64:
   case NEON::BI__builtin_neon_vcgezd_f64:
   case NEON::BI__builtin_neon_vcgezs_f32:
+  case NEON::BI__builtin_neon_vcgezh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6174,6 +6339,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vclezd_s64:
   case NEON::BI__builtin_neon_vclezd_f64:
   case NEON::BI__builtin_neon_vclezs_f32:
+  case NEON::BI__builtin_neon_vclezh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6181,6 +6347,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vcgtzd_s64:
   case NEON::BI__builtin_neon_vcgtzd_f64:
   case NEON::BI__builtin_neon_vcgtzs_f32:
+  case NEON::BI__builtin_neon_vcgtzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6188,6 +6355,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vcltzd_s64:
   case NEON::BI__builtin_neon_vcltzd_f64:
   case NEON::BI__builtin_neon_vcltzs_f32:
+  case NEON::BI__builtin_neon_vcltzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6240,6 +6408,26 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
   }
+  case NEON::BI__builtin_neon_vceqh_f16:
+  case NEON::BI__builtin_neon_vcleh_f16:
+  case NEON::BI__builtin_neon_vclth_f16:
+  case NEON::BI__builtin_neon_vcgeh_f16:
+  case NEON::BI__builtin_neon_vcgth_f16: {
+    llvm::CmpInst::Predicate P;
+    switch (BuiltinID) {
+    default: llvm_unreachable("missing builtin ID in switch!");
+    case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
+    case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
+    case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
+    case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
+    case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
+    }
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
+    Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
+    return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
+  }
   case NEON::BI__builtin_neon_vceqd_s64:
   case NEON::BI__builtin_neon_vceqd_u64:
   case NEON::BI__builtin_neon_vcgtd_s64:
@@ -6377,6 +6565,31 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
         llvm::VectorType::get(DoubleTy, 2));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
+  case NEON::BI__builtin_neon_vaddh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
+  case NEON::BI__builtin_neon_vsubh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
+  case NEON::BI__builtin_neon_vmulh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
+  case NEON::BI__builtin_neon_vdivh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
+  case NEON::BI__builtin_neon_vfmah_f16: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+    return Builder.CreateCall(F,
+      {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
+  }
+  case NEON::BI__builtin_neon_vfmsh_f16: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+    Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
+    Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
+    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+    return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
+  }
   case NEON::BI__builtin_neon_vaddd_s64:
   case NEON::BI__builtin_neon_vaddd_u64:
     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
@@ -6651,12 +6864,22 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
+  case NEON::BI__builtin_neon_vmaxh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fmax;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
+  }
   case NEON::BI__builtin_neon_vmin_v:
   case NEON::BI__builtin_neon_vminq_v:
     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
+  case NEON::BI__builtin_neon_vminh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fmin;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
+  }
   case NEON::BI__builtin_neon_vabd_v:
   case NEON::BI__builtin_neon_vabdq_v:
     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
@@ -6695,20 +6918,31 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminnmq_v:
     Int = Intrinsic::aarch64_neon_fminnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
+  case NEON::BI__builtin_neon_vminnmh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fminnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
   case NEON::BI__builtin_neon_vmaxnm_v:
   case NEON::BI__builtin_neon_vmaxnmq_v:
     Int = Intrinsic::aarch64_neon_fmaxnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
+  case NEON::BI__builtin_neon_vmaxnmh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fmaxnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
   case NEON::BI__builtin_neon_vrecpss_f32: {
     Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
                         Ops, "vrecps");
   }
-  case NEON::BI__builtin_neon_vrecpsd_f64: {
+  case NEON::BI__builtin_neon_vrecpsd_f64:
     Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
                         Ops, "vrecps");
-  }
+  case NEON::BI__builtin_neon_vrecpsh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
+                        Ops, "vrecps");
   case NEON::BI__builtin_neon_vqshrun_n_v:
     Int = Intrinsic::aarch64_neon_sqshrun;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
@@ -6724,36 +6958,71 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vqrshrn_n_v:
     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
+  case NEON::BI__builtin_neon_vrndah_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::round;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
+  }
   case NEON::BI__builtin_neon_vrnda_v:
   case NEON::BI__builtin_neon_vrndaq_v: {
     Int = Intrinsic::round;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
   }
+  case NEON::BI__builtin_neon_vrndih_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::nearbyint;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
+  }
   case NEON::BI__builtin_neon_vrndi_v:
   case NEON::BI__builtin_neon_vrndiq_v: {
     Int = Intrinsic::nearbyint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
   }
+  case NEON::BI__builtin_neon_vrndmh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::floor;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
+  }
   case NEON::BI__builtin_neon_vrndm_v:
   case NEON::BI__builtin_neon_vrndmq_v: {
     Int = Intrinsic::floor;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
   }
+  case NEON::BI__builtin_neon_vrndnh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::aarch64_neon_frintn;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
+  }
   case NEON::BI__builtin_neon_vrndn_v:
   case NEON::BI__builtin_neon_vrndnq_v: {
     Int = Intrinsic::aarch64_neon_frintn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
   }
+  case NEON::BI__builtin_neon_vrndph_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::ceil;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
+  }
   case NEON::BI__builtin_neon_vrndp_v:
   case NEON::BI__builtin_neon_vrndpq_v: {
     Int = Intrinsic::ceil;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
   }
+  case NEON::BI__builtin_neon_vrndxh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::rint;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
+  }
   case NEON::BI__builtin_neon_vrndx_v:
   case NEON::BI__builtin_neon_vrndxq_v: {
     Int = Intrinsic::rint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
   }
+  case NEON::BI__builtin_neon_vrndh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::trunc;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
+  }
   case NEON::BI__builtin_neon_vrnd_v:
   case NEON::BI__builtin_neon_vrndq_v: {
     Int = Intrinsic::trunc;
@@ -6902,6 +7171,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
   case NEON::BI__builtin_neon_vnegd_s64:
     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
+  case NEON::BI__builtin_neon_vnegh_f16:
+    return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
   case NEON::BI__builtin_neon_vpmaxnm_v:
   case NEON::BI__builtin_neon_vpmaxnmq_v: {
     Int = Intrinsic::aarch64_neon_fmaxnmp;
@@ -6912,6 +7183,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_fminnmp;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
   }
+  case NEON::BI__builtin_neon_vsqrth_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::sqrt;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
+  }
   case NEON::BI__builtin_neon_vsqrt_v:
   case NEON::BI__builtin_neon_vsqrtq_v: {
     Int = Intrinsic::sqrt;
@@ -7785,7 +8061,9 @@ static Value *EmitX86Select(CodeGenFunction &CGF,
 }
 
 static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
-                                   bool Signed, SmallVectorImpl<Value *> &Ops) {
+                                   bool Signed, ArrayRef<Value *> Ops) {
+  assert((Ops.size() == 2 || Ops.size() == 4) &&
+         "Unexpected number of arguments");
   unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
   Value *Cmp;
 
@@ -7809,9 +8087,11 @@ static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
     Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
   }
 
-  const auto *C = dyn_cast<Constant>(Ops.back());
-  if (!C || !C->isAllOnesValue())
-    Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, Ops.back(), NumElts));
+  if (Ops.size() == 4) {
+    const auto *C = dyn_cast<Constant>(Ops[3]);
+    if (!C || !C->isAllOnesValue())
+      Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, Ops[3], NumElts));
+  }
 
   if (NumElts < 8) {
     uint32_t Indices[8];
@@ -7827,6 +8107,11 @@ static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
                                                     std::max(NumElts, 8U)));
 }
 
+static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
+  Value *Zero = Constant::getNullValue(In->getType());
+  return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
+}
+
 static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) {
 
   llvm::Type *Ty = Ops[0]->getType();
@@ -8173,6 +8458,20 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_cvtmask2q512:
     return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
 
+  case X86::BI__builtin_ia32_cvtb2mask128:
+  case X86::BI__builtin_ia32_cvtb2mask256:
+  case X86::BI__builtin_ia32_cvtb2mask512:
+  case X86::BI__builtin_ia32_cvtw2mask128:
+  case X86::BI__builtin_ia32_cvtw2mask256:
+  case X86::BI__builtin_ia32_cvtw2mask512:
+  case X86::BI__builtin_ia32_cvtd2mask128:
+  case X86::BI__builtin_ia32_cvtd2mask256:
+  case X86::BI__builtin_ia32_cvtd2mask512:
+  case X86::BI__builtin_ia32_cvtq2mask128:
+  case X86::BI__builtin_ia32_cvtq2mask256:
+  case X86::BI__builtin_ia32_cvtq2mask512:
+    return EmitX86ConvertToMask(*this, Ops[0]);
+
   case X86::BI__builtin_ia32_movdqa32store128_mask:
   case X86::BI__builtin_ia32_movdqa64store128_mask:
   case X86::BI__builtin_ia32_storeaps128_mask:
@@ -8427,6 +8726,28 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
                                  Builder.getInt16Ty());
   }
 
+  case X86::BI__builtin_ia32_kunpckdi:
+  case X86::BI__builtin_ia32_kunpcksi:
+  case X86::BI__builtin_ia32_kunpckhi: {
+    unsigned NumElts = Ops[0]->getType()->getScalarSizeInBits();
+    Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
+    Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
+    uint32_t Indices[64];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i;
+
+    // First extract half of each vector. This gives better codegen than
+    // doing it in a single shuffle.
+    LHS = Builder.CreateShuffleVector(LHS, LHS,
+                                      makeArrayRef(Indices, NumElts / 2));
+    RHS = Builder.CreateShuffleVector(RHS, RHS,
+                                      makeArrayRef(Indices, NumElts / 2));
+    // Concat the vectors.
+    Value *Res = Builder.CreateShuffleVector(LHS, RHS,
+                                             makeArrayRef(Indices, NumElts));
+    return Builder.CreateBitCast(Res, Ops[0]->getType());
+  }
+
   case X86::BI__builtin_ia32_vplzcntd_128_mask:
   case X86::BI__builtin_ia32_vplzcntd_256_mask:
   case X86::BI__builtin_ia32_vplzcntd_512_mask:
@@ -10231,6 +10552,21 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
                                                    const CallExpr *E) {
   switch (BuiltinID) {
+  case WebAssembly::BI__builtin_wasm_mem_size: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    Value *I = EmitScalarExpr(E->getArg(0));
+    Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_size, ResultType);
+    return Builder.CreateCall(Callee, I);
+  }
+  case WebAssembly::BI__builtin_wasm_mem_grow: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    Value *Args[] = {
+      EmitScalarExpr(E->getArg(0)),
+      EmitScalarExpr(E->getArg(1))
+    };
+    Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_grow, ResultType);
+    return Builder.CreateCall(Callee, Args);
+  }
   case WebAssembly::BI__builtin_wasm_current_memory: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_current_memory, ResultType);
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index 38d7344572d3..655bc6659a95 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -1925,9 +1925,9 @@ void CodeGenModule::ConstructAttributeList(
   const ABIArgInfo &RetAI = FI.getReturnInfo();
   switch (RetAI.getKind()) {
   case ABIArgInfo::Extend:
-    if (RetTy->hasSignedIntegerRepresentation())
+    if (RetAI.isSignExt())
       RetAttrs.addAttribute(llvm::Attribute::SExt);
-    else if (RetTy->hasUnsignedIntegerRepresentation())
+    else
       RetAttrs.addAttribute(llvm::Attribute::ZExt);
     LLVM_FALLTHROUGH;
   case ABIArgInfo::Direct:
@@ -2006,14 +2006,10 @@ void CodeGenModule::ConstructAttributeList(
     // sense to do it here because parameters are so messed up.
     switch (AI.getKind()) {
     case ABIArgInfo::Extend:
-      if (ParamType->isSignedIntegerOrEnumerationType())
+      if (AI.isSignExt())
         Attrs.addAttribute(llvm::Attribute::SExt);
-      else if (ParamType->isUnsignedIntegerOrEnumerationType()) {
-        if (getTypes().getABIInfo().shouldSignExtUnsignedType(ParamType))
-          Attrs.addAttribute(llvm::Attribute::SExt);
-        else
-          Attrs.addAttribute(llvm::Attribute::ZExt);
-      }
+      else
+        Attrs.addAttribute(llvm::Attribute::ZExt);
       LLVM_FALLTHROUGH;
     case ABIArgInfo::Direct:
       if (ArgNo == 0 && FI.isChainCall())
@@ -2255,11 +2251,16 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
   for (FunctionArgList::const_iterator i = Args.begin(), e = Args.end();
        i != e; ++i, ++info_it, ++ArgNo) {
     const VarDecl *Arg = *i;
-    QualType Ty = info_it->type;
     const ABIArgInfo &ArgI = info_it->info;
 
     bool isPromoted =
       isa<ParmVarDecl>(Arg) && cast<ParmVarDecl>(Arg)->isKNRPromoted();
+    // We are converting from ABIArgInfo type to VarDecl type directly, unless
+    // the parameter is promoted. In this case we convert to
+    // CGFunctionInfo::ArgInfo type with subsequent argument demotion.
+    QualType Ty = isPromoted ? info_it->type : Arg->getType();
+    assert(hasScalarEvaluationKind(Ty) ==
+           hasScalarEvaluationKind(Arg->getType()));
 
     unsigned FirstIRArg, NumIRArgs;
     std::tie(FirstIRArg, NumIRArgs) = IRFunctionArgs.getIRArgs(ArgNo);
@@ -3543,9 +3544,9 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     } else {
       // We can't represent a misaligned lvalue in the CallArgList, so copy
       // to an aligned temporary now.
-      Address tmp = CreateMemTemp(type);
-      EmitAggregateCopy(tmp, L.getAddress(), type, L.isVolatile());
-      args.add(RValue::getAggregate(tmp), type);
+      LValue Dest = MakeAddrLValue(CreateMemTemp(type), type);
+      EmitAggregateCopy(Dest, L, type, L.isVolatile());
+      args.add(RValue::getAggregate(Dest.getAddress()), type);
     }
     return;
   }
@@ -3883,7 +3884,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
           Address AI = CreateMemTemp(I->Ty, ArgInfo.getIndirectAlign(),
                                      "byval-temp", false);
           IRCallArgs[FirstIRArg] = AI.getPointer();
-          EmitAggregateCopy(AI, Addr, I->Ty, RV.isVolatileQualified());
+          LValue Dest = MakeAddrLValue(AI, I->Ty);
+          LValue Src = MakeAddrLValue(Addr, I->Ty);
+          EmitAggregateCopy(Dest, Src, I->Ty, RV.isVolatileQualified());
         } else {
           // Skip the extra memcpy call.
           IRCallArgs[FirstIRArg] = Addr.getPointer();
diff --git a/lib/CodeGen/CGClass.cpp b/lib/CodeGen/CGClass.cpp
index a6915071ec17..56bd25025f53 100644
--- a/lib/CodeGen/CGClass.cpp
+++ b/lib/CodeGen/CGClass.cpp
@@ -406,8 +406,8 @@ CodeGenFunction::GetAddressOfDerivedClass(Address BaseAddr,
 
   // Apply the offset.
   llvm::Value *Value = Builder.CreateBitCast(BaseAddr.getPointer(), Int8PtrTy);
-  Value = Builder.CreateGEP(Value, Builder.CreateNeg(NonVirtualOffset),
-                            "sub.ptr");
+  Value = Builder.CreateInBoundsGEP(Value, Builder.CreateNeg(NonVirtualOffset),
+                                    "sub.ptr");
 
   // Just cast.
   Value = Builder.CreateBitCast(Value, DerivedPtrTy);
@@ -615,7 +615,14 @@ static void EmitMemberInitializer(CodeGenFunction &CGF,
 
   llvm::Value *ThisPtr = CGF.LoadCXXThis();
   QualType RecordTy = CGF.getContext().getTypeDeclType(ClassDecl);
-  LValue LHS = CGF.MakeNaturalAlignAddrLValue(ThisPtr, RecordTy);
+  LValue LHS;
+
+  // If a base constructor is being emitted, create an LValue that has the
+  // non-virtual alignment.
+  if (CGF.CurGD.getCtorType() == Ctor_Base)
+    LHS = CGF.MakeNaturalAlignPointeeAddrLValue(ThisPtr, RecordTy);
+  else
+    LHS = CGF.MakeNaturalAlignAddrLValue(ThisPtr, RecordTy);
 
   EmitLValueForAnyFieldInitialization(CGF, MemberInit, LHS);
 
@@ -640,8 +647,7 @@ static void EmitMemberInitializer(CodeGenFunction &CGF,
       LValue Src = CGF.EmitLValueForFieldInitialization(ThisRHSLV, Field);
 
       // Copy the aggregate.
-      CGF.EmitAggregateCopy(LHS.getAddress(), Src.getAddress(), FieldType,
-                            LHS.isVolatileQualified());
+      CGF.EmitAggregateCopy(LHS, Src, FieldType, LHS.isVolatileQualified());
       // Ensure that we destroy the objects if an exception is thrown later in
       // the constructor.
       QualType::DestructionKind dtorKind = FieldType.isDestructedType();
@@ -2002,10 +2008,10 @@ void CodeGenFunction::EmitCXXConstructorCall(const CXXConstructorDecl *D,
     assert(E->getNumArgs() == 1 && "unexpected argcount for trivial ctor");
 
     const Expr *Arg = E->getArg(0);
-    QualType SrcTy = Arg->getType();
-    Address Src = EmitLValue(Arg).getAddress();
+    LValue Src = EmitLValue(Arg);
     QualType DestTy = getContext().getTypeDeclType(D->getParent());
-    EmitAggregateCopyCtor(This, Src, DestTy, SrcTy);
+    LValue Dest = MakeAddrLValue(This, DestTy);
+    EmitAggregateCopyCtor(Dest, Src);
     return;
   }
 
@@ -2072,8 +2078,10 @@ void CodeGenFunction::EmitCXXConstructorCall(const CXXConstructorDecl *D,
 
     QualType SrcTy = D->getParamDecl(0)->getType().getNonReferenceType();
     Address Src(Args[1].RV.getScalarVal(), getNaturalTypeAlignment(SrcTy));
+    LValue SrcLVal = MakeAddrLValue(Src, SrcTy);
     QualType DestTy = getContext().getTypeDeclType(ClassDecl);
-    EmitAggregateCopyCtor(This, Src, DestTy, SrcTy);
+    LValue DestLVal = MakeAddrLValue(This, DestTy);
+    EmitAggregateCopyCtor(DestLVal, SrcLVal);
     return;
   }
 
diff --git a/lib/CodeGen/CGCleanup.cpp b/lib/CodeGen/CGCleanup.cpp
index 22055b2cb902..526def2cc29c 100644
--- a/lib/CodeGen/CGCleanup.cpp
+++ b/lib/CodeGen/CGCleanup.cpp
@@ -624,7 +624,7 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF,
       si->eraseFromParent();
 
       // Destroy the load.
-      assert(condition->getOperand(0) == CGF.NormalCleanupDest);
+      assert(condition->getOperand(0) == CGF.NormalCleanupDest.getPointer());
       assert(condition->use_empty());
       condition->eraseFromParent();
     }
@@ -833,7 +833,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
         if (NormalCleanupDestSlot->hasOneUse()) {
           NormalCleanupDestSlot->user_back()->eraseFromParent();
           NormalCleanupDestSlot->eraseFromParent();
-          NormalCleanupDest = nullptr;
+          NormalCleanupDest = Address::invalid();
         }
 
         llvm::BasicBlock *BranchAfter = Scope.getBranchAfterBlock(0);
@@ -1250,10 +1250,10 @@ void CodeGenFunction::DeactivateCleanupBlock(EHScopeStack::stable_iterator C,
 }
 
 Address CodeGenFunction::getNormalCleanupDestSlot() {
-  if (!NormalCleanupDest)
+  if (!NormalCleanupDest.isValid())
     NormalCleanupDest =
-      CreateTempAlloca(Builder.getInt32Ty(), "cleanup.dest.slot");
-  return Address(NormalCleanupDest, CharUnits::fromQuantity(4));
+      CreateDefaultAlignTempAlloca(Builder.getInt32Ty(), "cleanup.dest.slot");
+  return NormalCleanupDest;
 }
 
 /// Emits all the code to cause the given temporary to be cleaned up.
diff --git a/lib/CodeGen/CGDebugInfo.cpp b/lib/CodeGen/CGDebugInfo.cpp
index aeed4d658a4e..d29e0791fbb4 100644
--- a/lib/CodeGen/CGDebugInfo.cpp
+++ b/lib/CodeGen/CGDebugInfo.cpp
@@ -365,7 +365,8 @@ llvm::DIFile::ChecksumKind
 CGDebugInfo::computeChecksum(FileID FID, SmallString<32> &Checksum) const {
   Checksum.clear();
 
-  if (!CGM.getCodeGenOpts().EmitCodeView)
+  if (!CGM.getCodeGenOpts().EmitCodeView &&
+      CGM.getCodeGenOpts().DwarfVersion < 5)
     return llvm::DIFile::CSK_None;
 
   SourceManager &SM = CGM.getContext().getSourceManager();
@@ -2803,9 +2804,18 @@ llvm::DICompositeType *CGDebugInfo::CreateLimitedType(const RecordType *Ty) {
 
   SmallString<256> FullName = getUniqueTagTypeName(Ty, CGM, TheCU);
 
+  // Explicitly record the calling convention for C++ records.
+  auto Flags = llvm::DINode::FlagZero;
+  if (auto CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    if (CGM.getCXXABI().getRecordArgABI(CXXRD) == CGCXXABI::RAA_Indirect)
+      Flags |= llvm::DINode::FlagTypePassByReference;
+    else
+      Flags |= llvm::DINode::FlagTypePassByValue;
+  }
+
   llvm::DICompositeType *RealDecl = DBuilder.createReplaceableCompositeType(
       getTagForRecord(RD), RDName, RDContext, DefUnit, Line, 0, Size, Align,
-      llvm::DINode::FlagZero, FullName);
+      Flags, FullName);
 
   // Elements of composite types usually have back to the type, creating
   // uniquing cycles.  Distinct nodes are more efficient.
diff --git a/lib/CodeGen/CGException.cpp b/lib/CodeGen/CGException.cpp
index 1ec084ff3f5b..1978d2757ac5 100644
--- a/lib/CodeGen/CGException.cpp
+++ b/lib/CodeGen/CGException.cpp
@@ -646,7 +646,7 @@ CodeGenFunction::getMSVCDispatchBlock(EHScopeStack::stable_iterator SI) {
     return DispatchBlock;
 
   if (EHS.getKind() == EHScope::Terminate)
-    DispatchBlock = getTerminateHandler();
+    DispatchBlock = getTerminateFunclet();
   else
     DispatchBlock = createBasicBlock();
   CGBuilderTy Builder(*this, DispatchBlock);
@@ -1334,24 +1334,15 @@ llvm::BasicBlock *CodeGenFunction::getTerminateHandler() {
   if (TerminateHandler)
     return TerminateHandler;
 
-  CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
-
   // Set up the terminate handler.  This block is inserted at the very
   // end of the function by FinishFunction.
   TerminateHandler = createBasicBlock("terminate.handler");
+  CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
   Builder.SetInsertPoint(TerminateHandler);
+
   llvm::Value *Exn = nullptr;
-  SaveAndRestore<llvm::Instruction *> RestoreCurrentFuncletPad(
-      CurrentFuncletPad);
-  if (EHPersonality::get(*this).usesFuncletPads()) {
-    llvm::Value *ParentPad = CurrentFuncletPad;
-    if (!ParentPad)
-      ParentPad = llvm::ConstantTokenNone::get(CGM.getLLVMContext());
-    CurrentFuncletPad = Builder.CreateCleanupPad(ParentPad);
-  } else {
-    if (getLangOpts().CPlusPlus)
-      Exn = getExceptionFromSlot();
-  }
+  if (getLangOpts().CPlusPlus)
+    Exn = getExceptionFromSlot();
   llvm::CallInst *terminateCall =
       CGM.getCXXABI().emitTerminateForUnexpectedException(*this, Exn);
   terminateCall->setDoesNotReturn();
@@ -1363,6 +1354,42 @@ llvm::BasicBlock *CodeGenFunction::getTerminateHandler() {
   return TerminateHandler;
 }
 
+llvm::BasicBlock *CodeGenFunction::getTerminateFunclet() {
+  assert(EHPersonality::get(*this).usesFuncletPads() &&
+         "use getTerminateLandingPad for non-funclet EH");
+
+  llvm::BasicBlock *&TerminateFunclet = TerminateFunclets[CurrentFuncletPad];
+  if (TerminateFunclet)
+    return TerminateFunclet;
+
+  CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
+
+  // Set up the terminate handler.  This block is inserted at the very
+  // end of the function by FinishFunction.
+  TerminateFunclet = createBasicBlock("terminate.handler");
+  Builder.SetInsertPoint(TerminateFunclet);
+
+  // Create the cleanuppad using the current parent pad as its token. Use 'none'
+  // if this is a top-level terminate scope, which is the common case.
+  SaveAndRestore<llvm::Instruction *> RestoreCurrentFuncletPad(
+      CurrentFuncletPad);
+  llvm::Value *ParentPad = CurrentFuncletPad;
+  if (!ParentPad)
+    ParentPad = llvm::ConstantTokenNone::get(CGM.getLLVMContext());
+  CurrentFuncletPad = Builder.CreateCleanupPad(ParentPad);
+
+  // Emit the __std_terminate call.
+  llvm::CallInst *terminateCall =
+      CGM.getCXXABI().emitTerminateForUnexpectedException(*this, nullptr);
+  terminateCall->setDoesNotReturn();
+  Builder.CreateUnreachable();
+
+  // Restore the saved insertion state.
+  Builder.restoreIP(SavedIP);
+
+  return TerminateFunclet;
+}
+
 llvm::BasicBlock *CodeGenFunction::getEHResumeBlock(bool isCleanup) {
   if (EHResumeBlock) return EHResumeBlock;
 
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index 3f81be00f32d..73354cdf854c 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -1034,8 +1034,12 @@ Address CodeGenFunction::EmitPointerWithAlignment(const Expr *E,
     // Derived-to-base conversions.
     case CK_UncheckedDerivedToBase:
     case CK_DerivedToBase: {
-      Address Addr = EmitPointerWithAlignment(CE->getSubExpr(), BaseInfo,
-                                              TBAAInfo);
+      // TODO: Support accesses to members of base classes in TBAA. For now, we
+      // conservatively pretend that the complete object is of the base class
+      // type.
+      if (TBAAInfo)
+        *TBAAInfo = CGM.getTBAAAccessInfo(E->getType());
+      Address Addr = EmitPointerWithAlignment(CE->getSubExpr(), BaseInfo);
       auto Derived = CE->getSubExpr()->getType()->getPointeeCXXRecordDecl();
       return GetAddressOfBaseClass(Addr, Derived,
                                    CE->path_begin(), CE->path_end(),
@@ -4479,8 +4483,7 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
 
   CalleeType = getContext().getCanonicalType(CalleeType);
 
-  const auto *FnType =
-      cast<FunctionType>(cast<PointerType>(CalleeType)->getPointeeType());
+  auto PointeeType = cast<PointerType>(CalleeType)->getPointeeType();
 
   CGCallee Callee = OrigCallee;
 
@@ -4489,8 +4492,12 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
     if (llvm::Constant *PrefixSig =
             CGM.getTargetCodeGenInfo().getUBSanFunctionSignature(CGM)) {
       SanitizerScope SanScope(this);
+      // Remove any (C++17) exception specifications, to allow calling e.g. a
+      // noexcept function through a non-noexcept pointer.
+      auto ProtoTy =
+        getContext().getFunctionTypeWithExceptionSpec(PointeeType, EST_None);
       llvm::Constant *FTRTTIConst =
-          CGM.GetAddrOfRTTIDescriptor(QualType(FnType, 0), /*ForEH=*/true);
+          CGM.GetAddrOfRTTIDescriptor(ProtoTy, /*ForEH=*/true);
       llvm::Type *PrefixStructTyElems[] = {PrefixSig->getType(), Int32Ty};
       llvm::StructType *PrefixStructTy = llvm::StructType::get(
           CGM.getLLVMContext(), PrefixStructTyElems, /*isPacked=*/true);
@@ -4530,6 +4537,8 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
     }
   }
 
+  const auto *FnType = cast<FunctionType>(PointeeType);
+
   // If we are checking indirect calls and this call is indirect, check that the
   // function pointer is a member of the bit set for the function type.
   if (SanOpts.has(SanitizerKind::CFIICall) &&
diff --git a/lib/CodeGen/CGExprAgg.cpp b/lib/CodeGen/CGExprAgg.cpp
index 0f05cab66d7e..d2b29e06ae3a 100644
--- a/lib/CodeGen/CGExprAgg.cpp
+++ b/lib/CodeGen/CGExprAgg.cpp
@@ -298,7 +298,9 @@ void AggExprEmitter::EmitCopy(QualType type, const AggValueSlot &dest,
   // If the result of the assignment is used, copy the LHS there also.
   // It's volatile if either side is.  Use the minimum alignment of
   // the two sides.
-  CGF.EmitAggregateCopy(dest.getAddress(), src.getAddress(), type,
+  LValue DestLV = CGF.MakeAddrLValue(dest.getAddress(), type);
+  LValue SrcLV = CGF.MakeAddrLValue(src.getAddress(), type);
+  CGF.EmitAggregateCopy(DestLV, SrcLV, type,
                         dest.isVolatile() || src.isVolatile());
 }
 
@@ -1541,12 +1543,14 @@ LValue CodeGenFunction::EmitAggExprToLValue(const Expr *E) {
   return LV;
 }
 
-void CodeGenFunction::EmitAggregateCopy(Address DestPtr,
-                                        Address SrcPtr, QualType Ty,
-                                        bool isVolatile,
+void CodeGenFunction::EmitAggregateCopy(LValue Dest, LValue Src,
+                                        QualType Ty, bool isVolatile,
                                         bool isAssignment) {
   assert(!Ty->isAnyComplexType() && "Shouldn't happen for complex");
 
+  Address DestPtr = Dest.getAddress();
+  Address SrcPtr = Src.getAddress();
+
   if (getLangOpts().CPlusPlus) {
     if (const RecordType *RT = Ty->getAs<RecordType>()) {
       CXXRecordDecl *Record = cast<CXXRecordDecl>(RT->getDecl());
@@ -1562,7 +1566,7 @@ void CodeGenFunction::EmitAggregateCopy(Address DestPtr,
         return;
     }
   }
-  
+
   // Aggregate assignment turns into llvm.memcpy.  This is almost valid per
   // C99 6.5.16.1p3, which states "If the value being stored in an object is
   // read from another object that overlaps in anyway the storage of the first
@@ -1657,4 +1661,10 @@ void CodeGenFunction::EmitAggregateCopy(Address DestPtr,
   // the optimizer wishes to expand it in to scalar memory operations.
   if (llvm::MDNode *TBAAStructTag = CGM.getTBAAStructInfo(Ty))
     Inst->setMetadata(llvm::LLVMContext::MD_tbaa_struct, TBAAStructTag);
+
+  if (CGM.getCodeGenOpts().NewStructPathTBAA) {
+    TBAAAccessInfo TBAAInfo = CGM.mergeTBAAInfoForMemoryTransfer(
+        Dest.getTBAAInfo(), Src.getTBAAInfo());
+    CGM.DecorateInstructionWithTBAA(Inst, TBAAInfo);
+  }
 }
diff --git a/lib/CodeGen/CGExprCXX.cpp b/lib/CodeGen/CGExprCXX.cpp
index c32f1e5415da..9a18751fc253 100644
--- a/lib/CodeGen/CGExprCXX.cpp
+++ b/lib/CodeGen/CGExprCXX.cpp
@@ -242,11 +242,15 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
     }
   }
 
-  Address This = Address::invalid();
-  if (IsArrow)
-    This = EmitPointerWithAlignment(Base);
-  else
-    This = EmitLValue(Base).getAddress();
+  LValue This;
+  if (IsArrow) {
+    LValueBaseInfo BaseInfo;
+    TBAAAccessInfo TBAAInfo;
+    Address ThisValue = EmitPointerWithAlignment(Base, &BaseInfo, &TBAAInfo);
+    This = MakeAddrLValue(ThisValue, Base->getType(), BaseInfo, TBAAInfo);
+  } else {
+    This = EmitLValue(Base);
+  }
 
 
   if (MD->isTrivial() || (MD->isDefaulted() && MD->getParent()->isUnion())) {
@@ -264,7 +268,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
                                (*RtlArgs)[0].RV.getScalarVal(),
                                (*(CE->arg_begin() + 1))->getType())
                          : EmitLValue(*CE->arg_begin());
-        EmitAggregateAssign(This, RHS.getAddress(), CE->getType());
+        EmitAggregateAssign(This, RHS, CE->getType());
         return RValue::get(This.getPointer());
       }
 
@@ -272,8 +276,10 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
           cast<CXXConstructorDecl>(MD)->isCopyOrMoveConstructor()) {
         // Trivial move and copy ctor are the same.
         assert(CE->getNumArgs() == 1 && "unexpected argcount for trivial ctor");
-        Address RHS = EmitLValue(*CE->arg_begin()).getAddress();
-        EmitAggregateCopy(This, RHS, (*CE->arg_begin())->getType());
+        const Expr *Arg = *CE->arg_begin();
+        LValue RHS = EmitLValue(Arg);
+        LValue Dest = MakeAddrLValue(This.getAddress(), Arg->getType());
+        EmitAggregateCopy(Dest, RHS, Arg->getType());
         return RValue::get(This.getPointer());
       }
       llvm_unreachable("unknown trivial member function");
@@ -335,7 +341,8 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
     assert(ReturnValue.isNull() && "Destructor shouldn't have return value");
     if (UseVirtualCall) {
       CGM.getCXXABI().EmitVirtualDestructorCall(
-          *this, Dtor, Dtor_Complete, This, cast<CXXMemberCallExpr>(CE));
+          *this, Dtor, Dtor_Complete, This.getAddress(),
+          cast<CXXMemberCallExpr>(CE));
     } else {
       CGCallee Callee;
       if (getLangOpts().AppleKext && MD->isVirtual() && HasQualifier)
@@ -364,7 +371,8 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
                   CGM.GetAddrOfFunction(GlobalDecl(Ctor, Ctor_Complete), Ty),
                                  Ctor);
   } else if (UseVirtualCall) {
-    Callee = CGM.getCXXABI().getVirtualFunctionPointer(*this, MD, This, Ty,
+    Callee = CGM.getCXXABI().getVirtualFunctionPointer(*this, MD,
+                                                       This.getAddress(), Ty,
                                                        CE->getLocStart());
   } else {
     if (SanOpts.has(SanitizerKind::CFINVCall) &&
@@ -372,7 +380,8 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
       llvm::Value *VTable;
       const CXXRecordDecl *RD;
       std::tie(VTable, RD) =
-          CGM.getCXXABI().LoadVTablePtr(*this, This, MD->getParent());
+          CGM.getCXXABI().LoadVTablePtr(*this, This.getAddress(),
+                                        MD->getParent());
       EmitVTablePtrCheckForCall(RD, VTable, CFITCK_NVCall, CE->getLocStart());
     }
 
@@ -388,8 +397,10 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
   }
 
   if (MD->isVirtual()) {
-    This = CGM.getCXXABI().adjustThisArgumentForVirtualFunctionCall(
-        *this, CalleeDecl, This, UseVirtualCall);
+    Address NewThisAddr =
+        CGM.getCXXABI().adjustThisArgumentForVirtualFunctionCall(
+            *this, CalleeDecl, This.getAddress(), UseVirtualCall);
+    This.setAddress(NewThisAddr);
   }
 
   return EmitCXXMemberOrOperatorCall(
diff --git a/lib/CodeGen/CGExprScalar.cpp b/lib/CodeGen/CGExprScalar.cpp
index c46215067a68..668c572942fd 100644
--- a/lib/CodeGen/CGExprScalar.cpp
+++ b/lib/CodeGen/CGExprScalar.cpp
@@ -162,13 +162,13 @@ static bool CanElideOverflowCheck(const ASTContext &Ctx, const BinOpInfo &Op) {
   // we can elide the overflow check.
   if (!Op.mayHaveIntegerOverflow())
     return true;
-
-  // If a unary op has a widened operand, the op cannot overflow.
-  if (const auto *UO = dyn_cast<UnaryOperator>(Op.E))
-    return IsWidenedIntegerOp(Ctx, UO->getSubExpr());
-
-  // We usually don't need overflow checks for binops with widened operands.
-  // Multiplication with promoted unsigned operands is a special case.
+
+  // If a unary op has a widened operand, the op cannot overflow.
+  if (const auto *UO = dyn_cast<UnaryOperator>(Op.E))
+    return !UO->canOverflow();
+
+  // We usually don't need overflow checks for binops with widened operands.
+  // Multiplication with promoted unsigned operands is a special case.
   const auto *BO = cast<BinaryOperator>(Op.E);
   auto OptionalLHSTy = getUnwidenedIntegerType(Ctx, BO->getLHS());
   if (!OptionalLHSTy)
@@ -1870,13 +1870,13 @@ llvm::Value *ScalarExprEmitter::EmitIncDecConsiderOverflowBehavior(
     return Builder.CreateAdd(InVal, Amount, Name);
   case LangOptions::SOB_Undefined:
     if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow))
-      return Builder.CreateNSWAdd(InVal, Amount, Name);
-    // Fall through.
-  case LangOptions::SOB_Trapping:
-    if (IsWidenedIntegerOp(CGF.getContext(), E->getSubExpr()))
-      return Builder.CreateNSWAdd(InVal, Amount, Name);
-    return EmitOverflowCheckedBinOp(createBinOpInfoFromIncDec(E, InVal, IsInc));
-  }
+      return Builder.CreateNSWAdd(InVal, Amount, Name);
+    // Fall through.
+  case LangOptions::SOB_Trapping:
+    if (!E->canOverflow())
+      return Builder.CreateNSWAdd(InVal, Amount, Name);
+    return EmitOverflowCheckedBinOp(createBinOpInfoFromIncDec(E, InVal, IsInc));
+  }
   llvm_unreachable("Unknown SignedOverflowBehaviorTy");
 }
 
@@ -1952,17 +1952,15 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
     value = Builder.getTrue();
 
   // Most common case by far: integer increment.
-  } else if (type->isIntegerType()) {
-    // Note that signed integer inc/dec with width less than int can't
-    // overflow because of promotion rules; we're just eliding a few steps here.
-    bool CanOverflow = value->getType()->getIntegerBitWidth() >=
-                       CGF.IntTy->getIntegerBitWidth();
-    if (CanOverflow && type->isSignedIntegerOrEnumerationType()) {
-      value = EmitIncDecConsiderOverflowBehavior(E, value, isInc);
-    } else if (CanOverflow && type->isUnsignedIntegerType() &&
-               CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow)) {
-      value =
-          EmitOverflowCheckedBinOp(createBinOpInfoFromIncDec(E, value, isInc));
+  } else if (type->isIntegerType()) {
+    // Note that signed integer inc/dec with width less than int can't
+    // overflow because of promotion rules; we're just eliding a few steps here.
+    if (E->canOverflow() && type->isSignedIntegerOrEnumerationType()) {
+      value = EmitIncDecConsiderOverflowBehavior(E, value, isInc);
+    } else if (E->canOverflow() && type->isUnsignedIntegerType() &&
+               CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow)) {
+      value =
+          EmitOverflowCheckedBinOp(createBinOpInfoFromIncDec(E, value, isInc));
     } else {
       llvm::Value *amt = llvm::ConstantInt::get(value->getType(), amount, true);
       value = Builder.CreateAdd(value, amt, isInc ? "inc" : "dec");
diff --git a/lib/CodeGen/CGObjC.cpp b/lib/CodeGen/CGObjC.cpp
index f26263d9472d..d2175f00b448 100644
--- a/lib/CodeGen/CGObjC.cpp
+++ b/lib/CodeGen/CGObjC.cpp
@@ -1008,12 +1008,13 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
                          /*init*/ true);
       return;
     }
-    case TEK_Aggregate:
+    case TEK_Aggregate: {
       // The return value slot is guaranteed to not be aliased, but
       // that's not necessarily the same as "on the stack", so
       // we still potentially need objc_memmove_collectable.
-      EmitAggregateCopy(ReturnValue, LV.getAddress(), ivarType);
-      return;
+      EmitAggregateCopy(/* Dest= */ MakeAddrLValue(ReturnValue, ivarType),
+                        /* Src= */ LV, ivarType);
+      return; }
     case TEK_Scalar: {
       llvm::Value *value;
       if (propType->isReferenceType()) {
@@ -1814,22 +1815,6 @@ void CodeGenFunction::EmitARCIntrinsicUse(ArrayRef<llvm::Value*> values) {
 }
 
 
-static bool IsForwarding(StringRef Name) {
-  return llvm::StringSwitch<bool>(Name)
-      .Cases("objc_autoreleaseReturnValue",             // ARCInstKind::AutoreleaseRV
-             "objc_autorelease",                        // ARCInstKind::Autorelease
-             "objc_retainAutoreleaseReturnValue",       // ARCInstKind::FusedRetainAutoreleaseRV
-             "objc_retainAutoreleasedReturnValue",      // ARCInstKind::RetainRV
-             "objc_retainAutorelease",                  // ARCInstKind::FusedRetainAutorelease
-             "objc_retainedObject",                     // ARCInstKind::NoopCast
-             "objc_retain",                             // ARCInstKind::Retain
-             "objc_unretainedObject",                   // ARCInstKind::NoopCast
-             "objc_unretainedPointer",                  // ARCInstKind::NoopCast
-             "objc_unsafeClaimAutoreleasedReturnValue", // ARCInstKind::ClaimRV
-             true)
-      .Default(false);
-}
-
 static llvm::Constant *createARCRuntimeFunction(CodeGenModule &CGM,
                                                 llvm::FunctionType *FTy,
                                                 StringRef Name) {
@@ -1847,9 +1832,6 @@ static llvm::Constant *createARCRuntimeFunction(CodeGenModule &CGM,
       // performance.
       F->addFnAttr(llvm::Attribute::NonLazyBind);
     }
-
-    if (IsForwarding(Name))
-      F->arg_begin()->addAttr(llvm::Attribute::Returned);
   }
 
   return RTF;
@@ -2070,7 +2052,7 @@ CodeGenFunction::EmitARCRetainAutoreleasedReturnValue(llvm::Value *value) {
 
 /// Claim a possibly-autoreleased return value at +0.  This is only
 /// valid to do in contexts which do not rely on the retain to keep
-/// the object valid for for all of its uses; for example, when
+/// the object valid for all of its uses; for example, when
 /// the value is ignored, or when it is being assigned to an
 /// __unsafe_unretained variable.
 ///
@@ -3268,12 +3250,12 @@ CodeGenFunction::GenerateObjCAtomicSetterCopyHelperFunction(
   DeclRefExpr DstExpr(&DstDecl, false, DestTy,
                       VK_RValue, SourceLocation());
   UnaryOperator DST(&DstExpr, UO_Deref, DestTy->getPointeeType(),
-                    VK_LValue, OK_Ordinary, SourceLocation());
+                    VK_LValue, OK_Ordinary, SourceLocation(), false);
   
   DeclRefExpr SrcExpr(&SrcDecl, false, SrcTy,
                       VK_RValue, SourceLocation());
   UnaryOperator SRC(&SrcExpr, UO_Deref, SrcTy->getPointeeType(),
-                    VK_LValue, OK_Ordinary, SourceLocation());
+                    VK_LValue, OK_Ordinary, SourceLocation(), false);
   
   Expr *Args[2] = { &DST, &SRC };
   CallExpr *CalleeExp = cast<CallExpr>(PID->getSetterCXXAssignment());
@@ -3351,7 +3333,7 @@ CodeGenFunction::GenerateObjCAtomicGetterCopyHelperFunction(
                       VK_RValue, SourceLocation());
   
   UnaryOperator SRC(&SrcExpr, UO_Deref, SrcTy->getPointeeType(),
-                    VK_LValue, OK_Ordinary, SourceLocation());
+                    VK_LValue, OK_Ordinary, SourceLocation(), false);
   
   CXXConstructExpr *CXXConstExpr = 
     cast<CXXConstructExpr>(PID->getGetterCXXConstructor());
diff --git a/lib/CodeGen/CGObjCMac.cpp b/lib/CodeGen/CGObjCMac.cpp
index ef4e6cd4f01b..4060b7196bdd 100644
--- a/lib/CodeGen/CGObjCMac.cpp
+++ b/lib/CodeGen/CGObjCMac.cpp
@@ -4179,10 +4179,6 @@ void FragileHazards::emitHazardsInNewBlocks() {
   }
 }
 
-static void addIfPresent(llvm::DenseSet<llvm::Value*> &S, llvm::Value *V) {
-  if (V) S.insert(V);
-}
-
 static void addIfPresent(llvm::DenseSet<llvm::Value*> &S, Address V) {
   if (V.isValid()) S.insert(V.getPointer());
 }
@@ -6403,7 +6399,7 @@ llvm::Value *CGObjCNonFragileABIMac::GenerateProtocolRef(CodeGenFunction &CGF,
   PTGV->setAlignment(Align.getQuantity());
   if (!CGM.getTriple().isOSBinFormatMachO())
     PTGV->setComdat(CGM.getModule().getOrInsertComdat(ProtocolName));
-  CGM.addCompilerUsedGlobal(PTGV);
+  CGM.addUsedGlobal(PTGV);
   return CGF.Builder.CreateAlignedLoad(PTGV, Align);
 }
 
@@ -6847,7 +6843,7 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocol(
     Protocols[PD->getIdentifier()] = Entry;
   }
   Entry->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  CGM.addCompilerUsedGlobal(Entry);
+  CGM.addUsedGlobal(Entry);
 
   // Use this protocol meta-data to build protocol list table in section
   // __DATA, __objc_protolist
@@ -6866,7 +6862,7 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocol(
   PTGV->setSection(GetSectionName("__objc_protolist",
                                   "coalesced,no_dead_strip"));
   PTGV->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  CGM.addCompilerUsedGlobal(PTGV);
+  CGM.addUsedGlobal(PTGV);
   return Entry;
 }
 
diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 784c90a9eb91..e85590bb4299 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -147,7 +147,7 @@ class CGOpenMPTaskOutlinedRegionInfo final : public CGOpenMPRegionInfo {
         auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
             CGF.GetAddrOfLocalVar(PartIDVar),
             PartIDVar->getType()->castAs<PointerType>());
-        auto *Res = CGF.EmitLoadOfScalar(PartIdLVal, SourceLocation());
+        auto *Res = CGF.EmitLoadOfScalar(PartIdLVal, PartIDVar->getLocation());
         auto *DoneBB = CGF.createBasicBlock(".untied.done.");
         UntiedSwitch = CGF.Builder.CreateSwitch(Res, DoneBB);
         CGF.EmitBlock(DoneBB);
@@ -354,7 +354,7 @@ class CGOpenMPInnerExprInfo final : public CGOpenMPInlinedRegionInfo {
       DeclRefExpr DRE(const_cast<VarDecl *>(VD),
                       /*RefersToEnclosingVariableOrCapture=*/false,
                       VD->getType().getNonReferenceType(), VK_LValue,
-                      SourceLocation());
+                      C.getLocation());
       PrivScope.addPrivate(VD, [&CGF, &DRE]() -> Address {
         return CGF.EmitLValue(&DRE).getAddress();
       });
@@ -787,17 +787,17 @@ static void emitInitWithReductionInitializer(CodeGenFunction &CGF,
     RValue InitRVal;
     switch (CGF.getEvaluationKind(Ty)) {
     case TEK_Scalar:
-      InitRVal = CGF.EmitLoadOfLValue(LV, SourceLocation());
+      InitRVal = CGF.EmitLoadOfLValue(LV, DRD->getLocation());
       break;
     case TEK_Complex:
       InitRVal =
-          RValue::getComplex(CGF.EmitLoadOfComplex(LV, SourceLocation()));
+          RValue::getComplex(CGF.EmitLoadOfComplex(LV, DRD->getLocation()));
       break;
     case TEK_Aggregate:
       InitRVal = RValue::getAggregate(LV.getAddress());
       break;
     }
-    OpaqueValueExpr OVE(SourceLocation(), Ty, VK_RValue);
+    OpaqueValueExpr OVE(DRD->getLocation(), Ty, VK_RValue);
     CodeGenFunction::OpaqueValueMapping OpaqueMap(CGF, &OVE, InitRVal);
     CGF.EmitAnyExprToMem(&OVE, Private, Ty.getQualifiers(),
                          /*IsInitializer=*/false);
@@ -1216,7 +1216,8 @@ emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty,
   CodeGenFunction CGF(CGM);
   // Map "T omp_in;" variable to "*omp_in_parm" value in all expressions.
   // Map "T omp_out;" variable to "*omp_out_parm" value in all expressions.
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args, In->getLocation(),
+                    Out->getLocation());
   CodeGenFunction::OMPPrivateScope Scope(CGF);
   Address AddrIn = CGF.GetAddrOfLocalVar(&OmpInParm);
   Scope.addPrivate(In, [&CGF, AddrIn, PtrTy]() -> Address {
@@ -1370,7 +1371,10 @@ llvm::Value *CGOpenMPRuntime::emitTaskOutlinedFunction(
   CodeGen.setAction(Action);
   assert(!ThreadIDVar->getType()->isPointerType() &&
          "thread id variable must be of type kmp_int32 for tasks");
-  auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
+  const OpenMPDirectiveKind Region =
+      isOpenMPTaskLoopDirective(D.getDirectiveKind()) ? OMPD_taskloop
+                                                      : OMPD_task;
+  auto *CS = D.getCapturedStmt(Region);
   auto *TD = dyn_cast<OMPTaskDirective>(&D);
   CodeGenFunction CGF(CGM, true);
   CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
@@ -1495,7 +1499,7 @@ llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
       if (OMPRegionInfo->getThreadIDVariable()) {
         // Check if this an outlined function with thread id passed as argument.
         auto LVal = OMPRegionInfo->getThreadIDVariableLValue(CGF);
-        ThreadID = CGF.EmitLoadOfLValue(LVal, Loc).getScalarVal();
+        ThreadID = CGF.EmitLoadOfScalar(LVal, Loc);
         // If value loaded in entry block, cache it and use it everywhere in
         // function.
         if (CGF.Builder.GetInsertBlock() == CGF.AllocaInsertPt->getParent()) {
@@ -2383,7 +2387,8 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
       // threadprivate copy of the variable VD
       CodeGenFunction CtorCGF(CGM);
       FunctionArgList Args;
-      ImplicitParamDecl Dst(CGM.getContext(), CGM.getContext().VoidPtrTy,
+      ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, Loc,
+                            /*Id=*/nullptr, CGM.getContext().VoidPtrTy,
                             ImplicitParamDecl::Other);
       Args.push_back(&Dst);
 
@@ -2393,13 +2398,13 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
       auto Fn = CGM.CreateGlobalInitOrDestructFunction(
           FTy, ".__kmpc_global_ctor_.", FI, Loc);
       CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidPtrTy, Fn, FI,
-                            Args, SourceLocation());
+                            Args, Loc, Loc);
       auto ArgVal = CtorCGF.EmitLoadOfScalar(
           CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false,
           CGM.getContext().VoidPtrTy, Dst.getLocation());
       Address Arg = Address(ArgVal, VDAddr.getAlignment());
-      Arg = CtorCGF.Builder.CreateElementBitCast(Arg,
-                                             CtorCGF.ConvertTypeForMem(ASTTy));
+      Arg = CtorCGF.Builder.CreateElementBitCast(
+          Arg, CtorCGF.ConvertTypeForMem(ASTTy));
       CtorCGF.EmitAnyExprToMem(Init, Arg, Init->getType().getQualifiers(),
                                /*IsInitializer=*/true);
       ArgVal = CtorCGF.EmitLoadOfScalar(
@@ -2414,7 +2419,8 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
       // of the variable VD
       CodeGenFunction DtorCGF(CGM);
       FunctionArgList Args;
-      ImplicitParamDecl Dst(CGM.getContext(), CGM.getContext().VoidPtrTy,
+      ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, Loc,
+                            /*Id=*/nullptr, CGM.getContext().VoidPtrTy,
                             ImplicitParamDecl::Other);
       Args.push_back(&Dst);
 
@@ -2425,7 +2431,7 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
           FTy, ".__kmpc_global_dtor_.", FI, Loc);
       auto NL = ApplyDebugLocation::CreateEmpty(DtorCGF);
       DtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI, Args,
-                            SourceLocation());
+                            Loc, Loc);
       // Create a scope with an artificial location for the body of this function.
       auto AL = ApplyDebugLocation::CreateArtificial(DtorCGF);
       auto ArgVal = DtorCGF.EmitLoadOfScalar(
@@ -2469,7 +2475,7 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
       FunctionArgList ArgList;
       InitCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, InitFunction,
                             CGM.getTypes().arrangeNullaryFunction(), ArgList,
-                            Loc);
+                            Loc, Loc);
       emitThreadPrivateVarInit(InitCGF, VDAddr, Ctor, CopyCtor, Dtor, Loc);
       InitCGF.FinishFunction();
       return InitFunction;
@@ -2783,12 +2789,15 @@ static Address emitAddrOfVarFromArray(CodeGenFunction &CGF, Address Array,
 static llvm::Value *emitCopyprivateCopyFunction(
     CodeGenModule &CGM, llvm::Type *ArgsType,
     ArrayRef<const Expr *> CopyprivateVars, ArrayRef<const Expr *> DestExprs,
-    ArrayRef<const Expr *> SrcExprs, ArrayRef<const Expr *> AssignmentOps) {
+    ArrayRef<const Expr *> SrcExprs, ArrayRef<const Expr *> AssignmentOps,
+    SourceLocation Loc) {
   auto &C = CGM.getContext();
   // void copy_func(void *LHSArg, void *RHSArg);
   FunctionArgList Args;
-  ImplicitParamDecl LHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
-  ImplicitParamDecl RHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy,
+                           ImplicitParamDecl::Other);
+  ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy,
+                           ImplicitParamDecl::Other);
   Args.push_back(&LHSArg);
   Args.push_back(&RHSArg);
   auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
@@ -2797,7 +2806,7 @@ static llvm::Value *emitCopyprivateCopyFunction(
       ".omp.copyprivate.copy_func", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
   CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   // Dest = (void*[n])(LHSArg);
   // Src = (void*[n])(RHSArg);
   Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
@@ -2888,7 +2897,7 @@ void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
     // threads in the corresponding parallel region.
     auto *CpyFn = emitCopyprivateCopyFunction(
         CGM, CGF.ConvertTypeForMem(CopyprivateArrayTy)->getPointerTo(),
-        CopyprivateVars, SrcExprs, DstExprs, AssignmentOps);
+        CopyprivateVars, SrcExprs, DstExprs, AssignmentOps, Loc);
     auto *BufSize = CGF.getTypeSize(CopyprivateArrayTy);
     Address CL =
       CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(CopyprivateList,
@@ -3399,11 +3408,13 @@ createOffloadingBinaryDescriptorFunction(CodeGenModule &CGM, StringRef Name,
   Args.push_back(&DummyPtr);
 
   CodeGenFunction CGF(CGM);
+  // Disable debug info for global (de-)initializer because they are not part of
+  // some particular construct.
+  CGF.disableDebugInfo();
   auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
   auto FTy = CGM.getTypes().GetFunctionType(FI);
-  auto *Fn =
-      CGM.CreateGlobalInitOrDestructFunction(FTy, Name, FI, SourceLocation());
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FI, Args, SourceLocation());
+  auto *Fn = CGM.CreateGlobalInitOrDestructFunction(FTy, Name, FI);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FI, Args);
   Codegen(CGF);
   CGF.FinishFunction();
   return Fn;
@@ -3411,7 +3422,6 @@ createOffloadingBinaryDescriptorFunction(CodeGenModule &CGM, StringRef Name,
 
 llvm::Function *
 CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() {
-
   // If we don't have entries or if we are emitting code for the device, we
   // don't need to do anything.
   if (CGM.getLangOpts().OpenMPIsDevice || OffloadEntriesInfoManager.empty())
@@ -3563,11 +3573,9 @@ void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID,
   EntryInit.addInt(CGM.SizeTy, Size);
   EntryInit.addInt(CGM.Int32Ty, Flags);
   EntryInit.addInt(CGM.Int32Ty, 0);
-  llvm::GlobalVariable *Entry =
-    EntryInit.finishAndCreateGlobal(".omp_offloading.entry",
-                                    Align,
-                                    /*constant*/ true,
-                                    llvm::GlobalValue::ExternalLinkage);
+  llvm::GlobalVariable *Entry = EntryInit.finishAndCreateGlobal(
+      Twine(".omp_offloading.entry.") + Name, Align,
+      /*constant*/ true, llvm::GlobalValue::ExternalLinkage);
 
   // The entry has to be created in the section the linker expects it to be.
   Entry->setSection(".omp_offloading.entries");
@@ -3750,6 +3758,7 @@ QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
     addFieldToRecordDecl(
         C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
     RD->completeDefinition();
+    RD->addAttr(PackedAttr::CreateImplicit(C));
     TgtOffloadEntryQTy = C.getRecordType(RD);
   }
   return TgtOffloadEntryQTy;
@@ -3945,7 +3954,8 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
                              ".omp_task_entry.", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskEntry, TaskEntryFnInfo);
   CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), KmpInt32Ty, TaskEntry, TaskEntryFnInfo, Args);
+  CGF.StartFunction(GlobalDecl(), KmpInt32Ty, TaskEntry, TaskEntryFnInfo, Args,
+                    Loc, Loc);
 
   // TaskFunction(gtid, tt->task_data.part_id, &tt->privates, task_privates_map,
   // tt,
@@ -3969,7 +3979,7 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
   auto SharedsFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds);
   auto SharedsLVal = CGF.EmitLValueForField(Base, *SharedsFI);
   auto *SharedsParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfLValue(SharedsLVal, Loc).getScalarVal(),
+      CGF.EmitLoadOfScalar(SharedsLVal, Loc),
       CGF.ConvertTypeForMem(SharedsPtrTy));
 
   auto PrivatesFI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin(), 1);
@@ -3992,19 +4002,19 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
   if (isOpenMPTaskLoopDirective(Kind)) {
     auto LBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound);
     auto LBLVal = CGF.EmitLValueForField(Base, *LBFI);
-    auto *LBParam = CGF.EmitLoadOfLValue(LBLVal, Loc).getScalarVal();
+    auto *LBParam = CGF.EmitLoadOfScalar(LBLVal, Loc);
     auto UBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound);
     auto UBLVal = CGF.EmitLValueForField(Base, *UBFI);
-    auto *UBParam = CGF.EmitLoadOfLValue(UBLVal, Loc).getScalarVal();
+    auto *UBParam = CGF.EmitLoadOfScalar(UBLVal, Loc);
     auto StFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTStride);
     auto StLVal = CGF.EmitLValueForField(Base, *StFI);
-    auto *StParam = CGF.EmitLoadOfLValue(StLVal, Loc).getScalarVal();
+    auto *StParam = CGF.EmitLoadOfScalar(StLVal, Loc);
     auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
     auto LILVal = CGF.EmitLValueForField(Base, *LIFI);
-    auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal();
+    auto *LIParam = CGF.EmitLoadOfScalar(LILVal, Loc);
     auto RFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTReductions);
     auto RLVal = CGF.EmitLValueForField(Base, *RFI);
-    auto *RParam = CGF.EmitLoadOfLValue(RLVal, Loc).getScalarVal();
+    auto *RParam = CGF.EmitLoadOfScalar(RLVal, Loc);
     CallArgs.push_back(LBParam);
     CallArgs.push_back(UBParam);
     CallArgs.push_back(StParam);
@@ -4045,9 +4055,8 @@ static llvm::Value *emitDestructorsFunction(CodeGenModule &CGM,
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, DestructorFn,
                                     DestructorFnInfo);
   CodeGenFunction CGF(CGM);
-  CGF.disableDebugInfo();
   CGF.StartFunction(GlobalDecl(), KmpInt32Ty, DestructorFn, DestructorFnInfo,
-                    Args);
+                    Args, Loc, Loc);
 
   LValue Base = CGF.EmitLoadOfPointerLValue(
       CGF.GetAddrOfLocalVar(&TaskTypeArg),
@@ -4139,9 +4148,8 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
   TaskPrivatesMap->removeFnAttr(llvm::Attribute::OptimizeNone);
   TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline);
   CodeGenFunction CGF(CGM);
-  CGF.disableDebugInfo();
   CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskPrivatesMap,
-                    TaskPrivatesMapFnInfo, Args);
+                    TaskPrivatesMapFnInfo, Args, Loc, Loc);
 
   // *privi = &.privates.privi;
   LValue Base = CGF.EmitLoadOfPointerLValue(
@@ -4178,6 +4186,11 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
   auto &C = CGF.getContext();
   auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
   LValue PrivatesBase = CGF.EmitLValueForField(TDBase, *FI);
+  OpenMPDirectiveKind Kind = isOpenMPTaskLoopDirective(D.getDirectiveKind())
+                                 ? OMPD_taskloop
+                                 : OMPD_task;
+  const CapturedStmt &CS = *D.getCapturedStmt(Kind);
+  CodeGenFunction::CGCapturedStmtInfo CapturesInfo(CS);
   LValue SrcBase;
   bool IsTargetTask =
       isOpenMPTargetDataManagementDirective(D.getDirectiveKind()) ||
@@ -4186,16 +4199,12 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
   // PointersArray and SizesArray. The original variables for these arrays are
   // not captured and we get their addresses explicitly.
   if ((!IsTargetTask && !Data.FirstprivateVars.empty()) ||
-      (IsTargetTask && Data.FirstprivateVars.size() > 3)) {
+      (IsTargetTask && KmpTaskSharedsPtr.isValid())) {
     SrcBase = CGF.MakeAddrLValue(
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
             KmpTaskSharedsPtr, CGF.ConvertTypeForMem(SharedsPtrTy)),
         SharedsTy);
   }
-  OpenMPDirectiveKind Kind = isOpenMPTaskLoopDirective(D.getDirectiveKind())
-                                 ? OMPD_taskloop
-                                 : OMPD_task;
-  CodeGenFunction::CGCapturedStmtInfo CapturesInfo(*D.getCapturedStmt(Kind));
   FI = cast<RecordDecl>(FI->getType()->getAsTagDecl())->field_begin();
   for (auto &&Pair : Privates) {
     auto *VD = Pair.second.PrivateCopy;
@@ -4209,17 +4218,19 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
         // PointersArray or SizesArray.
         LValue SharedRefLValue;
         QualType Type = OriginalVD->getType();
-        if (IsTargetTask && isa<ImplicitParamDecl>(OriginalVD) &&
-            isa<CapturedDecl>(OriginalVD->getDeclContext()) &&
-            cast<CapturedDecl>(OriginalVD->getDeclContext())->getNumParams() ==
-                0 &&
-            isa<TranslationUnitDecl>(
-                cast<CapturedDecl>(OriginalVD->getDeclContext())
-                    ->getDeclContext())) {
+        auto *SharedField = CapturesInfo.lookup(OriginalVD);
+        if (IsTargetTask && !SharedField) {
+          assert(isa<ImplicitParamDecl>(OriginalVD) &&
+                 isa<CapturedDecl>(OriginalVD->getDeclContext()) &&
+                 cast<CapturedDecl>(OriginalVD->getDeclContext())
+                         ->getNumParams() == 0 &&
+                 isa<TranslationUnitDecl>(
+                     cast<CapturedDecl>(OriginalVD->getDeclContext())
+                         ->getDeclContext()) &&
+                 "Expected artificial target data variable.");
           SharedRefLValue =
               CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(OriginalVD), Type);
         } else {
-          auto *SharedField = CapturesInfo.lookup(OriginalVD);
           SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
           SharedRefLValue = CGF.MakeAddrLValue(
               Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)),
@@ -4230,8 +4241,7 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
           // Initialize firstprivate array.
           if (!isa<CXXConstructExpr>(Init) || CGF.isTrivialInitializer(Init)) {
             // Perform simple memcpy.
-            CGF.EmitAggregateAssign(PrivateLValue.getAddress(),
-                                    SharedRefLValue.getAddress(), Type);
+            CGF.EmitAggregateAssign(PrivateLValue, SharedRefLValue, Type);
           } else {
             // Initialize firstprivate array using element-by-element
             // initialization.
@@ -4322,8 +4332,8 @@ emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc,
                              ".omp_task_dup.", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskDup, TaskDupFnInfo);
   CodeGenFunction CGF(CGM);
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskDup, TaskDupFnInfo, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskDup, TaskDupFnInfo, Args, Loc,
+                    Loc);
 
   LValue TDBase = CGF.EmitLoadOfPointerLValue(
       CGF.GetAddrOfLocalVar(&DstArg),
@@ -4523,7 +4533,9 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
                                            KmpTaskTShareds)),
                     Loc),
                 CGF.getNaturalTypeAlignment(SharedsTy));
-    CGF.EmitAggregateCopy(KmpTaskSharedsPtr, Shareds, SharedsTy);
+    LValue Dest = CGF.MakeAddrLValue(KmpTaskSharedsPtr, SharedsTy);
+    LValue Src = CGF.MakeAddrLValue(Shareds, SharedsTy);
+    CGF.EmitAggregateCopy(Dest, Src, SharedsTy);
   }
   // Emit initial values for private copies (if any).
   TaskResultTy Result;
@@ -4825,7 +4837,7 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
       IfVal,
       LBLVal.getPointer(),
       UBLVal.getPointer(),
-      CGF.EmitLoadOfScalar(StLVal, SourceLocation()),
+      CGF.EmitLoadOfScalar(StLVal, Loc),
       llvm::ConstantInt::getNullValue(
           CGF.IntTy), // Always 0 because taskgroup emitted by the compiler
       llvm::ConstantInt::getSigned(
@@ -4943,15 +4955,17 @@ static void emitReductionCombiner(CodeGenFunction &CGF,
 }
 
 llvm::Value *CGOpenMPRuntime::emitReductionFunction(
-    CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef<const Expr *> Privates,
-    ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
-    ArrayRef<const Expr *> ReductionOps) {
+    CodeGenModule &CGM, SourceLocation Loc, llvm::Type *ArgsType,
+    ArrayRef<const Expr *> Privates, ArrayRef<const Expr *> LHSExprs,
+    ArrayRef<const Expr *> RHSExprs, ArrayRef<const Expr *> ReductionOps) {
   auto &C = CGM.getContext();
 
   // void reduction_func(void *LHSArg, void *RHSArg);
   FunctionArgList Args;
-  ImplicitParamDecl LHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
-  ImplicitParamDecl RHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy,
+                           ImplicitParamDecl::Other);
+  ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy,
+                           ImplicitParamDecl::Other);
   Args.push_back(&LHSArg);
   Args.push_back(&RHSArg);
   auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
@@ -4960,7 +4974,7 @@ llvm::Value *CGOpenMPRuntime::emitReductionFunction(
       ".omp.reduction.reduction_func", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
   CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   // Dst = (void*[n])(LHSArg);
   // Src = (void*[n])(RHSArg);
@@ -5149,8 +5163,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
 
   // 2. Emit reduce_func().
   auto *ReductionFn = emitReductionFunction(
-      CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
-      LHSExprs, RHSExprs, ReductionOps);
+      CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
+      Privates, LHSExprs, RHSExprs, ReductionOps);
 
   // 3. Create static kmp_critical_name lock = { 0 };
   auto *Lock = getCriticalRegionLock(".reduction");
@@ -5365,7 +5379,8 @@ static llvm::Value *emitReduceInitFunction(CodeGenModule &CGM,
                                            ReductionCodeGen &RCG, unsigned N) {
   auto &C = CGM.getContext();
   FunctionArgList Args;
-  ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl Param(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy,
+                          ImplicitParamDecl::Other);
   Args.emplace_back(&Param);
   auto &FnInfo =
       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
@@ -5374,8 +5389,7 @@ static llvm::Value *emitReduceInitFunction(CodeGenModule &CGM,
                                     ".red_init.", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
   CodeGenFunction CGF(CGM);
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args, Loc, Loc);
   Address PrivateAddr = CGF.EmitLoadOfPointer(
       CGF.GetAddrOfLocalVar(&Param),
       C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
@@ -5386,9 +5400,8 @@ static llvm::Value *emitReduceInitFunction(CodeGenModule &CGM,
     Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
         CGF, CGM.getContext().getSizeType(),
         generateUniqueName("reduction_size", Loc, N));
-    Size =
-        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
-                             CGM.getContext().getSizeType(), SourceLocation());
+    Size = CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                                CGM.getContext().getSizeType(), Loc);
   }
   RCG.emitAggregateType(CGF, N, Size);
   LValue SharedLVal;
@@ -5435,8 +5448,10 @@ static llvm::Value *emitReduceCombFunction(CodeGenModule &CGM,
   auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(LHS)->getDecl());
   auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(RHS)->getDecl());
   FunctionArgList Args;
-  ImplicitParamDecl ParamInOut(C, C.VoidPtrTy, ImplicitParamDecl::Other);
-  ImplicitParamDecl ParamIn(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ParamInOut(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                               C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ParamIn(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy,
+                            ImplicitParamDecl::Other);
   Args.emplace_back(&ParamInOut);
   Args.emplace_back(&ParamIn);
   auto &FnInfo =
@@ -5446,8 +5461,7 @@ static llvm::Value *emitReduceCombFunction(CodeGenModule &CGM,
                                     ".red_comb.", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
   CodeGenFunction CGF(CGM);
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args, Loc, Loc);
   llvm::Value *Size = nullptr;
   // If the size of the reduction item is non-constant, load it from global
   // threadprivate variable.
@@ -5455,9 +5469,8 @@ static llvm::Value *emitReduceCombFunction(CodeGenModule &CGM,
     Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
         CGF, CGM.getContext().getSizeType(),
         generateUniqueName("reduction_size", Loc, N));
-    Size =
-        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
-                             CGM.getContext().getSizeType(), SourceLocation());
+    Size = CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                                CGM.getContext().getSizeType(), Loc);
   }
   RCG.emitAggregateType(CGF, N, Size);
   // Remap lhs and rhs variables to the addresses of the function arguments.
@@ -5506,7 +5519,8 @@ static llvm::Value *emitReduceFiniFunction(CodeGenModule &CGM,
     return nullptr;
   auto &C = CGM.getContext();
   FunctionArgList Args;
-  ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl Param(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy,
+                          ImplicitParamDecl::Other);
   Args.emplace_back(&Param);
   auto &FnInfo =
       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
@@ -5515,8 +5529,7 @@ static llvm::Value *emitReduceFiniFunction(CodeGenModule &CGM,
                                     ".red_fini.", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
   CodeGenFunction CGF(CGM);
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args, Loc, Loc);
   Address PrivateAddr = CGF.EmitLoadOfPointer(
       CGF.GetAddrOfLocalVar(&Param),
       C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
@@ -5527,9 +5540,8 @@ static llvm::Value *emitReduceFiniFunction(CodeGenModule &CGM,
     Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
         CGF, CGM.getContext().getSizeType(),
         generateUniqueName("reduction_size", Loc, N));
-    Size =
-        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
-                             CGM.getContext().getSizeType(), SourceLocation());
+    Size = CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                                CGM.getContext().getSizeType(), Loc);
   }
   RCG.emitAggregateType(CGF, N, Size);
   // Emit the finalizer body:
@@ -5876,7 +5888,7 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
        << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
   }
 
-  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+  const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target);
 
   CodeGenFunction CGF(CGM, true);
   CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName);
@@ -5903,6 +5915,7 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
   if (CGM.getLangOpts().OpenMPIsDevice) {
     OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy);
     OutlinedFn->setLinkage(llvm::GlobalValue::ExternalLinkage);
+    OutlinedFn->setDSOLocal(false);
   } else
     OutlinedFnID = new llvm::GlobalVariable(
         CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
@@ -5969,7 +5982,7 @@ emitNumTeamsForTargetDirective(CGOpenMPRuntime &OMPRuntime,
   // the expression is captured in the enclosing target environment when the
   // teams directive is not combined with target.
 
-  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+  const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target);
 
   if (auto *TeamsDir = dyn_cast_or_null<OMPExecutableDirective>(
           ignoreCompoundStmts(CS.getCapturedStmt()))) {
@@ -6072,7 +6085,7 @@ emitNumThreadsForTargetDirective(CGOpenMPRuntime &OMPRuntime,
   // the expression is captured in the enclosing target environment when the
   // teams directive is not combined with target.
 
-  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+  const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target);
 
   if (auto *TeamsDir = dyn_cast_or_null<OMPExecutableDirective>(
           ignoreCompoundStmts(CS.getCapturedStmt()))) {
@@ -6702,10 +6715,8 @@ class MappableExprsHandler {
         // We didn't find any match in our map information - generate a zero
         // size array section.
         // FIXME: MSVC 2013 seems to require this-> to find member CGF.
-        llvm::Value *Ptr =
-            this->CGF
-                .EmitLoadOfLValue(this->CGF.EmitLValue(IE), SourceLocation())
-                .getScalarVal();
+        llvm::Value *Ptr = this->CGF.EmitLoadOfScalar(this->CGF.EmitLValue(IE),
+                                                      IE->getExprLoc());
         BasePointers.push_back({Ptr, VD});
         Pointers.push_back(Ptr);
         Sizes.push_back(llvm::Constant::getNullValue(this->CGF.SizeTy));
@@ -7027,86 +7038,27 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
                                      const OMPExecutableDirective &D,
                                      llvm::Value *OutlinedFn,
                                      llvm::Value *OutlinedFnID,
-                                     const Expr *IfCond, const Expr *Device,
-                                     ArrayRef<llvm::Value *> CapturedVars) {
+                                     const Expr *IfCond, const Expr *Device) {
   if (!CGF.HaveInsertPoint())
     return;
 
   assert(OutlinedFn && "Invalid outlined function!");
 
-  // Fill up the arrays with all the captured variables.
-  MappableExprsHandler::MapValuesArrayTy KernelArgs;
-  MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
-  MappableExprsHandler::MapValuesArrayTy Pointers;
-  MappableExprsHandler::MapValuesArrayTy Sizes;
-  MappableExprsHandler::MapFlagsArrayTy MapTypes;
-
-  MappableExprsHandler::MapBaseValuesArrayTy CurBasePointers;
-  MappableExprsHandler::MapValuesArrayTy CurPointers;
-  MappableExprsHandler::MapValuesArrayTy CurSizes;
-  MappableExprsHandler::MapFlagsArrayTy CurMapTypes;
-
-  // Get mappable expression information.
-  MappableExprsHandler MEHandler(D, CGF);
-
-  const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
-  auto RI = CS.getCapturedRecordDecl()->field_begin();
-  auto CV = CapturedVars.begin();
-  for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
-                                            CE = CS.capture_end();
-       CI != CE; ++CI, ++RI, ++CV) {
-    CurBasePointers.clear();
-    CurPointers.clear();
-    CurSizes.clear();
-    CurMapTypes.clear();
-
-    // VLA sizes are passed to the outlined region by copy and do not have map
-    // information associated.
-    if (CI->capturesVariableArrayType()) {
-      CurBasePointers.push_back(*CV);
-      CurPointers.push_back(*CV);
-      CurSizes.push_back(CGF.getTypeSize(RI->getType()));
-      // Copy to the device as an argument. No need to retrieve it.
-      CurMapTypes.push_back(MappableExprsHandler::OMP_MAP_LITERAL |
-                            MappableExprsHandler::OMP_MAP_TARGET_PARAM);
-    } else {
-      // If we have any information in the map clause, we use it, otherwise we
-      // just do a default mapping.
-      MEHandler.generateInfoForCapture(CI, *CV, CurBasePointers, CurPointers,
-                                       CurSizes, CurMapTypes);
-      if (CurBasePointers.empty())
-        MEHandler.generateDefaultMapInfo(*CI, **RI, *CV, CurBasePointers,
-                                         CurPointers, CurSizes, CurMapTypes);
-    }
-    // We expect to have at least an element of information for this capture.
-    assert(!CurBasePointers.empty() && "Non-existing map pointer for capture!");
-    assert(CurBasePointers.size() == CurPointers.size() &&
-           CurBasePointers.size() == CurSizes.size() &&
-           CurBasePointers.size() == CurMapTypes.size() &&
-           "Inconsistent map information sizes!");
-
-    // The kernel args are always the first elements of the base pointers
-    // associated with a capture.
-    KernelArgs.push_back(*CurBasePointers.front());
-    // We need to append the results of this capture to what we already have.
-    BasePointers.append(CurBasePointers.begin(), CurBasePointers.end());
-    Pointers.append(CurPointers.begin(), CurPointers.end());
-    Sizes.append(CurSizes.begin(), CurSizes.end());
-    MapTypes.append(CurMapTypes.begin(), CurMapTypes.end());
-  }
+  const bool RequiresOuterTask = D.hasClausesOfKind<OMPDependClause>();
+  llvm::SmallVector<llvm::Value *, 16> CapturedVars;
+  const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target);
+  auto &&ArgsCodegen = [&CS, &CapturedVars](CodeGenFunction &CGF,
+                                            PrePostActionTy &) {
+    CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
+  };
+  emitInlinedDirective(CGF, OMPD_unknown, ArgsCodegen);
 
+  CodeGenFunction::OMPTargetDataInfo InputInfo;
+  llvm::Value *MapTypesArray = nullptr;
   // Fill up the pointer arrays and transfer execution to the device.
-  auto &&ThenGen = [this, &BasePointers, &Pointers, &Sizes, &MapTypes, Device,
-                    OutlinedFn, OutlinedFnID, &D,
-                    &KernelArgs](CodeGenFunction &CGF, PrePostActionTy &) {
-    auto &RT = CGF.CGM.getOpenMPRuntime();
-    // Emit the offloading arrays.
-    TargetDataInfo Info;
-    emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
-    emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
-                                 Info.PointersArray, Info.SizesArray,
-                                 Info.MapTypesArray, Info);
-
+  auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo,
+                    &MapTypesArray, &CS, RequiresOuterTask,
+                    &CapturedVars](CodeGenFunction &CGF, PrePostActionTy &) {
     // On top of the arrays that were filled up, the target offloading call
     // takes as arguments the device id as well as the host pointer. The host
     // pointer is used by the runtime library to identify the current target
@@ -7129,13 +7081,14 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
     }
 
     // Emit the number of elements in the offloading arrays.
-    llvm::Value *PointerNum = CGF.Builder.getInt32(BasePointers.size());
+    llvm::Value *PointerNum =
+        CGF.Builder.getInt32(InputInfo.NumberOfTargetItems);
 
     // Return value of the runtime offloading call.
     llvm::Value *Return;
 
-    auto *NumTeams = emitNumTeamsForTargetDirective(RT, CGF, D);
-    auto *NumThreads = emitNumThreadsForTargetDirective(RT, CGF, D);
+    auto *NumTeams = emitNumTeamsForTargetDirective(*this, CGF, D);
+    auto *NumThreads = emitNumThreadsForTargetDirective(*this, CGF, D);
 
     bool HasNowait = D.hasClausesOfKind<OMPNowaitClause>();
     // The target region is an outlined function launched by the runtime
@@ -7173,25 +7126,30 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
       // passed to the runtime library - a 32-bit integer with the value zero.
       assert(NumThreads && "Thread limit expression should be available along "
                            "with number of teams.");
-      llvm::Value *OffloadingArgs[] = {
-          DeviceID,           OutlinedFnID,
-          PointerNum,         Info.BasePointersArray,
-          Info.PointersArray, Info.SizesArray,
-          Info.MapTypesArray, NumTeams,
-          NumThreads};
+      llvm::Value *OffloadingArgs[] = {DeviceID,
+                                       OutlinedFnID,
+                                       PointerNum,
+                                       InputInfo.BasePointersArray.getPointer(),
+                                       InputInfo.PointersArray.getPointer(),
+                                       InputInfo.SizesArray.getPointer(),
+                                       MapTypesArray,
+                                       NumTeams,
+                                       NumThreads};
       Return = CGF.EmitRuntimeCall(
-          RT.createRuntimeFunction(HasNowait ? OMPRTL__tgt_target_teams_nowait
-                                             : OMPRTL__tgt_target_teams),
+          createRuntimeFunction(HasNowait ? OMPRTL__tgt_target_teams_nowait
+                                          : OMPRTL__tgt_target_teams),
           OffloadingArgs);
     } else {
-      llvm::Value *OffloadingArgs[] = {
-          DeviceID,           OutlinedFnID,
-          PointerNum,         Info.BasePointersArray,
-          Info.PointersArray, Info.SizesArray,
-          Info.MapTypesArray};
+      llvm::Value *OffloadingArgs[] = {DeviceID,
+                                       OutlinedFnID,
+                                       PointerNum,
+                                       InputInfo.BasePointersArray.getPointer(),
+                                       InputInfo.PointersArray.getPointer(),
+                                       InputInfo.SizesArray.getPointer(),
+                                       MapTypesArray};
       Return = CGF.EmitRuntimeCall(
-          RT.createRuntimeFunction(HasNowait ? OMPRTL__tgt_target_nowait
-                                             : OMPRTL__tgt_target),
+          createRuntimeFunction(HasNowait ? OMPRTL__tgt_target_nowait
+                                          : OMPRTL__tgt_target),
           OffloadingArgs);
     }
 
@@ -7204,17 +7162,114 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
     CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
 
     CGF.EmitBlock(OffloadFailedBlock);
-    emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedFn, KernelArgs);
+    if (RequiresOuterTask) {
+      CapturedVars.clear();
+      CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
+    }
+    emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedFn, CapturedVars);
     CGF.EmitBranch(OffloadContBlock);
 
     CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true);
   };
 
   // Notify that the host version must be executed.
-  auto &&ElseGen = [this, &D, OutlinedFn, &KernelArgs](CodeGenFunction &CGF,
-                                                      PrePostActionTy &) {
-    emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedFn,
-                             KernelArgs);
+  auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars,
+                    RequiresOuterTask](CodeGenFunction &CGF,
+                                       PrePostActionTy &) {
+    if (RequiresOuterTask) {
+      CapturedVars.clear();
+      CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
+    }
+    emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedFn, CapturedVars);
+  };
+
+  auto &&TargetThenGen = [this, &ThenGen, &D, &InputInfo, &MapTypesArray,
+                          &CapturedVars, RequiresOuterTask,
+                          &CS](CodeGenFunction &CGF, PrePostActionTy &) {
+    // Fill up the arrays with all the captured variables.
+    MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
+    MappableExprsHandler::MapValuesArrayTy Pointers;
+    MappableExprsHandler::MapValuesArrayTy Sizes;
+    MappableExprsHandler::MapFlagsArrayTy MapTypes;
+
+    MappableExprsHandler::MapBaseValuesArrayTy CurBasePointers;
+    MappableExprsHandler::MapValuesArrayTy CurPointers;
+    MappableExprsHandler::MapValuesArrayTy CurSizes;
+    MappableExprsHandler::MapFlagsArrayTy CurMapTypes;
+
+    // Get mappable expression information.
+    MappableExprsHandler MEHandler(D, CGF);
+
+    auto RI = CS.getCapturedRecordDecl()->field_begin();
+    auto CV = CapturedVars.begin();
+    for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
+                                              CE = CS.capture_end();
+         CI != CE; ++CI, ++RI, ++CV) {
+      CurBasePointers.clear();
+      CurPointers.clear();
+      CurSizes.clear();
+      CurMapTypes.clear();
+
+      // VLA sizes are passed to the outlined region by copy and do not have map
+      // information associated.
+      if (CI->capturesVariableArrayType()) {
+        CurBasePointers.push_back(*CV);
+        CurPointers.push_back(*CV);
+        CurSizes.push_back(CGF.getTypeSize(RI->getType()));
+        // Copy to the device as an argument. No need to retrieve it.
+        CurMapTypes.push_back(MappableExprsHandler::OMP_MAP_LITERAL |
+                              MappableExprsHandler::OMP_MAP_TARGET_PARAM);
+      } else {
+        // If we have any information in the map clause, we use it, otherwise we
+        // just do a default mapping.
+        MEHandler.generateInfoForCapture(CI, *CV, CurBasePointers, CurPointers,
+                                         CurSizes, CurMapTypes);
+        if (CurBasePointers.empty())
+          MEHandler.generateDefaultMapInfo(*CI, **RI, *CV, CurBasePointers,
+                                           CurPointers, CurSizes, CurMapTypes);
+      }
+      // We expect to have at least an element of information for this capture.
+      assert(!CurBasePointers.empty() &&
+             "Non-existing map pointer for capture!");
+      assert(CurBasePointers.size() == CurPointers.size() &&
+             CurBasePointers.size() == CurSizes.size() &&
+             CurBasePointers.size() == CurMapTypes.size() &&
+             "Inconsistent map information sizes!");
+
+      // We need to append the results of this capture to what we already have.
+      BasePointers.append(CurBasePointers.begin(), CurBasePointers.end());
+      Pointers.append(CurPointers.begin(), CurPointers.end());
+      Sizes.append(CurSizes.begin(), CurSizes.end());
+      MapTypes.append(CurMapTypes.begin(), CurMapTypes.end());
+    }
+
+    TargetDataInfo Info;
+    // Fill up the arrays and create the arguments.
+    emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
+    emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
+                                 Info.PointersArray, Info.SizesArray,
+                                 Info.MapTypesArray, Info);
+    InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
+    InputInfo.BasePointersArray =
+        Address(Info.BasePointersArray, CGM.getPointerAlign());
+    InputInfo.PointersArray =
+        Address(Info.PointersArray, CGM.getPointerAlign());
+    InputInfo.SizesArray = Address(Info.SizesArray, CGM.getPointerAlign());
+    MapTypesArray = Info.MapTypesArray;
+    if (RequiresOuterTask)
+      CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
+    else
+      emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
+  };
+
+  auto &&TargetElseGen = [this, &ElseGen, &D, RequiresOuterTask](
+                             CodeGenFunction &CGF, PrePostActionTy &) {
+    if (RequiresOuterTask) {
+      CodeGenFunction::OMPTargetDataInfo InputInfo;
+      CGF.EmitOMPTargetTaskBasedDirective(D, ElseGen, InputInfo);
+    } else {
+      emitInlinedDirective(CGF, D.getDirectiveKind(), ElseGen);
+    }
   };
 
   // If we have a target function ID it means that we need to support
@@ -7222,14 +7277,14 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
   // regardless of the conditional in the if clause if, e.g., the user do not
   // specify target triples.
   if (OutlinedFnID) {
-    if (IfCond)
-      emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
-    else {
-      RegionCodeGenTy ThenRCG(ThenGen);
+    if (IfCond) {
+      emitOMPIfClause(CGF, IfCond, TargetThenGen, TargetElseGen);
+    } else {
+      RegionCodeGenTy ThenRCG(TargetThenGen);
       ThenRCG(CGF);
     }
   } else {
-    RegionCodeGenTy ElseRCG(ElseGen);
+    RegionCodeGenTy ElseRCG(TargetElseGen);
     ElseRCG(CGF);
   }
 }
@@ -7292,6 +7347,17 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
       CodeGenFunction::EmitOMPTargetSimdDeviceFunction(
           CGM, ParentName, cast<OMPTargetSimdDirective>(*S));
       break;
+    case Stmt::OMPTargetTeamsDistributeParallelForDirectiveClass:
+      CodeGenFunction::EmitOMPTargetTeamsDistributeParallelForDeviceFunction(
+          CGM, ParentName,
+          cast<OMPTargetTeamsDistributeParallelForDirective>(*S));
+      break;
+    case Stmt::OMPTargetTeamsDistributeParallelForSimdDirectiveClass:
+      CodeGenFunction::
+          EmitOMPTargetTeamsDistributeParallelForSimdDeviceFunction(
+              CGM, ParentName,
+              cast<OMPTargetTeamsDistributeParallelForSimdDirective>(*S));
+      break;
     default:
       llvm_unreachable("Unknown target directive for OpenMP device codegen.");
     }
@@ -7299,12 +7365,11 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
   }
 
   if (const OMPExecutableDirective *E = dyn_cast<OMPExecutableDirective>(S)) {
-    if (!E->hasAssociatedStmt())
+    if (!E->hasAssociatedStmt() || !E->getAssociatedStmt())
       return;
 
     scanForTargetRegionsFunctions(
-        cast<CapturedStmt>(E->getAssociatedStmt())->getCapturedStmt(),
-        ParentName);
+        E->getInnermostCapturedStmt()->getCapturedStmt(), ParentName);
     return;
   }
 
@@ -7648,7 +7713,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
     if (D.hasClausesOfKind<OMPDependClause>())
       CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
     else
-      emitInlinedDirective(CGF, OMPD_target_update, ThenGen);
+      emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
   };
 
   if (IfCond)
@@ -8243,8 +8308,7 @@ void CGOpenMPSIMDRuntime::emitTargetCall(CodeGenFunction &CGF,
                                          const OMPExecutableDirective &D,
                                          llvm::Value *OutlinedFn,
                                          llvm::Value *OutlinedFnID,
-                                         const Expr *IfCond, const Expr *Device,
-                                         ArrayRef<llvm::Value *> CapturedVars) {
+                                         const Expr *IfCond, const Expr *Device) {
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 
diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h
index 98e09e09b5ec..e1ba8334368e 100644
--- a/lib/CodeGen/CGOpenMPRuntime.h
+++ b/lib/CodeGen/CGOpenMPRuntime.h
@@ -1048,7 +1048,8 @@ class CGOpenMPRuntime {
   /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
   /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
   /// or 'operator binop(LHS, RHS)'.
-  llvm::Value *emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType,
+  llvm::Value *emitReductionFunction(CodeGenModule &CGM, SourceLocation Loc,
+                                     llvm::Type *ArgsType,
                                      ArrayRef<const Expr *> Privates,
                                      ArrayRef<const Expr *> LHSExprs,
                                      ArrayRef<const Expr *> RHSExprs,
@@ -1206,13 +1207,11 @@ class CGOpenMPRuntime {
   /// directive, or null if no if clause is used.
   /// \param Device Expression evaluated in device clause associated with the
   /// target directive, or null if no device clause is used.
-  /// \param CapturedVars Values captured in the current region.
   virtual void emitTargetCall(CodeGenFunction &CGF,
                               const OMPExecutableDirective &D,
                               llvm::Value *OutlinedFn,
                               llvm::Value *OutlinedFnID, const Expr *IfCond,
-                              const Expr *Device,
-                              ArrayRef<llvm::Value *> CapturedVars);
+                              const Expr *Device);
 
   /// \brief Emit the target regions enclosed in \a GD function definition or
   /// the function itself in case it is a valid device function. Returns true if
@@ -1832,11 +1831,9 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
   /// directive, or null if no if clause is used.
   /// \param Device Expression evaluated in device clause associated with the
   /// target directive, or null if no device clause is used.
-  /// \param CapturedVars Values captured in the current region.
   void emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
                       llvm::Value *OutlinedFn, llvm::Value *OutlinedFnID,
-                      const Expr *IfCond, const Expr *Device,
-                      ArrayRef<llvm::Value *> CapturedVars) override;
+                      const Expr *IfCond, const Expr *Device) override;
 
   /// \brief Emit the target regions enclosed in \a GD function definition or
   /// the function itself in case it is a valid device function. Returns true if
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 7b2993cfd38d..7da8915c8440 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -250,8 +250,8 @@ static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
 }
 
 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
-    CodeGenModule &CGM)
-    : WorkerFn(nullptr), CGFI(nullptr) {
+    CodeGenModule &CGM, SourceLocation Loc)
+    : WorkerFn(nullptr), CGFI(nullptr), Loc(Loc) {
   createWorkerFunction(CGM);
 }
 
@@ -262,7 +262,7 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
 
   WorkerFn = llvm::Function::Create(
       CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
-      /* placeholder */ "_worker", &CGM.getModule());
+      /*placeholder=*/"_worker", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
 }
 
@@ -297,38 +297,38 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
   ExecutionModeRAII ModeRAII(CurrentExecutionMode,
                              CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
   EntryFunctionState EST;
-  WorkerFunctionState WST(CGM);
+  WorkerFunctionState WST(CGM, D.getLocStart());
   Work.clear();
   WrapperFunctionsMap.clear();
 
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
-    CGOpenMPRuntimeNVPTX &RT;
     CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
     CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
 
   public:
-    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
-                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
+    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
                          CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
-        : RT(RT), EST(EST), WST(WST) {}
+        : EST(EST), WST(WST) {}
     void Enter(CodeGenFunction &CGF) override {
-      RT.emitGenericEntryHeader(CGF, EST, WST);
+      static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
+          .emitGenericEntryHeader(CGF, EST, WST);
     }
     void Exit(CodeGenFunction &CGF) override {
-      RT.emitGenericEntryFooter(CGF, EST);
+      static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
+          .emitGenericEntryFooter(CGF, EST);
     }
-  } Action(*this, EST, WST);
+  } Action(EST, WST);
   CodeGen.setAction(Action);
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
 
-  // Create the worker function
-  emitWorkerFunction(WST);
-
   // Now change the name of the worker function to correspond to this target
   // region's entry function.
   WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
+
+  // Create the worker function
+  emitWorkerFunction(WST);
 }
 
 // Setup NVPTX threads for master-worker OpenMP scheme.
@@ -347,7 +347,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
   Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
 
   CGF.EmitBlock(WorkerBB);
-  emitCall(CGF, WST.WorkerFn);
+  emitOutlinedFunctionCall(CGF, WST.Loc, WST.WorkerFn);
   CGF.EmitBranch(EST.ExitBB);
 
   CGF.EmitBlock(MasterCheckBB);
@@ -479,8 +479,8 @@ void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
   ASTContext &Ctx = CGM.getContext();
 
   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
+  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {},
+                    WST.Loc, WST.Loc);
   emitWorkerLoop(CGF, WST);
   CGF.FinishFunction();
 }
@@ -571,8 +571,9 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
     Address Capture = CGF.EmitLoadOfPointer(SharedArgs,
        Ctx.getPointerType(
           Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>());
-    emitCall(CGF, W, {Bld.getInt16(/*ParallelLevel=*/0),
-        getMasterThreadID(CGF), Capture.getPointer()});
+    emitOutlinedFunctionCall(CGF, WST.Loc, W,
+                             {Bld.getInt16(/*ParallelLevel=*/0),
+                              getMasterThreadID(CGF), Capture.getPointer()});
 
     // Go to end of parallel region.
     CGF.EmitBranch(TerminateBB);
@@ -1058,19 +1059,41 @@ void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
 }
 
+/// Cast value to the specified type.
+static llvm::Value *
+castValueToType(CodeGenFunction &CGF, llvm::Value *Val, llvm::Type *CastTy,
+                llvm::Optional<bool> IsSigned = llvm::None) {
+  if (Val->getType() == CastTy)
+    return Val;
+  if (Val->getType()->getPrimitiveSizeInBits() > 0 &&
+      CastTy->getPrimitiveSizeInBits() > 0 &&
+      Val->getType()->getPrimitiveSizeInBits() ==
+          CastTy->getPrimitiveSizeInBits())
+    return CGF.Builder.CreateBitCast(Val, CastTy);
+  if (IsSigned.hasValue() && CastTy->isIntegerTy() &&
+      Val->getType()->isIntegerTy())
+    return CGF.Builder.CreateIntCast(Val, CastTy, *IsSigned);
+  Address CastItem = CGF.CreateTempAlloca(
+      CastTy,
+      CharUnits::fromQuantity(
+          CGF.CGM.getDataLayout().getPrefTypeAlignment(Val->getType())));
+  Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+      CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
+  CGF.Builder.CreateStore(Val, ValCastItem);
+  return CGF.Builder.CreateLoad(CastItem);
+}
+
 /// This function creates calls to one of two shuffle functions to copy
 /// variables between lanes in a warp.
 static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
-                                                 QualType ElemTy,
                                                  llvm::Value *Elem,
                                                  llvm::Value *Offset) {
   auto &CGM = CGF.CGM;
-  auto &C = CGM.getContext();
   auto &Bld = CGF.Builder;
   CGOpenMPRuntimeNVPTX &RT =
       *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
 
-  unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
+  unsigned Size = CGM.getDataLayout().getTypeStoreSize(Elem->getType());
   assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
 
   OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
@@ -1078,17 +1101,16 @@ static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
                                          : OMPRTL_NVPTX__kmpc_shuffle_int64;
 
   // Cast all types to 32- or 64-bit values before calling shuffle routines.
-  auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
-  auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
-  auto *WarpSize = CGF.EmitScalarConversion(
-      getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
-      C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
+  llvm::Type *CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
+  llvm::Value *ElemCast = castValueToType(CGF, Elem, CastTy, /*isSigned=*/true);
+  auto *WarpSize =
+      Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
 
   auto *ShuffledVal =
       CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
                           {ElemCast, Offset, WarpSize});
 
-  return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
+  return castValueToType(CGF, ShuffledVal, Elem->getType(), /*isSigned=*/true);
 }
 
 namespace {
@@ -1150,10 +1172,9 @@ static void emitReductionListCopy(
       // Step 1.1: Get the address for the src element in the Reduce list.
       Address SrcElementPtrAddr =
           Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
-      llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
-          SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-      SrcElementAddr =
-          Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
+      SrcElementAddr = CGF.EmitLoadOfPointer(
+          SrcElementPtrAddr,
+          C.getPointerType(Private->getType())->castAs<PointerType>());
 
       // Step 1.2: Create a temporary to store the element in the destination
       // Reduce list.
@@ -1169,32 +1190,26 @@ static void emitReductionListCopy(
       // Step 1.1: Get the address for the src element in the Reduce list.
       Address SrcElementPtrAddr =
           Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
-      llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
-          SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-      SrcElementAddr =
-          Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
+      SrcElementAddr = CGF.EmitLoadOfPointer(
+          SrcElementPtrAddr,
+          C.getPointerType(Private->getType())->castAs<PointerType>());
 
       // Step 1.2: Get the address for dest element.  The destination
       // element has already been created on the thread's stack.
       DestElementPtrAddr =
           Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
-      llvm::Value *DestElementPtr =
-          CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false,
-                               C.VoidPtrTy, SourceLocation());
-      Address DestElemAddr =
-          Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
-      DestElementAddr = Bld.CreateElementBitCast(
-          DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
+      DestElementAddr = CGF.EmitLoadOfPointer(
+          DestElementPtrAddr,
+          C.getPointerType(Private->getType())->castAs<PointerType>());
       break;
     }
     case ThreadToScratchpad: {
       // Step 1.1: Get the address for the src element in the Reduce list.
       Address SrcElementPtrAddr =
           Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
-      llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
-          SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-      SrcElementAddr =
-          Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
+      SrcElementAddr = CGF.EmitLoadOfPointer(
+          SrcElementPtrAddr,
+          C.getPointerType(Private->getType())->castAs<PointerType>());
 
       // Step 1.2: Get the address for dest element:
       // address = base + index * ElementSizeInChars.
@@ -1207,11 +1222,8 @@ static void emitReductionListCopy(
           Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
       ScratchPadElemAbsolutePtrVal =
           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
-      Address ScratchpadPtr =
-          Address(ScratchPadElemAbsolutePtrVal,
-                  C.getTypeAlignInChars(Private->getType()));
-      DestElementAddr = Bld.CreateElementBitCast(
-          ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType()));
+      DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
+                                C.getTypeAlignInChars(Private->getType()));
       IncrScratchpadDest = true;
       break;
     }
@@ -1248,14 +1260,15 @@ static void emitReductionListCopy(
         SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
     llvm::Value *Elem =
         CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
-                             Private->getType(), SourceLocation());
+                             Private->getType(), Private->getExprLoc());
 
     // Now that all active lanes have read the element in the
     // Reduce list, shuffle over the value from the remote lane.
-    if (ShuffleInElement) {
-      Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
-                                          RemoteLaneOffset);
-    }
+    if (ShuffleInElement)
+      Elem = createRuntimeShuffleFunction(CGF, Elem, RemoteLaneOffset);
+
+    DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
+                                               SrcElementAddr.getElementType());
 
     // Store the source element value to the dest element address.
     CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
@@ -1319,27 +1332,31 @@ static void emitReductionListCopy(
 ///    local = local @ remote
 ///  else
 ///    local = remote
-static llvm::Value *
-emitReduceScratchpadFunction(CodeGenModule &CGM,
-                             ArrayRef<const Expr *> Privates,
-                             QualType ReductionArrayTy, llvm::Value *ReduceFn) {
+static llvm::Value *emitReduceScratchpadFunction(
+    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
+    QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
   auto &C = CGM.getContext();
   auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
 
   // Destination of the copy.
-  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                  C.VoidPtrTy, ImplicitParamDecl::Other);
   // Base address of the scratchpad array, with each element storing a
   // Reduce list per team.
-  ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                  C.VoidPtrTy, ImplicitParamDecl::Other);
   // A source index into the scratchpad array.
-  ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
+  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
+                             ImplicitParamDecl::Other);
   // Row width of an element in the scratchpad array, typically
   // the number of teams.
-  ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
+  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
+                             ImplicitParamDecl::Other);
   // If should_reduce == 1, then it's load AND reduce,
   // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
   // The latter case is used for initialization.
-  ImplicitParamDecl ShouldReduceArg(C, Int32Ty, ImplicitParamDecl::Other);
+  ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                    Int32Ty, ImplicitParamDecl::Other);
 
   FunctionArgList Args;
   Args.push_back(&ReduceListArg);
@@ -1354,10 +1371,7 @@ emitReduceScratchpadFunction(CodeGenModule &CGM,
       "_omp_reduction_load_and_reduce", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
   CodeGenFunction CGF(CGM);
-  // We don't need debug information in this function as nothing here refers to
-  // user code.
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   auto &Bld = CGF.Builder;
 
@@ -1366,29 +1380,27 @@ emitReduceScratchpadFunction(CodeGenModule &CGM,
   Address ReduceListAddr(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, SourceLocation()),
+                               C.VoidPtrTy, Loc),
           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
       CGF.getPointerAlign());
 
   Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
   llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
-      AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+      AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
 
   Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
-  llvm::Value *IndexVal =
-      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
-                                             Int32Ty, SourceLocation()),
-                        CGM.SizeTy, /*isSigned=*/true);
+  llvm::Value *IndexVal = Bld.CreateIntCast(
+      CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
+      CGM.SizeTy, /*isSigned=*/true);
 
   Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
-  llvm::Value *WidthVal =
-      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
-                                             Int32Ty, SourceLocation()),
-                        CGM.SizeTy, /*isSigned=*/true);
+  llvm::Value *WidthVal = Bld.CreateIntCast(
+      CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
+      CGM.SizeTy, /*isSigned=*/true);
 
   Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
   llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
-      AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation());
+      AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
 
   // The absolute ptr address to the base addr of the next element to copy.
   llvm::Value *CumulativeElemBasePtr =
@@ -1421,7 +1433,8 @@ emitReduceScratchpadFunction(CodeGenModule &CGM,
       ReduceListAddr.getPointer(), CGF.VoidPtrTy);
   llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
       RemoteReduceList.getPointer(), CGF.VoidPtrTy);
-  CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr});
+  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
+      CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
   Bld.CreateBr(MergeBB);
 
   CGF.EmitBlock(ElseBB);
@@ -1445,22 +1458,27 @@ emitReduceScratchpadFunction(CodeGenModule &CGM,
 ///
 static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
                                          ArrayRef<const Expr *> Privates,
-                                         QualType ReductionArrayTy) {
+                                         QualType ReductionArrayTy,
+                                         SourceLocation Loc) {
 
   auto &C = CGM.getContext();
   auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
 
   // Source of the copy.
-  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                  C.VoidPtrTy, ImplicitParamDecl::Other);
   // Base address of the scratchpad array, with each element storing a
   // Reduce list per team.
-  ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                  C.VoidPtrTy, ImplicitParamDecl::Other);
   // A destination index into the scratchpad array, typically the team
   // identifier.
-  ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
+  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
+                             ImplicitParamDecl::Other);
   // Row width of an element in the scratchpad array, typically
   // the number of teams.
-  ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
+  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
+                             ImplicitParamDecl::Other);
 
   FunctionArgList Args;
   Args.push_back(&ReduceListArg);
@@ -1474,10 +1492,7 @@ static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
       "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
   CodeGenFunction CGF(CGM);
-  // We don't need debug information in this function as nothing here refers to
-  // user code.
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   auto &Bld = CGF.Builder;
 
@@ -1485,19 +1500,18 @@ static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
   Address SrcDataAddr(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, SourceLocation()),
+                               C.VoidPtrTy, Loc),
           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
       CGF.getPointerAlign());
 
   Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
   llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
-      AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+      AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
 
   Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
-  llvm::Value *IndexVal =
-      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
-                                             Int32Ty, SourceLocation()),
-                        CGF.SizeTy, /*isSigned=*/true);
+  llvm::Value *IndexVal = Bld.CreateIntCast(
+      CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
+      CGF.SizeTy, /*isSigned=*/true);
 
   Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
   llvm::Value *WidthVal =
@@ -1534,17 +1548,19 @@ static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
 ///     sync
 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
                                               ArrayRef<const Expr *> Privates,
-                                              QualType ReductionArrayTy) {
+                                              QualType ReductionArrayTy,
+                                              SourceLocation Loc) {
   auto &C = CGM.getContext();
   auto &M = CGM.getModule();
 
   // ReduceList: thread local Reduce list.
   // At the stage of the computation when this function is called, partially
   // aggregated values reside in the first lane of every active warp.
-  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                  C.VoidPtrTy, ImplicitParamDecl::Other);
   // NumWarps: number of warps active in the parallel region.  This could
   // be smaller than 32 (max warps in a CTA) for partial block reduction.
-  ImplicitParamDecl NumWarpsArg(C,
+  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
                                 C.getIntTypeForBitwidth(32, /* Signed */ true),
                                 ImplicitParamDecl::Other);
   FunctionArgList Args;
@@ -1557,10 +1573,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
       "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
   CodeGenFunction CGF(CGM);
-  // We don't need debug information in this function as nothing here refers to
-  // user code.
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   auto &Bld = CGF.Builder;
 
@@ -1781,21 +1794,23 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
 ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
 ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
 ///   that the contiguity assumption still holds.
-static llvm::Value *
-emitShuffleAndReduceFunction(CodeGenModule &CGM,
-                             ArrayRef<const Expr *> Privates,
-                             QualType ReductionArrayTy, llvm::Value *ReduceFn) {
+static llvm::Value *emitShuffleAndReduceFunction(
+    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
+    QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
   auto &C = CGM.getContext();
 
   // Thread local Reduce list used to host the values of data to be reduced.
-  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                  C.VoidPtrTy, ImplicitParamDecl::Other);
   // Current lane id; could be logical.
-  ImplicitParamDecl LaneIDArg(C, C.ShortTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
+                              ImplicitParamDecl::Other);
   // Offset of the remote source lane relative to the current lane.
-  ImplicitParamDecl RemoteLaneOffsetArg(C, C.ShortTy,
-                                        ImplicitParamDecl::Other);
+  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                        C.ShortTy, ImplicitParamDecl::Other);
   // Algorithm version.  This is expected to be known at compile time.
-  ImplicitParamDecl AlgoVerArg(C, C.ShortTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                               C.ShortTy, ImplicitParamDecl::Other);
   FunctionArgList Args;
   Args.push_back(&ReduceListArg);
   Args.push_back(&LaneIDArg);
@@ -1808,10 +1823,7 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM,
       "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
   CodeGenFunction CGF(CGM);
-  // We don't need debug information in this function as nothing here refers to
-  // user code.
-  CGF.disableDebugInfo();
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   auto &Bld = CGF.Builder;
 
@@ -1898,7 +1910,8 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM,
       LocalReduceList.getPointer(), CGF.VoidPtrTy);
   llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
       RemoteReduceList.getPointer(), CGF.VoidPtrTy);
-  CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
+  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
+      CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
   Bld.CreateBr(MergeBB);
 
   CGF.EmitBlock(ElseBB);
@@ -2228,8 +2241,8 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
 
   // 2. Emit reduce_func().
   auto *ReductionFn = emitReductionFunction(
-      CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
-      LHSExprs, RHSExprs, ReductionOps);
+      CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
+      Privates, LHSExprs, RHSExprs, ReductionOps);
 
   // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
   // RedList, shuffle_reduce_func, interwarp_copy_func);
@@ -2239,9 +2252,9 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
       ReductionList.getPointer(), CGF.VoidPtrTy);
 
   auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
-      CGM, Privates, ReductionArrayTy, ReductionFn);
+      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
   auto *InterWarpCopyFn =
-      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
+      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
 
   llvm::Value *Res = nullptr;
   if (ParallelReduction) {
@@ -2259,9 +2272,9 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
 
   if (TeamsReduction) {
     auto *ScratchPadCopyFn =
-        emitCopyToScratchpad(CGM, Privates, ReductionArrayTy);
+        emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
     auto *LoadAndReduceFn = emitReduceScratchpadFunction(
-        CGM, Privates, ReductionArrayTy, ReductionFn);
+        CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
 
     llvm::Value *Args[] = {ThreadId,
                            CGF.Builder.getInt32(RHSExprs.size()),
@@ -2412,7 +2425,7 @@ void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
 llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
   ASTContext &Ctx = CGM.getContext();
-  const auto &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
+  const CapturedStmt &CS = *D.getCapturedStmt(OMPD_parallel);
 
   // Create a function that takes as argument the source thread.
   FunctionArgList WrapperArgs;
@@ -2422,10 +2435,15 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
       Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
   QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy);
   QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy);
-  ImplicitParamDecl ParallelLevelArg(Ctx, Int16QTy, ImplicitParamDecl::Other);
-  ImplicitParamDecl WrapperArg(Ctx, Int32QTy, ImplicitParamDecl::Other);
-  ImplicitParamDecl SharedArgsList(Ctx, VoidPtrPtrQTy,
-      ImplicitParamDecl::Other);
+  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
+                                     /*Id=*/nullptr, Int16QTy,
+                                     ImplicitParamDecl::Other);
+  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
+                               /*Id=*/nullptr, Int32QTy,
+                               ImplicitParamDecl::Other);
+  ImplicitParamDecl SharedArgsList(Ctx, /*DC=*/nullptr, D.getLocStart(),
+                                   /*Id=*/nullptr, VoidPtrPtrQTy,
+                                   ImplicitParamDecl::Other);
   WrapperArgs.emplace_back(&ParallelLevelArg);
   WrapperArgs.emplace_back(&WrapperArg);
   WrapperArgs.emplace_back(&SharedArgsList);
@@ -2440,7 +2458,8 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
 
   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
-  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs);
+  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
+                    D.getLocStart(), D.getLocStart());
 
   const auto *RD = CS.getCapturedRecordDecl();
   auto CurField = RD->field_begin();
@@ -2489,7 +2508,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
     Args.emplace_back(Arg);
   }
 
-  emitCall(CGF, OutlinedParallelFn, Args);
+  emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
   CGF.FinishFunction();
   return Fn;
 }
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index 5d13408318a5..6ee2b7afdfe0 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -36,8 +36,9 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
   public:
     llvm::Function *WorkerFn;
     const CGFunctionInfo *CGFI;
+    SourceLocation Loc;
 
-    WorkerFunctionState(CodeGenModule &CGM);
+    WorkerFunctionState(CodeGenModule &CGM, SourceLocation Loc);
 
   private:
     void createWorkerFunction(CodeGenModule &CGM);
diff --git a/lib/CodeGen/CGStmt.cpp b/lib/CodeGen/CGStmt.cpp
index 97e35e2b6e8d..12d136b0b9b2 100644
--- a/lib/CodeGen/CGStmt.cpp
+++ b/lib/CodeGen/CGStmt.cpp
@@ -1005,7 +1005,9 @@ void CodeGenFunction::EmitReturnOfRValue(RValue RV, QualType Ty) {
   if (RV.isScalar()) {
     Builder.CreateStore(RV.getScalarVal(), ReturnValue);
   } else if (RV.isAggregate()) {
-    EmitAggregateCopy(ReturnValue, RV.getAggregateAddress(), Ty);
+    LValue Dest = MakeAddrLValue(ReturnValue, Ty);
+    LValue Src = MakeAddrLValue(RV.getAggregateAddress(), Ty);
+    EmitAggregateCopy(Dest, Src, Ty);
   } else {
     EmitStoreOfComplex(RV.getComplexVal(), MakeAddrLValue(ReturnValue, Ty),
                        /*init*/ true);
diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp
index 26817ac330ba..fb919d7b0283 100644
--- a/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/lib/CodeGen/CGStmtOpenMP.cpp
@@ -53,34 +53,35 @@ class OMPLexicalScope : public CodeGenFunction::LexicalScope {
   }
 
 public:
-  OMPLexicalScope(CodeGenFunction &CGF, const OMPExecutableDirective &S,
-                  bool AsInlined = false, bool EmitPreInitStmt = true)
+  OMPLexicalScope(
+      CodeGenFunction &CGF, const OMPExecutableDirective &S,
+      const llvm::Optional<OpenMPDirectiveKind> CapturedRegion = llvm::None,
+      const bool EmitPreInitStmt = true)
       : CodeGenFunction::LexicalScope(CGF, S.getSourceRange()),
         InlinedShareds(CGF) {
     if (EmitPreInitStmt)
       emitPreInitStmt(CGF, S);
-    if (AsInlined) {
-      if (S.hasAssociatedStmt()) {
-        auto *CS = cast<CapturedStmt>(S.getAssociatedStmt());
-        for (auto &C : CS->captures()) {
-          if (C.capturesVariable() || C.capturesVariableByCopy()) {
-            auto *VD = C.getCapturedVar();
-            assert(VD == VD->getCanonicalDecl() &&
-                        "Canonical decl must be captured.");
-            DeclRefExpr DRE(const_cast<VarDecl *>(VD),
-                            isCapturedVar(CGF, VD) ||
-                                (CGF.CapturedStmtInfo &&
-                                 InlinedShareds.isGlobalVarCaptured(VD)),
-                            VD->getType().getNonReferenceType(), VK_LValue,
-                            SourceLocation());
-            InlinedShareds.addPrivate(VD, [&CGF, &DRE]() -> Address {
-              return CGF.EmitLValue(&DRE).getAddress();
-            });
-          }
-        }
-        (void)InlinedShareds.Privatize();
+    if (!CapturedRegion.hasValue())
+      return;
+    assert(S.hasAssociatedStmt() &&
+           "Expected associated statement for inlined directive.");
+    const CapturedStmt *CS = S.getCapturedStmt(*CapturedRegion);
+    for (auto &C : CS->captures()) {
+      if (C.capturesVariable() || C.capturesVariableByCopy()) {
+        auto *VD = C.getCapturedVar();
+        assert(VD == VD->getCanonicalDecl() &&
+               "Canonical decl must be captured.");
+        DeclRefExpr DRE(
+            const_cast<VarDecl *>(VD),
+            isCapturedVar(CGF, VD) || (CGF.CapturedStmtInfo &&
+                                       InlinedShareds.isGlobalVarCaptured(VD)),
+            VD->getType().getNonReferenceType(), VK_LValue, C.getLocation());
+        InlinedShareds.addPrivate(VD, [&CGF, &DRE]() -> Address {
+          return CGF.EmitLValue(&DRE).getAddress();
+        });
       }
     }
+    (void)InlinedShareds.Privatize();
   }
 };
 
@@ -96,9 +97,8 @@ class OMPParallelScope final : public OMPLexicalScope {
 
 public:
   OMPParallelScope(CodeGenFunction &CGF, const OMPExecutableDirective &S)
-      : OMPLexicalScope(CGF, S,
-                        /*AsInlined=*/false,
-                        /*EmitPreInitStmt=*/EmitPreInitStmt(S)) {}
+      : OMPLexicalScope(CGF, S, /*CapturedRegion=*/llvm::None,
+                        EmitPreInitStmt(S)) {}
 };
 
 /// Lexical scope for OpenMP teams construct, that handles correct codegen
@@ -112,9 +112,8 @@ class OMPTeamsScope final : public OMPLexicalScope {
 
 public:
   OMPTeamsScope(CodeGenFunction &CGF, const OMPExecutableDirective &S)
-      : OMPLexicalScope(CGF, S,
-                        /*AsInlined=*/false,
-                        /*EmitPreInitStmt=*/EmitPreInitStmt(S)) {}
+      : OMPLexicalScope(CGF, S, /*CapturedRegion=*/llvm::None,
+                        EmitPreInitStmt(S)) {}
 };
 
 /// Private scope for OpenMP loop-based directives, that supports capturing
@@ -197,7 +196,7 @@ class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
                               (CGF.CapturedStmtInfo &&
                                InlinedShareds.isGlobalVarCaptured(VD)),
                           VD->getType().getNonReferenceType(), VK_LValue,
-                          SourceLocation());
+                          C.getLocation());
           InlinedShareds.addPrivate(VD, [&CGF, &DRE]() -> Address {
             return CGF.EmitLValue(&DRE).getAddress();
           });
@@ -266,8 +265,7 @@ void CodeGenFunction::GenerateOpenMPCapturedVars(
     } else if (CurCap->capturesThis())
       CapturedVars.push_back(CXXThisValue);
     else if (CurCap->capturesVariableByCopy()) {
-      llvm::Value *CV =
-          EmitLoadOfLValue(EmitLValue(*I), SourceLocation()).getScalarVal();
+      llvm::Value *CV = EmitLoadOfScalar(EmitLValue(*I), CurCap->getLocation());
 
       // If the field is not a pointer, we need to save the actual value
       // and load it as a void pointer.
@@ -280,7 +278,7 @@ void CodeGenFunction::GenerateOpenMPCapturedVars(
 
         auto *SrcAddrVal = EmitScalarConversion(
             DstAddr.getPointer(), Ctx.getPointerType(Ctx.getUIntPtrType()),
-            Ctx.getPointerType(CurField->getType()), SourceLocation());
+            Ctx.getPointerType(CurField->getType()), CurCap->getLocation());
         LValue SrcLV =
             MakeNaturalAlignAddrLValue(SrcAddrVal, CurField->getType());
 
@@ -288,7 +286,7 @@ void CodeGenFunction::GenerateOpenMPCapturedVars(
         EmitStoreThroughLValue(RValue::get(CV), SrcLV);
 
         // Load the value using the destination type pointer.
-        CV = EmitLoadOfLValue(DstLV, SourceLocation()).getScalarVal();
+        CV = EmitLoadOfScalar(DstLV, CurCap->getLocation());
       }
       CapturedVars.push_back(CV);
     } else {
@@ -298,14 +296,15 @@ void CodeGenFunction::GenerateOpenMPCapturedVars(
   }
 }
 
-static Address castValueFromUintptr(CodeGenFunction &CGF, QualType DstType,
-                                    StringRef Name, LValue AddrLV,
+static Address castValueFromUintptr(CodeGenFunction &CGF, SourceLocation Loc,
+                                    QualType DstType, StringRef Name,
+                                    LValue AddrLV,
                                     bool isReferenceType = false) {
   ASTContext &Ctx = CGF.getContext();
 
-  auto *CastedPtr = CGF.EmitScalarConversion(
-      AddrLV.getAddress().getPointer(), Ctx.getUIntPtrType(),
-      Ctx.getPointerType(DstType), SourceLocation());
+  auto *CastedPtr = CGF.EmitScalarConversion(AddrLV.getAddress().getPointer(),
+                                             Ctx.getUIntPtrType(),
+                                             Ctx.getPointerType(DstType), Loc);
   auto TmpAddr =
       CGF.MakeNaturalAlignAddrLValue(CastedPtr, Ctx.getPointerType(DstType))
           .getAddress();
@@ -496,13 +495,12 @@ static llvm::Function *emitOutlinedFunctionPrologue(
                                         AlignmentSource::Decl);
     if (FD->hasCapturedVLAType()) {
       if (FO.UIntPtrCastRequired) {
-        ArgLVal = CGF.MakeAddrLValue(castValueFromUintptr(CGF, FD->getType(),
-                                                          Args[Cnt]->getName(),
-                                                          ArgLVal),
-                                     FD->getType(), AlignmentSource::Decl);
+        ArgLVal = CGF.MakeAddrLValue(
+            castValueFromUintptr(CGF, I->getLocation(), FD->getType(),
+                                 Args[Cnt]->getName(), ArgLVal),
+            FD->getType(), AlignmentSource::Decl);
       }
-      auto *ExprArg =
-          CGF.EmitLoadOfLValue(ArgLVal, SourceLocation()).getScalarVal();
+      auto *ExprArg = CGF.EmitLoadOfScalar(ArgLVal, I->getLocation());
       auto VAT = FD->getCapturedVLAType();
       VLASizes.insert({Args[Cnt], {VAT->getSizeExpr(), ExprArg}});
     } else if (I->capturesVariable()) {
@@ -530,16 +528,15 @@ static llvm::Function *emitOutlinedFunctionPrologue(
       QualType VarTy = Var->getType();
       LocalAddrs.insert(
           {Args[Cnt],
-           {Var,
-            FO.UIntPtrCastRequired
-                ? castValueFromUintptr(CGF, FD->getType(), Args[Cnt]->getName(),
-                                       ArgLVal, VarTy->isReferenceType())
-                : ArgLVal.getAddress()}});
+           {Var, FO.UIntPtrCastRequired
+                     ? castValueFromUintptr(CGF, I->getLocation(),
+                                            FD->getType(), Args[Cnt]->getName(),
+                                            ArgLVal, VarTy->isReferenceType())
+                     : ArgLVal.getAddress()}});
     } else {
       // If 'this' is captured, load it into CXXThisValue.
       assert(I->capturesThis());
-      CXXThisValue = CGF.EmitLoadOfLValue(ArgLVal, Args[Cnt]->getLocation())
-                         .getScalarVal();
+      CXXThisValue = CGF.EmitLoadOfScalar(ArgLVal, I->getLocation());
       LocalAddrs.insert({Args[Cnt], {nullptr, ArgLVal.getAddress()}});
     }
     ++Cnt;
@@ -604,7 +601,7 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S) {
           I->second.second,
           I->second.first ? I->second.first->getType() : Arg->getType(),
           AlignmentSource::Decl);
-      CallArg = WrapperCGF.EmitLoadOfScalar(LV, SourceLocation());
+      CallArg = WrapperCGF.EmitLoadOfScalar(LV, S.getLocStart());
     } else {
       auto EI = VLASizes.find(Arg);
       if (EI != VLASizes.end())
@@ -613,7 +610,7 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S) {
         LValue LV = WrapperCGF.MakeAddrLValue(WrapperCGF.GetAddrOfLocalVar(Arg),
                                               Arg->getType(),
                                               AlignmentSource::Decl);
-        CallArg = WrapperCGF.EmitLoadOfScalar(LV, SourceLocation());
+        CallArg = WrapperCGF.EmitLoadOfScalar(LV, S.getLocStart());
       }
     }
     CallArgs.emplace_back(WrapperCGF.EmitFromMemory(CallArg, Arg->getType()));
@@ -695,7 +692,9 @@ void CodeGenFunction::EmitOMPCopy(QualType OriginalType, Address DestAddr,
     auto *BO = dyn_cast<BinaryOperator>(Copy);
     if (BO && BO->getOpcode() == BO_Assign) {
       // Perform simple memcpy for simple copying.
-      EmitAggregateAssign(DestAddr, SrcAddr, OriginalType);
+      LValue Dest = MakeAddrLValue(DestAddr, OriginalType);
+      LValue Src = MakeAddrLValue(SrcAddr, OriginalType);
+      EmitAggregateAssign(Dest, Src, OriginalType);
     } else {
       // For arrays with complex element types perform element by element
       // copying.
@@ -738,7 +737,12 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
           cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl());
   }
   llvm::DenseSet<const VarDecl *> EmittedAsFirstprivate;
-  CGCapturedStmtInfo CapturesInfo(cast<CapturedStmt>(*D.getAssociatedStmt()));
+  llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
+  getOpenMPCaptureRegions(CaptureRegions, D.getDirectiveKind());
+  // Force emission of the firstprivate copy if the directive does not emit
+  // outlined function, like omp for, omp simd, omp distribute etc.
+  bool MustEmitFirstprivateCopy =
+      CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown;
   for (const auto *C : D.getClausesOfKind<OMPFirstprivateClause>()) {
     auto IRef = C->varlist_begin();
     auto InitsRef = C->inits().begin();
@@ -746,9 +750,8 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
       auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*IRef)->getDecl());
       bool ThisFirstprivateIsLastprivate =
           Lastprivates.count(OrigVD->getCanonicalDecl()) > 0;
-      auto *CapFD = CapturesInfo.lookup(OrigVD);
       auto *FD = CapturedStmtInfo->lookup(OrigVD);
-      if (!ThisFirstprivateIsLastprivate && FD && (FD == CapFD) &&
+      if (!MustEmitFirstprivateCopy && !ThisFirstprivateIsLastprivate && FD &&
           !FD->getType()->isReferenceType()) {
         EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl());
         ++IRef;
@@ -764,7 +767,8 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
         DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
                         /*RefersToEnclosingVariableOrCapture=*/FD != nullptr,
                         (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
-        Address OriginalAddr = EmitLValue(&DRE).getAddress();
+        LValue OriginalLVal = EmitLValue(&DRE);
+        Address OriginalAddr = OriginalLVal.getAddress();
         QualType Type = VD->getType();
         if (Type->isArrayType()) {
           // Emit VarDecl with copy init for arrays.
@@ -775,8 +779,9 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
             auto *Init = VD->getInit();
             if (!isa<CXXConstructExpr>(Init) || isTrivialInitializer(Init)) {
               // Perform simple memcpy.
-              EmitAggregateAssign(Emission.getAllocatedAddress(), OriginalAddr,
-                                  Type);
+              LValue Dest = MakeAddrLValue(Emission.getAllocatedAddress(),
+                                           Type);
+              EmitAggregateAssign(Dest, OriginalLVal, Type);
             } else {
               EmitOMPAggregateAssign(
                   Emission.getAllocatedAddress(), OriginalAddr, Type,
@@ -1272,7 +1277,7 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
     CGF.EmitOMPPrivateClause(S, PrivateScope);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
-    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    CGF.EmitStmt(S.getCapturedStmt(OMPD_parallel)->getCapturedStmt());
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel);
   };
   emitCommonOMPParallelDirective(*this, S, OMPD_parallel, CodeGen,
@@ -1734,7 +1739,7 @@ void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     emitOMPSimdRegion(CGF, S, Action);
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
 }
 
@@ -2029,14 +2034,18 @@ emitDistributeParallelForInnerBounds(CodeGenFunction &CGF,
   // the current ones.
   LValue PrevLB = CGF.EmitLValue(LS.getPrevLowerBoundVariable());
   LValue PrevUB = CGF.EmitLValue(LS.getPrevUpperBoundVariable());
-  llvm::Value *PrevLBVal = CGF.EmitLoadOfScalar(PrevLB, SourceLocation());
+  llvm::Value *PrevLBVal = CGF.EmitLoadOfScalar(
+      PrevLB, LS.getPrevLowerBoundVariable()->getExprLoc());
   PrevLBVal = CGF.EmitScalarConversion(
       PrevLBVal, LS.getPrevLowerBoundVariable()->getType(),
-      LS.getIterationVariable()->getType(), SourceLocation());
-  llvm::Value *PrevUBVal = CGF.EmitLoadOfScalar(PrevUB, SourceLocation());
+      LS.getIterationVariable()->getType(),
+      LS.getPrevLowerBoundVariable()->getExprLoc());
+  llvm::Value *PrevUBVal = CGF.EmitLoadOfScalar(
+      PrevUB, LS.getPrevUpperBoundVariable()->getExprLoc());
   PrevUBVal = CGF.EmitScalarConversion(
       PrevUBVal, LS.getPrevUpperBoundVariable()->getType(),
-      LS.getIterationVariable()->getType(), SourceLocation());
+      LS.getIterationVariable()->getType(),
+      LS.getPrevUpperBoundVariable()->getExprLoc());
 
   CGF.EmitStoreOfScalar(PrevLBVal, LB);
   CGF.EmitStoreOfScalar(PrevUBVal, UB);
@@ -2062,10 +2071,10 @@ emitDistributeParallelForDispatchBounds(CodeGenFunction &CGF,
   // is not normalized as each team only executes its own assigned
   // distribute chunk
   QualType IteratorTy = IVExpr->getType();
-  llvm::Value *LBVal = CGF.EmitLoadOfScalar(LB, /*Volatile=*/false, IteratorTy,
-                                            SourceLocation());
-  llvm::Value *UBVal = CGF.EmitLoadOfScalar(UB, /*Volatile=*/false, IteratorTy,
-                                            SourceLocation());
+  llvm::Value *LBVal =
+      CGF.EmitLoadOfScalar(LB, /*Volatile=*/false, IteratorTy, S.getLocStart());
+  llvm::Value *UBVal =
+      CGF.EmitLoadOfScalar(UB, /*Volatile=*/false, IteratorTy, S.getLocStart());
   return {LBVal, UBVal};
 }
 
@@ -2122,7 +2131,7 @@ void CodeGenFunction::EmitOMPDistributeParallelForDirective(
     CGF.EmitOMPDistributeLoop(S, emitInnerParallelForWhenCombined,
                               S.getDistInc());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_parallel);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_distribute, CodeGen);
 }
 
@@ -2132,7 +2141,7 @@ void CodeGenFunction::EmitOMPDistributeParallelForSimdDirective(
     CGF.EmitOMPDistributeLoop(S, emitInnerParallelForWhenCombined,
                               S.getDistInc());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_parallel);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_distribute, CodeGen);
 }
 
@@ -2141,7 +2150,7 @@ void CodeGenFunction::EmitOMPDistributeSimdDirective(
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
     CGF.EmitOMPDistributeLoop(S, emitOMPLoopBodyWithStopPoint, S.getInc());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
 }
 
@@ -2167,28 +2176,6 @@ void CodeGenFunction::EmitOMPTargetSimdDirective(
   emitCommonOMPTargetDirective(*this, S, CodeGen);
 }
 
-void CodeGenFunction::EmitOMPTargetTeamsDistributeParallelForDirective(
-    const OMPTargetTeamsDistributeParallelForDirective &S) {
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
-  CGM.getOpenMPRuntime().emitInlinedDirective(
-      *this, OMPD_target_teams_distribute_parallel_for,
-      [&S](CodeGenFunction &CGF, PrePostActionTy &) {
-        CGF.EmitStmt(
-            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
-      });
-}
-
-void CodeGenFunction::EmitOMPTargetTeamsDistributeParallelForSimdDirective(
-    const OMPTargetTeamsDistributeParallelForSimdDirective &S) {
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
-  CGM.getOpenMPRuntime().emitInlinedDirective(
-      *this, OMPD_target_teams_distribute_parallel_for_simd,
-      [&S](CodeGenFunction &CGF, PrePostActionTy &) {
-        CGF.EmitStmt(
-            cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
-      });
-}
-
 namespace {
   struct ScheduleKindModifiersTy {
     OpenMPScheduleClauseKind Kind;
@@ -2425,7 +2412,7 @@ void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
                                                  emitDispatchForLoopBounds);
   };
   {
-    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    OMPLexicalScope Scope(*this, S, OMPD_unknown);
     CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_for, CodeGen,
                                                 S.hasCancel());
   }
@@ -2445,7 +2432,7 @@ void CodeGenFunction::EmitOMPForSimdDirective(const OMPForSimdDirective &S) {
                                                  emitDispatchForLoopBounds);
   };
   {
-    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    OMPLexicalScope Scope(*this, S, OMPD_unknown);
     CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, CodeGen);
   }
 
@@ -2465,8 +2452,8 @@ static LValue createSectionLVal(CodeGenFunction &CGF, QualType Ty,
 }
 
 void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
-  auto *Stmt = cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt();
-  auto *CS = dyn_cast<CompoundStmt>(Stmt);
+  const Stmt *Stmt = S.getInnermostCapturedStmt()->getCapturedStmt();
+  const auto *CS = dyn_cast<CompoundStmt>(Stmt);
   bool HasLastprivates = false;
   auto &&CodeGen = [&S, Stmt, CS, &HasLastprivates](CodeGenFunction &CGF,
                                                     PrePostActionTy &) {
@@ -2494,7 +2481,7 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
                         OK_Ordinary, S.getLocStart(), FPOptions());
     // Increment for loop counter.
     UnaryOperator Inc(&IVRefExpr, UO_PreInc, KmpInt32Ty, VK_RValue, OK_Ordinary,
-                      S.getLocStart());
+                      S.getLocStart(), true);
     auto BodyGen = [Stmt, CS, &S, &IV](CodeGenFunction &CGF) {
       // Iterate through all sections and emit a switch construct:
       // switch (IV) {
@@ -2508,9 +2495,9 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
       // }
       // .omp.sections.exit:
       auto *ExitBB = CGF.createBasicBlock(".omp.sections.exit");
-      auto *SwitchStmt = CGF.Builder.CreateSwitch(
-          CGF.EmitLoadOfLValue(IV, S.getLocStart()).getScalarVal(), ExitBB,
-          CS == nullptr ? 1 : CS->size());
+      auto *SwitchStmt =
+          CGF.Builder.CreateSwitch(CGF.EmitLoadOfScalar(IV, S.getLocStart()),
+                                   ExitBB, CS == nullptr ? 1 : CS->size());
       if (CS) {
         unsigned CaseNumber = 0;
         for (auto *SubStmt : CS->children()) {
@@ -2606,7 +2593,7 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
 
 void CodeGenFunction::EmitOMPSectionsDirective(const OMPSectionsDirective &S) {
   {
-    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    OMPLexicalScope Scope(*this, S, OMPD_unknown);
     EmitSections(S);
   }
   // Emit an implicit barrier at the end.
@@ -2618,9 +2605,9 @@ void CodeGenFunction::EmitOMPSectionsDirective(const OMPSectionsDirective &S) {
 
 void CodeGenFunction::EmitOMPSectionDirective(const OMPSectionDirective &S) {
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
-    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    CGF.EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_section, CodeGen,
                                               S.hasCancel());
 }
@@ -2649,10 +2636,10 @@ void CodeGenFunction::EmitOMPSingleDirective(const OMPSingleDirective &S) {
     (void)CGF.EmitOMPFirstprivateClause(S, SingleScope);
     CGF.EmitOMPPrivateClause(S, SingleScope);
     (void)SingleScope.Privatize();
-    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    CGF.EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt());
   };
   {
-    OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+    OMPLexicalScope Scope(*this, S, OMPD_unknown);
     CGM.getOpenMPRuntime().emitSingleRegion(*this, CodeGen, S.getLocStart(),
                                             CopyprivateVars, DestExprs,
                                             SrcExprs, AssignmentOps);
@@ -2669,21 +2656,21 @@ void CodeGenFunction::EmitOMPSingleDirective(const OMPSingleDirective &S) {
 void CodeGenFunction::EmitOMPMasterDirective(const OMPMasterDirective &S) {
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     Action.Enter(CGF);
-    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    CGF.EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getLocStart());
 }
 
 void CodeGenFunction::EmitOMPCriticalDirective(const OMPCriticalDirective &S) {
   auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     Action.Enter(CGF);
-    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    CGF.EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt());
   };
   Expr *Hint = nullptr;
   if (auto *HintClause = S.getSingleClause<OMPHintClause>())
     Hint = HintClause->getHint();
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitCriticalRegion(*this,
                                             S.getDirectiveName().getAsString(),
                                             CodeGen, S.getLocStart(), Hint);
@@ -2725,12 +2712,12 @@ void CodeGenFunction::EmitOMPParallelSectionsDirective(
                                  emitEmptyBoundParameters);
 }
 
-void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
-                                                const RegionCodeGenTy &BodyGen,
-                                                const TaskGenTy &TaskGen,
-                                                OMPTaskDataTy &Data) {
+void CodeGenFunction::EmitOMPTaskBasedDirective(
+    const OMPExecutableDirective &S, const OpenMPDirectiveKind CapturedRegion,
+    const RegionCodeGenTy &BodyGen, const TaskGenTy &TaskGen,
+    OMPTaskDataTy &Data) {
   // Emit outlined function for task construct.
-  auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
+  const CapturedStmt *CS = S.getCapturedStmt(CapturedRegion);
   auto *I = CS->getCapturedDecl()->param_begin();
   auto *PartId = std::next(I);
   auto *TaskT = std::next(I, 4);
@@ -2831,8 +2818,9 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
   for (const auto *C : S.getClausesOfKind<OMPDependClause>())
     for (auto *IRef : C->varlists())
       Data.Dependences.push_back(std::make_pair(C->getDependencyKind(), IRef));
-  auto &&CodeGen = [&Data, &S, CS, &BodyGen, &LastprivateDstsOrigs](
-      CodeGenFunction &CGF, PrePostActionTy &Action) {
+  auto &&CodeGen = [&Data, &S, CS, &BodyGen, &LastprivateDstsOrigs,
+                    CapturedRegion](CodeGenFunction &CGF,
+                                    PrePostActionTy &Action) {
     // Set proper addresses for generated private copies.
     OMPPrivateScope Scope(CGF);
     if (!Data.PrivateVars.empty() || !Data.FirstprivateVars.empty() ||
@@ -2889,7 +2877,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
       }
     }
     if (Data.Reductions) {
-      OMPLexicalScope LexScope(CGF, S, /*AsInlined=*/true);
+      OMPLexicalScope LexScope(CGF, S, CapturedRegion);
       ReductionCodeGen RedCG(Data.ReductionVars, Data.ReductionCopies,
                              Data.ReductionOps);
       llvm::Value *ReductionsPtr = CGF.Builder.CreateLoad(
@@ -2904,7 +2892,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
                         Replacement.getPointer(), CGF.getContext().VoidPtrTy,
                         CGF.getContext().getPointerType(
                             Data.ReductionCopies[Cnt]->getType()),
-                        SourceLocation()),
+                        Data.ReductionCopies[Cnt]->getExprLoc()),
                     Replacement.getAlignment());
         Replacement = RedCG.adjustPrivateAddress(CGF, Cnt, Replacement);
         Scope.addPrivate(RedCG.getBaseDecl(Cnt),
@@ -2946,15 +2934,16 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
         RedCG.emitAggregateType(CGF, Cnt);
         // The taskgroup descriptor variable is always implicit firstprivate and
         // privatized already during procoessing of the firstprivates.
-        llvm::Value *ReductionsPtr = CGF.EmitLoadOfScalar(
-            CGF.EmitLValue(TaskgroupDescriptors[Cnt]), SourceLocation());
+        llvm::Value *ReductionsPtr =
+            CGF.EmitLoadOfScalar(CGF.EmitLValue(TaskgroupDescriptors[Cnt]),
+                                 TaskgroupDescriptors[Cnt]->getExprLoc());
         Address Replacement = CGF.CGM.getOpenMPRuntime().getTaskReductionItem(
             CGF, S.getLocStart(), ReductionsPtr, RedCG.getSharedLValue(Cnt));
         Replacement = Address(
             CGF.EmitScalarConversion(
                 Replacement.getPointer(), CGF.getContext().VoidPtrTy,
                 CGF.getContext().getPointerType(InRedPrivs[Cnt]->getType()),
-                SourceLocation()),
+                InRedPrivs[Cnt]->getExprLoc()),
             Replacement.getAlignment());
         Replacement = RedCG.adjustPrivateAddress(CGF, Cnt, Replacement);
         InRedScope.addPrivate(RedCG.getBaseDecl(Cnt),
@@ -2980,27 +2969,24 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
 
 static ImplicitParamDecl *
 createImplicitFirstprivateForType(ASTContext &C, OMPTaskDataTy &Data,
-                                  QualType Ty, CapturedDecl *CD) {
-  auto *OrigVD = ImplicitParamDecl::Create(
-      C, CD, SourceLocation(), /*Id=*/nullptr, Ty, ImplicitParamDecl::Other);
-  auto *OrigRef =
-      DeclRefExpr::Create(C, NestedNameSpecifierLoc(), SourceLocation(), OrigVD,
-                          /*RefersToEnclosingVariableOrCapture=*/false,
-                          SourceLocation(), Ty, VK_LValue);
-  auto *PrivateVD = ImplicitParamDecl::Create(
-      C, CD, SourceLocation(), /*Id=*/nullptr, Ty, ImplicitParamDecl::Other);
+                                  QualType Ty, CapturedDecl *CD,
+                                  SourceLocation Loc) {
+  auto *OrigVD = ImplicitParamDecl::Create(C, CD, Loc, /*Id=*/nullptr, Ty,
+                                           ImplicitParamDecl::Other);
+  auto *OrigRef = DeclRefExpr::Create(
+      C, NestedNameSpecifierLoc(), SourceLocation(), OrigVD,
+      /*RefersToEnclosingVariableOrCapture=*/false, Loc, Ty, VK_LValue);
+  auto *PrivateVD = ImplicitParamDecl::Create(C, CD, Loc, /*Id=*/nullptr, Ty,
+                                              ImplicitParamDecl::Other);
   auto *PrivateRef = DeclRefExpr::Create(
       C, NestedNameSpecifierLoc(), SourceLocation(), PrivateVD,
-      /*RefersToEnclosingVariableOrCapture=*/false, SourceLocation(), Ty,
-      VK_LValue);
+      /*RefersToEnclosingVariableOrCapture=*/false, Loc, Ty, VK_LValue);
   QualType ElemType = C.getBaseElementType(Ty);
-  auto *InitVD =
-      ImplicitParamDecl::Create(C, CD, SourceLocation(), /*Id=*/nullptr,
-                                ElemType, ImplicitParamDecl::Other);
-  auto *InitRef =
-      DeclRefExpr::Create(C, NestedNameSpecifierLoc(), SourceLocation(), InitVD,
-                          /*RefersToEnclosingVariableOrCapture=*/false,
-                          SourceLocation(), ElemType, VK_LValue);
+  auto *InitVD = ImplicitParamDecl::Create(C, CD, Loc, /*Id=*/nullptr, ElemType,
+                                           ImplicitParamDecl::Other);
+  auto *InitRef = DeclRefExpr::Create(
+      C, NestedNameSpecifierLoc(), SourceLocation(), InitVD,
+      /*RefersToEnclosingVariableOrCapture=*/false, Loc, ElemType, VK_LValue);
   PrivateVD->setInitStyle(VarDecl::CInit);
   PrivateVD->setInit(ImplicitCastExpr::Create(C, ElemType, CK_LValueToRValue,
                                               InitRef, /*BasePath=*/nullptr,
@@ -3047,14 +3033,15 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
     QualType BaseAndPointersType = getContext().getConstantArrayType(
         getContext().VoidPtrTy, ArrSize, ArrayType::Normal,
         /*IndexTypeQuals=*/0);
-    BPVD = createImplicitFirstprivateForType(getContext(), Data,
-                                             BaseAndPointersType, CD);
-    PVD = createImplicitFirstprivateForType(getContext(), Data,
-                                            BaseAndPointersType, CD);
+    BPVD = createImplicitFirstprivateForType(
+        getContext(), Data, BaseAndPointersType, CD, S.getLocStart());
+    PVD = createImplicitFirstprivateForType(
+        getContext(), Data, BaseAndPointersType, CD, S.getLocStart());
     QualType SizesType = getContext().getConstantArrayType(
         getContext().getSizeType(), ArrSize, ArrayType::Normal,
         /*IndexTypeQuals=*/0);
-    SVD = createImplicitFirstprivateForType(getContext(), Data, SizesType, CD);
+    SVD = createImplicitFirstprivateForType(getContext(), Data, SizesType, CD,
+                                            S.getLocStart());
     TargetScope.addPrivate(
         BPVD, [&InputInfo]() { return InputInfo.BasePointersArray; });
     TargetScope.addPrivate(PVD,
@@ -3099,16 +3086,17 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
     }
     // Privatize all private variables except for in_reduction items.
     (void)Scope.Privatize();
-    InputInfo.BasePointersArray = CGF.Builder.CreateConstArrayGEP(
-        CGF.GetAddrOfLocalVar(BPVD), /*Index=*/0, CGF.getPointerSize());
-    InputInfo.PointersArray = CGF.Builder.CreateConstArrayGEP(
-        CGF.GetAddrOfLocalVar(PVD), /*Index=*/0, CGF.getPointerSize());
-    InputInfo.SizesArray = CGF.Builder.CreateConstArrayGEP(
-        CGF.GetAddrOfLocalVar(SVD), /*Index=*/0, CGF.getSizeSize());
+    if (InputInfo.NumberOfTargetItems > 0) {
+      InputInfo.BasePointersArray = CGF.Builder.CreateConstArrayGEP(
+          CGF.GetAddrOfLocalVar(BPVD), /*Index=*/0, CGF.getPointerSize());
+      InputInfo.PointersArray = CGF.Builder.CreateConstArrayGEP(
+          CGF.GetAddrOfLocalVar(PVD), /*Index=*/0, CGF.getPointerSize());
+      InputInfo.SizesArray = CGF.Builder.CreateConstArrayGEP(
+          CGF.GetAddrOfLocalVar(SVD), /*Index=*/0, CGF.getSizeSize());
+    }
 
     Action.Enter(CGF);
-    OMPLexicalScope LexScope(CGF, S, /*AsInlined=*/true,
-                             /*EmitPreInitStmt=*/false);
+    OMPLexicalScope LexScope(CGF, S, OMPD_task, /*EmitPreInitStmt=*/false);
     BodyGen(CGF);
   };
   auto *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
@@ -3125,7 +3113,7 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
 
 void CodeGenFunction::EmitOMPTaskDirective(const OMPTaskDirective &S) {
   // Emit outlined function for task construct.
-  auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
+  const CapturedStmt *CS = S.getCapturedStmt(OMPD_task);
   auto CapturedStruct = GenerateCapturedStmtArgument(*CS);
   auto SharedsTy = getContext().getRecordType(CS->getCapturedRecordDecl());
   const Expr *IfCond = nullptr;
@@ -3150,7 +3138,7 @@ void CodeGenFunction::EmitOMPTaskDirective(const OMPTaskDirective &S) {
                                             SharedsTy, CapturedStruct, IfCond,
                                             Data);
   };
-  EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
+  EmitOMPTaskBasedDirective(S, OMPD_task, BodyGen, TaskGen, Data);
 }
 
 void CodeGenFunction::EmitOMPTaskyieldDirective(
@@ -3199,9 +3187,9 @@ void CodeGenFunction::EmitOMPTaskgroupDirective(
       CGF.EmitStoreOfScalar(ReductionDesc, CGF.GetAddrOfLocalVar(VD),
                             /*Volatile=*/false, E->getType());
     }
-    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    CGF.EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitTaskgroupRegion(*this, CodeGen, S.getLocStart());
 }
 
@@ -3409,7 +3397,7 @@ void CodeGenFunction::EmitOMPDistributeDirective(
 
     CGF.EmitOMPDistributeLoop(S, emitOMPLoopBodyWithStopPoint, S.getInc());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_distribute, CodeGen);
 }
 
@@ -3424,7 +3412,9 @@ static llvm::Function *emitOutlinedOrderedFunction(CodeGenModule &CGM,
 }
 
 void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) {
-  if (!S.getAssociatedStmt()) {
+  if (S.hasClausesOfKind<OMPDependClause>()) {
+    assert(!S.getAssociatedStmt() &&
+           "No associated statement must be in ordered depend construct.");
     for (const auto *DC : S.getClausesOfKind<OMPDependClause>())
       CGM.getOpenMPRuntime().emitDoacrossOrdered(*this, DC);
     return;
@@ -3432,8 +3422,8 @@ void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) {
   auto *C = S.getSingleClause<OMPSIMDClause>();
   auto &&CodeGen = [&S, C, this](CodeGenFunction &CGF,
                                  PrePostActionTy &Action) {
+    const CapturedStmt *CS = S.getInnermostCapturedStmt();
     if (C) {
-      auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
       llvm::SmallVector<llvm::Value *, 16> CapturedVars;
       CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
       auto *OutlinedFn = emitOutlinedOrderedFunction(CGM, CS);
@@ -3441,11 +3431,10 @@ void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) {
                                                       OutlinedFn, CapturedVars);
     } else {
       Action.Enter(CGF);
-      CGF.EmitStmt(
-          cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      CGF.EmitStmt(CS->getCapturedStmt());
     }
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitOrderedRegion(*this, CodeGen, S.getLocStart(), !C);
 }
 
@@ -3889,8 +3878,7 @@ void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
     }
   }
 
-  const auto *CS =
-      S.getAssociatedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
+  const auto *CS = S.getInnermostCapturedStmt()->IgnoreContainers();
   if (const auto *EWC = dyn_cast<ExprWithCleanups>(CS)) {
     enterFullExpression(EWC);
   }
@@ -3910,7 +3898,7 @@ void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
                       S.getV(), S.getExpr(), S.getUpdateExpr(),
                       S.isXLHSInRHSPart(), S.getLocStart());
   };
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_unknown);
   CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_atomic, CodeGen);
 }
 
@@ -3919,7 +3907,6 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
                                          const RegionCodeGenTy &CodeGen) {
   assert(isOpenMPTargetExecutionDirective(S.getDirectiveKind()));
   CodeGenModule &CGM = CGF.CGM;
-  const CapturedStmt &CS = *S.getCapturedStmt(OMPD_target);
 
   llvm::Function *Fn = nullptr;
   llvm::Constant *FnID = nullptr;
@@ -3967,11 +3954,8 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
   // Emit target region as a standalone region.
   CGM.getOpenMPRuntime().emitTargetOutlinedFunction(S, ParentName, Fn, FnID,
                                                     IsOffloadEntry, CodeGen);
-  OMPLexicalScope Scope(CGF, S);
-  llvm::SmallVector<llvm::Value *, 16> CapturedVars;
-  CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
-  CGM.getOpenMPRuntime().emitTargetCall(CGF, S, Fn, FnID, IfCond, Device,
-                                        CapturedVars);
+  OMPLexicalScope Scope(CGF, S, OMPD_task);
+  CGM.getOpenMPRuntime().emitTargetCall(CGF, S, Fn, FnID, IfCond, Device);
 }
 
 static void emitTargetRegion(CodeGenFunction &CGF, const OMPTargetDirective &S,
@@ -3982,7 +3966,7 @@ static void emitTargetRegion(CodeGenFunction &CGF, const OMPTargetDirective &S,
   (void)PrivateScope.Privatize();
 
   Action.Enter(CGF);
-  CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+  CGF.EmitStmt(S.getCapturedStmt(OMPD_target)->getCapturedStmt());
 }
 
 void CodeGenFunction::EmitOMPTargetDeviceFunction(CodeGenModule &CGM,
@@ -4039,7 +4023,7 @@ void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &S) {
     CGF.EmitOMPPrivateClause(S, PrivateScope);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
-    CGF.EmitStmt(cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+    CGF.EmitStmt(S.getCapturedStmt(OMPD_teams)->getCapturedStmt());
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
   };
   emitCommonOMPTeamsDirective(*this, S, OMPD_distribute, CodeGen);
@@ -4266,6 +4250,105 @@ void CodeGenFunction::EmitOMPTeamsDistributeParallelForSimdDirective(
                                    [](CodeGenFunction &) { return nullptr; });
 }
 
+static void emitTargetTeamsDistributeParallelForRegion(
+    CodeGenFunction &CGF, const OMPTargetTeamsDistributeParallelForDirective &S,
+    PrePostActionTy &Action) {
+  auto &&CodeGenDistribute = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+    CGF.EmitOMPDistributeLoop(S, emitInnerParallelForWhenCombined,
+                              S.getDistInc());
+  };
+
+  // Emit teams region as a standalone region.
+  auto &&CodeGenTeams = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
+                                                 PrePostActionTy &) {
+    CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+    CGF.EmitOMPReductionClauseInit(S, PrivateScope);
+    (void)PrivateScope.Privatize();
+    CGF.CGM.getOpenMPRuntime().emitInlinedDirective(
+        CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
+    CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
+  };
+
+  emitCommonOMPTeamsDirective(CGF, S, OMPD_distribute_parallel_for,
+                              CodeGenTeams);
+  emitPostUpdateForReductionClause(CGF, S,
+                                   [](CodeGenFunction &) { return nullptr; });
+}
+
+void CodeGenFunction::EmitOMPTargetTeamsDistributeParallelForDeviceFunction(
+    CodeGenModule &CGM, StringRef ParentName,
+    const OMPTargetTeamsDistributeParallelForDirective &S) {
+  // Emit SPMD target teams distribute parallel for region as a standalone
+  // region.
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    emitTargetTeamsDistributeParallelForRegion(CGF, S, Action);
+  };
+  llvm::Function *Fn;
+  llvm::Constant *Addr;
+  // Emit target region as a standalone region.
+  CGM.getOpenMPRuntime().emitTargetOutlinedFunction(
+      S, ParentName, Fn, Addr, /*IsOffloadEntry=*/true, CodeGen);
+  assert(Fn && Addr && "Target device function emission failed.");
+}
+
+void CodeGenFunction::EmitOMPTargetTeamsDistributeParallelForDirective(
+    const OMPTargetTeamsDistributeParallelForDirective &S) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    emitTargetTeamsDistributeParallelForRegion(CGF, S, Action);
+  };
+  emitCommonOMPTargetDirective(*this, S, CodeGen);
+}
+
+static void emitTargetTeamsDistributeParallelForSimdRegion(
+    CodeGenFunction &CGF,
+    const OMPTargetTeamsDistributeParallelForSimdDirective &S,
+    PrePostActionTy &Action) {
+  auto &&CodeGenDistribute = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+    CGF.EmitOMPDistributeLoop(S, emitInnerParallelForWhenCombined,
+                              S.getDistInc());
+  };
+
+  // Emit teams region as a standalone region.
+  auto &&CodeGenTeams = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
+                                                 PrePostActionTy &) {
+    CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+    CGF.EmitOMPReductionClauseInit(S, PrivateScope);
+    (void)PrivateScope.Privatize();
+    CGF.CGM.getOpenMPRuntime().emitInlinedDirective(
+        CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
+    CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
+  };
+
+  emitCommonOMPTeamsDirective(CGF, S, OMPD_distribute_parallel_for_simd,
+                              CodeGenTeams);
+  emitPostUpdateForReductionClause(CGF, S,
+                                   [](CodeGenFunction &) { return nullptr; });
+}
+
+void CodeGenFunction::EmitOMPTargetTeamsDistributeParallelForSimdDeviceFunction(
+    CodeGenModule &CGM, StringRef ParentName,
+    const OMPTargetTeamsDistributeParallelForSimdDirective &S) {
+  // Emit SPMD target teams distribute parallel for simd region as a standalone
+  // region.
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    emitTargetTeamsDistributeParallelForSimdRegion(CGF, S, Action);
+  };
+  llvm::Function *Fn;
+  llvm::Constant *Addr;
+  // Emit target region as a standalone region.
+  CGM.getOpenMPRuntime().emitTargetOutlinedFunction(
+      S, ParentName, Fn, Addr, /*IsOffloadEntry=*/true, CodeGen);
+  assert(Fn && Addr && "Target device function emission failed.");
+}
+
+void CodeGenFunction::EmitOMPTargetTeamsDistributeParallelForSimdDirective(
+    const OMPTargetTeamsDistributeParallelForSimdDirective &S) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    emitTargetTeamsDistributeParallelForSimdRegion(CGF, S, Action);
+  };
+  emitCommonOMPTargetDirective(*this, S, CodeGen);
+}
+
 void CodeGenFunction::EmitOMPCancellationPointDirective(
     const OMPCancellationPointDirective &S) {
   CGM.getOpenMPRuntime().emitCancellationPointCall(*this, S.getLocStart(),
@@ -4383,10 +4466,9 @@ void CodeGenFunction::EmitOMPTargetDataDirective(
   DevicePointerPrivActionTy PrivAction(PrivatizeDevicePointers);
 
   auto &&CodeGen = [&S, &Info, &PrivatizeDevicePointers](
-      CodeGenFunction &CGF, PrePostActionTy &Action) {
+                       CodeGenFunction &CGF, PrePostActionTy &Action) {
     auto &&InnermostCodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
-      CGF.EmitStmt(
-          cast<CapturedStmt>(S.getAssociatedStmt())->getCapturedStmt());
+      CGF.EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt());
     };
 
     // Codegen that selects wheather to generate the privatization code or not.
@@ -4468,7 +4550,7 @@ void CodeGenFunction::EmitOMPTargetEnterDataDirective(
   if (auto *C = S.getSingleClause<OMPDeviceClause>())
     Device = C->getDevice();
 
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_task);
   CGM.getOpenMPRuntime().emitTargetDataStandAloneCall(*this, S, IfCond, Device);
 }
 
@@ -4489,7 +4571,7 @@ void CodeGenFunction::EmitOMPTargetExitDataDirective(
   if (auto *C = S.getSingleClause<OMPDeviceClause>())
     Device = C->getDevice();
 
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_task);
   CGM.getOpenMPRuntime().emitTargetDataStandAloneCall(*this, S, IfCond, Device);
 }
 
@@ -4626,7 +4708,7 @@ static void mapParam(CodeGenFunction &CGF, const DeclRefExpr *Helper,
 void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
   assert(isOpenMPTaskLoopDirective(S.getDirectiveKind()));
   // Emit outlined function for task construct.
-  auto CS = cast<CapturedStmt>(S.getAssociatedStmt());
+  const CapturedStmt *CS = S.getCapturedStmt(OMPD_taskloop);
   auto CapturedStruct = GenerateCapturedStmtArgument(*CS);
   auto SharedsTy = getContext().getRecordType(CS->getCapturedRecordDecl());
   const Expr *IfCond = nullptr;
@@ -4748,15 +4830,16 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_taskloop,
                                                     CodeGen);
   };
-  if (Data.Nogroup)
-    EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
-  else {
+  if (Data.Nogroup) {
+    EmitOMPTaskBasedDirective(S, OMPD_taskloop, BodyGen, TaskGen, Data);
+  } else {
     CGM.getOpenMPRuntime().emitTaskgroupRegion(
         *this,
         [&S, &BodyGen, &TaskGen, &Data](CodeGenFunction &CGF,
                                         PrePostActionTy &Action) {
           Action.Enter(CGF);
-          CGF.EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
+          CGF.EmitOMPTaskBasedDirective(S, OMPD_taskloop, BodyGen, TaskGen,
+                                        Data);
         },
         S.getLocStart());
   }
@@ -4789,7 +4872,7 @@ void CodeGenFunction::EmitOMPTargetUpdateDirective(
   if (auto *C = S.getSingleClause<OMPDeviceClause>())
     Device = C->getDevice();
 
-  OMPLexicalScope Scope(*this, S, /*AsInlined=*/true);
+  OMPLexicalScope Scope(*this, S, OMPD_task);
   CGM.getOpenMPRuntime().emitTargetDataStandAloneCall(*this, S, IfCond, Device);
 }
 
@@ -4811,10 +4894,7 @@ void CodeGenFunction::EmitSimpleOMPExecutableDirective(
           }
         }
       }
-      const auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
-      while (const auto *CCS = dyn_cast<CapturedStmt>(CS->getCapturedStmt()))
-        CS = CCS;
-      CGF.EmitStmt(CS->getCapturedStmt());
+      CGF.EmitStmt(D.getInnermostCapturedStmt()->getCapturedStmt());
     }
   };
   OMPSimdLexicalScope Scope(*this, D);
diff --git a/lib/CodeGen/CodeGenFunction.cpp b/lib/CodeGen/CodeGenFunction.cpp
index 9dbd7cc3fcbf..e62d3e75b711 100644
--- a/lib/CodeGen/CodeGenFunction.cpp
+++ b/lib/CodeGen/CodeGenFunction.cpp
@@ -70,7 +70,7 @@ CodeGenFunction::CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext)
       IsSanitizerScope(false), CurFuncIsThunk(false), AutoreleaseResult(false),
       SawAsmBlock(false), IsOutlinedSEHHelper(false), BlockInfo(nullptr),
       BlockPointer(nullptr), LambdaThisCaptureField(nullptr),
-      NormalCleanupDest(nullptr), NextCleanupDestIndex(1),
+      NormalCleanupDest(Address::invalid()), NextCleanupDestIndex(1),
       FirstBlockInfo(nullptr), EHResumeBlock(nullptr), ExceptionSlot(nullptr),
       EHSelectorSlot(nullptr), DebugInfo(CGM.getModuleDebugInfo()),
       DisableDebugInfo(false), DidCallStackSave(false), IndirectBranch(nullptr),
@@ -419,6 +419,9 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   EmitIfUsed(*this, TerminateHandler);
   EmitIfUsed(*this, UnreachableBlock);
 
+  for (const auto &FuncletAndParent : TerminateFunclets)
+    EmitIfUsed(*this, FuncletAndParent.second);
+
   if (CGM.getCodeGenOpts().EmitDeclMetadata)
     EmitDeclMetadata();
 
@@ -436,10 +439,11 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   // if compiled with no optimizations. We do it for coroutine as the lifetime
   // of CleanupDestSlot alloca make correct coroutine frame building very
   // difficult.
-  if (NormalCleanupDest && isCoroutine()) {
+  if (NormalCleanupDest.isValid() && isCoroutine()) {
     llvm::DominatorTree DT(*CurFn);
-    llvm::PromoteMemToReg(NormalCleanupDest, DT);
-    NormalCleanupDest = nullptr;
+    llvm::PromoteMemToReg(
+        cast<llvm::AllocaInst>(NormalCleanupDest.getPointer()), DT);
+    NormalCleanupDest = Address::invalid();
   }
 }
 
@@ -884,7 +888,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD,
   }
 
   // Apply xray attributes to the function (as a string, for now)
-  if (D && ShouldXRayInstrumentFunction()) {
+  bool InstrumentXray = ShouldXRayInstrumentFunction();
+  if (D && InstrumentXray) {
     if (const auto *XRayAttr = D->getAttr<XRayInstrumentAttr>()) {
       if (XRayAttr->alwaysXRayInstrument())
         Fn->addFnAttr("function-instrument", "xray-always");
@@ -921,8 +926,13 @@ void CodeGenFunction::StartFunction(GlobalDecl GD,
   if (getLangOpts().CPlusPlus && SanOpts.has(SanitizerKind::Function)) {
     if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D)) {
       if (llvm::Constant *PrologueSig = getPrologueSignature(CGM, FD)) {
+        // Remove any (C++17) exception specifications, to allow calling e.g. a
+        // noexcept function through a non-noexcept pointer.
+        auto ProtoTy =
+          getContext().getFunctionTypeWithExceptionSpec(FD->getType(),
+                                                        EST_None);
         llvm::Constant *FTRTTIConst =
-            CGM.GetAddrOfRTTIDescriptor(FD->getType(), /*ForEH=*/true);
+            CGM.GetAddrOfRTTIDescriptor(ProtoTy, /*ForEH=*/true);
         llvm::Constant *FTRTTIConstEncoded =
             EncodeAddrForUseInPrologue(Fn, FTRTTIConst);
         llvm::Constant *PrologueStructElems[] = {PrologueSig,
@@ -2297,6 +2307,60 @@ void CodeGenFunction::EmitSanitizerStatReport(llvm::SanitizerStatKind SSK) {
   CGM.getSanStats().create(IRB, SSK);
 }
 
+llvm::Value *
+CodeGenFunction::FormResolverCondition(const MultiVersionResolverOption &RO) {
+  llvm::Value *TrueCondition = nullptr;
+  if (!RO.ParsedAttribute.Architecture.empty())
+    TrueCondition = EmitX86CpuIs(RO.ParsedAttribute.Architecture);
+
+  if (!RO.ParsedAttribute.Features.empty()) {
+    SmallVector<StringRef, 8> FeatureList;
+    llvm::for_each(RO.ParsedAttribute.Features,
+                   [&FeatureList](const std::string &Feature) {
+                     FeatureList.push_back(StringRef{Feature}.substr(1));
+                   });
+    llvm::Value *FeatureCmp = EmitX86CpuSupports(FeatureList);
+    TrueCondition = TrueCondition ? Builder.CreateAnd(TrueCondition, FeatureCmp)
+                                  : FeatureCmp;
+  }
+  return TrueCondition;
+}
+
+void CodeGenFunction::EmitMultiVersionResolver(
+    llvm::Function *Resolver, ArrayRef<MultiVersionResolverOption> Options) {
+  assert((getContext().getTargetInfo().getTriple().getArch() ==
+              llvm::Triple::x86 ||
+          getContext().getTargetInfo().getTriple().getArch() ==
+              llvm::Triple::x86_64) &&
+         "Only implemented for x86 targets");
+
+  // Main function's basic block.
+  llvm::BasicBlock *CurBlock = createBasicBlock("entry", Resolver);
+  Builder.SetInsertPoint(CurBlock);
+  EmitX86CpuInit();
+
+  llvm::Function *DefaultFunc = nullptr;
+  for (const MultiVersionResolverOption &RO : Options) {
+    Builder.SetInsertPoint(CurBlock);
+    llvm::Value *TrueCondition = FormResolverCondition(RO);
+
+    if (!TrueCondition) {
+      DefaultFunc = RO.Function;
+    } else {
+      llvm::BasicBlock *RetBlock = createBasicBlock("ro_ret", Resolver);
+      llvm::IRBuilder<> RetBuilder(RetBlock);
+      RetBuilder.CreateRet(RO.Function);
+      CurBlock = createBasicBlock("ro_else", Resolver);
+      Builder.CreateCondBr(TrueCondition, RetBlock, CurBlock);
+    }
+  }
+
+  assert(DefaultFunc && "No default version?");
+  // Emit return from the 'else-ist' block.
+  Builder.SetInsertPoint(CurBlock);
+  Builder.CreateRet(DefaultFunc);
+}
+
 llvm::DebugLoc CodeGenFunction::SourceLocToDebugLoc(SourceLocation Location) {
   if (CGDebugInfo *DI = getDebugInfo())
     return DI->SourceLocToDebugLoc(Location);
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index a7c322d3b95c..75071a8e3229 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -34,6 +34,7 @@
 #include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
@@ -433,7 +434,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   };
 
   /// i32s containing the indexes of the cleanup destinations.
-  llvm::AllocaInst *NormalCleanupDest;
+  Address NormalCleanupDest;
 
   unsigned NextCleanupDestIndex;
 
@@ -1420,6 +1421,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::BasicBlock *TerminateHandler;
   llvm::BasicBlock *TrapBB;
 
+  /// Terminate funclets keyed by parent funclet pad.
+  llvm::MapVector<llvm::Value *, llvm::BasicBlock *> TerminateFunclets;
+
   /// True if we need emit the life-time markers.
   const bool ShouldEmitLifetimeMarkers;
 
@@ -1808,6 +1812,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// getTerminateLandingPad - Return a landing pad that just calls terminate.
   llvm::BasicBlock *getTerminateLandingPad();
 
+  /// getTerminateLandingPad - Return a cleanup funclet that just calls
+  /// terminate.
+  llvm::BasicBlock *getTerminateFunclet();
+
   /// getTerminateHandler - Return a handler (not a landing pad, just
   /// a catch handler) that just calls terminate.  This is used when
   /// a terminate scope encloses a try.
@@ -2093,16 +2101,14 @@ class CodeGenFunction : public CodeGenTypeCache {
   ///
   /// The difference to EmitAggregateCopy is that tail padding is not copied.
   /// This is required for correctness when assigning non-POD structures in C++.
-  void EmitAggregateAssign(Address DestPtr, Address SrcPtr,
-                           QualType EltTy) {
+  void EmitAggregateAssign(LValue Dest, LValue Src, QualType EltTy) {
     bool IsVolatile = hasVolatileMember(EltTy);
-    EmitAggregateCopy(DestPtr, SrcPtr, EltTy, IsVolatile, true);
+    EmitAggregateCopy(Dest, Src, EltTy, IsVolatile, /* isAssignment= */ true);
   }
 
-  void EmitAggregateCopyCtor(Address DestPtr, Address SrcPtr,
-                             QualType DestTy, QualType SrcTy) {
-    EmitAggregateCopy(DestPtr, SrcPtr, SrcTy, /*IsVolatile=*/false,
-                      /*IsAssignment=*/false);
+  void EmitAggregateCopyCtor(LValue Dest, LValue Src) {
+    EmitAggregateCopy(Dest, Src, Src.getType(),
+                      /* IsVolatile= */ false, /* IsAssignment= */ false);
   }
 
   /// EmitAggregateCopy - Emit an aggregate copy.
@@ -2111,9 +2117,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// volatile.
   /// \param isAssignment - If false, allow padding to be copied.  This often
   /// yields more efficient.
-  void EmitAggregateCopy(Address DestPtr, Address SrcPtr,
-                         QualType EltTy, bool isVolatile=false,
-                         bool isAssignment = false);
+  void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy,
+                         bool isVolatile = false, bool isAssignment = false);
 
   /// GetAddrOfLocalVar - Return the address of a local variable.
   Address GetAddrOfLocalVar(const VarDecl *VD) {
@@ -2824,6 +2829,7 @@ class CodeGenFunction : public CodeGenTypeCache {
                                         const OMPTaskDataTy & /*Data*/)>
       TaskGenTy;
   void EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
+                                 const OpenMPDirectiveKind CapturedRegion,
                                  const RegionCodeGenTy &BodyGen,
                                  const TaskGenTy &TaskGen, OMPTaskDataTy &Data);
   struct OMPTargetDataInfo {
@@ -2933,6 +2939,15 @@ class CodeGenFunction : public CodeGenTypeCache {
   static void EmitOMPTargetSimdDeviceFunction(CodeGenModule &CGM,
                                               StringRef ParentName,
                                               const OMPTargetSimdDirective &S);
+  /// Emit device code for the target teams distribute parallel for simd
+  /// directive.
+  static void EmitOMPTargetTeamsDistributeParallelForSimdDeviceFunction(
+      CodeGenModule &CGM, StringRef ParentName,
+      const OMPTargetTeamsDistributeParallelForSimdDirective &S);
+
+  static void EmitOMPTargetTeamsDistributeParallelForDeviceFunction(
+      CodeGenModule &CGM, StringRef ParentName,
+      const OMPTargetTeamsDistributeParallelForDirective &S);
   /// \brief Emit inner loop of the worksharing/simd construct.
   ///
   /// \param S Directive, for which the inner loop must be emitted.
@@ -3976,6 +3991,29 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   void EmitSanitizerStatReport(llvm::SanitizerStatKind SSK);
 
+  struct MultiVersionResolverOption {
+    llvm::Function *Function;
+    TargetAttr::ParsedTargetAttr ParsedAttribute;
+    unsigned Priority;
+    MultiVersionResolverOption(const TargetInfo &TargInfo, llvm::Function *F,
+                               const clang::TargetAttr::ParsedTargetAttr &PT)
+        : Function(F), ParsedAttribute(PT), Priority(0u) {
+      for (StringRef Feat : PT.Features)
+        Priority = std::max(Priority,
+                            TargInfo.multiVersionSortPriority(Feat.substr(1)));
+
+      if (!PT.Architecture.empty())
+        Priority = std::max(Priority,
+                            TargInfo.multiVersionSortPriority(PT.Architecture));
+    }
+
+    bool operator>(const MultiVersionResolverOption &Other) const {
+      return Priority > Other.Priority;
+    }
+  };
+  void EmitMultiVersionResolver(llvm::Function *Resolver,
+                                ArrayRef<MultiVersionResolverOption> Options);
+
 private:
   QualType getVarArgType(const Expr *Arg);
 
@@ -3992,6 +4030,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitX86CpuSupports(const CallExpr *E);
   llvm::Value *EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs);
   llvm::Value *EmitX86CpuInit();
+  llvm::Value *FormResolverCondition(const MultiVersionResolverOption &RO);
 };
 
 /// Helper class with most of the code for saving a value for a
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index df33fc3e8200..eba886db3e35 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -395,6 +395,7 @@ void CodeGenModule::Release() {
   applyGlobalValReplacements();
   applyReplacements();
   checkAliases();
+  emitMultiVersionFunctions();
   EmitCXXGlobalInitFunc();
   EmitCXXGlobalDtorFunc();
   EmitCXXThreadLocalInitFunc();
@@ -456,6 +457,10 @@ void CodeGenModule::Release() {
     // Indicate that we want CodeView in the metadata.
     getModule().addModuleFlag(llvm::Module::Warning, "CodeView", 1);
   }
+  if (CodeGenOpts.ControlFlowGuard) {
+    // We want function ID tables for Control Flow Guard.
+    getModule().addModuleFlag(llvm::Module::Warning, "cfguard", 1);
+  }
   if (CodeGenOpts.OptimizationLevel > 0 && CodeGenOpts.StrictVTablePointers) {
     // We don't support LTO with 2 with different StrictVTablePointers
     // FIXME: we could support it by stripping all the information introduced
@@ -501,6 +506,20 @@ void CodeGenModule::Release() {
     getModule().addModuleFlag(llvm::Module::Override, "Cross-DSO CFI", 1);
   }
 
+  if (CodeGenOpts.CFProtectionReturn &&
+      Target.checkCFProtectionReturnSupported(getDiags())) {
+    // Indicate that we want to instrument return control flow protection.
+    getModule().addModuleFlag(llvm::Module::Override, "cf-protection-return",
+                              1);
+  }
+
+  if (CodeGenOpts.CFProtectionBranch &&
+      Target.checkCFProtectionBranchSupported(getDiags())) {
+    // Indicate that we want to instrument branch control flow protection.
+    getModule().addModuleFlag(llvm::Module::Override, "cf-protection-branch",
+                              1);
+  }
+
   if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) {
     // Indicate whether __nvvm_reflect should be configured to flush denormal
     // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
@@ -632,6 +651,14 @@ CodeGenModule::mergeTBAAInfoForConditionalOperator(TBAAAccessInfo InfoA,
   return TBAA->mergeTBAAInfoForConditionalOperator(InfoA, InfoB);
 }
 
+TBAAAccessInfo
+CodeGenModule::mergeTBAAInfoForMemoryTransfer(TBAAAccessInfo DestInfo,
+                                              TBAAAccessInfo SrcInfo) {
+  if (!TBAA)
+    return TBAAAccessInfo();
+  return TBAA->mergeTBAAInfoForConditionalOperator(DestInfo, SrcInfo);
+}
+
 void CodeGenModule::DecorateInstructionWithTBAA(llvm::Instruction *Inst,
                                                 TBAAAccessInfo TBAAInfo) {
   if (llvm::MDNode *Tag = getTBAAAccessTagInfo(TBAAInfo))
@@ -675,6 +702,8 @@ llvm::ConstantInt *CodeGenModule::getSize(CharUnits size) {
 void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV,
                                         const NamedDecl *D,
                                         ForDefinition_t IsForDefinition) const {
+  if (GV->hasDLLImportStorageClass())
+    return;
   // Internal definitions always have default visibility.
   if (GV->hasLocalLinkage()) {
     GV->setVisibility(llvm::GlobalValue::DefaultVisibility);
@@ -725,36 +754,52 @@ void CodeGenModule::setTLSMode(llvm::GlobalValue *GV, const VarDecl &D) const {
   GV->setThreadLocalMode(TLM);
 }
 
-StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
-  GlobalDecl CanonicalGD = GD.getCanonicalDecl();
+static void AppendTargetMangling(const CodeGenModule &CGM,
+                                 const TargetAttr *Attr, raw_ostream &Out) {
+  if (Attr->isDefaultVersion())
+    return;
 
-  // Some ABIs don't have constructor variants.  Make sure that base and
-  // complete constructors get mangled the same.
-  if (const auto *CD = dyn_cast<CXXConstructorDecl>(CanonicalGD.getDecl())) {
-    if (!getTarget().getCXXABI().hasConstructorVariants()) {
-      CXXCtorType OrigCtorType = GD.getCtorType();
-      assert(OrigCtorType == Ctor_Base || OrigCtorType == Ctor_Complete);
-      if (OrigCtorType == Ctor_Base)
-        CanonicalGD = GlobalDecl(CD, Ctor_Complete);
-    }
+  Out << '.';
+  const auto &Target = CGM.getTarget();
+  TargetAttr::ParsedTargetAttr Info =
+      Attr->parse([&Target](StringRef LHS, StringRef RHS) {
+                    // Multiversioning doesn't allow "no-${feature}", so we can
+                    // only have "+" prefixes here.
+                    assert(LHS.startswith("+") && RHS.startswith("+") &&
+                           "Features should always have a prefix.");
+                    return Target.multiVersionSortPriority(LHS.substr(1)) >
+                           Target.multiVersionSortPriority(RHS.substr(1));
+                  });
+
+  bool IsFirst = true;
+
+  if (!Info.Architecture.empty()) {
+    IsFirst = false;
+    Out << "arch_" << Info.Architecture;
   }
 
-  auto FoundName = MangledDeclNames.find(CanonicalGD);
-  if (FoundName != MangledDeclNames.end())
-    return FoundName->second;
+  for (StringRef Feat : Info.Features) {
+    if (!IsFirst)
+      Out << '_';
+    IsFirst = false;
+    Out << Feat.substr(1);
+  }
+}
 
-  const auto *ND = cast<NamedDecl>(GD.getDecl());
+static std::string getMangledNameImpl(const CodeGenModule &CGM, GlobalDecl GD,
+                                      const NamedDecl *ND,
+                                      bool OmitTargetMangling = false) {
   SmallString<256> Buffer;
-  StringRef Str;
-  if (getCXXABI().getMangleContext().shouldMangleDeclName(ND)) {
+  llvm::raw_svector_ostream Out(Buffer);
+  MangleContext &MC = CGM.getCXXABI().getMangleContext();
+  if (MC.shouldMangleDeclName(ND)) {
     llvm::raw_svector_ostream Out(Buffer);
     if (const auto *D = dyn_cast<CXXConstructorDecl>(ND))
-      getCXXABI().getMangleContext().mangleCXXCtor(D, GD.getCtorType(), Out);
+      MC.mangleCXXCtor(D, GD.getCtorType(), Out);
     else if (const auto *D = dyn_cast<CXXDestructorDecl>(ND))
-      getCXXABI().getMangleContext().mangleCXXDtor(D, GD.getDtorType(), Out);
+      MC.mangleCXXDtor(D, GD.getDtorType(), Out);
     else
-      getCXXABI().getMangleContext().mangleName(ND, Out);
-    Str = Out.str();
+      MC.mangleName(ND, Out);
   } else {
     IdentifierInfo *II = ND->getIdentifier();
     assert(II && "Attempt to mangle unnamed decl.");
@@ -764,14 +809,78 @@ StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
         FD->getType()->castAs<FunctionType>()->getCallConv() == CC_X86RegCall) {
       llvm::raw_svector_ostream Out(Buffer);
       Out << "__regcall3__" << II->getName();
-      Str = Out.str();
     } else {
-      Str = II->getName();
+      Out << II->getName();
+    }
+  }
+
+  if (const auto *FD = dyn_cast<FunctionDecl>(ND))
+    if (FD->isMultiVersion() && !OmitTargetMangling)
+      AppendTargetMangling(CGM, FD->getAttr<TargetAttr>(), Out);
+  return Out.str();
+}
+
+void CodeGenModule::UpdateMultiVersionNames(GlobalDecl GD,
+                                            const FunctionDecl *FD) {
+  if (!FD->isMultiVersion())
+    return;
+
+  // Get the name of what this would be without the 'target' attribute.  This
+  // allows us to lookup the version that was emitted when this wasn't a
+  // multiversion function.
+  std::string NonTargetName =
+      getMangledNameImpl(*this, GD, FD, /*OmitTargetMangling=*/true);
+  GlobalDecl OtherGD;
+  if (lookupRepresentativeDecl(NonTargetName, OtherGD)) {
+    assert(OtherGD.getCanonicalDecl()
+               .getDecl()
+               ->getAsFunction()
+               ->isMultiVersion() &&
+           "Other GD should now be a multiversioned function");
+    // OtherFD is the version of this function that was mangled BEFORE
+    // becoming a MultiVersion function.  It potentially needs to be updated.
+    const FunctionDecl *OtherFD =
+        OtherGD.getCanonicalDecl().getDecl()->getAsFunction();
+    std::string OtherName = getMangledNameImpl(*this, OtherGD, OtherFD);
+    // This is so that if the initial version was already the 'default'
+    // version, we don't try to update it.
+    if (OtherName != NonTargetName) {
+      // Remove instead of erase, since others may have stored the StringRef
+      // to this.
+      const auto ExistingRecord = Manglings.find(NonTargetName);
+      if (ExistingRecord != std::end(Manglings))
+        Manglings.remove(&(*ExistingRecord));
+      auto Result = Manglings.insert(std::make_pair(OtherName, OtherGD));
+      MangledDeclNames[OtherGD.getCanonicalDecl()] = Result.first->first();
+      if (llvm::GlobalValue *Entry = GetGlobalValue(NonTargetName))
+        Entry->setName(OtherName);
     }
   }
+}
+
+StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
+  GlobalDecl CanonicalGD = GD.getCanonicalDecl();
+
+  // Some ABIs don't have constructor variants.  Make sure that base and
+  // complete constructors get mangled the same.
+  if (const auto *CD = dyn_cast<CXXConstructorDecl>(CanonicalGD.getDecl())) {
+    if (!getTarget().getCXXABI().hasConstructorVariants()) {
+      CXXCtorType OrigCtorType = GD.getCtorType();
+      assert(OrigCtorType == Ctor_Base || OrigCtorType == Ctor_Complete);
+      if (OrigCtorType == Ctor_Base)
+        CanonicalGD = GlobalDecl(CD, Ctor_Complete);
+    }
+  }
+
+  auto FoundName = MangledDeclNames.find(CanonicalGD);
+  if (FoundName != MangledDeclNames.end())
+    return FoundName->second;
+
 
   // Keep the first result in the case of a mangling collision.
-  auto Result = Manglings.insert(std::make_pair(Str, GD));
+  const auto *ND = cast<NamedDecl>(GD.getDecl());
+  auto Result =
+      Manglings.insert(std::make_pair(getMangledNameImpl(*this, GD, ND), GD));
   return MangledDeclNames[CanonicalGD] = Result.first->first();
 }
 
@@ -2061,6 +2170,77 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
 static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
                                                       llvm::Function *NewFn);
 
+void CodeGenModule::emitMultiVersionFunctions() {
+  for (GlobalDecl GD : MultiVersionFuncs) {
+    SmallVector<CodeGenFunction::MultiVersionResolverOption, 10> Options;
+    const FunctionDecl *FD = cast<FunctionDecl>(GD.getDecl());
+    getContext().forEachMultiversionedFunctionVersion(
+        FD, [this, &GD, &Options](const FunctionDecl *CurFD) {
+          GlobalDecl CurGD{
+              (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)};
+          StringRef MangledName = getMangledName(CurGD);
+          llvm::Constant *Func = GetGlobalValue(MangledName);
+          if (!Func) {
+            if (CurFD->isDefined()) {
+              EmitGlobalFunctionDefinition(CurGD, nullptr);
+              Func = GetGlobalValue(MangledName);
+            } else {
+              const CGFunctionInfo &FI =
+                  getTypes().arrangeGlobalDeclaration(GD);
+              llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
+              Func = GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false,
+                                       /*DontDefer=*/false, ForDefinition);
+            }
+            assert(Func && "This should have just been created");
+          }
+          Options.emplace_back(getTarget(), cast<llvm::Function>(Func),
+                               CurFD->getAttr<TargetAttr>()->parse());
+        });
+
+    llvm::Function *ResolverFunc = cast<llvm::Function>(
+        GetGlobalValue((getMangledName(GD) + ".resolver").str()));
+    if (supportsCOMDAT())
+      ResolverFunc->setComdat(
+          getModule().getOrInsertComdat(ResolverFunc->getName()));
+    std::stable_sort(
+        Options.begin(), Options.end(),
+        std::greater<CodeGenFunction::MultiVersionResolverOption>());
+    CodeGenFunction CGF(*this);
+    CGF.EmitMultiVersionResolver(ResolverFunc, Options);
+  }
+}
+
+/// If an ifunc for the specified mangled name is not in the module, create and
+/// return an llvm IFunc Function with the specified type.
+llvm::Constant *
+CodeGenModule::GetOrCreateMultiVersionIFunc(GlobalDecl GD, llvm::Type *DeclTy,
+                                            StringRef MangledName,
+                                            const FunctionDecl *FD) {
+  std::string IFuncName = (MangledName + ".ifunc").str();
+  if (llvm::GlobalValue *IFuncGV = GetGlobalValue(IFuncName))
+    return IFuncGV;
+
+  // Since this is the first time we've created this IFunc, make sure
+  // that we put this multiversioned function into the list to be
+  // replaced later.
+  MultiVersionFuncs.push_back(GD);
+
+  std::string ResolverName = (MangledName + ".resolver").str();
+  llvm::Type *ResolverType = llvm::FunctionType::get(
+      llvm::PointerType::get(DeclTy,
+                             Context.getTargetAddressSpace(FD->getType())),
+      false);
+  llvm::Constant *Resolver =
+      GetOrCreateLLVMFunction(ResolverName, ResolverType, GlobalDecl{},
+                              /*ForVTable=*/false);
+  llvm::GlobalIFunc *GIF = llvm::GlobalIFunc::create(
+      DeclTy, 0, llvm::Function::ExternalLinkage, "", Resolver, &getModule());
+  GIF->setName(IFuncName);
+  SetCommonAttributes(FD, GIF);
+
+  return GIF;
+}
+
 /// GetOrCreateLLVMFunction - If the specified mangled name is not in the
 /// module, create and return an llvm Function with the specified type. If there
 /// is something in the module with the specified name, return it potentially
@@ -2074,6 +2254,16 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
     ForDefinition_t IsForDefinition) {
   const Decl *D = GD.getDecl();
 
+  // Any attempts to use a MultiVersion function should result in retrieving
+  // the iFunc instead. Name Mangling will handle the rest of the changes.
+  if (const FunctionDecl *FD = cast_or_null<FunctionDecl>(D)) {
+    if (FD->isMultiVersion() && FD->getAttr<TargetAttr>()->isDefaultVersion()) {
+      UpdateMultiVersionNames(GD, FD);
+      if (!IsForDefinition)
+        return GetOrCreateMultiVersionIFunc(GD, Ty, MangledName, FD);
+    }
+  }
+
   // Lookup the entry, lazily creating it if necessary.
   llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
   if (Entry) {
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index 22c4463b2c81..42e9e1be3166 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -324,6 +324,10 @@ class CodeGenModule : public CodeGenTypeCache {
   /// is defined once we get to the end of the of the translation unit.
   std::vector<GlobalDecl> Aliases;
 
+  /// List of multiversion functions that have to be emitted.  Used to make sure
+  /// we properly emit the iFunc.
+  std::vector<GlobalDecl> MultiVersionFuncs;
+
   typedef llvm::StringMap<llvm::TrackingVH<llvm::Constant> > ReplacementsTy;
   ReplacementsTy Replacements;
 
@@ -685,6 +689,11 @@ class CodeGenModule : public CodeGenTypeCache {
   TBAAAccessInfo mergeTBAAInfoForConditionalOperator(TBAAAccessInfo InfoA,
                                                      TBAAAccessInfo InfoB);
 
+  /// mergeTBAAInfoForMemoryTransfer - Get merged TBAA information for the
+  /// purposes of memory transfer calls.
+  TBAAAccessInfo mergeTBAAInfoForMemoryTransfer(TBAAAccessInfo DestInfo,
+                                                TBAAAccessInfo SrcInfo);
+
   /// getTBAAInfoForSubobject - Get TBAA information for an access with a given
   /// base lvalue.
   TBAAAccessInfo getTBAAInfoForSubobject(LValue Base, QualType AccessType) {
@@ -1247,6 +1256,12 @@ class CodeGenModule : public CodeGenTypeCache {
       llvm::AttributeList ExtraAttrs = llvm::AttributeList(),
       ForDefinition_t IsForDefinition = NotForDefinition);
 
+  llvm::Constant *GetOrCreateMultiVersionIFunc(GlobalDecl GD,
+                                               llvm::Type *DeclTy,
+                                               StringRef MangledName,
+                                               const FunctionDecl *FD);
+  void UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD);
+
   llvm::Constant *GetOrCreateLLVMGlobal(StringRef MangledName,
                                         llvm::PointerType *PTy,
                                         const VarDecl *D,
@@ -1319,6 +1334,8 @@ class CodeGenModule : public CodeGenTypeCache {
 
   void checkAliases();
 
+  void emitMultiVersionFunctions();
+
   /// Emit any vtables which we deferred and still have a use for.
   void EmitDeferredVTables();
 
diff --git a/lib/CodeGen/CodeGenTBAA.cpp b/lib/CodeGen/CodeGenTBAA.cpp
index ad473032db17..90509b0d29cb 100644
--- a/lib/CodeGen/CodeGenTBAA.cpp
+++ b/lib/CodeGen/CodeGenTBAA.cpp
@@ -391,3 +391,21 @@ CodeGenTBAA::mergeTBAAInfoForConditionalOperator(TBAAAccessInfo InfoA,
   // access type regardless of their base types.
   return TBAAAccessInfo::getMayAliasInfo();
 }
+
+TBAAAccessInfo
+CodeGenTBAA::mergeTBAAInfoForMemoryTransfer(TBAAAccessInfo DestInfo,
+                                            TBAAAccessInfo SrcInfo) {
+  if (DestInfo == SrcInfo)
+    return DestInfo;
+
+  if (!DestInfo || !SrcInfo)
+    return TBAAAccessInfo();
+
+  if (DestInfo.isMayAlias() || SrcInfo.isMayAlias())
+    return TBAAAccessInfo::getMayAliasInfo();
+
+  // TODO: Implement the rest of the logic here. For example, two accesses
+  // with same final access types result in an access to an object of that final
+  // access type regardless of their base types.
+  return TBAAAccessInfo::getMayAliasInfo();
+}
diff --git a/lib/CodeGen/CodeGenTBAA.h b/lib/CodeGen/CodeGenTBAA.h
index a5b1f66bcd1a..c3e5c30643d1 100644
--- a/lib/CodeGen/CodeGenTBAA.h
+++ b/lib/CodeGen/CodeGenTBAA.h
@@ -201,6 +201,11 @@ class CodeGenTBAA {
   /// purpose of conditional operator.
   TBAAAccessInfo mergeTBAAInfoForConditionalOperator(TBAAAccessInfo InfoA,
                                                      TBAAAccessInfo InfoB);
+
+  /// mergeTBAAInfoForMemoryTransfer - Get merged TBAA information for the
+  /// purpose of memory transfer calls.
+  TBAAAccessInfo mergeTBAAInfoForMemoryTransfer(TBAAAccessInfo DestInfo,
+                                                TBAAAccessInfo SrcInfo);
 };
 
 }  // end namespace CodeGen
diff --git a/lib/CodeGen/CoverageMappingGen.cpp b/lib/CodeGen/CoverageMappingGen.cpp
index 89a30dc7040c..5eb48c2b579d 100644
--- a/lib/CodeGen/CoverageMappingGen.cpp
+++ b/lib/CodeGen/CoverageMappingGen.cpp
@@ -74,7 +74,10 @@ class SourceMappingRegion {
 
   bool hasEndLoc() const { return LocEnd.hasValue(); }
 
-  void setEndLoc(SourceLocation Loc) { LocEnd = Loc; }
+  void setEndLoc(SourceLocation Loc) {
+    assert(Loc.isValid() && "Setting an invalid end location");
+    LocEnd = Loc;
+  }
 
   SourceLocation getEndLoc() const {
     assert(LocEnd && "Region has no end location");
diff --git a/lib/CodeGen/ItaniumCXXABI.cpp b/lib/CodeGen/ItaniumCXXABI.cpp
index c375b82ea936..861f29a298b6 100644
--- a/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3435,9 +3435,7 @@ static unsigned extractPBaseFlags(ASTContext &Ctx, QualType &Type) {
   if (auto *Proto = Type->getAs<FunctionProtoType>()) {
     if (Proto->isNothrow(Ctx)) {
       Flags |= ItaniumRTTIBuilder::PTI_Noexcept;
-      Type = Ctx.getFunctionType(
-          Proto->getReturnType(), Proto->getParamTypes(),
-          Proto->getExtProtoInfo().withExceptionSpec(EST_None));
+      Type = Ctx.getFunctionTypeWithExceptionSpec(Type, EST_None);
     }
   }
 
@@ -3904,7 +3902,9 @@ static void InitCatchParam(CodeGenFunction &CGF,
     llvm::Value *rawAdjustedExn = CallBeginCatch(CGF, Exn, true);
     Address adjustedExn(CGF.Builder.CreateBitCast(rawAdjustedExn, PtrTy),
                         caughtExnAlignment);
-    CGF.EmitAggregateCopy(ParamAddr, adjustedExn, CatchType);
+    LValue Dest = CGF.MakeAddrLValue(ParamAddr, CatchType);
+    LValue Src = CGF.MakeAddrLValue(adjustedExn, CatchType);
+    CGF.EmitAggregateCopy(Dest, Src, CatchType);
     return;
   }
 
diff --git a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
index d0760b9cc2a6..150b64fb2316 100644
--- a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
+++ b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
@@ -71,9 +71,8 @@ class PCHContainerGenerator : public ASTConsumer {
     }
 
     bool VisitImportDecl(ImportDecl *D) {
-      auto *Import = cast<ImportDecl>(D);
-      if (!Import->getImportedOwningModule())
-        DI.EmitImportDecl(*Import);
+      if (!D->getImportedOwningModule())
+        DI.EmitImportDecl(*D);
       return true;
     }
 
@@ -229,6 +228,11 @@ class PCHContainerGenerator : public ASTConsumer {
       Builder->getModuleDebugInfo()->completeRequiredType(RD);
   }
 
+  void HandleImplicitImportDecl(ImportDecl *D) override {
+    if (!D->getImportedOwningModule())
+      Builder->getModuleDebugInfo()->EmitImportDecl(*D);
+  }
+
   /// Emit a container holding the serialized AST.
   void HandleTranslationUnit(ASTContext &Ctx) override {
     assert(M && VMContext && Builder);
diff --git a/lib/CodeGen/SwiftCallingConv.cpp b/lib/CodeGen/SwiftCallingConv.cpp
index fc8e36d2c599..2a12078e198a 100644
--- a/lib/CodeGen/SwiftCallingConv.cpp
+++ b/lib/CodeGen/SwiftCallingConv.cpp
@@ -579,11 +579,9 @@ bool SwiftAggLowering::shouldPassIndirectly(bool asReturnValue) const {
   // Empty types don't need to be passed indirectly.
   if (Entries.empty()) return false;
 
-  CharUnits totalSize = Entries.back().End;
-
   // Avoid copying the array of types when there's just a single element.
   if (Entries.size() == 1) {
-    return getSwiftABIInfo(CGM).shouldPassIndirectlyForSwift(totalSize,
+    return getSwiftABIInfo(CGM).shouldPassIndirectlyForSwift(
                                                            Entries.back().Type,
                                                              asReturnValue);    
   }
@@ -593,8 +591,14 @@ bool SwiftAggLowering::shouldPassIndirectly(bool asReturnValue) const {
   for (auto &entry : Entries) {
     componentTys.push_back(entry.Type);
   }
-  return getSwiftABIInfo(CGM).shouldPassIndirectlyForSwift(totalSize,
-                                                           componentTys,
+  return getSwiftABIInfo(CGM).shouldPassIndirectlyForSwift(componentTys,
+                                                           asReturnValue);
+}
+
+bool swiftcall::shouldPassIndirectly(CodeGenModule &CGM,
+                                     ArrayRef<llvm::Type*> componentTys,
+                                     bool asReturnValue) {
+  return getSwiftABIInfo(CGM).shouldPassIndirectlyForSwift(componentTys,
                                                            asReturnValue);
 }
 
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 4b8006428f8f..8641d872724f 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -201,10 +201,6 @@ bool ABIInfo::isHomogeneousAggregateSmallEnough(const Type *Base,
   return false;
 }
 
-bool ABIInfo::shouldSignExtUnsignedType(QualType Ty) const {
-  return false;
-}
-
 LLVM_DUMP_METHOD void ABIArgInfo::dump() const {
   raw_ostream &OS = llvm::errs();
   OS << "(ABIArgInfo Kind=";
@@ -682,8 +678,8 @@ ABIArgInfo DefaultABIInfo::classifyArgumentType(QualType Ty) const {
   if (const EnumType *EnumTy = Ty->getAs<EnumType>())
     Ty = EnumTy->getDecl()->getIntegerType();
 
-  return (Ty->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
+                                        : ABIArgInfo::getDirect());
 }
 
 ABIArgInfo DefaultABIInfo::classifyReturnType(QualType RetTy) const {
@@ -697,8 +693,8 @@ ABIArgInfo DefaultABIInfo::classifyReturnType(QualType RetTy) const {
   if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
     RetTy = EnumTy->getDecl()->getIntegerType();
 
-  return (RetTy->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend(RetTy)
+                                           : ABIArgInfo::getDirect());
 }
 
 //===----------------------------------------------------------------------===//
@@ -845,8 +841,8 @@ ABIArgInfo PNaClABIInfo::classifyArgumentType(QualType Ty) const {
     return ABIArgInfo::getDirect();
   }
 
-  return (Ty->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
+                                        : ABIArgInfo::getDirect());
 }
 
 ABIArgInfo PNaClABIInfo::classifyReturnType(QualType RetTy) const {
@@ -861,8 +857,8 @@ ABIArgInfo PNaClABIInfo::classifyReturnType(QualType RetTy) const {
   if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
     RetTy = EnumTy->getDecl()->getIntegerType();
 
-  return (RetTy->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend(RetTy)
+                                           : ABIArgInfo::getDirect());
 }
 
 /// IsX86_MMXType - Return true if this is an MMX type.
@@ -1028,8 +1024,7 @@ class X86_32ABIInfo : public SwiftABIInfo {
       IsMCUABI(CGT.getTarget().getTriple().isOSIAMCU()),
       DefaultNumRegisterParameters(NumRegisterParameters) {}
 
-  bool shouldPassIndirectlyForSwift(CharUnits totalSize,
-                                    ArrayRef<llvm::Type*> scalars,
+  bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,
                                     bool asReturnValue) const override {
     // LLVM's x86-32 lowering currently only assigns up to three
     // integer registers and three fp registers.  Oddly, it'll use up to
@@ -1404,8 +1399,8 @@ ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy,
   if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
     RetTy = EnumTy->getDecl()->getIntegerType();
 
-  return (RetTy->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend(RetTy)
+                                           : ABIArgInfo::getDirect());
 }
 
 static bool isSSEVectorType(ASTContext &Context, QualType Ty) {
@@ -1677,8 +1672,8 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
 
   if (Ty->isPromotableIntegerType()) {
     if (InReg)
-      return ABIArgInfo::getExtendInReg();
-    return ABIArgInfo::getExtend();
+      return ABIArgInfo::getExtendInReg(Ty);
+    return ABIArgInfo::getExtend(Ty);
   }
 
   if (InReg)
@@ -2168,8 +2163,7 @@ class X86_64ABIInfo : public SwiftABIInfo {
     return Has64BitPointers;
   }
 
-  bool shouldPassIndirectlyForSwift(CharUnits totalSize,
-                                    ArrayRef<llvm::Type*> scalars,
+  bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,
                                     bool asReturnValue) const override {
     return occupiesMoreThan(CGT, scalars, /*total*/ 4);
   }  
@@ -2201,8 +2195,7 @@ class WinX86_64ABIInfo : public SwiftABIInfo {
     return isX86VectorCallAggregateSmallEnough(NumMembers);
   }
 
-  bool shouldPassIndirectlyForSwift(CharUnits totalSize,
-                                    ArrayRef<llvm::Type *> scalars,
+  bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type *> scalars,
                                     bool asReturnValue) const override {
     return occupiesMoreThan(CGT, scalars, /*total*/ 4);
   }
@@ -2868,8 +2861,8 @@ ABIArgInfo X86_64ABIInfo::getIndirectReturnResult(QualType Ty) const {
     if (const EnumType *EnumTy = Ty->getAs<EnumType>())
       Ty = EnumTy->getDecl()->getIntegerType();
 
-    return (Ty->isPromotableIntegerType() ?
-            ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+    return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
+                                          : ABIArgInfo::getDirect());
   }
 
   return getNaturalAlignIndirect(Ty);
@@ -2901,8 +2894,8 @@ ABIArgInfo X86_64ABIInfo::getIndirectResult(QualType Ty,
     if (const EnumType *EnumTy = Ty->getAs<EnumType>())
       Ty = EnumTy->getDecl()->getIntegerType();
 
-    return (Ty->isPromotableIntegerType() ?
-            ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+    return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
+                                          : ABIArgInfo::getDirect());
   }
 
   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
@@ -3271,7 +3264,7 @@ classifyReturnType(QualType RetTy) const {
 
       if (RetTy->isIntegralOrEnumerationType() &&
           RetTy->isPromotableIntegerType())
-        return ABIArgInfo::getExtend();
+        return ABIArgInfo::getExtend(RetTy);
     }
     break;
 
@@ -3416,7 +3409,7 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(
 
       if (Ty->isIntegralOrEnumerationType() &&
           Ty->isPromotableIntegerType())
-        return ABIArgInfo::getExtend();
+        return ABIArgInfo::getExtend(Ty);
     }
 
     break;
@@ -3941,7 +3934,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
   // extended.
   const BuiltinType *BT = Ty->getAs<BuiltinType>();
   if (BT && BT->getKind() == BuiltinType::Bool)
-    return ABIArgInfo::getExtend();
+    return ABIArgInfo::getExtend(Ty);
 
   // Mingw64 GCC uses the old 80 bit extended precision floating point unit. It
   // passes them indirectly through memory.
@@ -4694,8 +4687,8 @@ PPC64_SVR4_ABIInfo::classifyArgumentType(QualType Ty) const {
                                    /*Realign=*/TyAlign > ABIAlign);
   }
 
-  return (isPromotableTypeForABI(Ty) ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (isPromotableTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
+                                     : ABIArgInfo::getDirect());
 }
 
 ABIArgInfo
@@ -4749,8 +4742,8 @@ PPC64_SVR4_ABIInfo::classifyReturnType(QualType RetTy) const {
     return getNaturalAlignIndirect(RetTy);
   }
 
-  return (isPromotableTypeForABI(RetTy) ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (isPromotableTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
+                                        : ABIArgInfo::getDirect());
 }
 
 // Based on ARMABIInfo::EmitVAArg, adjusted for 64-bit machine.
@@ -4922,8 +4915,7 @@ class AArch64ABIInfo : public SwiftABIInfo {
   Address EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
                       QualType Ty) const override;
 
-  bool shouldPassIndirectlyForSwift(CharUnits totalSize,
-                                    ArrayRef<llvm::Type*> scalars,
+  bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,
                                     bool asReturnValue) const override {
     return occupiesMoreThan(CGT, scalars, /*total*/ 4);
   }
@@ -5002,7 +4994,7 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty) const {
       Ty = EnumTy->getDecl()->getIntegerType();
 
     return (Ty->isPromotableIntegerType() && isDarwinPCS()
-                ? ABIArgInfo::getExtend()
+                ? ABIArgInfo::getExtend(Ty)
                 : ABIArgInfo::getDirect());
   }
 
@@ -5072,7 +5064,7 @@ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy) const {
       RetTy = EnumTy->getDecl()->getIntegerType();
 
     return (RetTy->isPromotableIntegerType() && isDarwinPCS()
-                ? ABIArgInfo::getExtend()
+                ? ABIArgInfo::getExtend(RetTy)
                 : ABIArgInfo::getDirect());
   }
 
@@ -5521,8 +5513,7 @@ class ARMABIInfo : public SwiftABIInfo {
   llvm::CallingConv::ID getABIDefaultCC() const;
   void setCCs();
 
-  bool shouldPassIndirectlyForSwift(CharUnits totalSize,
-                                    ArrayRef<llvm::Type*> scalars,
+  bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,
                                     bool asReturnValue) const override {
     return occupiesMoreThan(CGT, scalars, /*total*/ 4);
   }
@@ -5730,10 +5721,11 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
     return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
   }
 
-  // __fp16 gets passed as if it were an int or float, but with the top 16 bits
-  // unspecified. This is not done for OpenCL as it handles the half type
-  // natively, and does not need to interwork with AAPCS code.
-  if (Ty->isHalfType() && !getContext().getLangOpts().NativeHalfArgsAndReturns) {
+  // _Float16 and __fp16 get passed as if it were an int or float, but with
+  // the top 16 bits unspecified. This is not done for OpenCL as it handles the
+  // half type natively, and does not need to interwork with AAPCS code.
+  if ((Ty->isFloat16Type() || Ty->isHalfType()) &&
+      !getContext().getLangOpts().NativeHalfArgsAndReturns) {
     llvm::Type *ResType = IsEffectivelyAAPCS_VFP ?
       llvm::Type::getFloatTy(getVMContext()) :
       llvm::Type::getInt32Ty(getVMContext());
@@ -5746,7 +5738,7 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
       Ty = EnumTy->getDecl()->getIntegerType();
     }
 
-    return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend()
+    return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
                                           : ABIArgInfo::getDirect());
   }
 
@@ -5928,10 +5920,11 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
     return getNaturalAlignIndirect(RetTy);
   }
 
-  // __fp16 gets returned as if it were an int or float, but with the top 16
-  // bits unspecified. This is not done for OpenCL as it handles the half type
-  // natively, and does not need to interwork with AAPCS code.
-  if (RetTy->isHalfType() && !getContext().getLangOpts().NativeHalfArgsAndReturns) {
+  // _Float16 and __fp16 get returned as if it were an int or float, but with
+  // the top 16 bits unspecified. This is not done for OpenCL as it handles the
+  // half type natively, and does not need to interwork with AAPCS code.
+  if ((RetTy->isFloat16Type() || RetTy->isHalfType()) &&
+      !getContext().getLangOpts().NativeHalfArgsAndReturns) {
     llvm::Type *ResType = IsEffectivelyAAPCS_VFP ?
       llvm::Type::getFloatTy(getVMContext()) :
       llvm::Type::getInt32Ty(getVMContext());
@@ -5943,7 +5936,7 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
     if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
       RetTy = EnumTy->getDecl()->getIntegerType();
 
-    return RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend()
+    return RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend(RetTy)
                                             : ABIArgInfo::getDirect();
   }
 
@@ -6176,8 +6169,8 @@ ABIArgInfo NVPTXABIInfo::classifyReturnType(QualType RetTy) const {
   if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
     RetTy = EnumTy->getDecl()->getIntegerType();
 
-  return (RetTy->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend(RetTy)
+                                           : ABIArgInfo::getDirect());
 }
 
 ABIArgInfo NVPTXABIInfo::classifyArgumentType(QualType Ty) const {
@@ -6189,8 +6182,8 @@ ABIArgInfo NVPTXABIInfo::classifyArgumentType(QualType Ty) const {
   if (isAggregateTypeForABI(Ty))
     return getNaturalAlignIndirect(Ty, /* byval */ true);
 
-  return (Ty->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
+                                        : ABIArgInfo::getDirect());
 }
 
 void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const {
@@ -6313,8 +6306,7 @@ class SystemZABIInfo : public SwiftABIInfo {
   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                     QualType Ty) const override;
 
-  bool shouldPassIndirectlyForSwift(CharUnits totalSize,
-                                    ArrayRef<llvm::Type*> scalars,
+  bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,
                                     bool asReturnValue) const override {
     return occupiesMoreThan(CGT, scalars, /*total*/ 4);
   }
@@ -6586,8 +6578,8 @@ ABIArgInfo SystemZABIInfo::classifyReturnType(QualType RetTy) const {
     return ABIArgInfo::getDirect();
   if (isCompoundType(RetTy) || getContext().getTypeSize(RetTy) > 64)
     return getNaturalAlignIndirect(RetTy);
-  return (isPromotableIntegerType(RetTy) ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (isPromotableIntegerType(RetTy) ? ABIArgInfo::getExtend(RetTy)
+                                         : ABIArgInfo::getDirect());
 }
 
 ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
@@ -6597,7 +6589,7 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
 
   // Integers and enums are extended to full register width.
   if (isPromotableIntegerType(Ty))
-    return ABIArgInfo::getExtend();
+    return ABIArgInfo::getExtend(Ty);
 
   // Handle vector types and vector-like structure types.  Note that
   // as opposed to float-like structure types, we do not allow any
@@ -6705,7 +6697,7 @@ class MipsABIInfo : public ABIInfo {
   void computeInfo(CGFunctionInfo &FI) const override;
   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                     QualType Ty) const override;
-  bool shouldSignExtUnsignedType(QualType Ty) const override;
+  ABIArgInfo extendType(QualType Ty) const;
 };
 
 class MIPSTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -6898,7 +6890,7 @@ MipsABIInfo::classifyArgumentType(QualType Ty, uint64_t &Offset) const {
 
   // All integral types are promoted to the GPR width.
   if (Ty->isIntegralOrEnumerationType())
-    return ABIArgInfo::getExtend();
+    return extendType(Ty);
 
   return ABIArgInfo::getDirect(
       nullptr, 0, IsO32 ? nullptr : getPaddingType(OrigOffset, CurrOffset));
@@ -6980,8 +6972,8 @@ ABIArgInfo MipsABIInfo::classifyReturnType(QualType RetTy) const {
   if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
     RetTy = EnumTy->getDecl()->getIntegerType();
 
-  return (RetTy->isPromotableIntegerType() ?
-          ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+  return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend(RetTy)
+                                           : ABIArgInfo::getDirect());
 }
 
 void MipsABIInfo::computeInfo(CGFunctionInfo &FI) const {
@@ -7047,14 +7039,14 @@ Address MipsABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   return Addr;
 }
 
-bool MipsABIInfo::shouldSignExtUnsignedType(QualType Ty) const {
+ABIArgInfo MipsABIInfo::extendType(QualType Ty) const {
   int TySize = getContext().getTypeSize(Ty);
 
   // MIPS64 ABI requires unsigned 32 bit integers to be sign extended.
   if (Ty->isUnsignedIntegerOrEnumerationType() && TySize == 32)
-    return true;
+    return ABIArgInfo::getSignExtend(Ty);
 
-  return false;
+  return ABIArgInfo::getExtend(Ty);
 }
 
 bool
@@ -7227,8 +7219,8 @@ ABIArgInfo HexagonABIInfo::classifyArgumentType(QualType Ty) const {
     if (const EnumType *EnumTy = Ty->getAs<EnumType>())
       Ty = EnumTy->getDecl()->getIntegerType();
 
-    return (Ty->isPromotableIntegerType() ?
-            ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+    return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
+                                          : ABIArgInfo::getDirect());
   }
 
   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
@@ -7265,8 +7257,8 @@ ABIArgInfo HexagonABIInfo::classifyReturnType(QualType RetTy) const {
     if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
       RetTy = EnumTy->getDecl()->getIntegerType();
 
-    return (RetTy->isPromotableIntegerType() ?
-            ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
+    return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend(RetTy)
+                                             : ABIArgInfo::getDirect());
   }
 
   if (isEmptyRecord(getContext(), RetTy, true))
@@ -7409,7 +7401,7 @@ ABIArgInfo LanaiABIInfo::classifyArgumentType(QualType Ty,
   if (Ty->isPromotableIntegerType()) {
     if (InReg)
       return ABIArgInfo::getDirectInReg();
-    return ABIArgInfo::getExtend();
+    return ABIArgInfo::getExtend(Ty);
   }
   if (InReg)
     return ABIArgInfo::getDirectInReg();
@@ -7991,7 +7983,7 @@ SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const {
 
   // Integer types smaller than a register are extended.
   if (Size < 64 && Ty->isIntegerType())
-    return ABIArgInfo::getExtend();
+    return ABIArgInfo::getExtend(Ty);
 
   // Other non-aggregates go in registers.
   if (!isAggregateTypeForABI(Ty))
@@ -8780,6 +8772,182 @@ static bool getTypeString(SmallStringEnc &Enc, const Decl *D,
   return false;
 }
 
+//===----------------------------------------------------------------------===//
+// RISCV ABI Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+class RISCVABIInfo : public DefaultABIInfo {
+private:
+  unsigned XLen; // Size of the integer ('x') registers in bits.
+  static const int NumArgGPRs = 8;
+
+public:
+  RISCVABIInfo(CodeGen::CodeGenTypes &CGT, unsigned XLen)
+      : DefaultABIInfo(CGT), XLen(XLen) {}
+
+  // DefaultABIInfo's classifyReturnType and classifyArgumentType are
+  // non-virtual, but computeInfo is virtual, so we overload it.
+  void computeInfo(CGFunctionInfo &FI) const override;
+
+  ABIArgInfo classifyArgumentType(QualType Ty, bool IsFixed,
+                                  int &ArgGPRsLeft) const;
+  ABIArgInfo classifyReturnType(QualType RetTy) const;
+
+  Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                    QualType Ty) const override;
+
+  ABIArgInfo extendType(QualType Ty) const;
+};
+} // end anonymous namespace
+
+void RISCVABIInfo::computeInfo(CGFunctionInfo &FI) const {
+  QualType RetTy = FI.getReturnType();
+  if (!getCXXABI().classifyReturnType(FI))
+    FI.getReturnInfo() = classifyReturnType(RetTy);
+
+  // IsRetIndirect is true if classifyArgumentType indicated the value should
+  // be passed indirect or if the type size is greater than 2*xlen. e.g. fp128
+  // is passed direct in LLVM IR, relying on the backend lowering code to
+  // rewrite the argument list and pass indirectly on RV32.
+  bool IsRetIndirect = FI.getReturnInfo().getKind() == ABIArgInfo::Indirect ||
+                       getContext().getTypeSize(RetTy) > (2 * XLen);
+
+  // We must track the number of GPRs used in order to conform to the RISC-V
+  // ABI, as integer scalars passed in registers should have signext/zeroext
+  // when promoted, but are anyext if passed on the stack. As GPR usage is
+  // different for variadic arguments, we must also track whether we are
+  // examining a vararg or not.
+  int ArgGPRsLeft = IsRetIndirect ? NumArgGPRs - 1 : NumArgGPRs;
+  int NumFixedArgs = FI.getNumRequiredArgs();
+
+  int ArgNum = 0;
+  for (auto &ArgInfo : FI.arguments()) {
+    bool IsFixed = ArgNum < NumFixedArgs;
+    ArgInfo.info = classifyArgumentType(ArgInfo.type, IsFixed, ArgGPRsLeft);
+    ArgNum++;
+  }
+}
+
+ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
+                                              int &ArgGPRsLeft) const {
+  assert(ArgGPRsLeft <= NumArgGPRs && "Arg GPR tracking underflow");
+  Ty = useFirstFieldIfTransparentUnion(Ty);
+
+  // Structures with either a non-trivial destructor or a non-trivial
+  // copy constructor are always passed indirectly.
+  if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+    if (ArgGPRsLeft)
+      ArgGPRsLeft -= 1;
+    return getNaturalAlignIndirect(Ty, /*ByVal=*/RAA ==
+                                           CGCXXABI::RAA_DirectInMemory);
+  }
+
+  // Ignore empty structs/unions.
+  if (isEmptyRecord(getContext(), Ty, true))
+    return ABIArgInfo::getIgnore();
+
+  uint64_t Size = getContext().getTypeSize(Ty);
+  uint64_t NeededAlign = getContext().getTypeAlign(Ty);
+  bool MustUseStack = false;
+  // Determine the number of GPRs needed to pass the current argument
+  // according to the ABI. 2*XLen-aligned varargs are passed in "aligned"
+  // register pairs, so may consume 3 registers.
+  int NeededArgGPRs = 1;
+  if (!IsFixed && NeededAlign == 2 * XLen)
+    NeededArgGPRs = 2 + (ArgGPRsLeft % 2);
+  else if (Size > XLen && Size <= 2 * XLen)
+    NeededArgGPRs = 2;
+
+  if (NeededArgGPRs > ArgGPRsLeft) {
+    MustUseStack = true;
+    NeededArgGPRs = ArgGPRsLeft;
+  }
+
+  ArgGPRsLeft -= NeededArgGPRs;
+
+  if (!isAggregateTypeForABI(Ty) && !Ty->isVectorType()) {
+    // Treat an enum type as its underlying type.
+    if (const EnumType *EnumTy = Ty->getAs<EnumType>())
+      Ty = EnumTy->getDecl()->getIntegerType();
+
+    // All integral types are promoted to XLen width, unless passed on the
+    // stack.
+    if (Size < XLen && Ty->isIntegralOrEnumerationType() && !MustUseStack) {
+      return extendType(Ty);
+    }
+
+    return ABIArgInfo::getDirect();
+  }
+
+  // Aggregates which are <= 2*XLen will be passed in registers if possible,
+  // so coerce to integers.
+  if (Size <= 2 * XLen) {
+    unsigned Alignment = getContext().getTypeAlign(Ty);
+
+    // Use a single XLen int if possible, 2*XLen if 2*XLen alignment is
+    // required, and a 2-element XLen array if only XLen alignment is required.
+    if (Size <= XLen) {
+      return ABIArgInfo::getDirect(
+          llvm::IntegerType::get(getVMContext(), XLen));
+    } else if (Alignment == 2 * XLen) {
+      return ABIArgInfo::getDirect(
+          llvm::IntegerType::get(getVMContext(), 2 * XLen));
+    } else {
+      return ABIArgInfo::getDirect(llvm::ArrayType::get(
+          llvm::IntegerType::get(getVMContext(), XLen), 2));
+    }
+  }
+  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
+}
+
+ABIArgInfo RISCVABIInfo::classifyReturnType(QualType RetTy) const {
+  if (RetTy->isVoidType())
+    return ABIArgInfo::getIgnore();
+
+  int ArgGPRsLeft = 2;
+
+  // The rules for return and argument types are the same, so defer to
+  // classifyArgumentType.
+  return classifyArgumentType(RetTy, /*IsFixed=*/true, ArgGPRsLeft);
+}
+
+Address RISCVABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                                QualType Ty) const {
+  CharUnits SlotSize = CharUnits::fromQuantity(XLen / 8);
+
+  // Empty records are ignored for parameter passing purposes.
+  if (isEmptyRecord(getContext(), Ty, true)) {
+    Address Addr(CGF.Builder.CreateLoad(VAListAddr), SlotSize);
+    Addr = CGF.Builder.CreateElementBitCast(Addr, CGF.ConvertTypeForMem(Ty));
+    return Addr;
+  }
+
+  std::pair<CharUnits, CharUnits> SizeAndAlign =
+      getContext().getTypeInfoInChars(Ty);
+
+  // Arguments bigger than 2*Xlen bytes are passed indirectly.
+  bool IsIndirect = SizeAndAlign.first > 2 * SlotSize;
+
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect, SizeAndAlign,
+                          SlotSize, /*AllowHigherAlign=*/true);
+}
+
+ABIArgInfo RISCVABIInfo::extendType(QualType Ty) const {
+  int TySize = getContext().getTypeSize(Ty);
+  // RV64 ABI requires unsigned 32 bit integers to be sign extended.
+  if (XLen == 64 && Ty->isUnsignedIntegerOrEnumerationType() && TySize == 32)
+    return ABIArgInfo::getSignExtend(Ty);
+  return ABIArgInfo::getExtend(Ty);
+}
+
+namespace {
+class RISCVTargetCodeGenInfo : public TargetCodeGenInfo {
+public:
+  RISCVTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, unsigned XLen)
+      : TargetCodeGenInfo(new RISCVABIInfo(CGT, XLen)) {}
+};
+} // namespace
 
 //===----------------------------------------------------------------------===//
 // Driver code
@@ -8894,6 +9062,11 @@ const TargetCodeGenInfo &CodeGenModule::getTargetCodeGenInfo() {
   case llvm::Triple::msp430:
     return SetCGInfo(new MSP430TargetCodeGenInfo(Types));
 
+  case llvm::Triple::riscv32:
+    return SetCGInfo(new RISCVTargetCodeGenInfo(Types, 32));
+  case llvm::Triple::riscv64:
+    return SetCGInfo(new RISCVTargetCodeGenInfo(Types, 64));
+
   case llvm::Triple::systemz: {
     bool HasVector = getTarget().getABI() == "vector";
     return SetCGInfo(new SystemZTargetCodeGenInfo(Types, HasVector));
diff --git a/lib/Driver/Action.cpp b/lib/Driver/Action.cpp
index 85e466a4409d..70ff131d7820 100644
--- a/lib/Driver/Action.cpp
+++ b/lib/Driver/Action.cpp
@@ -29,6 +29,7 @@ const char *Action::getClassName(ActionClass AC) {
   case PrecompileJobClass: return "precompiler";
   case AnalyzeJobClass: return "analyzer";
   case MigrateJobClass: return "migrator";
+  case FortranFrontendJobClass: return "fortran-frontend";
   case CompileJobClass: return "compiler";
   case BackendJobClass: return "backend";
   case AssembleJobClass: return "assembler";
@@ -323,6 +324,12 @@ void MigrateJobAction::anchor() {}
 MigrateJobAction::MigrateJobAction(Action *Input, types::ID OutputType)
     : JobAction(MigrateJobClass, Input, OutputType) {}
 
+void FortranFrontendJobAction::anchor() {}
+
+FortranFrontendJobAction::FortranFrontendJobAction(Action *Input,
+                                             types::ID OutputType)
+    : JobAction(FortranFrontendJobClass, Input, OutputType) {}
+
 void CompileJobAction::anchor() {}
 
 CompileJobAction::CompileJobAction(Action *Input, types::ID OutputType)
diff --git a/lib/Driver/CMakeLists.txt b/lib/Driver/CMakeLists.txt
index 5bf91f2be981..e229b3bebdd7 100644
--- a/lib/Driver/CMakeLists.txt
+++ b/lib/Driver/CMakeLists.txt
@@ -25,6 +25,7 @@ add_clang_library(clangDriver
   ToolChains/Arch/ARM.cpp
   ToolChains/Arch/Mips.cpp
   ToolChains/Arch/PPC.cpp
+  ToolChains/Arch/RISCV.cpp
   ToolChains/Arch/Sparc.cpp
   ToolChains/Arch/SystemZ.cpp
   ToolChains/Arch/X86.cpp
@@ -40,6 +41,7 @@ add_clang_library(clangDriver
   ToolChains/Cuda.cpp
   ToolChains/Darwin.cpp
   ToolChains/DragonFly.cpp
+  ToolChains/Flang.cpp
   ToolChains/FreeBSD.cpp
   ToolChains/Fuchsia.cpp
   ToolChains/Gnu.cpp
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
index 9ae33b80f889..4f7d320980d9 100644
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -62,6 +62,7 @@
 #include "llvm/Option/OptSpecifier.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -70,6 +71,7 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/StringSaver.h"
 #include <map>
 #include <memory>
 #include <utility>
@@ -92,7 +94,8 @@ Driver::Driver(StringRef ClangExecutable, StringRef DefaultTargetTriple,
       CCPrintHeadersFilename(nullptr), CCLogDiagnosticsFilename(nullptr),
       CCCPrintBindings(false), CCPrintHeaders(false), CCLogDiagnostics(false),
       CCGenDiagnostics(false), DefaultTargetTriple(DefaultTargetTriple),
-      CCCGenericGCCName(""), CheckInputsExist(true), CCCUsePCH(true),
+      CCCGenericGCCName(""), Saver(Alloc),
+      CheckInputsExist(true), CCCUsePCH(true),
       GenReproducer(false), SuppressMissingInputWarning(false) {
 
   // Provide a sane fallback if no VFS is specified.
@@ -103,6 +106,13 @@ Driver::Driver(StringRef ClangExecutable, StringRef DefaultTargetTriple,
   Dir = llvm::sys::path::parent_path(ClangExecutable);
   InstalledDir = Dir; // Provide a sensible default installed dir.
 
+#if defined(CLANG_CONFIG_FILE_SYSTEM_DIR)
+  SystemConfigDir = CLANG_CONFIG_FILE_SYSTEM_DIR;
+#endif
+#if defined(CLANG_CONFIG_FILE_USER_DIR)
+  UserConfigDir = CLANG_CONFIG_FILE_USER_DIR;
+#endif
+
   // Compute the path to the resource directory.
   StringRef ClangResourceDir(CLANG_RESOURCE_DIR);
   SmallString<128> P(Dir);
@@ -143,6 +153,7 @@ void Driver::setDriverModeFromOption(StringRef Opt) {
                          .Case("g++", GXXMode)
                          .Case("cpp", CPPMode)
                          .Case("cl", CLMode)
+                         .Case("fortran", FortranMode)
                          .Default(~0U);
 
   if (M != ~0U)
@@ -178,9 +189,19 @@ InputArgList Driver::ParseArgStrings(ArrayRef<const char *> ArgStrings,
   // Check for unsupported options.
   for (const Arg *A : Args) {
     if (A->getOption().hasFlag(options::Unsupported)) {
-      Diag(diag::err_drv_unsupported_opt) << A->getAsString(Args);
-      ContainsError |= Diags.getDiagnosticLevel(diag::err_drv_unsupported_opt,
-                                                SourceLocation()) >
+      unsigned DiagID;
+      auto ArgString = A->getAsString(Args);
+      std::string Nearest;
+      if (getOpts().findNearest(
+            ArgString, Nearest, IncludedFlagsBitmask,
+            ExcludedFlagsBitmask | options::Unsupported) > 1) {
+        DiagID = diag::err_drv_unsupported_opt;
+        Diag(DiagID) << ArgString;
+      } else {
+        DiagID = diag::err_drv_unsupported_opt_with_suggestion;
+        Diag(DiagID) << ArgString << Nearest;
+      }
+      ContainsError |= Diags.getDiagnosticLevel(DiagID, SourceLocation()) >
                        DiagnosticsEngine::Warning;
       continue;
     }
@@ -195,11 +216,20 @@ InputArgList Driver::ParseArgStrings(ArrayRef<const char *> ArgStrings,
   }
 
   for (const Arg *A : Args.filtered(options::OPT_UNKNOWN)) {
-    auto ID = IsCLMode() ? diag::warn_drv_unknown_argument_clang_cl
-                         : diag::err_drv_unknown_argument;
-
-    Diags.Report(ID) << A->getAsString(Args);
-    ContainsError |= Diags.getDiagnosticLevel(ID, SourceLocation()) >
+    unsigned DiagID;
+    auto ArgString = A->getAsString(Args);
+    std::string Nearest;
+    if (getOpts().findNearest(
+          ArgString, Nearest, IncludedFlagsBitmask, ExcludedFlagsBitmask) > 1) {
+      DiagID = IsCLMode() ? diag::warn_drv_unknown_argument_clang_cl
+                          : diag::err_drv_unknown_argument;
+      Diags.Report(DiagID) << ArgString;
+    } else {
+      DiagID = IsCLMode() ? diag::warn_drv_unknown_argument_clang_cl_with_suggestion
+                          : diag::err_drv_unknown_argument_with_suggestion;
+      Diags.Report(DiagID) << ArgString << Nearest;
+    }
+    ContainsError |= Diags.getDiagnosticLevel(DiagID, SourceLocation()) >
                      DiagnosticsEngine::Warning;
   }
 
@@ -221,6 +251,10 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL,
       (PhaseArg = DAL.getLastArg(options::OPT__SLASH_P))) {
     FinalPhase = phases::Preprocess;
 
+    // -fsyntax-only stops Fortran compilation after FortranFrontend
+  } else if (IsFortranMode() && (PhaseArg = DAL.getLastArg(options::OPT_fsyntax_only))) {
+    FinalPhase = phases::FortranFrontend;
+
     // --precompile only runs up to precompilation.
   } else if ((PhaseArg = DAL.getLastArg(options::OPT__precompile))) {
     FinalPhase = phases::Precompile;
@@ -600,6 +634,216 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
   //
 }
 
+/// Looks the given directories for the specified file.
+///
+/// \param[out] FilePath File path, if the file was found.
+/// \param[in]  Dirs Directories used for the search.
+/// \param[in]  FileName Name of the file to search for.
+/// \return True if file was found.
+///
+/// Looks for file specified by FileName sequentially in directories specified
+/// by Dirs.
+///
+static bool searchForFile(SmallVectorImpl<char> &FilePath,
+                          ArrayRef<std::string> Dirs,
+                          StringRef FileName) {
+  SmallString<128> WPath;
+  for (const StringRef &Dir : Dirs) {
+    if (Dir.empty())
+      continue;
+    WPath.clear();
+    llvm::sys::path::append(WPath, Dir, FileName);
+    llvm::sys::path::native(WPath);
+    if (llvm::sys::fs::is_regular_file(WPath)) {
+      FilePath = std::move(WPath);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Driver::readConfigFile(StringRef FileName) {
+  // Try reading the given file.
+  SmallVector<const char *, 32> NewCfgArgs;
+  if (!llvm::cl::readConfigFile(FileName, Saver, NewCfgArgs)) {
+    Diag(diag::err_drv_cannot_read_config_file) << FileName;
+    return true;
+  }
+
+  // Read options from config file.
+  llvm::SmallString<128> CfgFileName(FileName);
+  llvm::sys::path::native(CfgFileName);
+  ConfigFile = CfgFileName.str();
+  bool ContainErrors;
+  CfgOptions = llvm::make_unique<InputArgList>(
+      ParseArgStrings(NewCfgArgs, ContainErrors));
+  if (ContainErrors) {
+    CfgOptions.reset();
+    return true;
+  }
+
+  if (CfgOptions->hasArg(options::OPT_config)) {
+    CfgOptions.reset();
+    Diag(diag::err_drv_nested_config_file);
+    return true;
+  }
+
+  // Claim all arguments that come from a configuration file so that the driver
+  // does not warn on any that is unused.
+  for (Arg *A : *CfgOptions)
+    A->claim();
+  return false;
+}
+
+bool Driver::loadConfigFile() {
+  std::string CfgFileName;
+  bool FileSpecifiedExplicitly = false;
+
+  // Process options that change search path for config files.
+  if (CLOptions) {
+    if (CLOptions->hasArg(options::OPT_config_system_dir_EQ)) {
+      SmallString<128> CfgDir;
+      CfgDir.append(
+          CLOptions->getLastArgValue(options::OPT_config_system_dir_EQ));
+      if (!CfgDir.empty()) {
+        if (llvm::sys::fs::make_absolute(CfgDir).value() != 0)
+          SystemConfigDir.clear();
+        else
+          SystemConfigDir = std::string(CfgDir.begin(), CfgDir.end());
+      }
+    }
+    if (CLOptions->hasArg(options::OPT_config_user_dir_EQ)) {
+      SmallString<128> CfgDir;
+      CfgDir.append(
+          CLOptions->getLastArgValue(options::OPT_config_user_dir_EQ));
+      if (!CfgDir.empty()) {
+        if (llvm::sys::fs::make_absolute(CfgDir).value() != 0)
+          UserConfigDir.clear();
+        else
+          UserConfigDir = std::string(CfgDir.begin(), CfgDir.end());
+      }
+    }
+  }
+
+  // First try to find config file specified in command line.
+  if (CLOptions) {
+    std::vector<std::string> ConfigFiles =
+        CLOptions->getAllArgValues(options::OPT_config);
+    if (ConfigFiles.size() > 1) {
+      Diag(diag::err_drv_duplicate_config);
+      return true;
+    }
+
+    if (!ConfigFiles.empty()) {
+      CfgFileName = ConfigFiles.front();
+      assert(!CfgFileName.empty());
+
+      // If argument contains directory separator, treat it as a path to
+      // configuration file.
+      if (llvm::sys::path::has_parent_path(CfgFileName)) {
+        SmallString<128> CfgFilePath;
+        if (llvm::sys::path::is_relative(CfgFileName))
+          llvm::sys::fs::current_path(CfgFilePath);
+        llvm::sys::path::append(CfgFilePath, CfgFileName);
+        if (!llvm::sys::fs::is_regular_file(CfgFilePath)) {
+          Diag(diag::err_drv_config_file_not_exist) << CfgFilePath;
+          return true;
+        }
+        return readConfigFile(CfgFilePath);
+      }
+
+      FileSpecifiedExplicitly = true;
+    }
+  }
+
+  // If config file is not specified explicitly, try to deduce configuration
+  // from executable name. For instance, an executable 'armv7l-clang' will
+  // search for config file 'armv7l-clang.cfg'.
+  if (CfgFileName.empty() && !ClangNameParts.TargetPrefix.empty())
+    CfgFileName = ClangNameParts.TargetPrefix + '-' + ClangNameParts.ModeSuffix;
+
+  if (CfgFileName.empty())
+    return false;
+
+  // Determine architecture part of the file name, if it is present.
+  StringRef CfgFileArch = CfgFileName;
+  size_t ArchPrefixLen = CfgFileArch.find('-');
+  if (ArchPrefixLen == StringRef::npos)
+    ArchPrefixLen = CfgFileArch.size();
+  llvm::Triple CfgTriple;
+  CfgFileArch = CfgFileArch.take_front(ArchPrefixLen);
+  CfgTriple = llvm::Triple(llvm::Triple::normalize(CfgFileArch));
+  if (CfgTriple.getArch() == llvm::Triple::ArchType::UnknownArch)
+    ArchPrefixLen = 0;
+
+  if (!StringRef(CfgFileName).endswith(".cfg"))
+    CfgFileName += ".cfg";
+
+  // If config file starts with architecture name and command line options
+  // redefine architecture (with options like -m32 -LE etc), try finding new
+  // config file with that architecture.
+  SmallString<128> FixedConfigFile;
+  size_t FixedArchPrefixLen = 0;
+  if (ArchPrefixLen) {
+    // Get architecture name from config file name like 'i386.cfg' or
+    // 'armv7l-clang.cfg'.
+    // Check if command line options changes effective triple.
+    llvm::Triple EffectiveTriple = computeTargetTriple(*this,
+                                             CfgTriple.getTriple(), *CLOptions);
+    if (CfgTriple.getArch() != EffectiveTriple.getArch()) {
+      FixedConfigFile = EffectiveTriple.getArchName();
+      FixedArchPrefixLen = FixedConfigFile.size();
+      // Append the rest of original file name so that file name transforms
+      // like: i386-clang.cfg -> x86_64-clang.cfg.
+      if (ArchPrefixLen < CfgFileName.size())
+        FixedConfigFile += CfgFileName.substr(ArchPrefixLen);
+    }
+  }
+
+  // Prepare list of directories where config file is searched for.
+  SmallVector<std::string, 3> CfgFileSearchDirs;
+  CfgFileSearchDirs.push_back(UserConfigDir);
+  CfgFileSearchDirs.push_back(SystemConfigDir);
+  CfgFileSearchDirs.push_back(Dir);
+
+  // Try to find config file. First try file with corrected architecture.
+  llvm::SmallString<128> CfgFilePath;
+  if (!FixedConfigFile.empty()) {
+    if (searchForFile(CfgFilePath, CfgFileSearchDirs, FixedConfigFile))
+      return readConfigFile(CfgFilePath);
+    // If 'x86_64-clang.cfg' was not found, try 'x86_64.cfg'.
+    FixedConfigFile.resize(FixedArchPrefixLen);
+    FixedConfigFile.append(".cfg");
+    if (searchForFile(CfgFilePath, CfgFileSearchDirs, FixedConfigFile))
+      return readConfigFile(CfgFilePath);
+  }
+
+  // Then try original file name.
+  if (searchForFile(CfgFilePath, CfgFileSearchDirs, CfgFileName))
+    return readConfigFile(CfgFilePath);
+
+  // Finally try removing driver mode part: 'x86_64-clang.cfg' -> 'x86_64.cfg'.
+  if (!ClangNameParts.ModeSuffix.empty() &&
+      !ClangNameParts.TargetPrefix.empty()) {
+    CfgFileName.assign(ClangNameParts.TargetPrefix);
+    CfgFileName.append(".cfg");
+    if (searchForFile(CfgFilePath, CfgFileSearchDirs, CfgFileName))
+      return readConfigFile(CfgFilePath);
+  }
+
+  // Report error but only if config file was specified explicitly, by option
+  // --config. If it was deduced from executable name, it is not an error.
+  if (FileSpecifiedExplicitly) {
+    Diag(diag::err_drv_config_file_not_found) << CfgFileName;
+    for (const std::string &SearchDir : CfgFileSearchDirs)
+      if (!SearchDir.empty())
+        Diag(diag::note_drv_config_file_searched_in) << SearchDir;
+    return true;
+  }
+
+  return false;
+}
+
 Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   llvm::PrettyStackTraceString CrashInfo("Compilation construction");
 
@@ -623,12 +867,35 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
 
   // FIXME: What are we going to do with -V and -b?
 
+  // Arguments specified in command line.
+  bool ContainsError;
+  CLOptions = llvm::make_unique<InputArgList>(
+      ParseArgStrings(ArgList.slice(1), ContainsError));
+
+  // Try parsing configuration file.
+  if (!ContainsError)
+    ContainsError = loadConfigFile();
+  bool HasConfigFile = !ContainsError && (CfgOptions.get() != nullptr);
+
+  // All arguments, from both config file and command line.
+  InputArgList Args = std::move(HasConfigFile ? std::move(*CfgOptions)
+                                              : std::move(*CLOptions));
+  if (HasConfigFile)
+    for (auto *Opt : *CLOptions) {
+      const Arg *BaseArg = &Opt->getBaseArg();
+      if (BaseArg == Opt)
+        BaseArg = nullptr;
+      Arg *Copy = new llvm::opt::Arg(Opt->getOption(), Opt->getSpelling(),
+                                     Args.size(), BaseArg);
+      Copy->getValues() = Opt->getValues();
+      if (Opt->isClaimed())
+        Copy->claim();
+      Args.append(Copy);
+    }
+
   // FIXME: This stuff needs to go into the Compilation, not the driver.
   bool CCCPrintPhases;
 
-  bool ContainsError;
-  InputArgList Args = ParseArgStrings(ArgList.slice(1), ContainsError);
-
   // Silence driver warnings if requested
   Diags.setIgnoreAllWarnings(Args.hasArg(options::OPT_w));
 
@@ -1144,6 +1411,10 @@ void Driver::PrintVersion(const Compilation &C, raw_ostream &OS) const {
 
   // Print out the install directory.
   OS << "InstalledDir: " << InstalledDir << '\n';
+
+  // If configuration file was used, print its path.
+  if (!ConfigFile.empty())
+    OS << "Configuration file: " << ConfigFile << '\n';
 }
 
 /// PrintDiagnosticCategories - Implement the --print-diagnostic-categories
@@ -1250,6 +1521,15 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
     SuppressMissingInputWarning = true;
   }
 
+  if (C.getArgs().hasArg(options::OPT_v)) {
+    if (!SystemConfigDir.empty())
+      llvm::errs() << "System configuration file directory: "
+                   << SystemConfigDir << "\n";
+    if (!UserConfigDir.empty())
+      llvm::errs() << "User configuration file directory: "
+                   << UserConfigDir << "\n";
+  }
+
   const ToolChain &TC = C.getDefaultToolChain();
 
   if (C.getArgs().hasArg(options::OPT_v))
@@ -2767,6 +3047,13 @@ Action *Driver::ConstructPhaseAction(Compilation &C, const ArgList &Args,
     }
     return C.MakeAction<PrecompileJobAction>(Input, OutputTy);
   }
+  case phases::FortranFrontend: {
+    if (Args.hasArg(options::OPT_fsyntax_only))
+      return C.MakeAction<FortranFrontendJobAction>(Input,
+                                                    types::TY_Nothing);
+    return C.MakeAction<FortranFrontendJobAction>(Input,
+                                                  types::TY_LLVM_IR);
+  }
   case phases::Compile: {
     if (Args.hasArg(options::OPT_fsyntax_only))
       return C.MakeAction<CompileJobAction>(Input, types::TY_Nothing);
diff --git a/lib/Driver/Phases.cpp b/lib/Driver/Phases.cpp
index 7ae270857f4d..5f5e206f7d98 100644
--- a/lib/Driver/Phases.cpp
+++ b/lib/Driver/Phases.cpp
@@ -17,6 +17,7 @@ const char *phases::getPhaseName(ID Id) {
   switch (Id) {
   case Preprocess: return "preprocessor";
   case Precompile: return "precompiler";
+  case FortranFrontend: return "fortran-frontend";
   case Compile: return "compiler";
   case Backend: return "backend";
   case Assemble: return "assembler";
diff --git a/lib/Driver/SanitizerArgs.cpp b/lib/Driver/SanitizerArgs.cpp
index 3c985a1f71d7..f1d6044033fc 100644
--- a/lib/Driver/SanitizerArgs.cpp
+++ b/lib/Driver/SanitizerArgs.cpp
@@ -112,7 +112,7 @@ static bool getDefaultBlacklist(const Driver &D, SanitizerMask Kinds,
 
   if (BlacklistFile) {
     clang::SmallString<64> Path(D.ResourceDir);
-    llvm::sys::path::append(Path, BlacklistFile);
+    llvm::sys::path::append(Path, "share", BlacklistFile);
     BLPath = Path.str();
     return true;
   }
diff --git a/lib/Driver/ToolChain.cpp b/lib/Driver/ToolChain.cpp
index f96a1182e3ca..eab990e236ae 100644
--- a/lib/Driver/ToolChain.cpp
+++ b/lib/Driver/ToolChain.cpp
@@ -11,6 +11,7 @@
 #include "ToolChains/CommonArgs.h"
 #include "ToolChains/Arch/ARM.h"
 #include "ToolChains/Clang.h"
+#include "ToolChains/Flang.h"
 #include "clang/Basic/ObjCRuntime.h"
 #include "clang/Basic/VirtualFileSystem.h"
 #include "clang/Config/config.h"
@@ -137,6 +138,7 @@ const DriverSuffix *FindDriverSuffix(StringRef ProgName, size_t &Pos) {
       {"cpp", "--driver-mode=cpp"},
       {"cl", "--driver-mode=cl"},
       {"++", "--driver-mode=g++"},
+      {"flang", "--driver-mode=fortran"},
   };
 
   for (size_t i = 0; i < llvm::array_lengthof(DriverSuffixes); ++i) {
@@ -255,6 +257,12 @@ Tool *ToolChain::getAssemble() const {
   return Assemble.get();
 }
 
+Tool *ToolChain::getFlangFrontend() const {
+  if (!FlangFrontend)
+    FlangFrontend.reset(new tools::FlangFrontend(*this));
+  return FlangFrontend.get();
+}
+
 Tool *ToolChain::getClangAs() const {
   if (!Assemble)
     Assemble.reset(new tools::ClangAs(*this));
@@ -301,6 +309,9 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
   case Action::OffloadBundlingJobClass:
   case Action::OffloadUnbundlingJobClass:
     return getOffloadBundler();
+
+  case Action::FortranFrontendJobClass:
+    return getFlangFrontend();
   }
 
   llvm_unreachable("Invalid tool kind.");
@@ -749,6 +760,49 @@ void ToolChain::AddCCKextLibArgs(const ArgList &Args,
   CmdArgs.push_back("-lcc_kext");
 }
 
+void ToolChain::AddFortranStdlibLibArgs(const ArgList &Args,
+                                    ArgStringList &CmdArgs) const {
+ bool staticFlangLibs = false;
+ bool useOpenMP = false;
+
+  if (Args.hasArg(options::OPT_staticFlangLibs)) {
+    for (auto *A: Args.filtered(options::OPT_staticFlangLibs)) {
+      A->claim();
+      staticFlangLibs = true;
+    }
+  }
+
+  Arg *A = Args.getLastArg(options::OPT_mp, options::OPT_nomp,
+                           options::OPT_fopenmp, options::OPT_fno_openmp);
+  if (A &&
+      (A->getOption().matches(options::OPT_mp) ||
+       A->getOption().matches(options::OPT_fopenmp))) {
+      useOpenMP = true;
+  }
+
+  if (staticFlangLibs) {
+    CmdArgs.push_back("-Bstatic");
+  }
+  CmdArgs.push_back("-lflang");
+  CmdArgs.push_back("-lflangrti");
+  if( useOpenMP ) {
+    CmdArgs.push_back("-lomp");
+  }
+  else {
+    CmdArgs.push_back("-lompstub");
+  }
+  if( staticFlangLibs ) {
+    CmdArgs.push_back("-Bdynamic");
+  }
+
+  CmdArgs.push_back("-lm");
+  CmdArgs.push_back("-lrt");
+
+  // Allways link Fortran executables with Pthreads
+  CmdArgs.push_back("-lpthread");
+
+}
+
 bool ToolChain::AddFastMathRuntimeIfAvailable(const ArgList &Args,
                                               ArgStringList &CmdArgs) const {
   // Do not check for -fno-fast-math or -fno-unsafe-math when -Ofast passed
diff --git a/lib/Driver/ToolChains/Ananas.cpp b/lib/Driver/ToolChains/Ananas.cpp
index ee072cc03e7c..8b70aea103b5 100644
--- a/lib/Driver/ToolChains/Ananas.cpp
+++ b/lib/Driver/ToolChains/Ananas.cpp
@@ -64,8 +64,19 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (!D.SysRoot.empty())
     CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot));
 
-  // Ananas only supports static linkage for now.
-  CmdArgs.push_back("-Bstatic");
+  if (Args.hasArg(options::OPT_static)) {
+    CmdArgs.push_back("-Bstatic");
+  } else {
+    if (Args.hasArg(options::OPT_rdynamic))
+      CmdArgs.push_back("-export-dynamic");
+    if (Args.hasArg(options::OPT_shared)) {
+      CmdArgs.push_back("-Bshareable");
+    } else {
+      Args.AddAllArgs(CmdArgs, options::OPT_pie);
+      CmdArgs.push_back("-dynamic-linker");
+      CmdArgs.push_back("/lib/ld-ananas.so");
+    }
+  }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
@@ -75,9 +86,15 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
-    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt0.o")));
+    if (!Args.hasArg(options::OPT_shared)) {
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt0.o")));
+    }
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o")));
-    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtbegin.o")));
+    if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie)) {
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtbeginS.o")));
+    } else {
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtbegin.o")));
+    }
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
@@ -97,7 +114,10 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-lc");
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
-    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtend.o")));
+    if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie))
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtendS.o")));
+    else
+      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtend.o")));
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtn.o")));
   }
 
diff --git a/lib/Driver/ToolChains/Arch/RISCV.cpp b/lib/Driver/ToolChains/Arch/RISCV.cpp
new file mode 100644
index 000000000000..2b4ae88365a6
--- /dev/null
+++ b/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -0,0 +1,61 @@
+//===--- RISCV.cpp - RISCV Helpers for Tools --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "clang/Driver/Driver.h"
+#include "clang/Driver/DriverDiagnostic.h"
+#include "clang/Driver/Options.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace clang::driver;
+using namespace clang::driver::tools;
+using namespace clang;
+using namespace llvm::opt;
+
+void riscv::getRISCVTargetFeatures(const Driver &D, const ArgList &Args,
+                                   std::vector<StringRef> &Features) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+    StringRef MArch = A->getValue();
+    // TODO: handle rv64
+    std::pair<StringRef, StringRef> MArchSplit = StringRef(MArch).split("rv32");
+    if (!MArchSplit.second.size())
+      return;
+
+    for (char c : MArchSplit.second) {
+      switch (c) {
+      case 'i':
+        break;
+      case 'm':
+        Features.push_back("+m");
+        break;
+      case 'a':
+        Features.push_back("+a");
+        break;
+      case 'f':
+        Features.push_back("+f");
+        break;
+      case 'd':
+        Features.push_back("+d");
+        break;
+      case 'c':
+        Features.push_back("+c");
+        break;
+      }
+    }
+  }
+}
+
+StringRef riscv::getRISCVABI(const ArgList &Args, const llvm::Triple &Triple) {
+  if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
+    return A->getValue();
+
+  return Triple.getArch() == llvm::Triple::riscv32 ? "ilp32" : "lp64";
+}
diff --git a/lib/Driver/ToolChains/Arch/RISCV.h b/lib/Driver/ToolChains/Arch/RISCV.h
new file mode 100644
index 000000000000..beda14979fab
--- /dev/null
+++ b/lib/Driver/ToolChains/Arch/RISCV.h
@@ -0,0 +1,32 @@
+//===--- RISCV.h - RISCV-specific Tool Helpers ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ARCH_RISCV_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ARCH_RISCV_H
+
+#include "clang/Driver/Driver.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Option/Option.h"
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace driver {
+namespace tools {
+namespace riscv {
+void getRISCVTargetFeatures(const Driver &D, const llvm::opt::ArgList &Args,
+                            std::vector<llvm::StringRef> &Features);
+StringRef getRISCVABI(const llvm::opt::ArgList &Args,
+                      const llvm::Triple &Triple);
+} // end namespace riscv
+} // namespace tools
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ARCH_RISCV_H
diff --git a/lib/Driver/ToolChains/Arch/X86.cpp b/lib/Driver/ToolChains/Arch/X86.cpp
index a18b2aa35b03..7a4f836d2e1a 100644
--- a/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/lib/Driver/ToolChains/Arch/X86.cpp
@@ -40,26 +40,29 @@ const char *x86::getX86TargetCPU(const ArgList &Args,
       return Args.MakeArgString(CPU);
   }
 
-  if (const Arg *A = Args.getLastArg(options::OPT__SLASH_arch)) {
-    // Mapping built by referring to X86TargetInfo::getDefaultFeatures().
+  if (const Arg *A = Args.getLastArgNoClaim(options::OPT__SLASH_arch)) {
+    // Mapping built by looking at lib/Basic's X86TargetInfo::initFeatureMap().
     StringRef Arch = A->getValue();
-    const char *CPU;
-    if (Triple.getArch() == llvm::Triple::x86) {
+    const char *CPU = nullptr;
+    if (Triple.getArch() == llvm::Triple::x86) {  // 32-bit-only /arch: flags.
       CPU = llvm::StringSwitch<const char *>(Arch)
                 .Case("IA32", "i386")
                 .Case("SSE", "pentium3")
                 .Case("SSE2", "pentium4")
-                .Case("AVX", "sandybridge")
-                .Case("AVX2", "haswell")
                 .Default(nullptr);
-    } else {
+    }
+    if (CPU == nullptr) {  // 32-bit and 64-bit /arch: flags.
       CPU = llvm::StringSwitch<const char *>(Arch)
                 .Case("AVX", "sandybridge")
                 .Case("AVX2", "haswell")
+                .Case("AVX512F", "knl")
+                .Case("AVX512", "skylake-avx512")
                 .Default(nullptr);
     }
-    if (CPU)
+    if (CPU) {
+      A->claim();
       return CPU;
+    }
   }
 
   // Select the default CPU if none was given (or detection failed).
@@ -141,30 +144,6 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
       Features.push_back("+ssse3");
   }
 
-  // Set features according to the -arch flag on MSVC.
-  if (Arg *A = Args.getLastArg(options::OPT__SLASH_arch)) {
-    StringRef Arch = A->getValue();
-    bool ArchUsed = false;
-    // First, look for flags that are shared in x86 and x86-64.
-    if (ArchType == llvm::Triple::x86_64 || ArchType == llvm::Triple::x86) {
-      if (Arch == "AVX" || Arch == "AVX2") {
-        ArchUsed = true;
-        Features.push_back(Args.MakeArgString("+" + Arch.lower()));
-      }
-    }
-    // Then, look for x86-specific flags.
-    if (ArchType == llvm::Triple::x86) {
-      if (Arch == "IA32") {
-        ArchUsed = true;
-      } else if (Arch == "SSE" || Arch == "SSE2") {
-        ArchUsed = true;
-        Features.push_back(Args.MakeArgString("+" + Arch.lower()));
-      }
-    }
-    if (!ArchUsed)
-      D.Diag(clang::diag::warn_drv_unused_argument) << A->getAsString(Args);
-  }
-
   // Now add any that the user explicitly requested on the command line,
   // which may override the defaults.
   handleTargetFeaturesGroup(Args, Features, options::OPT_m_x86_Features_Group);
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index c445669a5ad4..ef5e64bc9443 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -12,6 +12,7 @@
 #include "Arch/ARM.h"
 #include "Arch/Mips.h"
 #include "Arch/PPC.h"
+#include "Arch/RISCV.h"
 #include "Arch/Sparc.h"
 #include "Arch/SystemZ.h"
 #include "Arch/X86.h"
@@ -327,6 +328,10 @@ static void getTargetFeatures(const ToolChain &TC, const llvm::Triple &Triple,
   case llvm::Triple::ppc64le:
     ppc::getPPCTargetFeatures(D, Triple, Args, Features);
     break;
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
+    riscv::getRISCVTargetFeatures(D, Args, Features);
+    break;
   case llvm::Triple::systemz:
     systemz::getSystemZTargetFeatures(Args, Features);
     break;
@@ -547,6 +552,14 @@ static bool useFramePointerForTargetByDefault(const ArgList &Args,
     }
   }
 
+  switch (Triple.getArch()) {
+    case llvm::Triple::riscv32:
+    case llvm::Triple::riscv64:
+      return !areOptimizationsEnabled(Args);
+    default:
+      break;
+  }
+
   if (Triple.isOSWindows()) {
     switch (Triple.getArch()) {
     case llvm::Triple::x86:
@@ -1282,6 +1295,8 @@ static bool isSignedCharDefault(const llvm::Triple &Triple) {
 
   case llvm::Triple::hexagon:
   case llvm::Triple::ppc64le:
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
   case llvm::Triple::systemz:
   case llvm::Triple::xcore:
     return false;
@@ -1391,6 +1406,11 @@ void Clang::RenderTargetOptions(const llvm::Triple &EffectiveTriple,
     AddPPCTargetArgs(Args, CmdArgs);
     break;
 
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
+    AddRISCVTargetArgs(Args, CmdArgs);
+    break;
+
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
   case llvm::Triple::sparcv9:
@@ -1668,6 +1688,25 @@ void Clang::AddPPCTargetArgs(const ArgList &Args,
   }
 }
 
+void Clang::AddRISCVTargetArgs(const ArgList &Args,
+                               ArgStringList &CmdArgs) const {
+  // FIXME: currently defaults to the soft-float ABIs. Will need to be
+  // expanded to select ilp32f, ilp32d, lp64f, lp64d when appropiate.
+  const char *ABIName = nullptr;
+  const llvm::Triple &Triple = getToolChain().getTriple();
+  if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
+    ABIName = A->getValue();
+  else if (Triple.getArch() == llvm::Triple::riscv32)
+    ABIName = "ilp32";
+  else if (Triple.getArch() == llvm::Triple::riscv64)
+    ABIName = "lp64";
+  else
+    llvm_unreachable("Unexpected triple!");
+
+  CmdArgs.push_back("-target-abi");
+  CmdArgs.push_back(ABIName);
+}
+
 void Clang::AddSparcTargetArgs(const ArgList &Args,
                                ArgStringList &CmdArgs) const {
   sparc::FloatABI FloatABI =
@@ -1722,6 +1761,9 @@ void Clang::AddX86TargetArgs(const ArgList &Args,
       getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
           << A->getOption().getName() << Value;
     }
+  } else if (getToolChain().getDriver().IsCLMode()) {
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back("-x86-asm-syntax=intel");
   }
 
   // Set flags to support MCU ABI.
@@ -3583,7 +3625,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       options::OPT_finstrument_function_entry_bare))
     A->render(Args, CmdArgs);
 
-  addPGOAndCoverageFlags(C, D, Output, Args, CmdArgs);
+  // NVPTX doesn't support PGO or coverage. There's no runtime support for
+  // sampling, overhead of call arc collection is way too high and there's no
+  // way to collect the output.
+  if (!Triple.isNVPTX())
+    addPGOAndCoverageFlags(C, D, Output, Args, CmdArgs);
 
   if (auto *ABICompatArg = Args.getLastArg(options::OPT_fclang_abi_compat_EQ))
     ABICompatArg->render(Args, CmdArgs);
@@ -3798,6 +3844,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(A->getValue());
   }
 
+  if (Args.hasFlag(options::OPT_fstack_size_section,
+                   options::OPT_fno_stack_size_section, RawTriple.isPS4()))
+    CmdArgs.push_back("-fstack-size-section");
+
   CmdArgs.push_back("-ferror-limit");
   if (Arg *A = Args.getLastArg(options::OPT_ferror_limit_EQ))
     CmdArgs.push_back(A->getValue());
@@ -4004,6 +4054,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // Forward -cl options to -cc1
   RenderOpenCLOptions(Args, CmdArgs);
 
+  if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine("-fcf-protection=") + A->getValue()));
+  }
+
   // Forward -f options with positive and negative forms; we translate
   // these by hand.
   if (Arg *A = getLastProfileSampleUseArg(Args)) {
@@ -4639,6 +4694,37 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fwhole-program-vtables");
   }
 
+  if (Arg *A = Args.getLastArg(options::OPT_fexperimental_isel,
+                               options::OPT_fno_experimental_isel)) {
+    CmdArgs.push_back("-mllvm");
+    if (A->getOption().matches(options::OPT_fexperimental_isel)) {
+      CmdArgs.push_back("-global-isel=1");
+
+      // GISel is on by default on AArch64 -O0, so don't bother adding
+      // the fallback remarks for it. Other combinations will add a warning of
+      // some kind.
+      bool IsArchSupported = Triple.getArch() == llvm::Triple::aarch64;
+      bool IsOptLevelSupported = false;
+
+      Arg *A = Args.getLastArg(options::OPT_O_Group);
+      if (Triple.getArch() == llvm::Triple::aarch64) {
+        if (!A || A->getOption().matches(options::OPT_O0))
+          IsOptLevelSupported = true;
+      }
+      if (!IsArchSupported || !IsOptLevelSupported) {
+        CmdArgs.push_back("-mllvm");
+        CmdArgs.push_back("-global-isel-abort=2");
+
+        if (!IsArchSupported)
+          D.Diag(diag::warn_drv_experimental_isel_incomplete) << Triple.getArchName();
+        else
+          D.Diag(diag::warn_drv_experimental_isel_incomplete_opt);
+      }
+    } else {
+      CmdArgs.push_back("-global-isel=0");
+    }
+  }
+
   // Finally add the compile command to the compilation.
   if (Args.hasArg(options::OPT__SLASH_fallback) &&
       Output.getType() == types::TY_Object &&
@@ -5070,6 +5156,10 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
     else
       CmdArgs.push_back("msvc");
   }
+
+  if (Args.hasArg(options::OPT__SLASH_Guard) &&
+      Args.getLastArgValue(options::OPT__SLASH_Guard).equals_lower("cf"))
+    CmdArgs.push_back("-cfguard");
 }
 
 visualstudio::Compiler *Clang::getCLFallback() const {
diff --git a/lib/Driver/ToolChains/Clang.h b/lib/Driver/ToolChains/Clang.h
index e23822b9c678..8893d88a8ca9 100644
--- a/lib/Driver/ToolChains/Clang.h
+++ b/lib/Driver/ToolChains/Clang.h
@@ -60,6 +60,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
                         llvm::opt::ArgStringList &CmdArgs) const;
   void AddR600TargetArgs(const llvm::opt::ArgList &Args,
                          llvm::opt::ArgStringList &CmdArgs) const;
+  void AddRISCVTargetArgs(const llvm::opt::ArgList &Args,
+                          llvm::opt::ArgStringList &CmdArgs) const;
   void AddSparcTargetArgs(const llvm::opt::ArgList &Args,
                           llvm::opt::ArgStringList &CmdArgs) const;
   void AddSystemZTargetArgs(const llvm::opt::ArgList &Args,
diff --git a/lib/Driver/ToolChains/CommonArgs.cpp b/lib/Driver/ToolChains/CommonArgs.cpp
index f26880123d8c..9d272130e996 100644
--- a/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/lib/Driver/ToolChains/CommonArgs.cpp
@@ -56,6 +56,23 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
+/// \brief Determine if Fortran link libraies are needed
+bool tools::needFortranLibs(const Driver &D, const ArgList &Args) {
+  if (D.IsFortranMode() && !Args.hasArg(options::OPT_nostdlib) &&
+      !Args.hasArg(options::OPT_noFlangLibs)) {
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Determine if Fortran "main" object is needed
+static bool needFortranMain(const Driver &D, const ArgList &Args) {
+  return (needFortranLibs(D, Args)
+       && (!Args.hasArg(options::OPT_Mnomain) ||
+           !Args.hasArg(options::OPT_no_fortran_main)));
+}
+
 void tools::addPathIfExists(const Driver &D, const Twine &Path,
                             ToolChain::path_list &Paths) {
   if (D.getVFS().exists(Path))
@@ -138,6 +155,7 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
                             const ArgList &Args, ArgStringList &CmdArgs,
                             const JobAction &JA) {
   const Driver &D = TC.getDriver();
+  bool SeenFirstLinkerInput = false;
 
   // Add extra linker input arguments which are not treated as inputs
   // (constructed via -Xarch_).
@@ -162,6 +180,14 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
       continue;
     }
 
+    // Add Fortan "main" before the first linker input
+    if (!SeenFirstLinkerInput) {
+      if (needFortranMain(D, Args)) {
+        CmdArgs.push_back("-lflangmain");
+      }
+      SeenFirstLinkerInput = true;
+    }
+
     // Otherwise, this is a linker input argument.
     const Arg &A = II.getInputArg();
 
@@ -179,6 +205,15 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
     }
   }
 
+  if (!SeenFirstLinkerInput && needFortranMain(D, Args)) {
+    CmdArgs.push_back("-lflangmain");
+  }
+
+  // Claim "no Fortran main" arguments
+  for (auto Arg : Args.filtered(options::OPT_no_fortran_main, options::OPT_Mnomain)) {
+    Arg->claim();
+  }
+
   // LIBRARY_PATH - included following the user specified library paths.
   //                and only supported on native toolchains.
   if (!TC.isCrossCompiling()) {
@@ -363,12 +398,9 @@ unsigned tools::getLTOParallelism(const ArgList &Args, const Driver &D) {
   return Parallelism;
 }
 
-// CloudABI and WebAssembly use -ffunction-sections and -fdata-sections by
-// default.
+// CloudABI uses -ffunction-sections and -fdata-sections by default.
 bool tools::isUseSeparateSections(const llvm::Triple &Triple) {
-  return Triple.getOS() == llvm::Triple::CloudABI ||
-         Triple.getArch() == llvm::Triple::wasm32 ||
-         Triple.getArch() == llvm::Triple::wasm64;
+  return Triple.getOS() == llvm::Triple::CloudABI;
 }
 
 void tools::AddGoldPlugin(const ToolChain &ToolChain, const ArgList &Args,
@@ -549,12 +581,16 @@ void tools::linkSanitizerRuntimeDeps(const ToolChain &TC,
       TC.getTriple().getOS() != llvm::Triple::NetBSD &&
       TC.getTriple().getOS() != llvm::Triple::RTEMS)
     CmdArgs.push_back("-ldl");
-  // Required for forkpty on some OSes
+  // Required for functions like forkpty on some OSes
   if (TC.getTriple().getOS() == llvm::Triple::NetBSD)
     CmdArgs.push_back("-lutil");
   // Required for backtrace on some OSes
-  if (TC.getTriple().getOS() == llvm::Triple::NetBSD)
+  if (TC.getTriple().getOS() == llvm::Triple::NetBSD ||
+      TC.getTriple().getOS() == llvm::Triple::FreeBSD)
     CmdArgs.push_back("-lexecinfo");
+  // Required for kvm (kernel memory interface) on some OSes
+  if (TC.getTriple().getOS() == llvm::Triple::NetBSD)
+    CmdArgs.push_back("-lkvm");
 }
 
 static void
diff --git a/lib/Driver/ToolChains/CommonArgs.h b/lib/Driver/ToolChains/CommonArgs.h
index 012f5b9f87ae..6e1ed090611e 100644
--- a/lib/Driver/ToolChains/CommonArgs.h
+++ b/lib/Driver/ToolChains/CommonArgs.h
@@ -20,6 +20,8 @@ namespace clang {
 namespace driver {
 namespace tools {
 
+bool needFortranLibs(const Driver &D, const llvm::opt::ArgList &Args);
+
 void addPathIfExists(const Driver &D, const Twine &Path,
                      ToolChain::path_list &Paths);
 
diff --git a/lib/Driver/ToolChains/Cuda.cpp b/lib/Driver/ToolChains/Cuda.cpp
index bc4820797b2f..e513e818ebfb 100644
--- a/lib/Driver/ToolChains/Cuda.cpp
+++ b/lib/Driver/ToolChains/Cuda.cpp
@@ -8,18 +8,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "Cuda.h"
-#include "InputInfo.h"
 #include "CommonArgs.h"
+#include "InputInfo.h"
 #include "clang/Basic/Cuda.h"
-#include "clang/Config/config.h"
 #include "clang/Basic/VirtualFileSystem.h"
-#include "clang/Driver/Distro.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
+#include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include <system_error>
 
 using namespace clang::driver;
@@ -52,6 +54,8 @@ static CudaVersion ParseCudaVersionFile(llvm::StringRef V) {
     return CudaVersion::CUDA_80;
   if (Major == 9 && Minor == 0)
     return CudaVersion::CUDA_90;
+  if (Major == 9 && Minor == 1)
+    return CudaVersion::CUDA_91;
   return CudaVersion::UNKNOWN;
 }
 
@@ -59,42 +63,75 @@ CudaInstallationDetector::CudaInstallationDetector(
     const Driver &D, const llvm::Triple &HostTriple,
     const llvm::opt::ArgList &Args)
     : D(D) {
-  SmallVector<std::string, 4> CudaPathCandidates;
+  struct Candidate {
+    std::string Path;
+    bool StrictChecking;
+
+    Candidate(std::string Path, bool StrictChecking = false)
+        : Path(Path), StrictChecking(StrictChecking) {}
+  };
+  SmallVector<Candidate, 4> Candidates;
 
   // In decreasing order so we prefer newer versions to older versions.
   std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
 
   if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
-    CudaPathCandidates.push_back(
-        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ));
+    Candidates.emplace_back(
+        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
   } else if (HostTriple.isOSWindows()) {
     for (const char *Ver : Versions)
-      CudaPathCandidates.push_back(
+      Candidates.emplace_back(
           D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
           Ver);
   } else {
-    CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda");
+    if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
+      // Try to find ptxas binary. If the executable is located in a directory
+      // called 'bin/', its parent directory might be a good guess for a valid
+      // CUDA installation.
+      // However, some distributions might installs 'ptxas' to /usr/bin. In that
+      // case the candidate would be '/usr' which passes the following checks
+      // because '/usr/include' exists as well. To avoid this case, we always
+      // check for the directory potentially containing files for libdevice,
+      // even if the user passes -nocudalib.
+      if (llvm::ErrorOr<std::string> ptxas =
+              llvm::sys::findProgramByName("ptxas")) {
+        SmallString<256> ptxasAbsolutePath;
+        llvm::sys::fs::real_path(*ptxas, ptxasAbsolutePath);
+
+        StringRef ptxasDir = llvm::sys::path::parent_path(ptxasAbsolutePath);
+        if (llvm::sys::path::filename(ptxasDir) == "bin")
+          Candidates.emplace_back(llvm::sys::path::parent_path(ptxasDir),
+                                  /*StrictChecking=*/true);
+      }
+    }
+
+    Candidates.emplace_back(D.SysRoot + "/usr/local/cuda");
     for (const char *Ver : Versions)
-      CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-" + Ver);
+      Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver);
 
     if (Distro(D.getVFS()).IsDebian())
       // Special case for Debian to have nvidia-cuda-toolkit work
       // out of the box. More info on http://bugs.debian.org/882505
-      CudaPathCandidates.push_back(D.SysRoot + "/usr/lib/cuda");
+      Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda");
   }
 
-  for (const auto &CudaPath : CudaPathCandidates) {
-    if (CudaPath.empty() || !D.getVFS().exists(CudaPath))
+  bool NoCudaLib = Args.hasArg(options::OPT_nocudalib);
+
+  for (const auto &Candidate : Candidates) {
+    InstallPath = Candidate.Path;
+    if (InstallPath.empty() || !D.getVFS().exists(InstallPath))
       continue;
 
-    InstallPath = CudaPath;
-    BinPath = CudaPath + "/bin";
+    BinPath = InstallPath + "/bin";
     IncludePath = InstallPath + "/include";
     LibDevicePath = InstallPath + "/nvvm/libdevice";
 
     auto &FS = D.getVFS();
     if (!(FS.exists(IncludePath) && FS.exists(BinPath)))
       continue;
+    bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking);
+    if (CheckLibDevice && !FS.exists(LibDevicePath))
+      continue;
 
     // On Linux, we have both lib and lib64 directories, and we need to choose
     // based on our triple.  On MacOS, we have only a lib directory.
@@ -119,14 +156,18 @@ CudaInstallationDetector::CudaInstallationDetector(
       Version = ParseCudaVersionFile((*VersionFile)->getBuffer());
     }
 
-    if (Version == CudaVersion::CUDA_90) {
-      // CUDA-9 uses single libdevice file for all GPU variants.
+    if (Version >= CudaVersion::CUDA_90) {
+      // CUDA-9+ uses single libdevice file for all GPU variants.
       std::string FilePath = LibDevicePath + "/libdevice.10.bc";
       if (FS.exists(FilePath)) {
-        for (const char *GpuArch :
+        for (const char *GpuArchName :
              {"sm_20", "sm_30", "sm_32", "sm_35", "sm_50", "sm_52", "sm_53",
-              "sm_60", "sm_61", "sm_62", "sm_70"})
-          LibDeviceMap[GpuArch] = FilePath;
+                   "sm_60", "sm_61", "sm_62", "sm_70", "sm_72"}) {
+          const CudaArch GpuArch = StringToCudaArch(GpuArchName);
+          if (Version >= MinVersionForCudaArch(GpuArch) &&
+              Version <= MaxVersionForCudaArch(GpuArch))
+            LibDeviceMap[GpuArchName] = FilePath;
+        }
       }
     } else {
       std::error_code EC;
@@ -174,7 +215,7 @@ CudaInstallationDetector::CudaInstallationDetector(
 
     // Check that we have found at least one libdevice that we can link in if
     // -nocudalib hasn't been specified.
-    if (LibDeviceMap.empty() && !Args.hasArg(options::OPT_nocudalib))
+    if (LibDeviceMap.empty() && !NoCudaLib)
       continue;
 
     IsValid = true;
diff --git a/lib/Driver/ToolChains/Flang.cpp b/lib/Driver/ToolChains/Flang.cpp
new file mode 100644
index 000000000000..eb619a860e93
--- /dev/null
+++ b/lib/Driver/ToolChains/Flang.cpp
@@ -0,0 +1,851 @@
+//===--- Flang.cpp - Flang+LLVM ToolChain Implementations -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Flang.h"
+#include "CommonArgs.h"
+#include "InputInfo.h"
+#include "clang/Basic/CharInfo.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/ObjCRuntime.h"
+#include "clang/Basic/Version.h"
+#include "clang/Config/config.h"
+#include "clang/Driver/DriverDiagnostic.h"
+#include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
+#include "clang/Driver/XRayArgs.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/YAMLParser.h"
+
+#ifdef LLVM_ON_UNIX
+#include <unistd.h> // For getuid().
+#endif
+
+using namespace clang::driver;
+using namespace clang::driver::tools;
+using namespace clang;
+using namespace llvm::opt;
+
+void FlangFrontend::ConstructJob(Compilation &C, const JobAction &JA,
+                         const InputInfo &Output, const InputInfoList &Inputs,
+                         const ArgList &Args, const char *LinkingOutput) const {
+  ArgStringList CommonCmdArgs;
+  ArgStringList UpperCmdArgs;
+  ArgStringList LowerCmdArgs;
+  SmallString<256> Stem;
+  std::string OutFile;
+  bool NeedIEEE = true;
+
+  // Check number of inputs for sanity. We need at least one input.
+  assert(Inputs.size() >= 1 && "Must have at least one input.");
+
+  /***** Process file arguments to both parts *****/
+  const InputInfo &Input = Inputs[0];
+  types::ID InputType = Input.getType();
+  // Check file type sanity
+  assert(types::isFortran(InputType) && "Can only accept Fortran");
+
+  if (Args.hasArg(options::OPT_fsyntax_only)) {
+    // For -fsyntax-only produce temp files only
+    Stem = C.getDriver().GetTemporaryPath("", "");
+  } else {
+    OutFile = Output.getFilename();
+    Stem = llvm::sys::path::filename(OutFile);
+    llvm::sys::path::replace_extension(Stem, "");
+  }
+
+  // Add input file name to the compilation line
+  UpperCmdArgs.push_back(Input.getBaseInput());
+
+  // Add temporary output for ILM
+  const char * ILMFile = Args.MakeArgString(Stem + ".ilm");
+  LowerCmdArgs.push_back(ILMFile);
+  C.addTempFile(ILMFile);
+
+  /***** Process common args *****/
+
+  // Override IEEE mode if needed
+  if (Args.hasArg(options::OPT_Ofast) ||
+      Args.hasArg(options::OPT_ffast_math) ||
+      Args.hasArg(options::OPT_fno_fast_math) ||
+      Args.hasArg(options::OPT_Kieee_on) ||
+      Args.hasArg(options::OPT_Kieee_off)) {
+    auto A = Args.getLastArg(options::OPT_Ofast,
+                             options::OPT_ffast_math,
+                             options::OPT_fno_fast_math,
+                             options::OPT_Kieee_on,
+                             options::OPT_Kieee_off);
+    auto Opt = A->getOption();
+    if (Opt.matches(options::OPT_Ofast) ||
+        Opt.matches(options::OPT_ffast_math) ||
+        Opt.matches(options::OPT_Kieee_off)) {
+      NeedIEEE = false;
+    }
+  }
+
+  // -Kieee is on by default
+  if (!Args.hasArg(options::OPT_Kieee_off)) {
+    CommonCmdArgs.push_back("-y"); // Common: -y 129 2
+    CommonCmdArgs.push_back("129");
+    CommonCmdArgs.push_back("2");
+    // Lower: -x 6 0x100
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("6");
+    LowerCmdArgs.push_back("0x100");
+    // -x 42 0x400000
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("42");
+    LowerCmdArgs.push_back("0x400000");
+    // -y 129 4
+    LowerCmdArgs.push_back("-y");
+    LowerCmdArgs.push_back("129");
+    LowerCmdArgs.push_back("4");
+    // -x 129 0x400
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("129");
+    LowerCmdArgs.push_back("0x400");
+    for (auto Arg : Args.filtered(options::OPT_Kieee_on)) {
+      Arg->claim();
+    }
+  } else {
+    for (auto Arg : Args.filtered(options::OPT_Kieee_off)) {
+      Arg->claim();
+    }
+  }
+
+  // Add "inform level" flag
+  if (Args.hasArg(options::OPT_Minform_EQ)) {
+    // Parse arguments to set its value
+    for (Arg *A : Args.filtered(options::OPT_Minform_EQ)) {
+      A->claim();
+      CommonCmdArgs.push_back("-inform");
+      CommonCmdArgs.push_back(A->getValue(0));
+    }
+  } else {
+    // Default value
+    CommonCmdArgs.push_back("-inform");
+    CommonCmdArgs.push_back("warn");
+  }
+
+  for (auto Arg : Args.filtered(options::OPT_Msave_on)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-save");
+  }
+
+  for (auto Arg : Args.filtered(options::OPT_Msave_off)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-nosave");
+  }
+
+  // Treat denormalized numbers as zero: On
+  for (auto Arg : Args.filtered(options::OPT_Mdaz_on)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("129");
+    CommonCmdArgs.push_back("4");
+    CommonCmdArgs.push_back("-y");
+    CommonCmdArgs.push_back("129");
+    CommonCmdArgs.push_back("0x400");
+  }
+
+  // Treat denormalized numbers as zero: Off
+  for (auto Arg : Args.filtered(options::OPT_Mdaz_off)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-y");
+    CommonCmdArgs.push_back("129");
+    CommonCmdArgs.push_back("4");
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("129");
+    CommonCmdArgs.push_back("0x400");
+  }
+
+  // Bounds checking: On
+  for (auto Arg : Args.filtered(options::OPT_Mbounds_on)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("70");
+    CommonCmdArgs.push_back("2");
+  }
+
+  // Bounds checking: Off
+  for (auto Arg : Args.filtered(options::OPT_Mbounds_off)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-y");
+    CommonCmdArgs.push_back("70");
+    CommonCmdArgs.push_back("2");
+  }
+
+  // Generate code allowing recursive subprograms
+  for (auto Arg : Args.filtered(options::OPT_Mrecursive_on)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-recursive");
+  }
+
+  // Disable recursive subprograms
+  for (auto Arg : Args.filtered(options::OPT_Mrecursive_off)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-norecursive");
+  }
+
+  // Enable generating reentrant code (disable optimizations that inhibit it)
+  for (auto Arg : Args.filtered(options::OPT_Mreentrant_on)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-reentrant");
+  }
+
+  // Allow optimizations inhibiting reentrancy
+  for (auto Arg : Args.filtered(options::OPT_Mreentrant_off)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-noreentrant");
+  }
+
+  // Swap byte order for unformatted IO
+  for (auto Arg : Args.filtered(options::OPT_Mbyteswapio, options::OPT_byteswapio)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("125");
+    CommonCmdArgs.push_back("2");
+  }
+
+  // Treat backslashes as regular characters
+  for (auto Arg : Args.filtered(options::OPT_fnobackslash, options::OPT_Mbackslash)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("124");
+    CommonCmdArgs.push_back("0x40");
+  }
+
+  // Treat backslashes as C-style escape characters
+  for (auto Arg : Args.filtered(options::OPT_fbackslash, options::OPT_Mnobackslash)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-y");
+    CommonCmdArgs.push_back("124");
+    CommonCmdArgs.push_back("0x40");
+  }
+
+  // handle OpemMP options
+  if (auto *A = Args.getLastArg(options::OPT_mp, options::OPT_nomp,
+                             options::OPT_fopenmp, options::OPT_fno_openmp)) {
+    for (auto Arg : Args.filtered(options::OPT_mp, options::OPT_nomp)) {
+      Arg->claim();
+    }
+    for (auto Arg : Args.filtered(options::OPT_fopenmp,
+                                  options::OPT_fno_openmp)) {
+      Arg->claim();
+    }
+
+    if (A->getOption().matches(options::OPT_mp) ||
+        A->getOption().matches(options::OPT_fopenmp)) {
+
+      CommonCmdArgs.push_back("-mp");
+
+       // Allocate threadprivate data local to the thread
+      CommonCmdArgs.push_back("-x");
+      CommonCmdArgs.push_back("69");
+      CommonCmdArgs.push_back("0x200");
+
+      // Use the 'fair' schedule as the default static schedule
+      // for parallel do loops
+      CommonCmdArgs.push_back("-x");
+      CommonCmdArgs.push_back("69");
+      CommonCmdArgs.push_back("0x400");
+    }
+  }
+
+  // Align large objects on cache lines
+  for (auto Arg : Args.filtered(options::OPT_Mcache_align_on)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("119");
+    CommonCmdArgs.push_back("0x10000000");
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("129");
+    LowerCmdArgs.push_back("0x40000000");
+  }
+
+  // Disable special alignment of large objects
+  for (auto Arg : Args.filtered(options::OPT_Mcache_align_off)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-y");
+    CommonCmdArgs.push_back("119");
+    CommonCmdArgs.push_back("0x10000000");
+    LowerCmdArgs.push_back("-y");
+    LowerCmdArgs.push_back("129");
+    LowerCmdArgs.push_back("0x40000000");
+  }
+
+  // -Mstack_arrays
+  for (auto Arg : Args.filtered(options::OPT_Mstackarrays)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("54");
+    CommonCmdArgs.push_back("8");
+  }
+
+  // -g should produce DWARFv2
+  for (auto Arg : Args.filtered(options::OPT_g_Flag)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("120");
+    CommonCmdArgs.push_back("0x200");
+  }
+
+  // -gdwarf-2
+  for (auto Arg : Args.filtered(options::OPT_gdwarf_2)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("120");
+    CommonCmdArgs.push_back("0x200");
+  }
+
+  // -gdwarf-3
+  for (auto Arg : Args.filtered(options::OPT_gdwarf_3)) {
+    Arg->claim();
+    CommonCmdArgs.push_back("-x");
+    CommonCmdArgs.push_back("120");
+    CommonCmdArgs.push_back("0x4000");
+  }
+
+  // -Mipa has no effect
+  if (Arg *A = Args.getLastArg(options::OPT_Mipa)) {
+    getToolChain().getDriver().Diag(diag::warn_drv_clang_unsupported)
+      << A->getAsString(Args);
+  }
+
+  // -Minline has no effect
+  if (Arg *A = Args.getLastArg(options::OPT_Minline_on)) {
+    getToolChain().getDriver().Diag(diag::warn_drv_clang_unsupported)
+      << A->getAsString(Args);
+  }
+
+  // Handle -fdefault-real-8 (and its alias, -r8) and -fno-default-real-8
+  if (Arg *A = Args.getLastArg(options::OPT_r8,
+                               options::OPT_default_real_8_f,
+                               options::OPT_default_real_8_fno)) {
+    const char * fl;
+    // For -f version add -x flag, for -fno add -y
+    if (A->getOption().matches(options::OPT_default_real_8_fno)) {
+      fl = "-y";
+    } else {
+      fl = "-x";
+    }
+
+    for (Arg *A : Args.filtered(options::OPT_r8,
+                                options::OPT_default_real_8_f,
+                                options::OPT_default_real_8_fno)) {
+      A->claim();
+    }
+
+    UpperCmdArgs.push_back(fl);
+    UpperCmdArgs.push_back("124");
+    UpperCmdArgs.push_back("0x8");
+    UpperCmdArgs.push_back(fl);
+    UpperCmdArgs.push_back("124");
+    UpperCmdArgs.push_back("0x80000");
+  }
+
+  // Process and claim -i8/-fdefault-integer-8/-fno-default-integer-8 argument
+  if (Arg *A = Args.getLastArg(options::OPT_i8,
+                               options::OPT_default_integer_8_f,
+                               options::OPT_default_integer_8_fno)) {
+    const char * fl;
+
+    if (A->getOption().matches(options::OPT_default_integer_8_fno)) {
+      fl = "-y";
+    } else {
+      fl = "-x";
+    }
+
+    for (Arg *A : Args.filtered(options::OPT_i8,
+                                options::OPT_default_integer_8_f,
+                                options::OPT_default_integer_8_fno)) {
+      A->claim();
+    }
+
+    UpperCmdArgs.push_back(fl);
+    UpperCmdArgs.push_back("124");
+    UpperCmdArgs.push_back("0x10");
+  }
+
+  // Set a -x flag for first part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Hx_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    UpperCmdArgs.push_back("-x");
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Set a -y flag for first part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Hy_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    UpperCmdArgs.push_back("-y");
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Set a -q (debug) flag for first part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Hq_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    UpperCmdArgs.push_back("-q");
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Set a -qq (debug) flag for first part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Hqq_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    UpperCmdArgs.push_back("-qq");
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    UpperCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Pass an arbitrary flag for first part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Wh_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    SmallVector<StringRef, 8> PassArgs;
+    Value.split(PassArgs, StringRef(","));
+    for (StringRef PassArg : PassArgs) {
+      UpperCmdArgs.push_back(Args.MakeArgString(PassArg));
+    }
+  }
+
+  // Flush to zero mode
+  // Disabled by default, but can be enabled by a switch
+  if (Args.hasArg(options::OPT_Mflushz_on)) {
+    // For -Mflushz set -x 129 2 for second part of Fortran frontend
+    for (Arg *A: Args.filtered(options::OPT_Mflushz_on)) {
+      A->claim();
+      LowerCmdArgs.push_back("-x");
+      LowerCmdArgs.push_back("129");
+      LowerCmdArgs.push_back("2");
+    }
+  } else {
+    LowerCmdArgs.push_back("-y");
+    LowerCmdArgs.push_back("129");
+    LowerCmdArgs.push_back("2");
+    for (Arg *A: Args.filtered(options::OPT_Mflushz_off)) {
+      A->claim();
+    }
+  }
+
+  // Enable FMA
+  for (Arg *A: Args.filtered(options::OPT_Mfma_on, options::OPT_fma)) {
+    A->claim();
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("172");
+    LowerCmdArgs.push_back("0x40000000");
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("179");
+    LowerCmdArgs.push_back("1");
+  }
+
+  // Disable FMA
+  for (Arg *A: Args.filtered(options::OPT_Mfma_off, options::OPT_nofma)) {
+    A->claim();
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("171");
+    LowerCmdArgs.push_back("0x40000000");
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("178");
+    LowerCmdArgs.push_back("1");
+  }
+
+  // For -fPIC set -x 62 8 for second part of Fortran frontend
+  for (Arg *A: Args.filtered(options::OPT_fPIC)) {
+    A->claim();
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back("62");
+    LowerCmdArgs.push_back("8");
+  }
+
+  StringRef OptOStr("0");
+  if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
+    if (A->getOption().matches(options::OPT_O4)) {
+      OptOStr = "4"; // FIXME what should this be?
+    } else if (A->getOption().matches(options::OPT_Ofast)) {
+      OptOStr = "2"; // FIXME what should this be?
+    } else if (A->getOption().matches(options::OPT_O0)) {
+      // intentionally do nothing
+    } else {
+      assert(A->getOption().matches(options::OPT_O) && "Must have a -O flag");
+      StringRef S(A->getValue());
+      if ((S == "s") || (S == "z")) {
+	// -Os = size; -Oz = more size
+	OptOStr = "2"; // FIXME -Os|-Oz => -opt ?
+      } else if ((S == "1") || (S == "2") || (S == "3")) {
+	OptOStr = S;
+      } else {
+	OptOStr = "4";
+      }
+    }
+  }
+  unsigned OptLevel = std::stoi(OptOStr.str());
+
+  if (Args.hasArg(options::OPT_g_Group)) {
+    // pass -g to lower
+    LowerCmdArgs.push_back("-debug");
+  }
+
+  if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math)) {
+    if (A->getOption().matches(options::OPT_ffast_math)) {
+      LowerCmdArgs.push_back("-x");
+    } else {
+      LowerCmdArgs.push_back("-y");
+    }
+    LowerCmdArgs.push_back("216");
+    LowerCmdArgs.push_back("1");
+  }
+
+  // IEEE compatibility mode
+  LowerCmdArgs.push_back("-ieee");
+  if (NeedIEEE) {
+    LowerCmdArgs.push_back("1");
+  } else {
+    LowerCmdArgs.push_back("0");
+  }
+
+  /***** Upper part of the Fortran frontend *****/
+
+  // TODO do we need to invoke this under GDB sometimes?
+  const char *UpperExec = Args.MakeArgString(getToolChain().GetProgramPath("flang1"));
+
+  UpperCmdArgs.push_back("-opt"); UpperCmdArgs.push_back(Args.MakeArgString(OptOStr));
+  UpperCmdArgs.push_back("-terse"); UpperCmdArgs.push_back("1");
+  UpperCmdArgs.push_back("-inform"); UpperCmdArgs.push_back("warn");
+  UpperCmdArgs.push_back("-nohpf");
+  UpperCmdArgs.push_back("-nostatic");
+  UpperCmdArgs.append(CommonCmdArgs.begin(), CommonCmdArgs.end()); // Append common arguments
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("19"); UpperCmdArgs.push_back("0x400000");
+  UpperCmdArgs.push_back("-quad");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("59"); UpperCmdArgs.push_back("4");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("15"); UpperCmdArgs.push_back("2");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("49"); UpperCmdArgs.push_back("0x400004");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("51"); UpperCmdArgs.push_back("0x20");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("57"); UpperCmdArgs.push_back("0x4c");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("58"); UpperCmdArgs.push_back("0x10000");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("124"); UpperCmdArgs.push_back("0x1000");
+  UpperCmdArgs.push_back("-tp"); UpperCmdArgs.push_back("px");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("57"); UpperCmdArgs.push_back("0xfb0000");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("58"); UpperCmdArgs.push_back("0x78031040");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("47"); UpperCmdArgs.push_back("0x08");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("48"); UpperCmdArgs.push_back("4608");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("49"); UpperCmdArgs.push_back("0x100");
+  if (OptLevel >= 2) {
+    UpperCmdArgs.push_back("-x");
+    UpperCmdArgs.push_back("70");
+    UpperCmdArgs.push_back("0x6c00");
+    UpperCmdArgs.push_back("-x");
+    UpperCmdArgs.push_back("119");
+    UpperCmdArgs.push_back("0x10000000");
+    UpperCmdArgs.push_back("-x");
+    UpperCmdArgs.push_back("129");
+    UpperCmdArgs.push_back("2");
+    UpperCmdArgs.push_back("-x");
+    UpperCmdArgs.push_back("47");
+    UpperCmdArgs.push_back("0x400000");
+    UpperCmdArgs.push_back("-x");
+    UpperCmdArgs.push_back("52");
+    UpperCmdArgs.push_back("2");
+  }
+
+  // Add system include arguments.
+  getToolChain().AddFlangSystemIncludeArgs(Args, UpperCmdArgs);
+
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("unix");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__unix");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__unix__");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("linux");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__linux");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__linux__");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__NO_MATH_INLINES");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__LP64__");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__x86_64");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__x86_64__");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__LONG_MAX__=9223372036854775807L");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__SIZE_TYPE__=unsigned long int");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__PTRDIFF_TYPE__=long int");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__THROW=");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__extension__=");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__amd_64__amd64__");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__k8");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__k8__");
+  UpperCmdArgs.push_back("-def"); UpperCmdArgs.push_back("__PGLLVM__");
+
+  // Enable preprocessor
+  if (Args.hasArg(options::OPT_Mpreprocess) ||
+      Args.hasArg(options::OPT_cpp) ||
+      types::getPreprocessedType(InputType) != types::TY_INVALID) {
+    UpperCmdArgs.push_back("-preprocess");
+    for (auto Arg : Args.filtered(options::OPT_Mpreprocess, options::OPT_cpp)) {
+      Arg->claim();
+    }
+  }
+
+  // Enable standards checking
+  if (Args.hasArg(options::OPT_Mstandard)) {
+    UpperCmdArgs.push_back("-standard");
+    for (auto Arg : Args.filtered(options::OPT_Mstandard)) {
+      Arg->claim();
+    }
+  }
+
+  // Free or fixed form file
+  if (Args.hasArg(options::OPT_fortran_format_Group)) {
+    // Override file name suffix, scan arguments for that
+    for (Arg *A : Args.filtered(options::OPT_fortran_format_Group)) {
+      A->claim();
+      switch (A->getOption().getID()) {
+        default:
+          llvm_unreachable("missed a case");
+         case options::OPT_fixed_form_on:
+         case options::OPT_free_form_off:
+         case options::OPT_Mfixed:
+         case options::OPT_Mfree_off:
+         case options::OPT_Mfreeform_off:
+           UpperCmdArgs.push_back("-nofreeform");
+           break;
+         case options::OPT_free_form_on:
+         case options::OPT_fixed_form_off:
+         case options::OPT_Mfree_on:
+         case options::OPT_Mfreeform_on:
+           UpperCmdArgs.push_back("-freeform");
+           break;
+      }
+    }
+  } else {
+    // Deduce format from file name suffix
+    if (types::isFreeFormFortran(InputType)) {
+      UpperCmdArgs.push_back("-freeform");
+    } else {
+      UpperCmdArgs.push_back("-nofreeform");
+    }
+  }
+
+  // Extend lines to 132 characters
+  for (auto Arg : Args.filtered(options::OPT_Mextend)) {
+    Arg->claim();
+    UpperCmdArgs.push_back("-extend");
+  }
+
+  for (auto Arg : Args.filtered(options::OPT_ffixed_line_length_VALUE)) {
+    StringRef Value = Arg->getValue();
+    if (Value == "72") {
+      Arg->claim();
+    } else if (Value == "132") {
+      Arg->claim();
+      UpperCmdArgs.push_back("-extend");
+    } else {
+      getToolChain().getDriver().Diag(diag::err_drv_unsupported_fixed_line_length)
+        << Arg->getAsString(Args);
+    }
+  }
+
+  // Add user-defined include directories
+  for (auto Arg : Args.filtered(options::OPT_I)) {
+    Arg->claim();
+    UpperCmdArgs.push_back("-idir");
+    UpperCmdArgs.push_back(Arg->getValue(0));
+  }
+
+  // Add user-defined module directories
+  for (auto Arg : Args.filtered(options::OPT_ModuleDir, options::OPT_J)) {
+    Arg->claim();
+    UpperCmdArgs.push_back("-moddir");
+    UpperCmdArgs.push_back(Arg->getValue(0));
+  }
+
+  // "Define" preprocessor flags
+  for (auto Arg : Args.filtered(options::OPT_D)) {
+    Arg->claim();
+    UpperCmdArgs.push_back("-def");
+    UpperCmdArgs.push_back(Arg->getValue(0));
+  }
+
+  // "Define" preprocessor flags
+  for (auto Arg : Args.filtered(options::OPT_U)) {
+    Arg->claim();
+    UpperCmdArgs.push_back("-undef");
+    UpperCmdArgs.push_back(Arg->getValue(0));
+  }
+
+  UpperCmdArgs.push_back("-vect"); UpperCmdArgs.push_back("48");
+
+  // Semantics for assignments to allocatables
+  if (Arg *A = Args.getLastArg(options::OPT_Mallocatable_EQ)) {
+    // Argument is passed explicitly
+    StringRef Value = A->getValue();
+    if (Value == "03") { // Enable Fortran 2003 semantics
+      UpperCmdArgs.push_back("-x"); // Set XBIT
+    } else if (Value == "95") { // Enable Fortran 2003 semantics
+      UpperCmdArgs.push_back("-y"); // Unset XBIT
+    } else {
+      getToolChain().getDriver().Diag(diag::err_drv_invalid_allocatable_mode)
+        << A->getAsString(Args);
+    }
+  } else { // No argument passed
+    UpperCmdArgs.push_back("-y"); // Default is 1995
+  }
+  UpperCmdArgs.push_back("54"); UpperCmdArgs.push_back("1"); // XBIT value
+
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("70"); UpperCmdArgs.push_back("0x40000000");
+  UpperCmdArgs.push_back("-y"); UpperCmdArgs.push_back("163"); UpperCmdArgs.push_back("0xc0000000");
+  UpperCmdArgs.push_back("-x"); UpperCmdArgs.push_back("189"); UpperCmdArgs.push_back("0x10");
+
+  // Enable NULL pointer checking
+  if (Args.hasArg(options::OPT_Mchkptr)) {
+    UpperCmdArgs.push_back("-x");
+    UpperCmdArgs.push_back("70");
+    UpperCmdArgs.push_back("4");
+    for (auto Arg : Args.filtered(options::OPT_Mchkptr)) {
+      Arg->claim();
+    }
+  }
+
+  const char * STBFile = Args.MakeArgString(Stem + ".stb");
+  C.addTempFile(STBFile);
+  UpperCmdArgs.push_back("-stbfile");
+  UpperCmdArgs.push_back(STBFile);
+
+  const char * ModuleExportFile = Args.MakeArgString(Stem + ".cmod");
+  C.addTempFile(ModuleExportFile);
+  UpperCmdArgs.push_back("-modexport");
+  UpperCmdArgs.push_back(ModuleExportFile);
+
+  const char * ModuleIndexFile = Args.MakeArgString(Stem + ".cmdx");
+  C.addTempFile(ModuleIndexFile);
+  UpperCmdArgs.push_back("-modindex");
+  UpperCmdArgs.push_back(ModuleIndexFile);
+
+  UpperCmdArgs.push_back("-output");
+  UpperCmdArgs.push_back(ILMFile);
+
+  C.addCommand(llvm::make_unique<Command>(JA, *this, UpperExec, UpperCmdArgs, Inputs));
+
+  // For -fsyntax-only that is it
+  if (Args.hasArg(options::OPT_fsyntax_only)) return;
+
+  /***** Lower part of Fortran frontend *****/
+
+  const char *LowerExec = Args.MakeArgString(getToolChain().GetProgramPath("flang2"));
+
+  // TODO FLANG arg handling
+  LowerCmdArgs.push_back("-fn"); LowerCmdArgs.push_back(Input.getBaseInput());
+  LowerCmdArgs.push_back("-opt"); LowerCmdArgs.push_back(Args.MakeArgString(OptOStr));
+  LowerCmdArgs.push_back("-terse"); LowerCmdArgs.push_back("1");
+  LowerCmdArgs.push_back("-inform"); LowerCmdArgs.push_back("warn");
+  LowerCmdArgs.append(CommonCmdArgs.begin(), CommonCmdArgs.end()); // Append common arguments
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("51"); LowerCmdArgs.push_back("0x20");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("119"); LowerCmdArgs.push_back("0xa10000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("122"); LowerCmdArgs.push_back("0x40");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("123"); LowerCmdArgs.push_back("0x1000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("127"); LowerCmdArgs.push_back("4");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("127"); LowerCmdArgs.push_back("17");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("19"); LowerCmdArgs.push_back("0x400000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("28"); LowerCmdArgs.push_back("0x40000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("120"); LowerCmdArgs.push_back("0x10000000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("70"); LowerCmdArgs.push_back("0x8000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("122"); LowerCmdArgs.push_back("1");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("125"); LowerCmdArgs.push_back("0x20000");
+  LowerCmdArgs.push_back("-quad");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("59"); LowerCmdArgs.push_back("4");
+  LowerCmdArgs.push_back("-tp"); LowerCmdArgs.push_back("px");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("120"); LowerCmdArgs.push_back("0x1000"); // debug lite
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("124"); LowerCmdArgs.push_back("0x1400");
+  LowerCmdArgs.push_back("-y"); LowerCmdArgs.push_back("15"); LowerCmdArgs.push_back("2");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("57"); LowerCmdArgs.push_back("0x3b0000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("58"); LowerCmdArgs.push_back("0x48000000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("49"); LowerCmdArgs.push_back("0x100");
+  LowerCmdArgs.push_back("-astype"); LowerCmdArgs.push_back("0");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("183"); LowerCmdArgs.push_back("4");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("121"); LowerCmdArgs.push_back("0x800");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("54"); LowerCmdArgs.push_back("0x10");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("70"); LowerCmdArgs.push_back("0x40000000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("249"); LowerCmdArgs.push_back("50"); // LLVM version
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("124"); LowerCmdArgs.push_back("1");
+  LowerCmdArgs.push_back("-y"); LowerCmdArgs.push_back("163"); LowerCmdArgs.push_back("0xc0000000");
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("189"); LowerCmdArgs.push_back("0x10");
+  LowerCmdArgs.push_back("-y"); LowerCmdArgs.push_back("189"); LowerCmdArgs.push_back("0x4000000");
+
+  // Remove "noinline" attriblute
+  LowerCmdArgs.push_back("-x"); LowerCmdArgs.push_back("183"); LowerCmdArgs.push_back("0x10");
+
+  // Set a -x flag for second part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Mx_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    LowerCmdArgs.push_back("-x");
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Set a -y flag for second part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_My_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    LowerCmdArgs.push_back("-y");
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Set a -q (debug) flag for second part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Mq_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    LowerCmdArgs.push_back("-q");
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Set a -qq (debug) flag for second part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Mqq_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    auto XFlag = Value.split(",");
+    LowerCmdArgs.push_back("-qq");
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.first));
+    LowerCmdArgs.push_back(Args.MakeArgString(XFlag.second));
+  }
+
+  // Pass an arbitrary flag for second part of Fortran frontend
+  for (Arg *A : Args.filtered(options::OPT_Wm_EQ)) {
+    A->claim();
+    StringRef Value = A->getValue();
+    SmallVector<StringRef, 8> PassArgs;
+    Value.split(PassArgs, StringRef(","));
+    for (StringRef PassArg : PassArgs) {
+      LowerCmdArgs.push_back(Args.MakeArgString(PassArg));
+    }
+  }
+
+  LowerCmdArgs.push_back("-stbfile");
+  LowerCmdArgs.push_back(STBFile);
+
+  LowerCmdArgs.push_back("-asm"); LowerCmdArgs.push_back(Args.MakeArgString(OutFile));
+
+  C.addCommand(llvm::make_unique<Command>(JA, *this, LowerExec, LowerCmdArgs, Inputs));
+}
+
diff --git a/lib/Driver/ToolChains/Flang.h b/lib/Driver/ToolChains/Flang.h
new file mode 100644
index 000000000000..2c3eb7c895a0
--- /dev/null
+++ b/lib/Driver/ToolChains/Flang.h
@@ -0,0 +1,50 @@
+//===--- Flang.h - Flang Tool and ToolChain Implementations ====-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_Flang_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_Flang_H
+
+#include "MSVC.h"
+#include "clang/Basic/DebugInfoOptions.h"
+#include "clang/Driver/Driver.h"
+#include "clang/Driver/Tool.h"
+#include "clang/Driver/Types.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace driver {
+
+namespace tools {
+
+/// \brief Flang Fortran frontend
+class LLVM_LIBRARY_VISIBILITY FlangFrontend : public Tool {
+public:
+  FlangFrontend(const ToolChain &TC)
+      : Tool("flang:frontend",
+             "Fortran frontend to LLVM", TC,
+             RF_Full) {}
+
+  bool hasGoodDiagnostics() const override { return true; }
+  bool hasIntegratedAssembler() const override { return false; }
+  bool hasIntegratedCPP() const override { return false; }
+
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
+
+} // end namespace tools
+
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_CLANG_H
diff --git a/lib/Driver/ToolChains/FreeBSD.cpp b/lib/Driver/ToolChains/FreeBSD.cpp
index dd0334b9c28b..79ad910b8d46 100644
--- a/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/lib/Driver/ToolChains/FreeBSD.cpp
@@ -392,6 +392,8 @@ SanitizerMask FreeBSD::getSupportedSanitizers() const {
   }
   if (IsX86 || IsX86_64) {
     Res |= SanitizerKind::SafeStack;
+    Res |= SanitizerKind::Fuzzer;
+    Res |= SanitizerKind::FuzzerNoLink;
   }
   return Res;
 }
diff --git a/lib/Driver/ToolChains/Fuchsia.cpp b/lib/Driver/ToolChains/Fuchsia.cpp
index 269d34d18f1e..5f4995c77a44 100644
--- a/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/lib/Driver/ToolChains/Fuchsia.cpp
@@ -278,8 +278,9 @@ void Fuchsia::AddCXXStdlibLibArgs(const ArgList &Args,
 
 SanitizerMask Fuchsia::getSupportedSanitizers() const {
   SanitizerMask Res = ToolChain::getSupportedSanitizers();
-  Res |= SanitizerKind::SafeStack;
   Res |= SanitizerKind::Address;
+  Res |= SanitizerKind::Fuzzer;
+  Res |= SanitizerKind::SafeStack;
   Res |= SanitizerKind::Scudo;
   return Res;
 }
diff --git a/lib/Driver/ToolChains/Gnu.cpp b/lib/Driver/ToolChains/Gnu.cpp
index 7845781f12c4..b8afecf9c0dd 100644
--- a/lib/Driver/ToolChains/Gnu.cpp
+++ b/lib/Driver/ToolChains/Gnu.cpp
@@ -8,13 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "Gnu.h"
-#include "Linux.h"
 #include "Arch/ARM.h"
 #include "Arch/Mips.h"
 #include "Arch/PPC.h"
+#include "Arch/RISCV.h"
 #include "Arch/Sparc.h"
 #include "Arch/SystemZ.h"
 #include "CommonArgs.h"
+#include "Linux.h"
 #include "clang/Basic/VirtualFileSystem.h"
 #include "clang/Config/config.h" // for GCC_INSTALL_PREFIX
 #include "clang/Driver/Compilation.h"
@@ -271,6 +272,10 @@ static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
     return "elf64ppc";
   case llvm::Triple::ppc64le:
     return "elf64lppc";
+  case llvm::Triple::riscv32:
+    return "elf32lriscv";
+  case llvm::Triple::riscv64:
+    return "elf64lriscv";
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
     return "elf32_sparc";
@@ -465,6 +470,16 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // The profile runtime also needs access to system libraries.
   getToolChain().addProfileRTLibs(Args, CmdArgs);
 
+  // Add Fortran runtime libraries
+  if (needFortranLibs(D, Args)) {
+    ToolChain.AddFortranStdlibLibArgs(Args, CmdArgs);
+  } else {
+  // Claim "no Flang libraries" arguments if any
+    for (auto Arg : Args.filtered(options::OPT_noFlangLibs)) {
+      Arg->claim();
+    }
+  }
+
   if (D.CCCIsCXX() &&
       !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
     if (ToolChain.ShouldLinkCXXStdlib(Args)) {
@@ -624,6 +639,18 @@ void tools::gnutools::Assembler::ConstructJob(Compilation &C,
       ppc::getPPCAsmModeForCPU(getCPUName(Args, getToolChain().getTriple())));
     break;
   }
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64: {
+    StringRef ABIName = riscv::getRISCVABI(Args, getToolChain().getTriple());
+    CmdArgs.push_back("-mabi");
+    CmdArgs.push_back(ABIName.data());
+    if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+      StringRef MArch = A->getValue();
+      CmdArgs.push_back("-march");
+      CmdArgs.push_back(MArch.data());
+    }
+    break;
+  }
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel: {
     CmdArgs.push_back("-32");
@@ -856,6 +883,10 @@ static bool isMicroMips(const ArgList &Args) {
   return A && A->getOption().matches(options::OPT_mmicromips);
 }
 
+static bool isRISCV(llvm::Triple::ArchType Arch) {
+  return Arch == llvm::Triple::riscv32 || Arch == llvm::Triple::riscv64;
+}
+
 static Multilib makeMultilib(StringRef commonSuffix) {
   return Multilib(commonSuffix, commonSuffix, commonSuffix);
 }
@@ -1401,11 +1432,48 @@ static void findAndroidArmMultilibs(const Driver &D,
     Result.Multilibs = AndroidArmMultilibs;
 }
 
+static void findRISCVMultilibs(const Driver &D,
+                               const llvm::Triple &TargetTriple, StringRef Path,
+                               const ArgList &Args, DetectedMultilibs &Result) {
+
+  FilterNonExistent NonExistent(Path, "/crtbegin.o", D.getVFS());
+  Multilib Ilp32 = makeMultilib("lib32/ilp32").flag("+m32").flag("+mabi=ilp32");
+  Multilib Ilp32f =
+      makeMultilib("lib32/ilp32f").flag("+m32").flag("+mabi=ilp32f");
+  Multilib Ilp32d =
+      makeMultilib("lib32/ilp32d").flag("+m32").flag("+mabi=ilp32d");
+  Multilib Lp64 = makeMultilib("lib64/lp64").flag("+m64").flag("+mabi=lp64");
+  Multilib Lp64f = makeMultilib("lib64/lp64f").flag("+m64").flag("+mabi=lp64f");
+  Multilib Lp64d = makeMultilib("lib64/lp64d").flag("+m64").flag("+mabi=lp64d");
+  MultilibSet RISCVMultilibs =
+      MultilibSet()
+          .Either({Ilp32, Ilp32f, Ilp32d, Lp64, Lp64f, Lp64d})
+          .FilterOut(NonExistent);
+
+  Multilib::flags_list Flags;
+  bool IsRV64 = TargetTriple.getArch() == llvm::Triple::riscv64;
+  StringRef ABIName = tools::riscv::getRISCVABI(Args, TargetTriple);
+
+  addMultilibFlag(!IsRV64, "m32", Flags);
+  addMultilibFlag(IsRV64, "m64", Flags);
+  addMultilibFlag(ABIName == "ilp32", "mabi=ilp32", Flags);
+  addMultilibFlag(ABIName == "ilp32f", "mabi=ilp32f", Flags);
+  addMultilibFlag(ABIName == "ilp32d", "mabi=ilp32d", Flags);
+  addMultilibFlag(ABIName == "lp64", "mabi=lp64", Flags);
+  addMultilibFlag(ABIName == "lp64f", "mabi=lp64f", Flags);
+  addMultilibFlag(ABIName == "lp64d", "mabi=lp64d", Flags);
+
+  if (RISCVMultilibs.select(Flags, Result.SelectedMultilib))
+    Result.Multilibs = RISCVMultilibs;
+}
+
 static bool findBiarchMultilibs(const Driver &D,
                                 const llvm::Triple &TargetTriple,
                                 StringRef Path, const ArgList &Args,
                                 bool NeedsBiarchSuffix,
                                 DetectedMultilibs &Result) {
+  Multilib Default;
+
   // Some versions of SUSE and Fedora on ppc64 put 32-bit libs
   // in what would normally be GCCInstallPath and put the 64-bit
   // libs in a subdirectory named 64. The simple logic we follow is that
@@ -1413,10 +1481,26 @@ static bool findBiarchMultilibs(const Driver &D,
   // we use that. If not, and if not a biarch triple alias, we look for
   // crtbegin.o without the subdirectory.
 
-  Multilib Default;
+  StringRef Suff64 = "/64";
+  // Solaris uses platform-specific suffixes instead of /64.
+  if (TargetTriple.getOS() == llvm::Triple::Solaris) {
+    switch (TargetTriple.getArch()) {
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      Suff64 = "/amd64";
+      break;
+    case llvm::Triple::sparc:
+    case llvm::Triple::sparcv9:
+      Suff64 = "/sparcv9";
+      break;
+    default:
+      break;
+    }
+  }
+
   Multilib Alt64 = Multilib()
-                       .gccSuffix("/64")
-                       .includeSuffix("/64")
+                       .gccSuffix(Suff64)
+                       .includeSuffix(Suff64)
                        .flag("-m32")
                        .flag("+m64")
                        .flag("-mx32");
@@ -1613,21 +1697,17 @@ void Generic_GCC::GCCInstallationDetector::init(
     // If we have a SysRoot, try that first.
     if (!D.SysRoot.empty()) {
       Prefixes.push_back(D.SysRoot);
-      Prefixes.push_back(D.SysRoot + "/usr");
+      AddDefaultGCCPrefixes(TargetTriple, Prefixes, D.SysRoot);
     }
 
     // Then look for gcc installed alongside clang.
     Prefixes.push_back(D.InstalledDir + "/..");
 
-    // Then look for distribution supplied gcc installations.
+    // Next, look for prefix(es) that correspond to distribution-supplied gcc
+    // installations.
     if (D.SysRoot.empty()) {
-      // Look for RHEL devtoolsets.
-      Prefixes.push_back("/opt/rh/devtoolset-6/root/usr");
-      Prefixes.push_back("/opt/rh/devtoolset-4/root/usr");
-      Prefixes.push_back("/opt/rh/devtoolset-3/root/usr");
-      Prefixes.push_back("/opt/rh/devtoolset-2/root/usr");
-      // And finally in /usr.
-      Prefixes.push_back("/usr");
+      // Typically /usr.
+      AddDefaultGCCPrefixes(TargetTriple, Prefixes, D.SysRoot);
     }
   }
 
@@ -1698,6 +1778,48 @@ bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const {
   return false;
 }
 
+void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
+    const llvm::Triple &TargetTriple, SmallVectorImpl<std::string> &Prefixes,
+    StringRef SysRoot) {
+  if (TargetTriple.getOS() == llvm::Triple::Solaris) {
+    // Solaris is a special case.
+    // The GCC installation is under
+    //   /usr/gcc/<major>.<minor>/lib/gcc/<triple>/<major>.<minor>.<patch>/
+    // so we need to find those /usr/gcc/*/lib/gcc libdirs and go with
+    // /usr/gcc/<version> as a prefix.
+
+    std::string PrefixDir = SysRoot.str() + "/usr/gcc";
+    std::error_code EC;
+    for (vfs::directory_iterator LI = D.getVFS().dir_begin(PrefixDir, EC), LE;
+         !EC && LI != LE; LI = LI.increment(EC)) {
+      StringRef VersionText = llvm::sys::path::filename(LI->getName());
+      GCCVersion CandidateVersion = GCCVersion::Parse(VersionText);
+
+      // Filter out obviously bad entries.
+      if (CandidateVersion.Major == -1 || CandidateVersion.isOlderThan(4, 1, 1))
+        continue;
+
+      std::string CandidatePrefix = PrefixDir + "/" + VersionText.str();
+      std::string CandidateLibPath = CandidatePrefix + "/lib/gcc";
+      if (!D.getVFS().exists(CandidateLibPath))
+        continue;
+
+      Prefixes.push_back(CandidatePrefix);
+    }
+    return;
+  }
+
+  // Non-Solaris is much simpler - most systems just go with "/usr".
+  if (SysRoot.empty() && TargetTriple.getOS() == llvm::Triple::Linux) {
+    // Yet, still look for RHEL devtoolsets.
+    Prefixes.push_back("/opt/rh/devtoolset-6/root/usr");
+    Prefixes.push_back("/opt/rh/devtoolset-4/root/usr");
+    Prefixes.push_back("/opt/rh/devtoolset-3/root/usr");
+    Prefixes.push_back("/opt/rh/devtoolset-2/root/usr");
+  }
+  Prefixes.push_back(SysRoot.str() + "/usr");
+}
+
 /*static*/ void Generic_GCC::GCCInstallationDetector::CollectLibDirsAndTriples(
     const llvm::Triple &TargetTriple, const llvm::Triple &BiarchTriple,
     SmallVectorImpl<StringRef> &LibDirs,
@@ -1783,6 +1905,10 @@ bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const {
       "powerpc64le-linux-gnu", "powerpc64le-unknown-linux-gnu",
       "powerpc64le-suse-linux", "ppc64le-redhat-linux"};
 
+  static const char *const RISCV32LibDirs[] = {"/lib", "/lib32"};
+  static const char *const RISCVTriples[] = {"riscv32-unknown-linux-gnu",
+                                             "riscv64-unknown-linux-gnu"};
+
   static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"};
   static const char *const SPARCv8Triples[] = {"sparc-linux-gnu",
                                                "sparcv8-linux-gnu"};
@@ -1795,17 +1921,49 @@ bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const {
       "s390x-linux-gnu", "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu",
       "s390x-suse-linux", "s390x-redhat-linux"};
 
-  // Solaris.
-  static const char *const SolarisSPARCLibDirs[] = {"/gcc"};
-  static const char *const SolarisSPARCTriples[] = {"sparc-sun-solaris2.11",
-                                                    "i386-pc-solaris2.11"};
 
   using std::begin;
   using std::end;
 
   if (TargetTriple.getOS() == llvm::Triple::Solaris) {
-    LibDirs.append(begin(SolarisSPARCLibDirs), end(SolarisSPARCLibDirs));
-    TripleAliases.append(begin(SolarisSPARCTriples), end(SolarisSPARCTriples));
+    static const char *const SolarisLibDirs[] = {"/lib"};
+    static const char *const SolarisSparcV8Triples[] = {
+        "sparc-sun-solaris2.11", "sparc-sun-solaris2.12"};
+    static const char *const SolarisSparcV9Triples[] = {
+        "sparcv9-sun-solaris2.11", "sparcv9-sun-solaris2.12"};
+    static const char *const SolarisX86Triples[] = {"i386-pc-solaris2.11",
+                                                    "i386-pc-solaris2.12"};
+    static const char *const SolarisX86_64Triples[] = {"x86_64-pc-solaris2.11",
+                                                       "x86_64-pc-solaris2.12"};
+    LibDirs.append(begin(SolarisLibDirs), end(SolarisLibDirs));
+    BiarchLibDirs.append(begin(SolarisLibDirs), end(SolarisLibDirs));
+    switch (TargetTriple.getArch()) {
+    case llvm::Triple::x86:
+      TripleAliases.append(begin(SolarisX86Triples), end(SolarisX86Triples));
+      BiarchTripleAliases.append(begin(SolarisX86_64Triples),
+                                 end(SolarisX86_64Triples));
+      break;
+    case llvm::Triple::x86_64:
+      TripleAliases.append(begin(SolarisX86_64Triples),
+                           end(SolarisX86_64Triples));
+      BiarchTripleAliases.append(begin(SolarisX86Triples),
+                                 end(SolarisX86Triples));
+      break;
+    case llvm::Triple::sparc:
+      TripleAliases.append(begin(SolarisSparcV8Triples),
+                           end(SolarisSparcV8Triples));
+      BiarchTripleAliases.append(begin(SolarisSparcV9Triples),
+                                 end(SolarisSparcV9Triples));
+      break;
+    case llvm::Triple::sparcv9:
+      TripleAliases.append(begin(SolarisSparcV9Triples),
+                           end(SolarisSparcV9Triples));
+      BiarchTripleAliases.append(begin(SolarisSparcV8Triples),
+                                 end(SolarisSparcV8Triples));
+      break;
+    default:
+      break;
+    }
     return;
   }
 
@@ -1928,6 +2086,12 @@ bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const {
     LibDirs.append(begin(PPC64LELibDirs), end(PPC64LELibDirs));
     TripleAliases.append(begin(PPC64LETriples), end(PPC64LETriples));
     break;
+  case llvm::Triple::riscv32:
+    LibDirs.append(begin(RISCV32LibDirs), end(RISCV32LibDirs));
+    BiarchLibDirs.append(begin(RISCV32LibDirs), end(RISCV32LibDirs));
+    TripleAliases.append(begin(RISCVTriples), end(RISCVTriples));
+    BiarchTripleAliases.append(begin(RISCVTriples), end(RISCVTriples));
+    break;
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
     LibDirs.append(begin(SPARCv8LibDirs), end(SPARCv8LibDirs));
@@ -1960,56 +2124,6 @@ bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const {
     BiarchTripleAliases.push_back(BiarchTriple.str());
 }
 
-void Generic_GCC::GCCInstallationDetector::scanLibDirForGCCTripleSolaris(
-    const llvm::Triple &TargetArch, const llvm::opt::ArgList &Args,
-    const std::string &LibDir, StringRef CandidateTriple,
-    bool NeedsBiarchSuffix) {
-  // Solaris is a special case. The GCC installation is under
-  // /usr/gcc/<major>.<minor>/lib/gcc/<triple>/<major>.<minor>.<patch>/, so we
-  // need to iterate twice.
-  std::error_code EC;
-  for (vfs::directory_iterator LI = D.getVFS().dir_begin(LibDir, EC), LE;
-       !EC && LI != LE; LI = LI.increment(EC)) {
-    StringRef VersionText = llvm::sys::path::filename(LI->getName());
-    GCCVersion CandidateVersion = GCCVersion::Parse(VersionText);
-
-    if (CandidateVersion.Major != -1) // Filter obviously bad entries.
-      if (!CandidateGCCInstallPaths.insert(LI->getName()).second)
-        continue; // Saw this path before; no need to look at it again.
-    if (CandidateVersion.isOlderThan(4, 1, 1))
-      continue;
-    if (CandidateVersion <= Version)
-      continue;
-
-    GCCInstallPath =
-        LibDir + "/" + VersionText.str() + "/lib/gcc/" + CandidateTriple.str();
-    if (!D.getVFS().exists(GCCInstallPath))
-      continue;
-
-    // If we make it here there has to be at least one GCC version, let's just
-    // use the latest one.
-    std::error_code EEC;
-    for (vfs::directory_iterator
-             LLI = D.getVFS().dir_begin(GCCInstallPath, EEC),
-             LLE;
-         !EEC && LLI != LLE; LLI = LLI.increment(EEC)) {
-
-      StringRef SubVersionText = llvm::sys::path::filename(LLI->getName());
-      GCCVersion CandidateSubVersion = GCCVersion::Parse(SubVersionText);
-
-      if (CandidateSubVersion > Version)
-        Version = CandidateSubVersion;
-    }
-
-    GCCTriple.setTriple(CandidateTriple);
-
-    GCCInstallPath += "/" + Version.Text;
-    GCCParentLibPath = GCCInstallPath + "/../../../../";
-
-    IsValid = true;
-  }
-}
-
 bool Generic_GCC::GCCInstallationDetector::ScanGCCForMultilibs(
     const llvm::Triple &TargetTriple, const ArgList &Args,
     StringRef Path, bool NeedsBiarchSuffix) {
@@ -2025,6 +2139,8 @@ bool Generic_GCC::GCCInstallationDetector::ScanGCCForMultilibs(
   } else if (tools::isMipsArch(TargetArch)) {
     if (!findMIPSMultilibs(D, TargetTriple, Path, Args, Detected))
       return false;
+  } else if (isRISCV(TargetArch)) {
+    findRISCVMultilibs(D, TargetTriple, Path, Args, Detected);
   } else if (!findBiarchMultilibs(D, TargetTriple, Path, Args,
                                   NeedsBiarchSuffix, Detected)) {
     return false;
@@ -2041,12 +2157,6 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple(
     const llvm::Triple &TargetTriple, const ArgList &Args,
     const std::string &LibDir, StringRef CandidateTriple,
     bool NeedsBiarchSuffix) {
-  if (TargetTriple.getOS() == llvm::Triple::Solaris) {
-    scanLibDirForGCCTripleSolaris(TargetTriple, Args, LibDir, CandidateTriple,
-                                  NeedsBiarchSuffix);
-    return;
-  }
-
   llvm::Triple::ArchType TargetArch = TargetTriple.getArch();
   // Locations relative to the system lib directory where GCC's triple-specific
   // directories might reside.
@@ -2059,31 +2169,33 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple(
     // Whether this library suffix is relevant for the triple.
     bool Active;
   } Suffixes[] = {
-    // This is the normal place.
-    {"gcc/" + CandidateTriple.str(), "../..", true},
-
-    // Debian puts cross-compilers in gcc-cross.
-    {"gcc-cross/" + CandidateTriple.str(), "../..", true},
-
-    // The Freescale PPC SDK has the gcc libraries in
-    // <sysroot>/usr/lib/<triple>/x.y.z so have a look there as well. Only do
-    // this on Freescale triples, though, since some systems put a *lot* of
-    // files in that location, not just GCC installation data.
-    {CandidateTriple.str(), "..",
-      TargetTriple.getVendor() == llvm::Triple::Freescale},
-
-    // Natively multiarch systems sometimes put the GCC triple-specific
-    // directory within their multiarch lib directory, resulting in the
-    // triple appearing twice.
-    {CandidateTriple.str() + "/gcc/" + CandidateTriple.str(), "../../..", true},
-
-    // Deal with cases (on Ubuntu) where the system architecture could be i386
-    // but the GCC target architecture could be (say) i686.
-    // FIXME: It may be worthwhile to generalize this and look for a second
-    // triple.
-    {"i386-linux-gnu/gcc/" + CandidateTriple.str(), "../../..",
-      TargetArch == llvm::Triple::x86}
-  };
+      // This is the normal place.
+      {"gcc/" + CandidateTriple.str(), "../..", true},
+
+      // Debian puts cross-compilers in gcc-cross.
+      {"gcc-cross/" + CandidateTriple.str(), "../..",
+       TargetTriple.getOS() != llvm::Triple::Solaris},
+
+      // The Freescale PPC SDK has the gcc libraries in
+      // <sysroot>/usr/lib/<triple>/x.y.z so have a look there as well. Only do
+      // this on Freescale triples, though, since some systems put a *lot* of
+      // files in that location, not just GCC installation data.
+      {CandidateTriple.str(), "..",
+       TargetTriple.getVendor() == llvm::Triple::Freescale},
+
+      // Natively multiarch systems sometimes put the GCC triple-specific
+      // directory within their multiarch lib directory, resulting in the
+      // triple appearing twice.
+      {CandidateTriple.str() + "/gcc/" + CandidateTriple.str(), "../../..",
+       TargetTriple.getOS() != llvm::Triple::Solaris},
+
+      // Deal with cases (on Ubuntu) where the system architecture could be i386
+      // but the GCC target architecture could be (say) i686.
+      // FIXME: It may be worthwhile to generalize this and look for a second
+      // triple.
+      {"i386-linux-gnu/gcc/" + CandidateTriple.str(), "../../..",
+       (TargetArch == llvm::Triple::x86 &&
+        TargetTriple.getOS() != llvm::Triple::Solaris)}};
 
   for (auto &Suffix : Suffixes) {
     if (!Suffix.Active)
@@ -2240,6 +2352,8 @@ bool Generic_GCC::IsIntegratedAssemblerDefault() const {
   case llvm::Triple::ppc:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
   case llvm::Triple::systemz:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
diff --git a/lib/Driver/ToolChains/Gnu.h b/lib/Driver/ToolChains/Gnu.h
index f29342b95a07..9bad560c4d0e 100644
--- a/lib/Driver/ToolChains/Gnu.h
+++ b/lib/Driver/ToolChains/Gnu.h
@@ -250,6 +250,10 @@ class LLVM_LIBRARY_VISIBILITY Generic_GCC : public ToolChain {
                              SmallVectorImpl<StringRef> &BiarchLibDirs,
                              SmallVectorImpl<StringRef> &BiarchTripleAliases);
 
+    void AddDefaultGCCPrefixes(const llvm::Triple &TargetTriple,
+                               SmallVectorImpl<std::string> &Prefixes,
+                               StringRef SysRoot);
+
     bool ScanGCCForMultilibs(const llvm::Triple &TargetTriple,
                              const llvm::opt::ArgList &Args,
                              StringRef Path,
@@ -261,12 +265,6 @@ class LLVM_LIBRARY_VISIBILITY Generic_GCC : public ToolChain {
                                 StringRef CandidateTriple,
                                 bool NeedsBiarchSuffix = false);
 
-    void scanLibDirForGCCTripleSolaris(const llvm::Triple &TargetArch,
-                                       const llvm::opt::ArgList &Args,
-                                       const std::string &LibDir,
-                                       StringRef CandidateTriple,
-                                       bool NeedsBiarchSuffix = false);
-
     bool ScanGentooGccConfig(const llvm::Triple &TargetTriple,
                              const llvm::opt::ArgList &Args,
                              StringRef CandidateTriple,
@@ -307,8 +305,8 @@ class LLVM_LIBRARY_VISIBILITY Generic_GCC : public ToolChain {
   /// \brief Check whether the target triple's architecture is 32-bits.
   bool isTarget32Bit() const { return getTriple().isArch32Bit(); }
 
-  // FIXME: This should be final, but the Solaris tool chain does weird
-  // things we can't easily represent.
+  // FIXME: This should be final, but the CrossWindows toolchain does weird
+  // things that can't be easily generalized.
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
diff --git a/lib/Driver/ToolChains/Hexagon.cpp b/lib/Driver/ToolChains/Hexagon.cpp
index 2debf0e2de54..31281737c7a1 100644
--- a/lib/Driver/ToolChains/Hexagon.cpp
+++ b/lib/Driver/ToolChains/Hexagon.cpp
@@ -46,7 +46,7 @@ static void handleHVXWarnings(const Driver &D, const ArgList &Args) {
   // Handle the unsupported values passed to mhvx-length.
   if (Arg *A = Args.getLastArg(options::OPT_mhexagon_hvx_length_EQ)) {
     StringRef Val = A->getValue();
-    if (Val != "64B" && Val != "128B")
+    if (!Val.equals_lower("64b") && !Val.equals_lower("128b"))
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getOption().getName() << Val;
   }
diff --git a/lib/Driver/ToolChains/Linux.cpp b/lib/Driver/ToolChains/Linux.cpp
index 1301cdf114ae..1347c1a272bc 100644
--- a/lib/Driver/ToolChains/Linux.cpp
+++ b/lib/Driver/ToolChains/Linux.cpp
@@ -11,6 +11,7 @@
 #include "Arch/ARM.h"
 #include "Arch/Mips.h"
 #include "Arch/PPC.h"
+#include "Arch/RISCV.h"
 #include "CommonArgs.h"
 #include "clang/Basic/VirtualFileSystem.h"
 #include "clang/Config/config.h"
@@ -176,6 +177,9 @@ static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
       Triple.getEnvironment() == llvm::Triple::GNUX32)
     return "libx32";
 
+  if (Triple.getArch() == llvm::Triple::riscv32)
+    return "lib32";
+
   return Triple.isArch32Bit() ? "lib" : "lib64";
 }
 
@@ -226,6 +230,8 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
   const bool IsAndroid = Triple.isAndroid();
   const bool IsMips = tools::isMipsArch(Arch);
   const bool IsHexagon = Arch == llvm::Triple::hexagon;
+  const bool IsRISCV =
+      Arch == llvm::Triple::riscv32 || Arch == llvm::Triple::riscv64;
 
   if (IsMips && !SysRoot.empty())
     ExtraOpts.push_back("--sysroot=" + SysRoot);
@@ -333,6 +339,11 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
   addPathIfExists(D, SysRoot + "/lib/../" + OSLibDir, Paths);
   addPathIfExists(D, SysRoot + "/usr/lib/" + MultiarchTriple, Paths);
   addPathIfExists(D, SysRoot + "/usr/lib/../" + OSLibDir, Paths);
+  if (IsRISCV) {
+    StringRef ABIName = tools::riscv::getRISCVABI(Args, Triple);
+    addPathIfExists(D, SysRoot + "/" + OSLibDir + "/" + ABIName, Paths);
+    addPathIfExists(D, SysRoot + "/usr/" + OSLibDir + "/" + ABIName, Paths);
+  }
 
   // Try walking via the GCC triple path in case of biarch or multiarch GCC
   // installations with strange symlinks.
@@ -511,6 +522,18 @@ std::string Linux::getDynamicLinker(const ArgList &Args) const {
     Loader =
         (tools::ppc::hasPPCAbiArg(Args, "elfv1")) ? "ld64.so.1" : "ld64.so.2";
     break;
+  case llvm::Triple::riscv32: {
+    StringRef ABIName = tools::riscv::getRISCVABI(Args, Triple);
+    LibDir = "lib";
+    Loader = ("ld-linux-riscv32-" + ABIName + ".so.1").str();
+    break;
+  }
+  case llvm::Triple::riscv64: {
+    StringRef ABIName = tools::riscv::getRISCVABI(Args, Triple);
+    LibDir = "lib";
+    Loader = ("ld-linux-riscv64-" + ABIName + ".so.1").str();
+    break;
+  }
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
     LibDir = "lib";
@@ -543,6 +566,194 @@ std::string Linux::getDynamicLinker(const ArgList &Args) const {
   return "/" + LibDir + "/" + Loader;
 }
 
+/// Convert path list to Fortran frontend argument
+static void AddFlangSysIncludeArg(const ArgList &DriverArgs,
+                                  ArgStringList &Flang1Args,
+                                  ToolChain::path_list IncludePathList) {
+  std::string ArgValue; // Path argument value
+
+  // Make up argument value consisting of paths separated by colons
+  bool first = true;
+  for (auto P : IncludePathList) {
+    if (first) {
+      first = false;
+    } else {
+      ArgValue += ":";
+    }
+    ArgValue += P;
+  }
+
+  // Add the argument
+  Flang1Args.push_back("-stdinc");
+  Flang1Args.push_back(DriverArgs.MakeArgString(ArgValue));
+}
+
+void Linux::AddFlangSystemIncludeArgs(const ArgList &DriverArgs,
+                                      ArgStringList &Flang1Args) const {
+  path_list IncludePathList;
+  const Driver &D = getDriver();
+  std::string SysRoot = computeSysRoot();
+
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
+    return;
+
+  {
+    SmallString<128> P(D.InstalledDir);
+    llvm::sys::path::append(P, "../include");
+    IncludePathList.push_back(P.str());
+  }
+
+  if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
+    IncludePathList.push_back(SysRoot + "/usr/local/include");
+
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include");
+    IncludePathList.push_back(P.str());
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc)) {
+    AddFlangSysIncludeArg(DriverArgs, Flang1Args, IncludePathList);
+    return;
+  }
+
+  // Check for configure-time C include directories.
+  StringRef CIncludeDirs(C_INCLUDE_DIRS);
+  if (CIncludeDirs != "") {
+    SmallVector<StringRef, 5> dirs;
+    CIncludeDirs.split(dirs, ":");
+    for (StringRef dir : dirs) {
+      StringRef Prefix =
+          llvm::sys::path::is_absolute(dir) ? StringRef(SysRoot) : "";
+      IncludePathList.push_back(Prefix.str() + dir.str());
+    }
+    AddFlangSysIncludeArg(DriverArgs, Flang1Args, IncludePathList);
+    return;
+  }
+
+  // Lacking those, try to detect the correct set of system includes for the
+  // target triple.
+
+  // Add include directories specific to the selected multilib set and multilib.
+  if (GCCInstallation.isValid()) {
+    const auto &Callback = Multilibs.includeDirsCallback();
+    if (Callback) {
+      for (const auto &Path : Callback(GCCInstallation.getMultilib()))
+        addExternCSystemIncludeIfExists(
+            DriverArgs, Flang1Args, GCCInstallation.getInstallPath() + Path);
+    }
+  }
+
+  // Implement generic Debian multiarch support.
+  const StringRef X86_64MultiarchIncludeDirs[] = {
+      "/usr/include/x86_64-linux-gnu",
+
+      // FIXME: These are older forms of multiarch. It's not clear that they're
+      // in use in any released version of Debian, so we should consider
+      // removing them.
+      "/usr/include/i686-linux-gnu/64", "/usr/include/i486-linux-gnu/64"};
+  const StringRef X86MultiarchIncludeDirs[] = {
+      "/usr/include/i386-linux-gnu",
+
+      // FIXME: These are older forms of multiarch. It's not clear that they're
+      // in use in any released version of Debian, so we should consider
+      // removing them.
+      "/usr/include/x86_64-linux-gnu/32", "/usr/include/i686-linux-gnu",
+      "/usr/include/i486-linux-gnu"};
+  const StringRef AArch64MultiarchIncludeDirs[] = {
+      "/usr/include/aarch64-linux-gnu"};
+  const StringRef ARMMultiarchIncludeDirs[] = {
+      "/usr/include/arm-linux-gnueabi"};
+  const StringRef ARMHFMultiarchIncludeDirs[] = {
+      "/usr/include/arm-linux-gnueabihf"};
+  const StringRef MIPSMultiarchIncludeDirs[] = {"/usr/include/mips-linux-gnu"};
+  const StringRef MIPSELMultiarchIncludeDirs[] = {
+      "/usr/include/mipsel-linux-gnu"};
+  const StringRef MIPS64MultiarchIncludeDirs[] = {
+      "/usr/include/mips64-linux-gnu", "/usr/include/mips64-linux-gnuabi64"};
+  const StringRef MIPS64ELMultiarchIncludeDirs[] = {
+      "/usr/include/mips64el-linux-gnu",
+      "/usr/include/mips64el-linux-gnuabi64"};
+  const StringRef PPCMultiarchIncludeDirs[] = {
+      "/usr/include/powerpc-linux-gnu"};
+  const StringRef PPC64MultiarchIncludeDirs[] = {
+      "/usr/include/powerpc64-linux-gnu"};
+  const StringRef PPC64LEMultiarchIncludeDirs[] = {
+      "/usr/include/powerpc64le-linux-gnu"};
+  const StringRef SparcMultiarchIncludeDirs[] = {
+      "/usr/include/sparc-linux-gnu"};
+  const StringRef Sparc64MultiarchIncludeDirs[] = {
+      "/usr/include/sparc64-linux-gnu"};
+  ArrayRef<StringRef> MultiarchIncludeDirs;
+  switch (getTriple().getArch()) {
+  case llvm::Triple::x86_64:
+    MultiarchIncludeDirs = X86_64MultiarchIncludeDirs;
+    break;
+  case llvm::Triple::x86:
+    MultiarchIncludeDirs = X86MultiarchIncludeDirs;
+    break;
+  case llvm::Triple::aarch64:
+  case llvm::Triple::aarch64_be:
+    MultiarchIncludeDirs = AArch64MultiarchIncludeDirs;
+    break;
+  case llvm::Triple::arm:
+    if (getTriple().getEnvironment() == llvm::Triple::GNUEABIHF)
+      MultiarchIncludeDirs = ARMHFMultiarchIncludeDirs;
+    else
+      MultiarchIncludeDirs = ARMMultiarchIncludeDirs;
+    break;
+  case llvm::Triple::mips:
+    MultiarchIncludeDirs = MIPSMultiarchIncludeDirs;
+    break;
+  case llvm::Triple::mipsel:
+    MultiarchIncludeDirs = MIPSELMultiarchIncludeDirs;
+    break;
+  case llvm::Triple::mips64:
+    MultiarchIncludeDirs = MIPS64MultiarchIncludeDirs;
+    break;
+  case llvm::Triple::mips64el:
+    MultiarchIncludeDirs = MIPS64ELMultiarchIncludeDirs;
+    break;
+  case llvm::Triple::ppc:
+    MultiarchIncludeDirs = PPCMultiarchIncludeDirs;
+    break;
+  case llvm::Triple::ppc64:
+    MultiarchIncludeDirs = PPC64MultiarchIncludeDirs;
+    break;
+  case llvm::Triple::ppc64le:
+    MultiarchIncludeDirs = PPC64LEMultiarchIncludeDirs;
+    break;
+  case llvm::Triple::sparc:
+    MultiarchIncludeDirs = SparcMultiarchIncludeDirs;
+    break;
+  case llvm::Triple::sparcv9:
+    MultiarchIncludeDirs = Sparc64MultiarchIncludeDirs;
+    break;
+  default:
+    break;
+  }
+  for (StringRef Dir : MultiarchIncludeDirs) {
+    if (llvm::sys::fs::exists(SysRoot + Dir)) {
+      IncludePathList.push_back(SysRoot + Dir.str());
+      break;
+    }
+  }
+
+  if (getTriple().getOS() == llvm::Triple::RTEMS) {
+    AddFlangSysIncludeArg(DriverArgs, Flang1Args, IncludePathList);
+    return;
+  }
+
+  // Add an include of '/include' directly. This isn't provided by default by
+  // system GCCs, but is often used with cross-compiling GCCs, and harmless to
+  // add even when Clang is acting as-if it were a system compiler.
+  IncludePathList.push_back(SysRoot + "/include");
+
+  IncludePathList.push_back(SysRoot + "/usr/include");
+
+  AddFlangSysIncludeArg(DriverArgs, Flang1Args, IncludePathList);
+}
+
 void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                       ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
@@ -823,6 +1034,8 @@ bool Linux::isPIEDefault() const {
 SanitizerMask Linux::getSupportedSanitizers() const {
   const bool IsX86 = getTriple().getArch() == llvm::Triple::x86;
   const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
+  const bool IsMIPS = getTriple().getArch() == llvm::Triple::mips ||
+                      getTriple().getArch() == llvm::Triple::mipsel;
   const bool IsMIPS64 = getTriple().getArch() == llvm::Triple::mips64 ||
                         getTriple().getArch() == llvm::Triple::mips64el;
   const bool IsPowerPC64 = getTriple().getArch() == llvm::Triple::ppc64 ||
@@ -852,7 +1065,7 @@ SanitizerMask Linux::getSupportedSanitizers() const {
     Res |= SanitizerKind::Efficiency;
   if (IsX86 || IsX86_64)
     Res |= SanitizerKind::Function;
-  if (IsX86_64 || IsMIPS64 || IsAArch64 || IsX86 || IsArmArch)
+  if (IsX86_64 || IsMIPS64 || IsAArch64 || IsX86 || IsMIPS || IsArmArch)
     Res |= SanitizerKind::Scudo;
   if (IsAArch64)
     Res |= SanitizerKind::HWAddress;
diff --git a/lib/Driver/ToolChains/Linux.h b/lib/Driver/ToolChains/Linux.h
index 9778c1832ccc..34e41752d934 100644
--- a/lib/Driver/ToolChains/Linux.h
+++ b/lib/Driver/ToolChains/Linux.h
@@ -27,6 +27,9 @@ class LLVM_LIBRARY_VISIBILITY Linux : public Generic_ELF {
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  AddFlangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &Flang1Args) const override;
   std::string findLibCxxIncludePath() const override;
   void addLibStdCxxIncludePaths(
       const llvm::opt::ArgList &DriverArgs,
diff --git a/lib/Driver/ToolChains/Myriad.cpp b/lib/Driver/ToolChains/Myriad.cpp
index 06079b109dd1..2c0b3162588d 100644
--- a/lib/Driver/ToolChains/Myriad.cpp
+++ b/lib/Driver/ToolChains/Myriad.cpp
@@ -107,7 +107,6 @@ void tools::SHAVE::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(
         Args.MakeArgString(std::string("-i:") + A->getValue(0)));
   }
-  CmdArgs.push_back("-elf"); // Output format.
   CmdArgs.push_back(II.getFilename());
   CmdArgs.push_back(
       Args.MakeArgString(std::string("-o:") + Output.getFilename()));
diff --git a/lib/Driver/ToolChains/Solaris.cpp b/lib/Driver/ToolChains/Solaris.cpp
index 9fe6e9d520d0..bdf40668f112 100644
--- a/lib/Driver/ToolChains/Solaris.cpp
+++ b/lib/Driver/ToolChains/Solaris.cpp
@@ -122,6 +122,21 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
+static StringRef getSolarisLibSuffix(const llvm::Triple &Triple) {
+  switch (Triple.getArch()) {
+  case llvm::Triple::x86:
+  case llvm::Triple::sparc:
+    break;
+  case llvm::Triple::x86_64:
+    return "/amd64";
+  case llvm::Triple::sparcv9:
+    return "/sparcv9";
+  default:
+    llvm_unreachable("Unsupported architecture");
+  }
+  return "";
+}
+
 /// Solaris - Solaris tool chain which can call as(1) and ld(1) directly.
 
 Solaris::Solaris(const Driver &D, const llvm::Triple &Triple,
@@ -130,32 +145,24 @@ Solaris::Solaris(const Driver &D, const llvm::Triple &Triple,
 
   GCCInstallation.init(Triple, Args);
 
+  StringRef LibSuffix = getSolarisLibSuffix(Triple);
   path_list &Paths = getFilePaths();
-  if (GCCInstallation.isValid())
-    addPathIfExists(D, GCCInstallation.getInstallPath(), Paths);
-
-  addPathIfExists(D, getDriver().getInstalledDir(), Paths);
-  if (getDriver().getInstalledDir() != getDriver().Dir)
-    addPathIfExists(D, getDriver().Dir, Paths);
-
-  addPathIfExists(D, getDriver().SysRoot + getDriver().Dir + "/../lib", Paths);
-
-  std::string LibPath = "/usr/lib/";
-  switch (Triple.getArch()) {
-  case llvm::Triple::x86:
-  case llvm::Triple::sparc:
-    break;
-  case llvm::Triple::x86_64:
-    LibPath += "amd64/";
-    break;
-  case llvm::Triple::sparcv9:
-    LibPath += "sparcv9/";
-    break;
-  default:
-    llvm_unreachable("Unsupported architecture");
+  if (GCCInstallation.isValid()) {
+    // On Solaris gcc uses both an architecture-specific path with triple in it
+    // as well as a more generic lib path (+arch suffix).
+    addPathIfExists(D,
+                    GCCInstallation.getInstallPath() +
+                        GCCInstallation.getMultilib().gccSuffix(),
+                    Paths);
+    addPathIfExists(D, GCCInstallation.getParentLibPath() + LibSuffix, Paths);
   }
 
-  addPathIfExists(D, getDriver().SysRoot + LibPath, Paths);
+  // If we are currently running Clang inside of the requested system root,
+  // add its parent library path to those searched.
+  if (StringRef(D.Dir).startswith(D.SysRoot))
+    addPathIfExists(D, D.Dir + "/../lib", Paths);
+
+  addPathIfExists(D, D.SysRoot + "/usr/lib" + LibSuffix, Paths);
 }
 
 Tool *Solaris::buildAssembler() const {
@@ -164,30 +171,72 @@ Tool *Solaris::buildAssembler() const {
 
 Tool *Solaris::buildLinker() const { return new tools::solaris::Linker(*this); }
 
-void Solaris::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
-                                           ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdlibinc) ||
-      DriverArgs.hasArg(options::OPT_nostdincxx))
+void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
+                                        ArgStringList &CC1Args) const {
+  const Driver &D = getDriver();
+
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+    return;
+
+  if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
+    addSystemInclude(DriverArgs, CC1Args, D.SysRoot + "/usr/local/include");
+
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include");
+    addSystemInclude(DriverArgs, CC1Args, P);
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
     return;
 
-  // Include the support directory for things like xlocale and fudged system
-  // headers.
-  // FIXME: This is a weird mix of libc++ and libstdc++. We should also be
-  // checking the value of -stdlib= here and adding the includes for libc++
-  // rather than libstdc++ if it's requested.
-  addSystemInclude(DriverArgs, CC1Args, "/usr/include/c++/v1/support/solaris");
+  // Check for configure-time C include directories.
+  StringRef CIncludeDirs(C_INCLUDE_DIRS);
+  if (CIncludeDirs != "") {
+    SmallVector<StringRef, 5> dirs;
+    CIncludeDirs.split(dirs, ":");
+    for (StringRef dir : dirs) {
+      StringRef Prefix =
+          llvm::sys::path::is_absolute(dir) ? StringRef(D.SysRoot) : "";
+      addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir);
+    }
+    return;
+  }
 
+  // Add include directories specific to the selected multilib set and multilib.
   if (GCCInstallation.isValid()) {
-    GCCVersion Version = GCCInstallation.getVersion();
-    addSystemInclude(DriverArgs, CC1Args,
-                     getDriver().SysRoot + "/usr/gcc/" +
-                     Version.MajorStr + "." +
-                     Version.MinorStr +
-                     "/include/c++/" + Version.Text);
-    addSystemInclude(DriverArgs, CC1Args,
-                     getDriver().SysRoot + "/usr/gcc/" + Version.MajorStr +
-                     "." + Version.MinorStr + "/include/c++/" +
-                     Version.Text + "/" +
-                     GCCInstallation.getTriple().str());
+    const MultilibSet::IncludeDirsFunc &Callback =
+        Multilibs.includeDirsCallback();
+    if (Callback) {
+      for (const auto &Path : Callback(GCCInstallation.getMultilib()))
+        addExternCSystemIncludeIfExists(
+            DriverArgs, CC1Args, GCCInstallation.getInstallPath() + Path);
+    }
   }
+
+  addExternCSystemInclude(DriverArgs, CC1Args, D.SysRoot + "/usr/include");
+}
+
+void Solaris::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  // We need a detected GCC installation on Solaris (similar to Linux)
+  // to provide libstdc++'s headers.
+  if (!GCCInstallation.isValid())
+    return;
+
+  // By default, look for the C++ headers in an include directory adjacent to
+  // the lib directory of the GCC installation.
+  // On Solaris this usually looks like /usr/gcc/X.Y/include/c++/X.Y.Z
+  StringRef LibDir = GCCInstallation.getParentLibPath();
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+  const Multilib &Multilib = GCCInstallation.getMultilib();
+  const GCCVersion &Version = GCCInstallation.getVersion();
+
+  // The primary search for libstdc++ supports multiarch variants.
+  addLibStdCXXIncludePaths(LibDir.str() + "/../include", "/c++/" + Version.Text,
+                           TripleStr,
+                           /*GCCMultiarchTriple*/ "",
+                           /*TargetMultiarchTriple*/ "",
+                           Multilib.includeSuffix(), DriverArgs, CC1Args);
 }
diff --git a/lib/Driver/ToolChains/Solaris.h b/lib/Driver/ToolChains/Solaris.h
index 787917afab6e..b2a092b64807 100644
--- a/lib/Driver/ToolChains/Solaris.h
+++ b/lib/Driver/ToolChains/Solaris.h
@@ -57,9 +57,13 @@ class LLVM_LIBRARY_VISIBILITY Solaris : public Generic_ELF {
 
   bool IsIntegratedAssemblerDefault() const override { return true; }
 
-  void AddClangCXXStdlibIncludeArgs(
-      const llvm::opt::ArgList &DriverArgs,
-      llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
 
   unsigned GetDefaultDwarfVersion() const override { return 2; }
 
diff --git a/lib/Driver/ToolChains/WebAssembly.cpp b/lib/Driver/ToolChains/WebAssembly.cpp
index 8ae1b6c2f55d..94f7279bbdba 100644
--- a/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/lib/Driver/ToolChains/WebAssembly.cpp
@@ -11,6 +11,7 @@
 #include "CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 
@@ -62,8 +63,6 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     if (Args.hasArg(options::OPT_pthread))
       CmdArgs.push_back("-lpthread");
 
-    CmdArgs.push_back("-allow-undefined-file");
-    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("wasm.syms")));
     CmdArgs.push_back("-lc");
     AddRunTimeLibs(ToolChain, ToolChain.getDriver(), CmdArgs, Args);
   }
@@ -119,6 +118,12 @@ ToolChain::RuntimeLibType WebAssembly::GetDefaultRuntimeLibType() const {
 }
 
 ToolChain::CXXStdlibType WebAssembly::GetCXXStdlibType(const ArgList &Args) const {
+  if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
+    StringRef Value = A->getValue();
+    if (Value != "libc++")
+      getDriver().Diag(diag::err_drv_invalid_stdlib_name)
+          << A->getAsString(Args);
+  }
   return ToolChain::CST_Libcxx;
 }
 
@@ -136,6 +141,19 @@ void WebAssembly::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                      getDriver().SysRoot + "/include/c++/v1");
 }
 
+void WebAssembly::AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
+                                      llvm::opt::ArgStringList &CmdArgs) const {
+
+  switch (GetCXXStdlibType(Args)) {
+  case ToolChain::CST_Libcxx:
+    CmdArgs.push_back("-lc++");
+    CmdArgs.push_back("-lc++abi");
+    break;
+  case ToolChain::CST_Libstdcxx:
+    llvm_unreachable("invalid stdlib name");
+  }
+}
+
 std::string WebAssembly::getThreadModel() const {
   // The WebAssembly MVP does not yet support threads; for now, use the
   // "single" threading model, which lowers atomics to non-atomic operations.
diff --git a/lib/Driver/ToolChains/WebAssembly.h b/lib/Driver/ToolChains/WebAssembly.h
index 8784e12dfb0e..cdbb34ff919f 100644
--- a/lib/Driver/ToolChains/WebAssembly.h
+++ b/lib/Driver/ToolChains/WebAssembly.h
@@ -62,6 +62,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly final : public ToolChain {
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
+  void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs) const override;
   std::string getThreadModel() const override;
 
   const char *getDefaultLinker() const override {
diff --git a/lib/Driver/Types.cpp b/lib/Driver/Types.cpp
old mode 100644
new mode 100755
index ab63f0e81b12..526db0fabef5
--- a/lib/Driver/Types.cpp
+++ b/lib/Driver/Types.cpp
@@ -99,6 +99,8 @@ bool types::isAcceptedByClang(ID Id) {
 
   case TY_Asm:
   case TY_C: case TY_PP_C:
+  case TY_F_FreeForm: case TY_PP_F_FreeForm:
+  case TY_F_FixedForm: case TY_PP_F_FixedForm:
   case TY_CL:
   case TY_CUDA: case TY_PP_CUDA:
   case TY_CUDA_DEVICE:
@@ -117,6 +119,33 @@ bool types::isAcceptedByClang(ID Id) {
   }
 }
 
+bool types::isFortran(ID Id) {
+  switch (Id) {
+  default:
+    return false;
+
+  case TY_F_FreeForm:
+  case TY_PP_F_FreeForm:
+  case TY_F_FixedForm:
+  case TY_PP_F_FixedForm:
+    return true;
+  }
+}
+
+bool types::isFreeFormFortran(ID Id) {
+  if (!isFortran(Id))
+    return false;
+
+  return (Id == TY_F_FreeForm || Id == TY_PP_F_FreeForm);
+}
+
+bool types::isFixedFormFortran(ID Id) {
+  if (!isFortran(Id))
+    return false;
+
+  return (Id == TY_F_FixedForm || Id == TY_PP_F_FixedForm);
+}
+
 bool types::isObjC(ID Id) {
   switch (Id) {
   default:
@@ -178,8 +207,8 @@ types::ID types::lookupTypeForExtension(llvm::StringRef Ext) {
   return llvm::StringSwitch<types::ID>(Ext)
            .Case("c", TY_C)
            .Case("C", TY_CXX)
-           .Case("F", TY_Fortran)
-           .Case("f", TY_PP_Fortran)
+           .Case("F", TY_F_FixedForm)
+           .Case("f", TY_PP_F_FixedForm)
            .Case("h", TY_CHeader)
            .Case("H", TY_CXXHeader)
            .Case("i", TY_PP_C)
@@ -212,18 +241,22 @@ types::ID types::lookupTypeForExtension(llvm::StringRef Ext) {
            .Case("cui", TY_PP_CUDA)
            .Case("cxx", TY_CXX)
            .Case("CXX", TY_CXX)
-           .Case("F90", TY_Fortran)
-           .Case("f90", TY_PP_Fortran)
-           .Case("F95", TY_Fortran)
-           .Case("f95", TY_PP_Fortran)
-           .Case("for", TY_PP_Fortran)
-           .Case("FOR", TY_PP_Fortran)
-           .Case("fpp", TY_Fortran)
-           .Case("FPP", TY_Fortran)
            .Case("gch", TY_PCH)
            .Case("hpp", TY_CXXHeader)
            .Case("iim", TY_PP_CXXModule)
            .Case("lib", TY_Object)
+           .Case("for", TY_PP_F_FixedForm)
+           .Case("FOR", TY_PP_F_FixedForm)
+           .Case("fpp", TY_F_FixedForm)
+           .Case("FPP", TY_F_FixedForm)
+           .Case("f90", TY_PP_F_FreeForm)
+           .Case("f95", TY_PP_F_FreeForm)
+           .Case("f03", TY_PP_F_FreeForm)
+           .Case("f08", TY_PP_F_FreeForm)
+           .Case("F90", TY_F_FreeForm)
+           .Case("F95", TY_F_FreeForm)
+           .Case("F03", TY_F_FreeForm)
+           .Case("F08", TY_F_FreeForm)
            .Case("mii", TY_PP_ObjCXX)
            .Case("obj", TY_Object)
            .Case("pch", TY_PCH)
@@ -248,7 +281,9 @@ types::ID types::lookupTypeForTypeSpecifier(const char *Name) {
 // FIXME: Why don't we just put this list in the defs file, eh.
 void types::getCompilationPhases(ID Id, llvm::SmallVectorImpl<phases::ID> &P) {
   if (Id != TY_Object) {
-    if (getPreprocessedType(Id) != TY_INVALID) {
+    // Delegate preprocessing to the "upper" part of Fortran compiler,
+    // preprocess for other preprocessable inputs
+    if (getPreprocessedType(Id) != TY_INVALID && !isFortran(Id)) {
       P.push_back(phases::Preprocess);
     }
 
@@ -258,6 +293,9 @@ void types::getCompilationPhases(ID Id, llvm::SmallVectorImpl<phases::ID> &P) {
 
     if (!onlyPrecompileType(Id)) {
       if (!onlyAssembleType(Id)) {
+        if (isFortran(Id)) {
+          P.push_back(phases::FortranFrontend);
+        }
         P.push_back(phases::Compile);
         P.push_back(phases::Backend);
       }
diff --git a/lib/Format/BreakableToken.cpp b/lib/Format/BreakableToken.cpp
index 4735ab3564f0..72d2078797e5 100644
--- a/lib/Format/BreakableToken.cpp
+++ b/lib/Format/BreakableToken.cpp
@@ -214,11 +214,11 @@ unsigned BreakableStringLiteral::getContentStartColumn(unsigned LineIndex,
 
 BreakableStringLiteral::BreakableStringLiteral(
     const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
-    StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding,
-    const FormatStyle &Style)
+    StringRef Postfix, unsigned UnbreakableTailLength, bool InPPDirective,
+    encoding::Encoding Encoding, const FormatStyle &Style)
     : BreakableToken(Tok, InPPDirective, Encoding, Style),
       StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix),
-      UnbreakableTailLength(Tok.UnbreakableTailLength) {
+      UnbreakableTailLength(UnbreakableTailLength) {
   assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
   Line = Tok.TokenText.substr(
       Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
diff --git a/lib/Format/BreakableToken.h b/lib/Format/BreakableToken.h
index 8ef26ef464da..eba48f75e88f 100644
--- a/lib/Format/BreakableToken.h
+++ b/lib/Format/BreakableToken.h
@@ -238,8 +238,8 @@ class BreakableStringLiteral : public BreakableToken {
   /// after formatting.
   BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
                          StringRef Prefix, StringRef Postfix,
-                         bool InPPDirective, encoding::Encoding Encoding,
-                         const FormatStyle &Style);
+                         unsigned UnbreakableTailLength, bool InPPDirective,
+                         encoding::Encoding Encoding, const FormatStyle &Style);
 
   Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
                  unsigned ReflowColumn,
diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp
index a3d38b244c5c..3711ee01c482 100644
--- a/lib/Format/ContinuationIndenter.cpp
+++ b/lib/Format/ContinuationIndenter.cpp
@@ -102,28 +102,59 @@ static llvm::Optional<StringRef> getRawStringDelimiter(StringRef TokenText) {
   return Delimiter;
 }
 
+// Returns the canonical delimiter for \p Language, or the empty string if no
+// canonical delimiter is specified.
+static StringRef
+getCanonicalRawStringDelimiter(const FormatStyle &Style,
+                               FormatStyle::LanguageKind Language) {
+  for (const auto &Format : Style.RawStringFormats) {
+    if (Format.Language == Language)
+      return StringRef(Format.CanonicalDelimiter);
+  }
+  return "";
+}
+
 RawStringFormatStyleManager::RawStringFormatStyleManager(
     const FormatStyle &CodeStyle) {
   for (const auto &RawStringFormat : CodeStyle.RawStringFormats) {
-    FormatStyle Style;
-    if (!getPredefinedStyle(RawStringFormat.BasedOnStyle,
-                            RawStringFormat.Language, &Style)) {
-      Style = getLLVMStyle();
-      Style.Language = RawStringFormat.Language;
+    llvm::Optional<FormatStyle> LanguageStyle =
+        CodeStyle.GetLanguageStyle(RawStringFormat.Language);
+    if (!LanguageStyle) {
+      FormatStyle PredefinedStyle;
+      if (!getPredefinedStyle(RawStringFormat.BasedOnStyle,
+                              RawStringFormat.Language, &PredefinedStyle)) {
+        PredefinedStyle = getLLVMStyle();
+        PredefinedStyle.Language = RawStringFormat.Language;
+      }
+      LanguageStyle = PredefinedStyle;
+    }
+    LanguageStyle->ColumnLimit = CodeStyle.ColumnLimit;
+    for (StringRef Delimiter : RawStringFormat.Delimiters) {
+      DelimiterStyle.insert({Delimiter, *LanguageStyle});
+    }
+    for (StringRef EnclosingFunction : RawStringFormat.EnclosingFunctions) {
+      EnclosingFunctionStyle.insert({EnclosingFunction, *LanguageStyle});
     }
-    Style.ColumnLimit = CodeStyle.ColumnLimit;
-    DelimiterStyle.insert({RawStringFormat.Delimiter, Style});
   }
 }
 
 llvm::Optional<FormatStyle>
-RawStringFormatStyleManager::get(StringRef Delimiter) const {
+RawStringFormatStyleManager::getDelimiterStyle(StringRef Delimiter) const {
   auto It = DelimiterStyle.find(Delimiter);
   if (It == DelimiterStyle.end())
     return None;
   return It->second;
 }
 
+llvm::Optional<FormatStyle>
+RawStringFormatStyleManager::getEnclosingFunctionStyle(
+    StringRef EnclosingFunction) const {
+  auto It = EnclosingFunctionStyle.find(EnclosingFunction);
+  if (It == EnclosingFunctionStyle.end())
+    return None;
+  return It->second;
+}
+
 ContinuationIndenter::ContinuationIndenter(const FormatStyle &Style,
                                            const AdditionalKeywords &Keywords,
                                            const SourceManager &SourceMgr,
@@ -1293,14 +1324,32 @@ unsigned ContinuationIndenter::reformatRawStringLiteral(
     const FormatToken &Current, LineState &State,
     const FormatStyle &RawStringStyle, bool DryRun) {
   unsigned StartColumn = State.Column - Current.ColumnWidth;
-  auto Delimiter = *getRawStringDelimiter(Current.TokenText);
+  StringRef OldDelimiter = *getRawStringDelimiter(Current.TokenText);
+  StringRef NewDelimiter =
+      getCanonicalRawStringDelimiter(Style, RawStringStyle.Language);
+  if (NewDelimiter.empty() || OldDelimiter.empty())
+    NewDelimiter = OldDelimiter;
   // The text of a raw string is between the leading 'R"delimiter(' and the
   // trailing 'delimiter)"'.
-  unsigned PrefixSize = 3 + Delimiter.size();
-  unsigned SuffixSize = 2 + Delimiter.size();
+  unsigned OldPrefixSize = 3 + OldDelimiter.size();
+  unsigned OldSuffixSize = 2 + OldDelimiter.size();
+  // We create a virtual text environment which expects a null-terminated
+  // string, so we cannot use StringRef.
+  std::string RawText =
+      Current.TokenText.substr(OldPrefixSize).drop_back(OldSuffixSize);
+  if (NewDelimiter != OldDelimiter) {
+    // Don't update to the canonical delimiter 'deli' if ')deli"' occurs in the
+    // raw string.
+    std::string CanonicalDelimiterSuffix = (")" + NewDelimiter + "\"").str();
+    if (StringRef(RawText).contains(CanonicalDelimiterSuffix))
+      NewDelimiter = OldDelimiter;
+  }
+
+  unsigned NewPrefixSize = 3 + NewDelimiter.size();
+  unsigned NewSuffixSize = 2 + NewDelimiter.size();
 
-  // The first start column is the column the raw text starts.
-  unsigned FirstStartColumn = StartColumn + PrefixSize;
+  // The first start column is the column the raw text starts after formatting.
+  unsigned FirstStartColumn = StartColumn + NewPrefixSize;
 
   // The next start column is the intended indentation a line break inside
   // the raw string at level 0. It is determined by the following rules:
@@ -1311,7 +1360,7 @@ unsigned ContinuationIndenter::reformatRawStringLiteral(
   // These rules have the advantage that the formatted content both does not
   // violate the rectangle rule and visually flows within the surrounding
   // source.
-  bool ContentStartsOnNewline = Current.TokenText[PrefixSize] == '\n';
+  bool ContentStartsOnNewline = Current.TokenText[OldPrefixSize] == '\n';
   unsigned NextStartColumn = ContentStartsOnNewline
                                  ? State.Stack.back().Indent + Style.IndentWidth
                                  : FirstStartColumn;
@@ -1325,12 +1374,9 @@ unsigned ContinuationIndenter::reformatRawStringLiteral(
   //   - if the raw string prefix does not start on a newline, it is the current
   //     indent.
   unsigned LastStartColumn = Current.NewlinesBefore
-                                 ? FirstStartColumn - PrefixSize
+                                 ? FirstStartColumn - NewPrefixSize
                                  : State.Stack.back().Indent;
 
-  std::string RawText =
-      Current.TokenText.substr(PrefixSize).drop_back(SuffixSize);
-
   std::pair<tooling::Replacements, unsigned> Fixes = internal::reformat(
       RawStringStyle, RawText, {tooling::Range(0, RawText.size())},
       FirstStartColumn, NextStartColumn, LastStartColumn, "<stdin>",
@@ -1343,8 +1389,33 @@ unsigned ContinuationIndenter::reformatRawStringLiteral(
     return 0;
   }
   if (!DryRun) {
+    if (NewDelimiter != OldDelimiter) {
+      // In 'R"delimiter(...', the delimiter starts 2 characters after the start
+      // of the token.
+      SourceLocation PrefixDelimiterStart =
+          Current.Tok.getLocation().getLocWithOffset(2);
+      auto PrefixErr = Whitespaces.addReplacement(tooling::Replacement(
+          SourceMgr, PrefixDelimiterStart, OldDelimiter.size(), NewDelimiter));
+      if (PrefixErr) {
+        llvm::errs()
+            << "Failed to update the prefix delimiter of a raw string: "
+            << llvm::toString(std::move(PrefixErr)) << "\n";
+      }
+      // In 'R"delimiter(...)delimiter"', the suffix delimiter starts at
+      // position length - 1 - |delimiter|.
+      SourceLocation SuffixDelimiterStart =
+          Current.Tok.getLocation().getLocWithOffset(Current.TokenText.size() -
+                                                     1 - OldDelimiter.size());
+      auto SuffixErr = Whitespaces.addReplacement(tooling::Replacement(
+          SourceMgr, SuffixDelimiterStart, OldDelimiter.size(), NewDelimiter));
+      if (SuffixErr) {
+        llvm::errs()
+            << "Failed to update the suffix delimiter of a raw string: "
+            << llvm::toString(std::move(SuffixErr)) << "\n";
+      }
+    }
     SourceLocation OriginLoc =
-        Current.Tok.getLocation().getLocWithOffset(PrefixSize);
+        Current.Tok.getLocation().getLocWithOffset(OldPrefixSize);
     for (const tooling::Replacement &Fix : Fixes.first) {
       auto Err = Whitespaces.addReplacement(tooling::Replacement(
           SourceMgr, OriginLoc.getLocWithOffset(Fix.getOffset()),
@@ -1357,7 +1428,7 @@ unsigned ContinuationIndenter::reformatRawStringLiteral(
   }
   unsigned RawLastLineEndColumn = getLastLineEndColumn(
       *NewCode, FirstStartColumn, Style.TabWidth, Encoding);
-  State.Column = RawLastLineEndColumn + SuffixSize;
+  State.Column = RawLastLineEndColumn + NewSuffixSize;
   return Fixes.second;
 }
 
@@ -1430,6 +1501,26 @@ unsigned ContinuationIndenter::handleEndOfLine(const FormatToken &Current,
   return Penalty;
 }
 
+// Returns the enclosing function name of a token, or the empty string if not
+// found.
+static StringRef getEnclosingFunctionName(const FormatToken &Current) {
+  // Look for: 'function(' or 'function<templates>(' before Current.
+  auto Tok = Current.getPreviousNonComment();
+  if (!Tok || !Tok->is(tok::l_paren))
+    return "";
+  Tok = Tok->getPreviousNonComment();
+  if (!Tok)
+    return "";
+  if (Tok->is(TT_TemplateCloser)) {
+    Tok = Tok->MatchingParen;
+    if (Tok)
+      Tok = Tok->getPreviousNonComment();
+  }
+  if (!Tok || !Tok->is(tok::identifier))
+    return "";
+  return Tok->TokenText;
+}
+
 llvm::Optional<FormatStyle>
 ContinuationIndenter::getRawStringStyle(const FormatToken &Current,
                                         const LineState &State) {
@@ -1438,7 +1529,10 @@ ContinuationIndenter::getRawStringStyle(const FormatToken &Current,
   auto Delimiter = getRawStringDelimiter(Current.TokenText);
   if (!Delimiter)
     return None;
-  auto RawStringStyle = RawStringFormats.get(*Delimiter);
+  auto RawStringStyle = RawStringFormats.getDelimiterStyle(*Delimiter);
+  if (!RawStringStyle)
+    RawStringStyle = RawStringFormats.getEnclosingFunctionStyle(
+        getEnclosingFunctionName(Current));
   if (!RawStringStyle)
     return None;
   RawStringStyle->ColumnLimit = getColumnLimit(State);
@@ -1482,9 +1576,16 @@ std::unique_ptr<BreakableToken> ContinuationIndenter::createBreakableToken(
           Text.startswith(Prefix = "u8\"") ||
           Text.startswith(Prefix = "L\""))) ||
         (Text.startswith(Prefix = "_T(\"") && Text.endswith(Postfix = "\")"))) {
+      // We need this to address the case where there is an unbreakable tail
+      // only if certain other formatting decisions have been taken. The
+      // UnbreakableTailLength of Current is an overapproximation is that case
+      // and we need to be correct here.
+      unsigned UnbreakableTailLength = (State.NextToken && canBreak(State))
+                                           ? 0
+                                           : Current.UnbreakableTailLength;
       return llvm::make_unique<BreakableStringLiteral>(
-          Current, StartColumn, Prefix, Postfix, State.Line->InPPDirective,
-          Encoding, Style);
+          Current, StartColumn, Prefix, Postfix, UnbreakableTailLength,
+          State.Line->InPPDirective, Encoding, Style);
     }
   } else if (Current.is(TT_BlockComment)) {
     if (!Style.ReflowComments ||
diff --git a/lib/Format/ContinuationIndenter.h b/lib/Format/ContinuationIndenter.h
index ded7bfab4267..c257ded0ebfd 100644
--- a/lib/Format/ContinuationIndenter.h
+++ b/lib/Format/ContinuationIndenter.h
@@ -38,10 +38,14 @@ class WhitespaceManager;
 
 struct RawStringFormatStyleManager {
   llvm::StringMap<FormatStyle> DelimiterStyle;
+  llvm::StringMap<FormatStyle> EnclosingFunctionStyle;
 
   RawStringFormatStyleManager(const FormatStyle &CodeStyle);
 
-  llvm::Optional<FormatStyle> get(StringRef Delimiter) const;
+  llvm::Optional<FormatStyle> getDelimiterStyle(StringRef Delimiter) const;
+
+  llvm::Optional<FormatStyle>
+  getEnclosingFunctionStyle(StringRef EnclosingFunction) const;
 };
 
 class ContinuationIndenter {
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index 217c6729ee39..631d186e629a 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -32,6 +32,7 @@
 #include "clang/Basic/VirtualFileSystem.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Path.h"
@@ -455,8 +456,10 @@ template <> struct ScalarEnumerationTraits<FormatStyle::IncludeBlocksStyle> {
 
 template <> struct MappingTraits<FormatStyle::RawStringFormat> {
   static void mapping(IO &IO, FormatStyle::RawStringFormat &Format) {
-    IO.mapOptional("Delimiter", Format.Delimiter);
     IO.mapOptional("Language", Format.Language);
+    IO.mapOptional("Delimiters", Format.Delimiters);
+    IO.mapOptional("EnclosingFunctions", Format.EnclosingFunctions);
+    IO.mapOptional("CanonicalDelimiter", Format.CanonicalDelimiter);
     IO.mapOptional("BasedOnStyle", Format.BasedOnStyle);
   }
 };
@@ -641,7 +644,6 @@ FormatStyle getLLVMStyle() {
   LLVMStyle.SpacesBeforeTrailingComments = 1;
   LLVMStyle.Standard = FormatStyle::LS_Cpp11;
   LLVMStyle.UseTab = FormatStyle::UT_Never;
-  LLVMStyle.RawStringFormats = {{"pb", FormatStyle::LK_TextProto, "google"}};
   LLVMStyle.ReflowComments = true;
   LLVMStyle.SpacesInParentheses = false;
   LLVMStyle.SpacesInSquareBrackets = false;
@@ -673,6 +675,12 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   if (Language == FormatStyle::LK_TextProto) {
     FormatStyle GoogleStyle = getGoogleStyle(FormatStyle::LK_Proto);
     GoogleStyle.Language = FormatStyle::LK_TextProto;
+
+    // Text protos are currently mostly formatted inside C++ raw string literals
+    // and often the current breaking behavior of string literals is not
+    // beneficial there. Investigate turning this on once proper string reflow
+    // has been implemented.
+    GoogleStyle.BreakStringLiterals = false;
     return GoogleStyle;
   }
 
@@ -693,8 +701,20 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   GoogleStyle.IndentCaseLabels = true;
   GoogleStyle.KeepEmptyLinesAtTheStartOfBlocks = false;
   GoogleStyle.ObjCSpaceAfterProperty = false;
-  GoogleStyle.ObjCSpaceBeforeProtocolList = false;
+  GoogleStyle.ObjCSpaceBeforeProtocolList = true;
   GoogleStyle.PointerAlignment = FormatStyle::PAS_Left;
+  GoogleStyle.RawStringFormats = {{
+      FormatStyle::LK_TextProto,
+      /*Delimiters=*/
+      {
+          "pb",
+          "PB",
+      },
+      /*EnclosingFunctionNames=*/
+      {},
+      /*CanonicalDelimiter=*/"",
+      /*BasedOnStyle=*/"google",
+  }};
   GoogleStyle.SpacesBeforeTrailingComments = 2;
   GoogleStyle.Standard = FormatStyle::LS_Auto;
 
@@ -730,6 +750,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   } else if (Language == FormatStyle::LK_Proto) {
     GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
     GoogleStyle.SpacesInContainerLiterals = false;
+    GoogleStyle.Cpp11BracedListStyle = false;
   } else if (Language == FormatStyle::LK_ObjC) {
     GoogleStyle.ColumnLimit = 100;
   }
@@ -859,7 +880,7 @@ std::error_code parseConfiguration(StringRef Text, FormatStyle *Style) {
   assert(Language != FormatStyle::LK_None);
   if (Text.trim().empty())
     return make_error_code(ParseError::Error);
-
+  Style->StyleSet.Clear();
   std::vector<FormatStyle> Styles;
   llvm::yaml::Input Input(Text);
   // DocumentListTraits<vector<FormatStyle>> uses the context to get default
@@ -888,15 +909,23 @@ std::error_code parseConfiguration(StringRef Text, FormatStyle *Style) {
   // Look for a suitable configuration starting from the end, so we can
   // find the configuration for the specific language first, and the default
   // configuration (which can only be at slot 0) after it.
+  FormatStyle::FormatStyleSet StyleSet;
+  bool LanguageFound = false;
   for (int i = Styles.size() - 1; i >= 0; --i) {
-    if (Styles[i].Language == Language ||
-        Styles[i].Language == FormatStyle::LK_None) {
-      *Style = Styles[i];
-      Style->Language = Language;
-      return make_error_code(ParseError::Success);
-    }
-  }
-  return make_error_code(ParseError::Unsuitable);
+    if (Styles[i].Language != FormatStyle::LK_None)
+      StyleSet.Add(Styles[i]);
+    if (Styles[i].Language == Language)
+      LanguageFound = true;
+  }
+  if (!LanguageFound) {
+    if (Styles.empty() || Styles[0].Language != FormatStyle::LK_None)
+      return make_error_code(ParseError::Unsuitable);
+    FormatStyle DefaultStyle = Styles[0];
+    DefaultStyle.Language = Language;
+    StyleSet.Add(std::move(DefaultStyle));
+  }
+  *Style = *StyleSet.Get(Language);
+  return make_error_code(ParseError::Success);
 }
 
 std::string configurationAsText(const FormatStyle &Style) {
@@ -910,6 +939,38 @@ std::string configurationAsText(const FormatStyle &Style) {
   return Stream.str();
 }
 
+llvm::Optional<FormatStyle>
+FormatStyle::FormatStyleSet::Get(FormatStyle::LanguageKind Language) const {
+  if (!Styles)
+    return None;
+  auto It = Styles->find(Language);
+  if (It == Styles->end())
+    return None;
+  FormatStyle Style = It->second;
+  Style.StyleSet = *this;
+  return Style;
+}
+
+void FormatStyle::FormatStyleSet::Add(FormatStyle Style) {
+  assert(Style.Language != LK_None &&
+         "Cannot add a style for LK_None to a StyleSet");
+  assert(
+      !Style.StyleSet.Styles &&
+      "Cannot add a style associated with an existing StyleSet to a StyleSet");
+  if (!Styles)
+    Styles = std::make_shared<MapType>();
+  (*Styles)[Style.Language] = std::move(Style);
+}
+
+void FormatStyle::FormatStyleSet::Clear() {
+  Styles.reset();
+}
+
+llvm::Optional<FormatStyle>
+FormatStyle::GetLanguageStyle(FormatStyle::LanguageKind Language) const {
+  return StyleSet.Get(Language);
+}
+
 namespace {
 
 class JavaScriptRequoter : public TokenAnalyzer {
@@ -1341,6 +1402,103 @@ class Cleaner : public TokenAnalyzer {
   std::set<FormatToken *, FormatTokenLess> DeletedTokens;
 };
 
+class ObjCHeaderStyleGuesser : public TokenAnalyzer {
+public:
+  ObjCHeaderStyleGuesser(const Environment &Env, const FormatStyle &Style)
+      : TokenAnalyzer(Env, Style), IsObjC(false) {}
+
+  std::pair<tooling::Replacements, unsigned>
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens) override {
+    assert(Style.Language == FormatStyle::LK_Cpp);
+    IsObjC = guessIsObjC(AnnotatedLines, Tokens.getKeywords());
+    tooling::Replacements Result;
+    return {Result, 0};
+  }
+
+  bool isObjC() { return IsObjC; }
+
+private:
+  static bool guessIsObjC(const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+                          const AdditionalKeywords &Keywords) {
+    // Keep this array sorted, since we are binary searching over it.
+    static constexpr llvm::StringLiteral FoundationIdentifiers[] = {
+        "CGFloat",
+        "NSAffineTransform",
+        "NSArray",
+        "NSAttributedString",
+        "NSCache",
+        "NSCharacterSet",
+        "NSCountedSet",
+        "NSData",
+        "NSDataDetector",
+        "NSDecimal",
+        "NSDecimalNumber",
+        "NSDictionary",
+        "NSEdgeInsets",
+        "NSHashTable",
+        "NSIndexPath",
+        "NSIndexSet",
+        "NSInteger",
+        "NSLocale",
+        "NSMapTable",
+        "NSMutableArray",
+        "NSMutableAttributedString",
+        "NSMutableCharacterSet",
+        "NSMutableData",
+        "NSMutableDictionary",
+        "NSMutableIndexSet",
+        "NSMutableOrderedSet",
+        "NSMutableSet",
+        "NSMutableString",
+        "NSNumber",
+        "NSNumberFormatter",
+        "NSOrderedSet",
+        "NSPoint",
+        "NSPointerArray",
+        "NSRange",
+        "NSRect",
+        "NSRegularExpression",
+        "NSSet",
+        "NSSize",
+        "NSString",
+        "NSUInteger",
+        "NSURL",
+        "NSURLComponents",
+        "NSURLQueryItem",
+        "NSUUID",
+    };
+
+    for (auto &Line : AnnotatedLines) {
+      for (FormatToken *FormatTok = Line->First->Next; FormatTok;
+           FormatTok = FormatTok->Next) {
+        if ((FormatTok->Previous->is(tok::at) &&
+             (FormatTok->isObjCAtKeyword(tok::objc_interface) ||
+              FormatTok->isObjCAtKeyword(tok::objc_implementation) ||
+              FormatTok->isObjCAtKeyword(tok::objc_protocol) ||
+              FormatTok->isObjCAtKeyword(tok::objc_end) ||
+              FormatTok->isOneOf(tok::numeric_constant, tok::l_square,
+                                 tok::l_brace))) ||
+            (FormatTok->Tok.isAnyIdentifier() &&
+             std::binary_search(std::begin(FoundationIdentifiers),
+                                std::end(FoundationIdentifiers),
+                                FormatTok->TokenText)) ||
+            FormatTok->is(TT_ObjCStringLiteral) ||
+            FormatTok->isOneOf(Keywords.kw_NS_ENUM, Keywords.kw_NS_OPTIONS,
+                               TT_ObjCBlockLBrace, TT_ObjCBlockLParen,
+                               TT_ObjCDecl, TT_ObjCForIn, TT_ObjCMethodExpr,
+                               TT_ObjCMethodSpecifier, TT_ObjCProperty)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool IsObjC;
+};
+
 struct IncludeDirective {
   StringRef Filename;
   StringRef Text;
@@ -2126,14 +2284,15 @@ llvm::Expected<FormatStyle> getStyle(StringRef StyleName, StringRef FileName,
   FormatStyle Style = getLLVMStyle();
   Style.Language = getLanguageByFileName(FileName);
 
-  // This is a very crude detection of whether a header contains ObjC code that
-  // should be improved over time and probably be done on tokens, not one the
-  // bare content of the file.
-  if (Style.Language == FormatStyle::LK_Cpp && FileName.endswith(".h") &&
-      (Code.contains("\n- (") || Code.contains("\n+ (") ||
-       Code.contains("\n@end\n") || Code.contains("\n@end ") ||
-       Code.endswith("@end")))
-    Style.Language = FormatStyle::LK_ObjC;
+  if (Style.Language == FormatStyle::LK_Cpp && FileName.endswith(".h")) {
+    std::unique_ptr<Environment> Env =
+        Environment::CreateVirtualEnvironment(Code, FileName, /*Ranges=*/{});
+    ObjCHeaderStyleGuesser Guesser(*Env, Style);
+    Guesser.process();
+    if (Guesser.isObjC()) {
+      Style.Language = FormatStyle::LK_ObjC;
+    }
+  }
 
   FormatStyle FallbackStyle = getNoStyle();
   if (!getPredefinedStyle(FallbackStyleName, Style.Language, &FallbackStyle))
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h
index 3dc0ab0e7cca..60c11a3d0807 100644
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -441,7 +441,8 @@ struct FormatToken {
   }
 
   prec::Level getPrecedence() const {
-    return getBinOpPrecedence(Tok.getKind(), true, true);
+    return getBinOpPrecedence(Tok.getKind(), /*GreaterThanIsOperator=*/true,
+                              /*CPlusPlus11=*/true);
   }
 
   /// \brief Returns the previous token ignoring comments.
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index 298c72b002f8..05629eda09b2 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -462,13 +462,15 @@ class AnnotatingParser {
           FormatToken *Previous = CurrentToken->getPreviousNonComment();
           if (Previous->is(TT_JsTypeOptionalQuestion))
             Previous = Previous->getPreviousNonComment();
-          if (((CurrentToken->is(tok::colon) &&
-                (!Contexts.back().ColonIsDictLiteral || !Style.isCpp())) ||
-               Style.Language == FormatStyle::LK_Proto ||
-               Style.Language == FormatStyle::LK_TextProto) &&
-              (Previous->Tok.getIdentifierInfo() ||
-               Previous->is(tok::string_literal)))
-            Previous->Type = TT_SelectorName;
+          if ((CurrentToken->is(tok::colon) &&
+               (!Contexts.back().ColonIsDictLiteral || !Style.isCpp())) ||
+              Style.Language == FormatStyle::LK_Proto ||
+              Style.Language == FormatStyle::LK_TextProto) {
+            Left->Type = TT_DictLiteral;
+            if (Previous->Tok.getIdentifierInfo() ||
+                Previous->is(tok::string_literal))
+              Previous->Type = TT_SelectorName;
+          }
           if (CurrentToken->is(tok::colon) ||
               Style.Language == FormatStyle::LK_JavaScript)
             Left->Type = TT_DictLiteral;
@@ -1516,7 +1518,7 @@ class ExpressionParser {
                    AnnotatedLine &Line)
       : Style(Style), Keywords(Keywords), Current(Line.First) {}
 
-  /// \brief Parse expressions with the given operatore precedence.
+  /// \brief Parse expressions with the given operator precedence.
   void parse(int Precedence = 0) {
     // Skip 'return' and ObjC selector colons as they are not part of a binary
     // expression.
@@ -1723,15 +1725,18 @@ void TokenAnnotator::setCommentLineLevels(
       }
     }
 
-    if (NextNonCommentLine && CommentLine) {
-      // If the comment is currently aligned with the line immediately following
-      // it, that's probably intentional and we should keep it.
-      bool AlignedWithNextLine =
-          NextNonCommentLine->First->NewlinesBefore <= 1 &&
-          NextNonCommentLine->First->OriginalColumn ==
-              (*I)->First->OriginalColumn;
-      if (AlignedWithNextLine)
-        (*I)->Level = NextNonCommentLine->Level;
+    // If the comment is currently aligned with the line immediately following
+    // it, that's probably intentional and we should keep it.
+    if (NextNonCommentLine && CommentLine &&
+        NextNonCommentLine->First->NewlinesBefore <= 1 &&
+        NextNonCommentLine->First->OriginalColumn ==
+            (*I)->First->OriginalColumn) {
+      // Align comments for preprocessor lines with the # in column 0.
+      // Otherwise, align with the next line.
+      (*I)->Level = (NextNonCommentLine->Type == LT_PreprocessorDirective ||
+                     NextNonCommentLine->Type == LT_ImportStatement)
+                        ? 0
+                        : NextNonCommentLine->Level;
     } else {
       NextNonCommentLine = (*I)->First->isNot(tok::r_brace) ? (*I) : nullptr;
     }
@@ -2708,9 +2713,11 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
             Keywords.kw_readonly, Keywords.kw_abstract, Keywords.kw_get,
             Keywords.kw_set, Keywords.kw_async, Keywords.kw_await))
       return false; // Otherwise automatic semicolon insertion would trigger.
-    if (Left.Tok.getIdentifierInfo() &&
-        Right.startsSequence(tok::l_square, tok::r_square))
-      return false;  // breaking in "foo[]" creates illegal TS type syntax.
+    if (Right.NestingLevel == 0 &&
+        (Left.Tok.getIdentifierInfo() ||
+         Left.isOneOf(tok::r_square, tok::r_paren)) &&
+        Right.isOneOf(tok::l_square, tok::l_paren))
+      return false; // Otherwise automatic semicolon insertion would trigger.
     if (Left.is(TT_JsFatArrow) && Right.is(tok::l_brace))
       return false;
     if (Left.is(TT_JsTypeColon))
diff --git a/lib/Format/UnwrappedLineFormatter.cpp b/lib/Format/UnwrappedLineFormatter.cpp
index 60dc1a7169d1..253f89da9d14 100644
--- a/lib/Format/UnwrappedLineFormatter.cpp
+++ b/lib/Format/UnwrappedLineFormatter.cpp
@@ -304,9 +304,15 @@ class LineJoiner {
     if (TheLine->First->is(tok::l_brace) && TheLine->First == TheLine->Last &&
         I != AnnotatedLines.begin() &&
         I[-1]->First->isOneOf(tok::kw_if, tok::kw_while, tok::kw_for)) {
-      return Style.AllowShortBlocksOnASingleLine
-                 ? tryMergeSimpleBlock(I - 1, E, Limit)
-                 : 0;
+      unsigned MergedLines = 0;
+      if (Style.AllowShortBlocksOnASingleLine) {
+        MergedLines = tryMergeSimpleBlock(I - 1, E, Limit);
+        // If we managed to merge the block, discard the first merged line
+        // since we are merging starting from I.
+        if (MergedLines > 0)
+          --MergedLines;
+      }
+      return MergedLines;
     }
     // Try to merge a block with left brace wrapped that wasn't yet covered
     if (TheLine->Last->is(tok::l_brace)) {
diff --git a/lib/Format/UnwrappedLineParser.cpp b/lib/Format/UnwrappedLineParser.cpp
index b8608dcac9c7..1057fabb77c4 100644
--- a/lib/Format/UnwrappedLineParser.cpp
+++ b/lib/Format/UnwrappedLineParser.cpp
@@ -333,7 +333,19 @@ void UnwrappedLineParser::parseLevel(bool HasOpeningBrace) {
       nextToken();
       addUnwrappedLine();
       break;
-    case tok::kw_default:
+    case tok::kw_default: {
+      unsigned StoredPosition = Tokens->getPosition();
+      FormatToken *Next = Tokens->getNextToken();
+      FormatTok = Tokens->setPosition(StoredPosition);
+      if (Next && Next->isNot(tok::colon)) {
+        // default not followed by ':' is not a case label; treat it like
+        // an identifier.
+        parseStructuralElement();
+        break;
+      }
+      // Else, if it is 'default:', fall through to the case handling.
+      LLVM_FALLTHROUGH;
+    }
     case tok::kw_case:
       if (Style.Language == FormatStyle::LK_JavaScript &&
           Line->MustBeDeclaration) {
@@ -932,49 +944,6 @@ void UnwrappedLineParser::parseStructuralElement() {
     return;
   }
   switch (FormatTok->Tok.getKind()) {
-  case tok::at:
-    nextToken();
-    if (FormatTok->Tok.is(tok::l_brace)) {
-      nextToken();
-      parseBracedList();
-      break;
-    }
-    switch (FormatTok->Tok.getObjCKeywordID()) {
-    case tok::objc_public:
-    case tok::objc_protected:
-    case tok::objc_package:
-    case tok::objc_private:
-      return parseAccessSpecifier();
-    case tok::objc_interface:
-    case tok::objc_implementation:
-      return parseObjCInterfaceOrImplementation();
-    case tok::objc_protocol:
-      return parseObjCProtocol();
-    case tok::objc_end:
-      return; // Handled by the caller.
-    case tok::objc_optional:
-    case tok::objc_required:
-      nextToken();
-      addUnwrappedLine();
-      return;
-    case tok::objc_autoreleasepool:
-      nextToken();
-      if (FormatTok->Tok.is(tok::l_brace)) {
-        if (Style.BraceWrapping.AfterObjCDeclaration)
-          addUnwrappedLine();
-        parseBlock(/*MustBeDeclaration=*/false);
-      }
-      addUnwrappedLine();
-      return;
-    case tok::objc_try:
-      // This branch isn't strictly necessary (the kw_try case below would
-      // do this too after the tok::at is parsed above).  But be explicit.
-      parseTryCatch();
-      return;
-    default:
-      break;
-    }
-    break;
   case tok::kw_asm:
     nextToken();
     if (FormatTok->is(tok::l_brace)) {
@@ -1032,8 +1001,12 @@ void UnwrappedLineParser::parseStructuralElement() {
       // 'default: string' field declaration.
       break;
     nextToken();
-    parseLabel();
-    return;
+    if (FormatTok->is(tok::colon)) {
+      parseLabel();
+      return;
+    }
+    // e.g. "default void f() {}" in a Java interface.
+    break;
   case tok::kw_case:
     if (Style.Language == FormatStyle::LK_JavaScript && Line->MustBeDeclaration)
       // 'case: string' field declaration.
@@ -1117,6 +1090,44 @@ void UnwrappedLineParser::parseStructuralElement() {
       if (FormatTok->Tok.is(tok::l_brace)) {
         nextToken();
         parseBracedList();
+        break;
+      }
+      switch (FormatTok->Tok.getObjCKeywordID()) {
+      case tok::objc_public:
+      case tok::objc_protected:
+      case tok::objc_package:
+      case tok::objc_private:
+        return parseAccessSpecifier();
+      case tok::objc_interface:
+      case tok::objc_implementation:
+        return parseObjCInterfaceOrImplementation();
+      case tok::objc_protocol:
+        if (parseObjCProtocol())
+          return;
+        break;
+      case tok::objc_end:
+        return; // Handled by the caller.
+      case tok::objc_optional:
+      case tok::objc_required:
+        nextToken();
+        addUnwrappedLine();
+        return;
+      case tok::objc_autoreleasepool:
+        nextToken();
+        if (FormatTok->Tok.is(tok::l_brace)) {
+          if (Style.BraceWrapping.AfterObjCDeclaration)
+            addUnwrappedLine();
+          parseBlock(/*MustBeDeclaration=*/false);
+        }
+        addUnwrappedLine();
+        return;
+      case tok::objc_try:
+        // This branch isn't strictly necessary (the kw_try case below would
+        // do this too after the tok::at is parsed above).  But be explicit.
+        parseTryCatch();
+        return;
+      default:
+        break;
       }
       break;
     case tok::kw_enum:
@@ -2113,6 +2124,8 @@ void UnwrappedLineParser::parseObjCUntilAtEnd() {
 }
 
 void UnwrappedLineParser::parseObjCInterfaceOrImplementation() {
+  assert(FormatTok->Tok.getObjCKeywordID() == tok::objc_interface ||
+         FormatTok->Tok.getObjCKeywordID() == tok::objc_implementation);
   nextToken();
   nextToken(); // interface name
 
@@ -2140,8 +2153,21 @@ void UnwrappedLineParser::parseObjCInterfaceOrImplementation() {
   parseObjCUntilAtEnd();
 }
 
-void UnwrappedLineParser::parseObjCProtocol() {
+// Returns true for the declaration/definition form of @protocol,
+// false for the expression form.
+bool UnwrappedLineParser::parseObjCProtocol() {
+  assert(FormatTok->Tok.getObjCKeywordID() == tok::objc_protocol);
   nextToken();
+
+  if (FormatTok->is(tok::l_paren))
+    // The expression form of @protocol, e.g. "Protocol* p = @protocol(foo);".
+    return false;
+
+  // The definition/declaration form,
+  // @protocol Foo
+  // - (int)someMethod;
+  // @end
+
   nextToken(); // protocol name
 
   if (FormatTok->Tok.is(tok::less))
@@ -2150,11 +2176,13 @@ void UnwrappedLineParser::parseObjCProtocol() {
   // Check for protocol declaration.
   if (FormatTok->Tok.is(tok::semi)) {
     nextToken();
-    return addUnwrappedLine();
+    addUnwrappedLine();
+    return true;
   }
 
   addUnwrappedLine();
   parseObjCUntilAtEnd();
+  return true;
 }
 
 void UnwrappedLineParser::parseJavaScriptEs6ImportExport() {
diff --git a/lib/Format/UnwrappedLineParser.h b/lib/Format/UnwrappedLineParser.h
index 1d8ccabbd0f8..62238aba879c 100644
--- a/lib/Format/UnwrappedLineParser.h
+++ b/lib/Format/UnwrappedLineParser.h
@@ -119,7 +119,7 @@ class UnwrappedLineParser {
   void parseObjCProtocolList();
   void parseObjCUntilAtEnd();
   void parseObjCInterfaceOrImplementation();
-  void parseObjCProtocol();
+  bool parseObjCProtocol();
   void parseJavaScriptEs6ImportExport();
   bool tryToParseLambda();
   bool tryToParseLambdaIntroducer();
diff --git a/lib/Frontend/ASTUnit.cpp b/lib/Frontend/ASTUnit.cpp
index 1160df15a920..74abbf3b7259 100644
--- a/lib/Frontend/ASTUnit.cpp
+++ b/lib/Frontend/ASTUnit.cpp
@@ -2068,6 +2068,7 @@ void ASTUnit::CodeComplete(
   CodeCompleteOpts.IncludeCodePatterns = IncludeCodePatterns;
   CodeCompleteOpts.IncludeGlobals = CachedCompletionResults.empty();
   CodeCompleteOpts.IncludeBriefComments = IncludeBriefComments;
+  CodeCompleteOpts.LoadExternal = Consumer.loadExternal();
 
   assert(IncludeBriefComments == this->IncludeBriefCommentsInCodeCompletion);
 
diff --git a/lib/Frontend/CodeGenOptions.cpp b/lib/Frontend/CodeGenOptions.cpp
index 50bb9f951be4..84a39f2d570d 100644
--- a/lib/Frontend/CodeGenOptions.cpp
+++ b/lib/Frontend/CodeGenOptions.cpp
@@ -17,7 +17,7 @@ CodeGenOptions::CodeGenOptions() {
 #define ENUM_CODEGENOPT(Name, Type, Bits, Default) set##Name(Default);
 #include "clang/Frontend/CodeGenOptions.def"
 
-  RelocationModel = "pic";
+  RelocationModel = llvm::Reloc::PIC_;
   memcpy(CoverageVersion, "402*", 4);
 }
 
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index 6ce719aac673..b107f65adf17 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -305,6 +305,14 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args,
     }
   }
 
+  llvm::raw_string_ostream os(Opts.FullCompilerInvocation);
+  for (unsigned i=0; i<Args.getNumInputArgStrings(); i++) {
+    if (i != 0)
+      os << " ";
+    os << Args.getArgString(i);
+  }
+  os.flush();
+
   return Success;
 }
 
@@ -330,15 +338,23 @@ static StringRef getCodeModel(ArgList &Args, DiagnosticsEngine &Diags) {
   return "default";
 }
 
-static StringRef getRelocModel(ArgList &Args, DiagnosticsEngine &Diags) {
+static llvm::Reloc::Model getRelocModel(ArgList &Args,
+                                        DiagnosticsEngine &Diags) {
   if (Arg *A = Args.getLastArg(OPT_mrelocation_model)) {
     StringRef Value = A->getValue();
-    if (Value == "static" || Value == "pic" || Value == "ropi" ||
-        Value == "rwpi" || Value == "ropi-rwpi" || Value == "dynamic-no-pic")
-      return Value;
+    auto RM = llvm::StringSwitch<llvm::Optional<llvm::Reloc::Model>>(Value)
+                  .Case("static", llvm::Reloc::Static)
+                  .Case("pic", llvm::Reloc::PIC_)
+                  .Case("ropi", llvm::Reloc::ROPI)
+                  .Case("rwpi", llvm::Reloc::RWPI)
+                  .Case("ropi-rwpi", llvm::Reloc::ROPI_RWPI)
+                  .Case("dynamic-no-pic", llvm::Reloc::DynamicNoPIC)
+                  .Default(None);
+    if (RM.hasValue())
+      return *RM;
     Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Value;
   }
-  return "pic";
+  return llvm::Reloc::PIC_;
 }
 
 /// \brief Create a new Regex instance out of the string value in \p RpassArg.
@@ -682,6 +698,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
                                        OPT_fno_function_sections, false);
   Opts.DataSections = Args.hasFlag(OPT_fdata_sections,
                                    OPT_fno_data_sections, false);
+  Opts.StackSizeSection =
+      Args.hasFlag(OPT_fstack_size_section, OPT_fno_stack_size_section, false);
   Opts.UniqueSectionNames = Args.hasFlag(OPT_funique_section_names,
                                          OPT_fno_unique_section_names, true);
 
@@ -719,6 +737,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.MainFileName = Args.getLastArgValue(OPT_main_file_name);
   Opts.VerifyModule = !Args.hasArg(OPT_disable_llvm_verifier);
 
+  Opts.ControlFlowGuard = Args.hasArg(OPT_cfguard);
+
   Opts.DisableGCov = Args.hasArg(OPT_test_coverage);
   Opts.EmitGcovArcs = Args.hasArg(OPT_femit_coverage_data);
   Opts.EmitGcovNotes = Args.hasArg(OPT_femit_coverage_notes);
@@ -796,6 +816,21 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.CallFEntry = Args.hasArg(OPT_mfentry);
   Opts.EmitOpenCLArgMetadata = Args.hasArg(OPT_cl_kernel_arg_info);
 
+  if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) {
+    StringRef Name = A->getValue();
+    if (Name == "full") {
+      Opts.CFProtectionReturn = 1;
+      Opts.CFProtectionBranch = 1;
+    } else if (Name == "return")
+      Opts.CFProtectionReturn = 1;
+    else if (Name == "branch")
+      Opts.CFProtectionBranch = 1;
+    else if (Name != "none") {
+      Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Name;
+      Success = false;
+    }
+  }
+
   if (const Arg *A = Args.getLastArg(OPT_compress_debug_sections,
                                      OPT_compress_debug_sections_EQ)) {
     if (A->getOption().getID() == OPT_compress_debug_sections) {
@@ -2601,7 +2636,6 @@ static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) {
 }
 
 static void ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args,
-                                  FileManager &FileMgr,
                                   DiagnosticsEngine &Diags,
                                   frontend::ActionKind Action) {
   using namespace options;
@@ -2757,7 +2791,13 @@ bool CompilerInvocation::CreateFromArgs(CompilerInvocation &Res,
 
   // Issue errors on unknown arguments.
   for (const Arg *A : Args.filtered(OPT_UNKNOWN)) {
-    Diags.Report(diag::err_drv_unknown_argument) << A->getAsString(Args);
+    auto ArgString = A->getAsString(Args);
+    std::string Nearest;
+    if (Opts->findNearest(ArgString, Nearest, IncludedFlagsBitmask) > 1)
+      Diags.Report(diag::err_drv_unknown_argument) << ArgString;
+    else
+      Diags.Report(diag::err_drv_unknown_argument_with_suggestion)
+          << ArgString << Nearest;
     Success = false;
   }
 
@@ -2817,12 +2857,7 @@ bool CompilerInvocation::CreateFromArgs(CompilerInvocation &Res,
       !LangOpts.Sanitize.has(SanitizerKind::Address) &&
       !LangOpts.Sanitize.has(SanitizerKind::Memory);
 
-  // FIXME: ParsePreprocessorArgs uses the FileManager to read the contents of
-  // PCH file and find the original header name. Remove the need to do that in
-  // ParsePreprocessorArgs and remove the FileManager
-  // parameters from the function and the "FileManager.h" #include.
-  FileManager FileMgr(Res.getFileSystemOpts());
-  ParsePreprocessorArgs(Res.getPreprocessorOpts(), Args, FileMgr, Diags,
+  ParsePreprocessorArgs(Res.getPreprocessorOpts(), Args, Diags,
                         Res.getFrontendOpts().ProgramAction);
   ParsePreprocessorOutputArgs(Res.getPreprocessorOutputOpts(), Args,
                               Res.getFrontendOpts().ProgramAction);
diff --git a/lib/Frontend/FrontendAction.cpp b/lib/Frontend/FrontendAction.cpp
index 12226b231417..1b4d789b4d97 100644
--- a/lib/Frontend/FrontendAction.cpp
+++ b/lib/Frontend/FrontendAction.cpp
@@ -858,6 +858,13 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
       CI.getDiagnostics().Report(diag::err_module_map_not_found) << Filename;
   }
 
+  // Add a module declaration scope so that modules from -fmodule-map-file
+  // arguments may shadow modules found implicitly in search paths.
+  CI.getPreprocessor()
+      .getHeaderSearchInfo()
+      .getModuleMap()
+      .finishModuleDeclarationScope();
+
   // If we were asked to load any module files, do so now.
   for (const auto &ModuleFile : CI.getFrontendOpts().ModuleFiles)
     if (!CI.loadModuleFile(ModuleFile))
diff --git a/lib/Frontend/InitHeaderSearch.cpp b/lib/Frontend/InitHeaderSearch.cpp
index 8c6faced76ac..a098b6aaa31c 100644
--- a/lib/Frontend/InitHeaderSearch.cpp
+++ b/lib/Frontend/InitHeaderSearch.cpp
@@ -255,6 +255,7 @@ void InitHeaderSearch::AddDefaultCIncludePaths(const llvm::Triple &triple,
 
   switch (os) {
   case llvm::Triple::Linux:
+  case llvm::Triple::Solaris:
     llvm_unreachable("Include management is handled in the driver.");
 
   case llvm::Triple::CloudABI: {
@@ -396,6 +397,7 @@ AddDefaultCPlusPlusIncludePaths(const llvm::Triple &triple, const HeaderSearchOp
 
   switch (os) {
   case llvm::Triple::Linux:
+  case llvm::Triple::Solaris:
     llvm_unreachable("Include management is handled in the driver.");
     break;
   case llvm::Triple::Win32:
@@ -443,6 +445,7 @@ void InitHeaderSearch::AddDefaultIncludePaths(const LangOptions &Lang,
     break; // Everything else continues to use this routine's logic.
 
   case llvm::Triple::Linux:
+  case llvm::Triple::Solaris:
     return;
 
   case llvm::Triple::Win32:
diff --git a/lib/Frontend/InitPreprocessor.cpp b/lib/Frontend/InitPreprocessor.cpp
index 639050f7c64b..321d963827d1 100644
--- a/lib/Frontend/InitPreprocessor.cpp
+++ b/lib/Frontend/InitPreprocessor.cpp
@@ -817,10 +817,6 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   DefineFloatMacros(Builder, "FLT", &TI.getFloatFormat(), "F");
   DefineFloatMacros(Builder, "DBL", &TI.getDoubleFormat(), "");
   DefineFloatMacros(Builder, "LDBL", &TI.getLongDoubleFormat(), "L");
-  if (TI.hasFloat128Type())
-    // FIXME: Switch away from the non-standard "Q" when we can
-    DefineFloatMacros(Builder, "FLT128", &TI.getFloat128Format(), "Q");
-
 
   // Define a __POINTER_WIDTH__ macro for stdint.h.
   Builder.defineMacro("__POINTER_WIDTH__",
diff --git a/lib/Frontend/PrecompiledPreamble.cpp b/lib/Frontend/PrecompiledPreamble.cpp
index 7e1323fd83bb..d761f6ee458b 100644
--- a/lib/Frontend/PrecompiledPreamble.cpp
+++ b/lib/Frontend/PrecompiledPreamble.cpp
@@ -485,20 +485,15 @@ bool PrecompiledPreamble::CanReuse(const CompilerInvocation &Invocation,
 void PrecompiledPreamble::AddImplicitPreamble(
     CompilerInvocation &CI, IntrusiveRefCntPtr<vfs::FileSystem> &VFS,
     llvm::MemoryBuffer *MainFileBuffer) const {
-  assert(VFS && "VFS must not be null");
-
-  auto &PreprocessorOpts = CI.getPreprocessorOpts();
-
-  // Remap main file to point to MainFileBuffer.
-  auto MainFilePath = CI.getFrontendOpts().Inputs[0].getFile();
-  PreprocessorOpts.addRemappedFile(MainFilePath, MainFileBuffer);
-
-  // Configure ImpicitPCHInclude.
-  PreprocessorOpts.PrecompiledPreambleBytes.first = PreambleBytes.size();
-  PreprocessorOpts.PrecompiledPreambleBytes.second = PreambleEndsAtStartOfLine;
-  PreprocessorOpts.DisablePCHValidation = true;
+  PreambleBounds Bounds(PreambleBytes.size(), PreambleEndsAtStartOfLine);
+  configurePreamble(Bounds, CI, VFS, MainFileBuffer);
+}
 
-  setupPreambleStorage(Storage, PreprocessorOpts, VFS);
+void PrecompiledPreamble::OverridePreamble(
+    CompilerInvocation &CI, IntrusiveRefCntPtr<vfs::FileSystem> &VFS,
+    llvm::MemoryBuffer *MainFileBuffer) const {
+  auto Bounds = ComputePreambleBounds(*CI.getLangOpts(), MainFileBuffer, 0);
+  configurePreamble(Bounds, CI, VFS, MainFileBuffer);
 }
 
 PrecompiledPreamble::PrecompiledPreamble(
@@ -681,6 +676,27 @@ PrecompiledPreamble::PreambleFileHash::createForMemoryBuffer(
   return Result;
 }
 
+void PrecompiledPreamble::configurePreamble(
+    PreambleBounds Bounds, CompilerInvocation &CI,
+    IntrusiveRefCntPtr<vfs::FileSystem> &VFS,
+    llvm::MemoryBuffer *MainFileBuffer) const {
+  assert(VFS);
+
+  auto &PreprocessorOpts = CI.getPreprocessorOpts();
+
+  // Remap main file to point to MainFileBuffer.
+  auto MainFilePath = CI.getFrontendOpts().Inputs[0].getFile();
+  PreprocessorOpts.addRemappedFile(MainFilePath, MainFileBuffer);
+
+  // Configure ImpicitPCHInclude.
+  PreprocessorOpts.PrecompiledPreambleBytes.first = Bounds.Size;
+  PreprocessorOpts.PrecompiledPreambleBytes.second =
+      Bounds.PreambleEndsAtStartOfLine;
+  PreprocessorOpts.DisablePCHValidation = true;
+
+  setupPreambleStorage(Storage, PreprocessorOpts, VFS);
+}
+
 void PrecompiledPreamble::setupPreambleStorage(
     const PCHStorage &Storage, PreprocessorOptions &PreprocessorOpts,
     IntrusiveRefCntPtr<vfs::FileSystem> &VFS) {
diff --git a/lib/Frontend/PrintPreprocessedOutput.cpp b/lib/Frontend/PrintPreprocessedOutput.cpp
index 2e023294f1e8..36ecdada0c8b 100644
--- a/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -752,7 +752,7 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
     } else if (Tok.isLiteral() && !Tok.needsCleaning() &&
                Tok.getLiteralData()) {
       OS.write(Tok.getLiteralData(), Tok.getLength());
-    } else if (Tok.getLength() < 256) {
+    } else if (Tok.getLength() < llvm::array_lengthof(Buffer)) {
       const char *TokPtr = Buffer;
       unsigned Len = PP.getSpelling(Tok, TokPtr);
       OS.write(TokPtr, Len);
diff --git a/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 1954b24aedad..55f81c286083 100644
--- a/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -2590,7 +2590,7 @@ Stmt *RewriteModernObjC::RewriteObjCStringLiteral(ObjCStringLiteral *Exp) {
   Expr *Unop = new (Context) UnaryOperator(DRE, UO_AddrOf,
                                  Context->getPointerType(DRE->getType()),
                                            VK_RValue, OK_Ordinary,
-                                           SourceLocation());
+                                           SourceLocation(), false);
   // cast to NSConstantString *
   CastExpr *cast = NoTypeInfoCStyleCastExpr(Context, Exp->getType(),
                                             CK_CPointerToObjCPointerCast, Unop);
@@ -3295,7 +3295,7 @@ Stmt *RewriteModernObjC::SynthMessageExpr(ObjCMessageExpr *Exp,
       SuperRep = new (Context) UnaryOperator(SuperRep, UO_AddrOf,
                                Context->getPointerType(SuperRep->getType()),
                                              VK_RValue, OK_Ordinary,
-                                             SourceLocation());
+                                             SourceLocation(), false);
       SuperRep = NoTypeInfoCStyleCastExpr(Context,
                                           Context->getPointerType(superType),
                                           CK_BitCast, SuperRep);
@@ -3313,7 +3313,7 @@ Stmt *RewriteModernObjC::SynthMessageExpr(ObjCMessageExpr *Exp,
       SuperRep = new (Context) UnaryOperator(SuperRep, UO_AddrOf,
                                Context->getPointerType(SuperRep->getType()),
                                              VK_RValue, OK_Ordinary,
-                                             SourceLocation());
+                                             SourceLocation(), false);
     }
     MsgExprs.push_back(SuperRep);
     break;
@@ -3389,7 +3389,7 @@ Stmt *RewriteModernObjC::SynthMessageExpr(ObjCMessageExpr *Exp,
       SuperRep = new (Context) UnaryOperator(SuperRep, UO_AddrOf,
                                Context->getPointerType(SuperRep->getType()),
                                VK_RValue, OK_Ordinary,
-                               SourceLocation());
+                               SourceLocation(), false);
       SuperRep = NoTypeInfoCStyleCastExpr(Context,
                                Context->getPointerType(superType),
                                CK_BitCast, SuperRep);
@@ -4720,7 +4720,7 @@ Stmt *RewriteModernObjC::RewriteLocalVariableExternalStorage(DeclRefExpr *DRE) {
       return DRE;
   Expr *Exp = new (Context) UnaryOperator(DRE, UO_Deref, DRE->getType(),
                                           VK_LValue, OK_Ordinary,
-                                          DRE->getLocation());
+                                          DRE->getLocation(), false);
   // Need parens to enforce precedence.
   ParenExpr *PE = new (Context) ParenExpr(SourceLocation(), SourceLocation(), 
                                           Exp);
@@ -5314,7 +5314,7 @@ Stmt *RewriteModernObjC::SynthBlockInitExpr(BlockExpr *Exp,
                                 UO_AddrOf,
                                 Context->getPointerType(Context->VoidPtrTy), 
                                 VK_RValue, OK_Ordinary,
-                                SourceLocation());
+                                SourceLocation(), false);
   InitExprs.push_back(DescRefExpr); 
   
   // Add initializers for any closure decl refs.
@@ -5332,7 +5332,8 @@ Stmt *RewriteModernObjC::SynthBlockInitExpr(BlockExpr *Exp,
           QualType QT = (*I)->getType();
           QT = Context->getPointerType(QT);
           Exp = new (Context) UnaryOperator(Exp, UO_AddrOf, QT, VK_RValue,
-                                            OK_Ordinary, SourceLocation());
+                                            OK_Ordinary, SourceLocation(),
+                                            false);
         }
       } else if (isTopLevelBlockPointerType((*I)->getType())) {
         FD = SynthBlockInitFunctionDecl((*I)->getName());
@@ -5348,7 +5349,8 @@ Stmt *RewriteModernObjC::SynthBlockInitExpr(BlockExpr *Exp,
           QualType QT = (*I)->getType();
           QT = Context->getPointerType(QT);
           Exp = new (Context) UnaryOperator(Exp, UO_AddrOf, QT, VK_RValue,
-                                            OK_Ordinary, SourceLocation());
+                                            OK_Ordinary, SourceLocation(),
+                                            false);
         }
         
       }
@@ -5388,7 +5390,8 @@ Stmt *RewriteModernObjC::SynthBlockInitExpr(BlockExpr *Exp,
       if (!isNestedCapturedVar)
           Exp = new (Context) UnaryOperator(Exp, UO_AddrOf,
                                      Context->getPointerType(Exp->getType()),
-                                     VK_RValue, OK_Ordinary, SourceLocation());
+                                     VK_RValue, OK_Ordinary, SourceLocation(),
+                                     false);
       Exp = NoTypeInfoCStyleCastExpr(Context, castT, CK_BitCast, Exp);
       InitExprs.push_back(Exp);
     }
@@ -5414,7 +5417,7 @@ Stmt *RewriteModernObjC::SynthBlockInitExpr(BlockExpr *Exp,
   
   NewRep = new (Context) UnaryOperator(NewRep, UO_AddrOf,
                              Context->getPointerType(NewRep->getType()),
-                             VK_RValue, OK_Ordinary, SourceLocation());
+                             VK_RValue, OK_Ordinary, SourceLocation(), false);
   NewRep = NoTypeInfoCStyleCastExpr(Context, FType, CK_BitCast,
                                     NewRep);
   // Put Paren around the call.
@@ -7558,7 +7561,7 @@ Stmt *RewriteModernObjC::RewriteObjCIvarRefExpr(ObjCIvarRefExpr *IV) {
       
       Expr *Exp = new (Context) UnaryOperator(castExpr, UO_Deref, IvarT,
                                               VK_LValue, OK_Ordinary,
-                                              SourceLocation());
+                                              SourceLocation(), false);
       PE = new (Context) ParenExpr(OldRange.getBegin(),
                                    OldRange.getEnd(),
                                    Exp);
diff --git a/lib/Frontend/Rewrite/RewriteObjC.cpp b/lib/Frontend/Rewrite/RewriteObjC.cpp
index 096b81bc3f08..37398e5de061 100644
--- a/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -2511,7 +2511,7 @@ Stmt *RewriteObjC::RewriteObjCStringLiteral(ObjCStringLiteral *Exp) {
   Expr *Unop = new (Context) UnaryOperator(DRE, UO_AddrOf,
                                  Context->getPointerType(DRE->getType()),
                                            VK_RValue, OK_Ordinary,
-                                           SourceLocation());
+                                           SourceLocation(), false);
   // cast to NSConstantString *
   CastExpr *cast = NoTypeInfoCStyleCastExpr(Context, Exp->getType(),
                                             CK_CPointerToObjCPointerCast, Unop);
@@ -2712,7 +2712,7 @@ Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp,
       SuperRep = new (Context) UnaryOperator(SuperRep, UO_AddrOf,
                                Context->getPointerType(SuperRep->getType()),
                                              VK_RValue, OK_Ordinary,
-                                             SourceLocation());
+                                             SourceLocation(), false);
       SuperRep = NoTypeInfoCStyleCastExpr(Context,
                                           Context->getPointerType(superType),
                                           CK_BitCast, SuperRep);
@@ -2730,7 +2730,7 @@ Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp,
       SuperRep = new (Context) UnaryOperator(SuperRep, UO_AddrOf,
                                Context->getPointerType(SuperRep->getType()),
                                              VK_RValue, OK_Ordinary,
-                                             SourceLocation());
+                                             SourceLocation(), false);
     }
     MsgExprs.push_back(SuperRep);
     break;
@@ -2806,7 +2806,7 @@ Stmt *RewriteObjC::SynthMessageExpr(ObjCMessageExpr *Exp,
       SuperRep = new (Context) UnaryOperator(SuperRep, UO_AddrOf,
                                Context->getPointerType(SuperRep->getType()),
                                VK_RValue, OK_Ordinary,
-                               SourceLocation());
+                               SourceLocation(), false);
       SuperRep = NoTypeInfoCStyleCastExpr(Context,
                                Context->getPointerType(superType),
                                CK_BitCast, SuperRep);
@@ -3045,7 +3045,7 @@ Stmt *RewriteObjC::RewriteObjCProtocolExpr(ObjCProtocolExpr *Exp) {
                                                VK_LValue, SourceLocation());
   Expr *DerefExpr = new (Context) UnaryOperator(DRE, UO_AddrOf,
                              Context->getPointerType(DRE->getType()),
-                             VK_RValue, OK_Ordinary, SourceLocation());
+                             VK_RValue, OK_Ordinary, SourceLocation(), false);
   CastExpr *castExpr = NoTypeInfoCStyleCastExpr(Context, DerefExpr->getType(),
                                                 CK_BitCast,
                                                 DerefExpr);
@@ -3875,7 +3875,7 @@ Stmt *RewriteObjC::RewriteLocalVariableExternalStorage(DeclRefExpr *DRE) {
       return DRE;
   Expr *Exp = new (Context) UnaryOperator(DRE, UO_Deref, DRE->getType(),
                                           VK_LValue, OK_Ordinary,
-                                          DRE->getLocation());
+                                          DRE->getLocation(), false);
   // Need parens to enforce precedence.
   ParenExpr *PE = new (Context) ParenExpr(SourceLocation(), SourceLocation(), 
                                           Exp);
@@ -4438,7 +4438,7 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
                                 UO_AddrOf,
                                 Context->getPointerType(Context->VoidPtrTy), 
                                 VK_RValue, OK_Ordinary,
-                                SourceLocation());
+                                SourceLocation(), false);
   InitExprs.push_back(DescRefExpr); 
   
   // Add initializers for any closure decl refs.
@@ -4456,7 +4456,8 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
           QualType QT = (*I)->getType();
           QT = Context->getPointerType(QT);
           Exp = new (Context) UnaryOperator(Exp, UO_AddrOf, QT, VK_RValue,
-                                            OK_Ordinary, SourceLocation());
+                                            OK_Ordinary, SourceLocation(),
+                                            false);
         }
       } else if (isTopLevelBlockPointerType((*I)->getType())) {
         FD = SynthBlockInitFunctionDecl((*I)->getName());
@@ -4472,7 +4473,8 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
           QualType QT = (*I)->getType();
           QT = Context->getPointerType(QT);
           Exp = new (Context) UnaryOperator(Exp, UO_AddrOf, QT, VK_RValue,
-                                            OK_Ordinary, SourceLocation());
+                                            OK_Ordinary, SourceLocation(),
+                                            false);
         }
       }
       InitExprs.push_back(Exp);
@@ -4509,9 +4511,9 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
       // captured nested byref variable has its address passed. Do not take
       // its address again.
       if (!isNestedCapturedVar)
-          Exp = new (Context) UnaryOperator(Exp, UO_AddrOf,
-                                     Context->getPointerType(Exp->getType()),
-                                     VK_RValue, OK_Ordinary, SourceLocation());
+        Exp = new (Context) UnaryOperator(
+            Exp, UO_AddrOf, Context->getPointerType(Exp->getType()), VK_RValue,
+            OK_Ordinary, SourceLocation(), false);
       Exp = NoTypeInfoCStyleCastExpr(Context, castT, CK_BitCast, Exp);
       InitExprs.push_back(Exp);
     }
@@ -4529,7 +4531,7 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
                                   FType, VK_LValue, SourceLocation());
   NewRep = new (Context) UnaryOperator(NewRep, UO_AddrOf,
                              Context->getPointerType(NewRep->getType()),
-                             VK_RValue, OK_Ordinary, SourceLocation());
+                             VK_RValue, OK_Ordinary, SourceLocation(), false);
   NewRep = NoTypeInfoCStyleCastExpr(Context, FType, CK_BitCast,
                                     NewRep);
   BlockDeclRefs.clear();
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt
index 97ba3edea1c5..a6ea5b600f8d 100644
--- a/lib/Headers/CMakeLists.txt
+++ b/lib/Headers/CMakeLists.txt
@@ -34,7 +34,9 @@ set(files
   __clang_cuda_builtin_vars.h
   __clang_cuda_cmath.h
   __clang_cuda_complex_builtins.h
+  __clang_cuda_device_functions.h
   __clang_cuda_intrinsics.h
+  __clang_cuda_libdevice_declares.h
   __clang_cuda_math_forward_declares.h
   __clang_cuda_runtime_wrapper.h
   cetintrin.h
@@ -116,7 +118,12 @@ set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
 
 # Generate arm_neon.h
 clang_tablegen(arm_neon.h -gen-arm-neon
+  -I ${CLANG_SOURCE_DIR}/include/clang/Basic/
   SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
+# Generate arm_fp16.h
+clang_tablegen(arm_fp16.h -gen-arm-fp16
+  -I ${CLANG_SOURCE_DIR}/include/clang/Basic/
+  SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_fp16.td)
 
 set(out_files)
 foreach( f ${files} ${cuda_wrapper_files} )
@@ -134,6 +141,11 @@ add_custom_command(OUTPUT ${output_dir}/arm_neon.h
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
   COMMENT "Copying clang's arm_neon.h...")
 list(APPEND out_files ${output_dir}/arm_neon.h)
+add_custom_command(OUTPUT ${output_dir}/arm_fp16.h
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h ${output_dir}/arm_fp16.h
+  COMMENT "Copying clang's arm_fp16.h...")
+list(APPEND out_files ${output_dir}/arm_fp16.h)
 
 add_custom_target(clang-headers ALL DEPENDS ${out_files})
 set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
@@ -144,6 +156,12 @@ install(
   PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
   DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
 
+install(
+  FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h
+  COMPONENT clang-headers
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
+
 install(
   FILES ${cuda_wrapper_files}
   COMPONENT clang-headers
diff --git a/lib/Headers/__clang_cuda_device_functions.h b/lib/Headers/__clang_cuda_device_functions.h
new file mode 100644
index 000000000000..786700b5e95e
--- /dev/null
+++ b/lib/Headers/__clang_cuda_device_functions.h
@@ -0,0 +1,1318 @@
+/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+
+#if CUDA_VERSION < 9000
+#error This file is intended to be used with CUDA-9+ only.
+#endif
+
+// __DEVICE__ is a helper macro with common set of attributes for the wrappers
+// we implement in this file. We need static in order to avoid emitting unused
+// functions and __forceinline__ helps inlining these wrappers at -O1.
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ static __device__ __forceinline__
+
+// libdevice provides fast low precision and slow full-recision implementations
+// for some functions. Which one gets selected depends on
+// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
+// -ffast-math or -fcuda-approx-transcendentals are in effect.
+#pragma push_macro("__FAST_OR_SLOW")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __FAST_OR_SLOW(fast, slow) fast
+#else
+#define __FAST_OR_SLOW(fast, slow) slow
+#endif
+
+__DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); }
+__DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); }
+__DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
+__DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); }
+__DEVICE__ unsigned long long __brevll(unsigned long long __a) {
+  return __nv_brevll(__a);
+}
+__DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
+__DEVICE__ void __brkpt(int __a) { __brkpt(); }
+__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
+                                    unsigned int __c) {
+  return __nv_byte_perm(__a, __b, __c);
+}
+__DEVICE__ int __clz(int __a) { return __nv_clz(__a); }
+__DEVICE__ int __clzll(long long __a) { return __nv_clzll(__a); }
+__DEVICE__ float __cosf(float __a) { return __nv_fast_cosf(__a); }
+__DEVICE__ double __dAtomicAdd(double *__p, double __v) {
+  return __nvvm_atom_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dAtomicAdd_block(double *__p, double __v) {
+  return __nvvm_atom_cta_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dAtomicAdd_system(double *__p, double __v) {
+  return __nvvm_atom_sys_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dadd_rd(double __a, double __b) {
+  return __nv_dadd_rd(__a, __b);
+}
+__DEVICE__ double __dadd_rn(double __a, double __b) {
+  return __nv_dadd_rn(__a, __b);
+}
+__DEVICE__ double __dadd_ru(double __a, double __b) {
+  return __nv_dadd_ru(__a, __b);
+}
+__DEVICE__ double __dadd_rz(double __a, double __b) {
+  return __nv_dadd_rz(__a, __b);
+}
+__DEVICE__ double __ddiv_rd(double __a, double __b) {
+  return __nv_ddiv_rd(__a, __b);
+}
+__DEVICE__ double __ddiv_rn(double __a, double __b) {
+  return __nv_ddiv_rn(__a, __b);
+}
+__DEVICE__ double __ddiv_ru(double __a, double __b) {
+  return __nv_ddiv_ru(__a, __b);
+}
+__DEVICE__ double __ddiv_rz(double __a, double __b) {
+  return __nv_ddiv_rz(__a, __b);
+}
+__DEVICE__ double __dmul_rd(double __a, double __b) {
+  return __nv_dmul_rd(__a, __b);
+}
+__DEVICE__ double __dmul_rn(double __a, double __b) {
+  return __nv_dmul_rn(__a, __b);
+}
+__DEVICE__ double __dmul_ru(double __a, double __b) {
+  return __nv_dmul_ru(__a, __b);
+}
+__DEVICE__ double __dmul_rz(double __a, double __b) {
+  return __nv_dmul_rz(__a, __b);
+}
+__DEVICE__ float __double2float_rd(double __a) {
+  return __nv_double2float_rd(__a);
+}
+__DEVICE__ float __double2float_rn(double __a) {
+  return __nv_double2float_rn(__a);
+}
+__DEVICE__ float __double2float_ru(double __a) {
+  return __nv_double2float_ru(__a);
+}
+__DEVICE__ float __double2float_rz(double __a) {
+  return __nv_double2float_rz(__a);
+}
+__DEVICE__ int __double2hiint(double __a) { return __nv_double2hiint(__a); }
+__DEVICE__ int __double2int_rd(double __a) { return __nv_double2int_rd(__a); }
+__DEVICE__ int __double2int_rn(double __a) { return __nv_double2int_rn(__a); }
+__DEVICE__ int __double2int_ru(double __a) { return __nv_double2int_ru(__a); }
+__DEVICE__ int __double2int_rz(double __a) { return __nv_double2int_rz(__a); }
+__DEVICE__ long long __double2ll_rd(double __a) {
+  return __nv_double2ll_rd(__a);
+}
+__DEVICE__ long long __double2ll_rn(double __a) {
+  return __nv_double2ll_rn(__a);
+}
+__DEVICE__ long long __double2ll_ru(double __a) {
+  return __nv_double2ll_ru(__a);
+}
+__DEVICE__ long long __double2ll_rz(double __a) {
+  return __nv_double2ll_rz(__a);
+}
+__DEVICE__ int __double2loint(double __a) { return __nv_double2loint(__a); }
+__DEVICE__ unsigned int __double2uint_rd(double __a) {
+  return __nv_double2uint_rd(__a);
+}
+__DEVICE__ unsigned int __double2uint_rn(double __a) {
+  return __nv_double2uint_rn(__a);
+}
+__DEVICE__ unsigned int __double2uint_ru(double __a) {
+  return __nv_double2uint_ru(__a);
+}
+__DEVICE__ unsigned int __double2uint_rz(double __a) {
+  return __nv_double2uint_rz(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rd(double __a) {
+  return __nv_double2ull_rd(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rn(double __a) {
+  return __nv_double2ull_rn(__a);
+}
+__DEVICE__ unsigned long long __double2ull_ru(double __a) {
+  return __nv_double2ull_ru(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rz(double __a) {
+  return __nv_double2ull_rz(__a);
+}
+__DEVICE__ long long __double_as_longlong(double __a) {
+  return __nv_double_as_longlong(__a);
+}
+__DEVICE__ double __drcp_rd(double __a) { return __nv_drcp_rd(__a); }
+__DEVICE__ double __drcp_rn(double __a) { return __nv_drcp_rn(__a); }
+__DEVICE__ double __drcp_ru(double __a) { return __nv_drcp_ru(__a); }
+__DEVICE__ double __drcp_rz(double __a) { return __nv_drcp_rz(__a); }
+__DEVICE__ double __dsqrt_rd(double __a) { return __nv_dsqrt_rd(__a); }
+__DEVICE__ double __dsqrt_rn(double __a) { return __nv_dsqrt_rn(__a); }
+__DEVICE__ double __dsqrt_ru(double __a) { return __nv_dsqrt_ru(__a); }
+__DEVICE__ double __dsqrt_rz(double __a) { return __nv_dsqrt_rz(__a); }
+__DEVICE__ double __dsub_rd(double __a, double __b) {
+  return __nv_dsub_rd(__a, __b);
+}
+__DEVICE__ double __dsub_rn(double __a, double __b) {
+  return __nv_dsub_rn(__a, __b);
+}
+__DEVICE__ double __dsub_ru(double __a, double __b) {
+  return __nv_dsub_ru(__a, __b);
+}
+__DEVICE__ double __dsub_rz(double __a, double __b) {
+  return __nv_dsub_rz(__a, __b);
+}
+__DEVICE__ float __exp10f(float __a) { return __nv_fast_exp10f(__a); }
+__DEVICE__ float __expf(float __a) { return __nv_fast_expf(__a); }
+__DEVICE__ float __fAtomicAdd(float *__p, float __v) {
+  return __nvvm_atom_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicAdd_block(float *__p, float __v) {
+  return __nvvm_atom_cta_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicAdd_system(float *__p, float __v) {
+  return __nvvm_atom_sys_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicExch(float *__p, float __v) {
+  return __nv_int_as_float(
+      __nvvm_atom_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fAtomicExch_block(float *__p, float __v) {
+  return __nv_int_as_float(
+      __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fAtomicExch_system(float *__p, float __v) {
+  return __nv_int_as_float(
+      __nvvm_atom_sys_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fadd_rd(float __a, float __b) {
+  return __nv_fadd_rd(__a, __b);
+}
+__DEVICE__ float __fadd_rn(float __a, float __b) {
+  return __nv_fadd_rn(__a, __b);
+}
+__DEVICE__ float __fadd_ru(float __a, float __b) {
+  return __nv_fadd_ru(__a, __b);
+}
+__DEVICE__ float __fadd_rz(float __a, float __b) {
+  return __nv_fadd_rz(__a, __b);
+}
+__DEVICE__ float __fdiv_rd(float __a, float __b) {
+  return __nv_fdiv_rd(__a, __b);
+}
+__DEVICE__ float __fdiv_rn(float __a, float __b) {
+  return __nv_fdiv_rn(__a, __b);
+}
+__DEVICE__ float __fdiv_ru(float __a, float __b) {
+  return __nv_fdiv_ru(__a, __b);
+}
+__DEVICE__ float __fdiv_rz(float __a, float __b) {
+  return __nv_fdiv_rz(__a, __b);
+}
+__DEVICE__ float __fdividef(float __a, float __b) {
+  return __nv_fast_fdividef(__a, __b);
+}
+__DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
+__DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
+__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); }
+__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); }
+__DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); }
+__DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); }
+__DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); }
+__DEVICE__ int __float2int_rz(float __a) { return __nv_float2int_rz(__a); }
+__DEVICE__ long long __float2ll_rd(float __a) { return __nv_float2ll_rd(__a); }
+__DEVICE__ long long __float2ll_rn(float __a) { return __nv_float2ll_rn(__a); }
+__DEVICE__ long long __float2ll_ru(float __a) { return __nv_float2ll_ru(__a); }
+__DEVICE__ long long __float2ll_rz(float __a) { return __nv_float2ll_rz(__a); }
+__DEVICE__ unsigned int __float2uint_rd(float __a) {
+  return __nv_float2uint_rd(__a);
+}
+__DEVICE__ unsigned int __float2uint_rn(float __a) {
+  return __nv_float2uint_rn(__a);
+}
+__DEVICE__ unsigned int __float2uint_ru(float __a) {
+  return __nv_float2uint_ru(__a);
+}
+__DEVICE__ unsigned int __float2uint_rz(float __a) {
+  return __nv_float2uint_rz(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rd(float __a) {
+  return __nv_float2ull_rd(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rn(float __a) {
+  return __nv_float2ull_rn(__a);
+}
+__DEVICE__ unsigned long long __float2ull_ru(float __a) {
+  return __nv_float2ull_ru(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rz(float __a) {
+  return __nv_float2ull_rz(__a);
+}
+__DEVICE__ int __float_as_int(float __a) { return __nv_float_as_int(__a); }
+__DEVICE__ unsigned int __float_as_uint(float __a) {
+  return __nv_float_as_uint(__a);
+}
+__DEVICE__ double __fma_rd(double __a, double __b, double __c) {
+  return __nv_fma_rd(__a, __b, __c);
+}
+__DEVICE__ double __fma_rn(double __a, double __b, double __c) {
+  return __nv_fma_rn(__a, __b, __c);
+}
+__DEVICE__ double __fma_ru(double __a, double __b, double __c) {
+  return __nv_fma_ru(__a, __b, __c);
+}
+__DEVICE__ double __fma_rz(double __a, double __b, double __c) {
+  return __nv_fma_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_rd(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_rn(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_ru(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) {
+  return __nv_fmaf_ieee_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rd(float __a, float __b, float __c) {
+  return __nv_fmaf_rd(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rn(float __a, float __b, float __c) {
+  return __nv_fmaf_rn(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ru(float __a, float __b, float __c) {
+  return __nv_fmaf_ru(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rz(float __a, float __b, float __c) {
+  return __nv_fmaf_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmul_rd(float __a, float __b) {
+  return __nv_fmul_rd(__a, __b);
+}
+__DEVICE__ float __fmul_rn(float __a, float __b) {
+  return __nv_fmul_rn(__a, __b);
+}
+__DEVICE__ float __fmul_ru(float __a, float __b) {
+  return __nv_fmul_ru(__a, __b);
+}
+__DEVICE__ float __fmul_rz(float __a, float __b) {
+  return __nv_fmul_rz(__a, __b);
+}
+__DEVICE__ float __frcp_rd(float __a) { return __nv_frcp_rd(__a); }
+__DEVICE__ float __frcp_rn(float __a) { return __nv_frcp_rn(__a); }
+__DEVICE__ float __frcp_ru(float __a) { return __nv_frcp_ru(__a); }
+__DEVICE__ float __frcp_rz(float __a) { return __nv_frcp_rz(__a); }
+__DEVICE__ float __frsqrt_rn(float __a) { return __nv_frsqrt_rn(__a); }
+__DEVICE__ float __fsqrt_rd(float __a) { return __nv_fsqrt_rd(__a); }
+__DEVICE__ float __fsqrt_rn(float __a) { return __nv_fsqrt_rn(__a); }
+__DEVICE__ float __fsqrt_ru(float __a) { return __nv_fsqrt_ru(__a); }
+__DEVICE__ float __fsqrt_rz(float __a) { return __nv_fsqrt_rz(__a); }
+__DEVICE__ float __fsub_rd(float __a, float __b) {
+  return __nv_fsub_rd(__a, __b);
+}
+__DEVICE__ float __fsub_rn(float __a, float __b) {
+  return __nv_fsub_rn(__a, __b);
+}
+__DEVICE__ float __fsub_ru(float __a, float __b) {
+  return __nv_fsub_ru(__a, __b);
+}
+__DEVICE__ float __fsub_rz(float __a, float __b) {
+  return __nv_fsub_rz(__a, __b);
+}
+__DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); }
+__DEVICE__ double __hiloint2double(int __a, int __b) {
+  return __nv_hiloint2double(__a, __b);
+}
+__DEVICE__ int __iAtomicAdd(int *__p, int __v) {
+  return __nvvm_atom_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAdd_block(int *__p, int __v) {
+  __nvvm_atom_cta_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAdd_system(int *__p, int __v) {
+  __nvvm_atom_sys_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd(int *__p, int __v) {
+  return __nvvm_atom_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd_block(int *__p, int __v) {
+  return __nvvm_atom_cta_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd_system(int *__p, int __v) {
+  return __nvvm_atom_sys_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicCAS(int *__p, int __cmp, int __v) {
+  return __nvvm_atom_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) {
+  return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicCAS_system(int *__p, int __cmp, int __v) {
+  return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicExch(int *__p, int __v) {
+  return __nvvm_atom_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicExch_block(int *__p, int __v) {
+  return __nvvm_atom_cta_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicExch_system(int *__p, int __v) {
+  return __nvvm_atom_sys_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax(int *__p, int __v) {
+  return __nvvm_atom_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax_block(int *__p, int __v) {
+  return __nvvm_atom_cta_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax_system(int *__p, int __v) {
+  return __nvvm_atom_sys_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin(int *__p, int __v) {
+  return __nvvm_atom_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin_block(int *__p, int __v) {
+  return __nvvm_atom_cta_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin_system(int *__p, int __v) {
+  return __nvvm_atom_sys_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr(int *__p, int __v) {
+  return __nvvm_atom_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr_block(int *__p, int __v) {
+  return __nvvm_atom_cta_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr_system(int *__p, int __v) {
+  return __nvvm_atom_sys_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor(int *__p, int __v) {
+  return __nvvm_atom_xor_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor_block(int *__p, int __v) {
+  return __nvvm_atom_cta_xor_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor_system(int *__p, int __v) {
+  return __nvvm_atom_sys_xor_gen_i(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax(long long *__p, long long __v) {
+  return __nvvm_atom_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin(long long *__p, long long __v) {
+  return __nvvm_atom_min_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_min_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_min_gen_ll(__p, __v);
+}
+__DEVICE__ double __int2double_rn(int __a) { return __nv_int2double_rn(__a); }
+__DEVICE__ float __int2float_rd(int __a) { return __nv_int2float_rd(__a); }
+__DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); }
+__DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); }
+__DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); }
+__DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); }
+__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
+__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); }
+__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); }
+__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); }
+__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); }
+__DEVICE__ double __ll2double_rd(long long __a) {
+  return __nv_ll2double_rd(__a);
+}
+__DEVICE__ double __ll2double_rn(long long __a) {
+  return __nv_ll2double_rn(__a);
+}
+__DEVICE__ double __ll2double_ru(long long __a) {
+  return __nv_ll2double_ru(__a);
+}
+__DEVICE__ double __ll2double_rz(long long __a) {
+  return __nv_ll2double_rz(__a);
+}
+__DEVICE__ float __ll2float_rd(long long __a) { return __nv_ll2float_rd(__a); }
+__DEVICE__ float __ll2float_rn(long long __a) { return __nv_ll2float_rn(__a); }
+__DEVICE__ float __ll2float_ru(long long __a) { return __nv_ll2float_ru(__a); }
+__DEVICE__ float __ll2float_rz(long long __a) { return __nv_ll2float_rz(__a); }
+__DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) {
+  return __nvvm_atom_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr(long long *__p, long long __v) {
+  return __nvvm_atom_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor(long long *__p, long long __v) {
+  return __nvvm_atom_xor_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) {
+  return __nvvm_atom_cta_xor_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) {
+  return __nvvm_atom_sys_xor_gen_ll(__p, __v);
+}
+__DEVICE__ float __log10f(float __a) { return __nv_fast_log10f(__a); }
+__DEVICE__ float __log2f(float __a) { return __nv_fast_log2f(__a); }
+__DEVICE__ float __logf(float __a) { return __nv_fast_logf(__a); }
+__DEVICE__ double __longlong_as_double(long long __a) {
+  return __nv_longlong_as_double(__a);
+}
+__DEVICE__ int __mul24(int __a, int __b) { return __nv_mul24(__a, __b); }
+__DEVICE__ long long __mul64hi(long long __a, long long __b) {
+  return __nv_mul64hi(__a, __b);
+}
+__DEVICE__ int __mulhi(int __a, int __b) { return __nv_mulhi(__a, __b); }
+__DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); }
+__DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); }
+__DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); }
+__DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); }
+__DEVICE__ int __popc(int __a) { return __nv_popc(__a); }
+__DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); }
+__DEVICE__ float __powf(float __a, float __b) {
+  return __nv_fast_powf(__a, __b);
+}
+
+// Parameter must have a known integer value.
+#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a))
+__DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); }
+__DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
+  return __nv_sad(__a, __b, __c);
+}
+__DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); }
+__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); }
+__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); }
+__DEVICE__ void __sincosf(float __a, float *__sptr, float *__cptr) {
+  return __nv_fast_sincosf(__a, __sptr, __cptr);
+}
+__DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); }
+__DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); }
+__DEVICE__ int __syncthreads_count(int __a) { return __nvvm_bar0_popc(__a); }
+__DEVICE__ int __syncthreads_or(int __a) { return __nvvm_bar0_or(__a); }
+__DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
+__DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
+__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
+__DEVICE__ void __trap(void) { asm volatile("trap;"); }
+__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_add_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_add_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_add_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_and_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_and_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_and_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicCAS(unsigned int *__p, unsigned int __cmp,
+                                     unsigned int __v) {
+  return __nvvm_atom_cas_gen_i((int *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned int
+__uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) {
+  return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned int
+__uAtomicCAS_system(unsigned int *__p, unsigned int __cmp, unsigned int __v) {
+  return __nvvm_atom_sys_cas_gen_i((int *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_xchg_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch_system(unsigned int *__p,
+                                             unsigned int __v) {
+  return __nvvm_atom_sys_xchg_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_or_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_cta_or_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr_system(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_sys_or_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor(unsigned int *__p, unsigned int __v) {
+  return __nvvm_atom_xor_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p,
+                                           unsigned int __v) {
+  return __nvvm_atom_cta_xor_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor_system(unsigned int *__p,
+                                            unsigned int __v) {
+  return __nvvm_atom_sys_xor_gen_i((int *)__p, __v);
+}
+__DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) {
+  return __nv_uhadd(__a, __b);
+}
+__DEVICE__ double __uint2double_rn(unsigned int __a) {
+  return __nv_uint2double_rn(__a);
+}
+__DEVICE__ float __uint2float_rd(unsigned int __a) {
+  return __nv_uint2float_rd(__a);
+}
+__DEVICE__ float __uint2float_rn(unsigned int __a) {
+  return __nv_uint2float_rn(__a);
+}
+__DEVICE__ float __uint2float_ru(unsigned int __a) {
+  return __nv_uint2float_ru(__a);
+}
+__DEVICE__ float __uint2float_rz(unsigned int __a) {
+  return __nv_uint2float_rz(__a);
+}
+__DEVICE__ float __uint_as_float(unsigned int __a) {
+  return __nv_uint_as_float(__a);
+} //
+__DEVICE__ double __ull2double_rd(unsigned long long __a) {
+  return __nv_ull2double_rd(__a);
+}
+__DEVICE__ double __ull2double_rn(unsigned long long __a) {
+  return __nv_ull2double_rn(__a);
+}
+__DEVICE__ double __ull2double_ru(unsigned long long __a) {
+  return __nv_ull2double_ru(__a);
+}
+__DEVICE__ double __ull2double_rz(unsigned long long __a) {
+  return __nv_ull2double_rz(__a);
+}
+__DEVICE__ float __ull2float_rd(unsigned long long __a) {
+  return __nv_ull2float_rd(__a);
+}
+__DEVICE__ float __ull2float_rn(unsigned long long __a) {
+  return __nv_ull2float_rn(__a);
+}
+__DEVICE__ float __ull2float_ru(unsigned long long __a) {
+  return __nv_ull2float_ru(__a);
+}
+__DEVICE__ float __ull2float_rz(unsigned long long __a) {
+  return __nv_ull2float_rz(__a);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_add_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_and_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p,
+                                                   unsigned long long __cmp,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long *__p,
+                                                    unsigned long long __cmp,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long *__p,
+                                                     unsigned long long __v) {
+  return __nvvm_atom_sys_xchg_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long *__p,
+                                            unsigned long long __v) {
+  return __nvvm_atom_or_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p,
+                                                  unsigned long long __v) {
+  return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_sys_or_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long *__p,
+                                             unsigned long long __v) {
+  return __nvvm_atom_xor_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p,
+                                                   unsigned long long __v) {
+  return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long *__p,
+                                                    unsigned long long __v) {
+  return __nvvm_atom_sys_xor_gen_ll((long long *)__p, __v);
+}
+__DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) {
+  return __nv_umul24(__a, __b);
+}
+__DEVICE__ unsigned long long __umul64hi(unsigned long long __a,
+                                         unsigned long long __b) {
+  return __nv_umul64hi(__a, __b);
+}
+__DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) {
+  return __nv_umulhi(__a, __b);
+}
+__DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) {
+  return __nv_urhadd(__a, __b);
+}
+__DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b,
+                               unsigned int __c) {
+  return __nv_usad(__a, __b, __c);
+}
+__DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); }
+__DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); }
+__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffs2(__a, __b);
+}
+__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffs4(__a, __b);
+}
+__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffu2(__a, __b);
+}
+__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
+  return __nv_vabsdiffu4(__a, __b);
+}
+__DEVICE__ unsigned int __vabsss2(unsigned int __a) {
+  return __nv_vabsss2(__a);
+}
+__DEVICE__ unsigned int __vabsss4(unsigned int __a) {
+  return __nv_vabsss4(__a);
+}
+__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
+  return __nv_vadd2(__a, __b);
+}
+__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
+  return __nv_vadd4(__a, __b);
+}
+__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
+  return __nv_vaddss2(__a, __b);
+}
+__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
+  return __nv_vaddss4(__a, __b);
+}
+__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
+  return __nv_vaddus2(__a, __b);
+}
+__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
+  return __nv_vaddus4(__a, __b);
+}
+__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
+  return __nv_vavgs2(__a, __b);
+}
+__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
+  return __nv_vavgs4(__a, __b);
+}
+__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
+  return __nv_vavgu2(__a, __b);
+}
+__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
+  return __nv_vavgu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpeq2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpeq4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpges2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpges4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgeu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgeu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgts2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgts4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgtu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpgtu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmples2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmples4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpleu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpleu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmplts2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmplts4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpltu2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpltu4(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpne2(__a, __b);
+}
+__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
+  return __nv_vcmpne4(__a, __b);
+}
+__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) {
+  return __nv_vhaddu2(__a, __b);
+}
+__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) {
+  return __nv_vhaddu4(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxs2(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxs4(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxu2(__a, __b);
+}
+__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
+  return __nv_vmaxu4(__a, __b);
+}
+__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
+  return __nv_vmins2(__a, __b);
+}
+__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
+  return __nv_vmins4(__a, __b);
+}
+__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
+  return __nv_vminu2(__a, __b);
+}
+__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
+  return __nv_vminu4(__a, __b);
+}
+__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); }
+__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); }
+__DEVICE__ unsigned int __vnegss2(unsigned int __a) {
+  return __nv_vnegss2(__a);
+}
+__DEVICE__ unsigned int __vnegss4(unsigned int __a) {
+  return __nv_vnegss4(__a);
+}
+__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
+  return __nv_vsads2(__a, __b);
+}
+__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
+  return __nv_vsads4(__a, __b);
+}
+__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsadu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsadu4(__a, __b);
+}
+__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
+  return __nv_vseteq2(__a, __b);
+}
+__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
+  return __nv_vseteq4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetges2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetges4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgeu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgeu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgts2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgts4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgtu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetgtu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetles2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetles4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetleu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetleu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetlts2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetlts4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetltu2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetltu4(__a, __b);
+}
+__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
+  return __nv_vsetne2(__a, __b);
+}
+__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
+  return __nv_vsetne4(__a, __b);
+}
+__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
+  return __nv_vsub2(__a, __b);
+}
+__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
+  return __nv_vsub4(__a, __b);
+}
+__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
+  return __nv_vsubss2(__a, __b);
+}
+__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
+  return __nv_vsubss4(__a, __b);
+}
+__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
+  return __nv_vsubus2(__a, __b);
+}
+__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
+  return __nv_vsubus4(__a, __b);
+}
+__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
+__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
+__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
+__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
+__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
+__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
+__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
+__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
+__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
+__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
+__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
+__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
+__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
+__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
+__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
+__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
+__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
+__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
+__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
+__DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); }
+__DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); }
+__DEVICE__ double copysign(double __a, double __b) {
+  return __nv_copysign(__a, __b);
+}
+__DEVICE__ float copysignf(float __a, float __b) {
+  return __nv_copysignf(__a, __b);
+}
+__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
+__DEVICE__ float cosf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
+}
+__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
+__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
+__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
+__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
+__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
+__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
+__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
+__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
+__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
+__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
+__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
+__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
+__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
+__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
+__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
+__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
+__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
+__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
+__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
+__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
+__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
+__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
+__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
+__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
+__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
+__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
+__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
+__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
+__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
+__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
+__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
+__DEVICE__ float fdividef(float __a, float __b) {
+#if __FAST_MATH__ && !__CUDA_PREC_DIV
+  return __nv_fast_fdividef(__a, __b);
+#else
+  return __a / __b;
+#endif
+}
+__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
+__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
+__DEVICE__ double fma(double __a, double __b, double __c) {
+  return __nv_fma(__a, __b, __c);
+}
+__DEVICE__ float fmaf(float __a, float __b, float __c) {
+  return __nv_fmaf(__a, __b, __c);
+}
+__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
+__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
+__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
+__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
+__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
+__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
+__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
+__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
+__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
+__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
+__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
+__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
+__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
+__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
+__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
+__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
+__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
+__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
+__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
+__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
+__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
+__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
+__DEVICE__ long long llmax(long long __a, long long __b) {
+  return __nv_llmax(__a, __b);
+}
+__DEVICE__ long long llmin(long long __a, long long __b) {
+  return __nv_llmin(__a, __b);
+}
+__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
+__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
+__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
+__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
+__DEVICE__ double log(double __a) { return __nv_log(__a); }
+__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
+__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
+__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
+__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
+__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
+__DEVICE__ float log2f(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
+}
+__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
+__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
+__DEVICE__ float logf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
+}
+#if defined(__LP64__)
+__DEVICE__ long lrint(double __a) { return llrint(__a); }
+__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
+__DEVICE__ long lround(double __a) { return llround(__a); }
+__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
+#else
+__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
+__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
+__DEVICE__ long lround(double __a) { return round(__a); }
+__DEVICE__ long lroundf(float __a) { return roundf(__a); }
+#endif
+__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
+__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
+  return __builtin_memcpy(__a, __b, __c);
+}
+__DEVICE__ void *memset(void *__a, int __b, size_t __c) {
+  return __builtin_memset(__a, __b, __c);
+}
+__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
+__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
+__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
+__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
+__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
+__DEVICE__ double nextafter(double __a, double __b) {
+  return __nv_nextafter(__a, __b);
+}
+__DEVICE__ float nextafterf(float __a, float __b) {
+  return __nv_nextafterf(__a, __b);
+}
+__DEVICE__ double norm(int __dim, const double *__t) {
+  return __nv_norm(__dim, __t);
+}
+__DEVICE__ double norm3d(double __a, double __b, double __c) {
+  return __nv_norm3d(__a, __b, __c);
+}
+__DEVICE__ float norm3df(float __a, float __b, float __c) {
+  return __nv_norm3df(__a, __b, __c);
+}
+__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
+  return __nv_norm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
+  return __nv_norm4df(__a, __b, __c, __d);
+}
+__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
+__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
+__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
+__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
+__DEVICE__ float normf(int __dim, const float *__t) {
+  return __nv_normf(__dim, __t);
+}
+__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
+__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
+__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
+__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
+__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
+__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
+__DEVICE__ double remainder(double __a, double __b) {
+  return __nv_remainder(__a, __b);
+}
+__DEVICE__ float remainderf(float __a, float __b) {
+  return __nv_remainderf(__a, __b);
+}
+__DEVICE__ double remquo(double __a, double __b, int *__c) {
+  return __nv_remquo(__a, __b, __c);
+}
+__DEVICE__ float remquof(float __a, float __b, int *__c) {
+  return __nv_remquof(__a, __b, __c);
+}
+__DEVICE__ double rhypot(double __a, double __b) {
+  return __nv_rhypot(__a, __b);
+}
+__DEVICE__ float rhypotf(float __a, float __b) {
+  return __nv_rhypotf(__a, __b);
+}
+__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
+__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
+__DEVICE__ double rnorm(int __a, const double *__b) {
+  return __nv_rnorm(__a, __b);
+}
+__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
+  return __nv_rnorm3d(__a, __b, __c);
+}
+__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
+  return __nv_rnorm3df(__a, __b, __c);
+}
+__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
+  return __nv_rnorm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
+  return __nv_rnorm4df(__a, __b, __c, __d);
+}
+__DEVICE__ float rnormf(int __dim, const float *__t) {
+  return __nv_rnormf(__dim, __t);
+}
+__DEVICE__ double round(double __a) { return __nv_round(__a); }
+__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
+__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
+__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
+__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
+__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
+__DEVICE__ double scalbln(double __a, long __b) {
+  if (__b > INT_MAX)
+    return __a > 0 ? HUGE_VAL : -HUGE_VAL;
+  if (__b < INT_MIN)
+    return __a > 0 ? 0.0 : -0.0;
+  return scalbn(__a, (int)__b);
+}
+__DEVICE__ float scalblnf(float __a, long __b) {
+  if (__b > INT_MAX)
+    return __a > 0 ? HUGE_VALF : -HUGE_VALF;
+  if (__b < INT_MIN)
+    return __a > 0 ? 0.f : -0.f;
+  return scalbnf(__a, (int)__b);
+}
+__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
+__DEVICE__ void sincosf(float __a, float *__sptr, float *__cptr) {
+  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __sptr, __cptr);
+}
+__DEVICE__ void sincospi(double __a, double *__sptr, double *__cptr) {
+  return __nv_sincospi(__a, __sptr, __cptr);
+}
+__DEVICE__ void sincospif(float __a, float *__sptr, float *__cptr) {
+  return __nv_sincospif(__a, __sptr, __cptr);
+}
+__DEVICE__ float sinf(float __a) {
+  return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
+}
+__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
+__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
+__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
+__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
+__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
+__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
+__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
+__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
+__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
+__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
+__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
+__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
+__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
+__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
+__DEVICE__ unsigned long long ullmax(unsigned long long __a,
+                                     unsigned long long __b) {
+  return __nv_ullmax(__a, __b);
+}
+__DEVICE__ unsigned long long ullmin(unsigned long long __a,
+                                     unsigned long long __b) {
+  return __nv_ullmin(__a, __b);
+}
+__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
+  return __nv_umax(__a, __b);
+}
+__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
+  return __nv_umin(__a, __b);
+}
+__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
+__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
+__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
+__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
+__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
+__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
+
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__FAST_OR_SLOW")
+#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
diff --git a/lib/Headers/__clang_cuda_libdevice_declares.h b/lib/Headers/__clang_cuda_libdevice_declares.h
new file mode 100644
index 000000000000..73ea8f878183
--- /dev/null
+++ b/lib/Headers/__clang_cuda_libdevice_declares.h
@@ -0,0 +1,465 @@
+/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+
+extern "C" {
+
+__device__ int __nv_abs(int __a);
+__device__ double __nv_acos(double __a);
+__device__ float __nv_acosf(float __a);
+__device__ double __nv_acosh(double __a);
+__device__ float __nv_acoshf(float __a);
+__device__ double __nv_asin(double __a);
+__device__ float __nv_asinf(float __a);
+__device__ double __nv_asinh(double __a);
+__device__ float __nv_asinhf(float __a);
+__device__ double __nv_atan2(double __a, double __b);
+__device__ float __nv_atan2f(float __a, float __b);
+__device__ double __nv_atan(double __a);
+__device__ float __nv_atanf(float __a);
+__device__ double __nv_atanh(double __a);
+__device__ float __nv_atanhf(float __a);
+__device__ int __nv_brev(int __a);
+__device__ long long __nv_brevll(long long __a);
+__device__ int __nv_byte_perm(int __a, int __b, int __c);
+__device__ double __nv_cbrt(double __a);
+__device__ float __nv_cbrtf(float __a);
+__device__ double __nv_ceil(double __a);
+__device__ float __nv_ceilf(float __a);
+__device__ int __nv_clz(int __a);
+__device__ int __nv_clzll(long long __a);
+__device__ double __nv_copysign(double __a, double __b);
+__device__ float __nv_copysignf(float __a, float __b);
+__device__ double __nv_cos(double __a);
+__device__ float __nv_cosf(float __a);
+__device__ double __nv_cosh(double __a);
+__device__ float __nv_coshf(float __a);
+__device__ double __nv_cospi(double __a);
+__device__ float __nv_cospif(float __a);
+__device__ double __nv_cyl_bessel_i0(double __a);
+__device__ float __nv_cyl_bessel_i0f(float __a);
+__device__ double __nv_cyl_bessel_i1(double __a);
+__device__ float __nv_cyl_bessel_i1f(float __a);
+__device__ double __nv_dadd_rd(double __a, double __b);
+__device__ double __nv_dadd_rn(double __a, double __b);
+__device__ double __nv_dadd_ru(double __a, double __b);
+__device__ double __nv_dadd_rz(double __a, double __b);
+__device__ double __nv_ddiv_rd(double __a, double __b);
+__device__ double __nv_ddiv_rn(double __a, double __b);
+__device__ double __nv_ddiv_ru(double __a, double __b);
+__device__ double __nv_ddiv_rz(double __a, double __b);
+__device__ double __nv_dmul_rd(double __a, double __b);
+__device__ double __nv_dmul_rn(double __a, double __b);
+__device__ double __nv_dmul_ru(double __a, double __b);
+__device__ double __nv_dmul_rz(double __a, double __b);
+__device__ float __nv_double2float_rd(double __a);
+__device__ float __nv_double2float_rn(double __a);
+__device__ float __nv_double2float_ru(double __a);
+__device__ float __nv_double2float_rz(double __a);
+__device__ int __nv_double2hiint(double __a);
+__device__ int __nv_double2int_rd(double __a);
+__device__ int __nv_double2int_rn(double __a);
+__device__ int __nv_double2int_ru(double __a);
+__device__ int __nv_double2int_rz(double __a);
+__device__ long long __nv_double2ll_rd(double __a);
+__device__ long long __nv_double2ll_rn(double __a);
+__device__ long long __nv_double2ll_ru(double __a);
+__device__ long long __nv_double2ll_rz(double __a);
+__device__ int __nv_double2loint(double __a);
+__device__ unsigned int __nv_double2uint_rd(double __a);
+__device__ unsigned int __nv_double2uint_rn(double __a);
+__device__ unsigned int __nv_double2uint_ru(double __a);
+__device__ unsigned int __nv_double2uint_rz(double __a);
+__device__ unsigned long long __nv_double2ull_rd(double __a);
+__device__ unsigned long long __nv_double2ull_rn(double __a);
+__device__ unsigned long long __nv_double2ull_ru(double __a);
+__device__ unsigned long long __nv_double2ull_rz(double __a);
+__device__ unsigned long long __nv_double_as_longlong(double __a);
+__device__ double __nv_drcp_rd(double __a);
+__device__ double __nv_drcp_rn(double __a);
+__device__ double __nv_drcp_ru(double __a);
+__device__ double __nv_drcp_rz(double __a);
+__device__ double __nv_dsqrt_rd(double __a);
+__device__ double __nv_dsqrt_rn(double __a);
+__device__ double __nv_dsqrt_ru(double __a);
+__device__ double __nv_dsqrt_rz(double __a);
+__device__ double __nv_dsub_rd(double __a, double __b);
+__device__ double __nv_dsub_rn(double __a, double __b);
+__device__ double __nv_dsub_ru(double __a, double __b);
+__device__ double __nv_dsub_rz(double __a, double __b);
+__device__ double __nv_erfc(double __a);
+__device__ float __nv_erfcf(float __a);
+__device__ double __nv_erfcinv(double __a);
+__device__ float __nv_erfcinvf(float __a);
+__device__ double __nv_erfcx(double __a);
+__device__ float __nv_erfcxf(float __a);
+__device__ double __nv_erf(double __a);
+__device__ float __nv_erff(float __a);
+__device__ double __nv_erfinv(double __a);
+__device__ float __nv_erfinvf(float __a);
+__device__ double __nv_exp10(double __a);
+__device__ float __nv_exp10f(float __a);
+__device__ double __nv_exp2(double __a);
+__device__ float __nv_exp2f(float __a);
+__device__ double __nv_exp(double __a);
+__device__ float __nv_expf(float __a);
+__device__ double __nv_expm1(double __a);
+__device__ float __nv_expm1f(float __a);
+__device__ double __nv_fabs(double __a);
+__device__ float __nv_fabsf(float __a);
+__device__ float __nv_fadd_rd(float __a, float __b);
+__device__ float __nv_fadd_rn(float __a, float __b);
+__device__ float __nv_fadd_ru(float __a, float __b);
+__device__ float __nv_fadd_rz(float __a, float __b);
+__device__ float __nv_fast_cosf(float __a);
+__device__ float __nv_fast_exp10f(float __a);
+__device__ float __nv_fast_expf(float __a);
+__device__ float __nv_fast_fdividef(float __a, float __b);
+__device__ float __nv_fast_log10f(float __a);
+__device__ float __nv_fast_log2f(float __a);
+__device__ float __nv_fast_logf(float __a);
+__device__ float __nv_fast_powf(float __a, float __b);
+__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr);
+__device__ float __nv_fast_sinf(float __a);
+__device__ float __nv_fast_tanf(float __a);
+__device__ double __nv_fdim(double __a, double __b);
+__device__ float __nv_fdimf(float __a, float __b);
+__device__ float __nv_fdiv_rd(float __a, float __b);
+__device__ float __nv_fdiv_rn(float __a, float __b);
+__device__ float __nv_fdiv_ru(float __a, float __b);
+__device__ float __nv_fdiv_rz(float __a, float __b);
+__device__ int __nv_ffs(int __a);
+__device__ int __nv_ffsll(long long __a);
+__device__ int __nv_finitef(float __a);
+__device__ unsigned short __nv_float2half_rn(float __a);
+__device__ int __nv_float2int_rd(float __a);
+__device__ int __nv_float2int_rn(float __a);
+__device__ int __nv_float2int_ru(float __a);
+__device__ int __nv_float2int_rz(float __a);
+__device__ long long __nv_float2ll_rd(float __a);
+__device__ long long __nv_float2ll_rn(float __a);
+__device__ long long __nv_float2ll_ru(float __a);
+__device__ long long __nv_float2ll_rz(float __a);
+__device__ unsigned int __nv_float2uint_rd(float __a);
+__device__ unsigned int __nv_float2uint_rn(float __a);
+__device__ unsigned int __nv_float2uint_ru(float __a);
+__device__ unsigned int __nv_float2uint_rz(float __a);
+__device__ unsigned long long __nv_float2ull_rd(float __a);
+__device__ unsigned long long __nv_float2ull_rn(float __a);
+__device__ unsigned long long __nv_float2ull_ru(float __a);
+__device__ unsigned long long __nv_float2ull_rz(float __a);
+__device__ int __nv_float_as_int(float __a);
+__device__ unsigned int __nv_float_as_uint(float __a);
+__device__ double __nv_floor(double __a);
+__device__ float __nv_floorf(float __a);
+__device__ double __nv_fma(double __a, double __b, double __c);
+__device__ float __nv_fmaf(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rd(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rn(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ru(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rz(float __a, float __b, float __c);
+__device__ double __nv_fma_rd(double __a, double __b, double __c);
+__device__ double __nv_fma_rn(double __a, double __b, double __c);
+__device__ double __nv_fma_ru(double __a, double __b, double __c);
+__device__ double __nv_fma_rz(double __a, double __b, double __c);
+__device__ double __nv_fmax(double __a, double __b);
+__device__ float __nv_fmaxf(float __a, float __b);
+__device__ double __nv_fmin(double __a, double __b);
+__device__ float __nv_fminf(float __a, float __b);
+__device__ double __nv_fmod(double __a, double __b);
+__device__ float __nv_fmodf(float __a, float __b);
+__device__ float __nv_fmul_rd(float __a, float __b);
+__device__ float __nv_fmul_rn(float __a, float __b);
+__device__ float __nv_fmul_ru(float __a, float __b);
+__device__ float __nv_fmul_rz(float __a, float __b);
+__device__ float __nv_frcp_rd(float __a);
+__device__ float __nv_frcp_rn(float __a);
+__device__ float __nv_frcp_ru(float __a);
+__device__ float __nv_frcp_rz(float __a);
+__device__ double __nv_frexp(double __a, int *__b);
+__device__ float __nv_frexpf(float __a, int *__b);
+__device__ float __nv_frsqrt_rn(float __a);
+__device__ float __nv_fsqrt_rd(float __a);
+__device__ float __nv_fsqrt_rn(float __a);
+__device__ float __nv_fsqrt_ru(float __a);
+__device__ float __nv_fsqrt_rz(float __a);
+__device__ float __nv_fsub_rd(float __a, float __b);
+__device__ float __nv_fsub_rn(float __a, float __b);
+__device__ float __nv_fsub_ru(float __a, float __b);
+__device__ float __nv_fsub_rz(float __a, float __b);
+__device__ int __nv_hadd(int __a, int __b);
+__device__ float __nv_half2float(unsigned short __h);
+__device__ double __nv_hiloint2double(int __a, int __b);
+__device__ double __nv_hypot(double __a, double __b);
+__device__ float __nv_hypotf(float __a, float __b);
+__device__ int __nv_ilogb(double __a);
+__device__ int __nv_ilogbf(float __a);
+__device__ double __nv_int2double_rn(int __a);
+__device__ float __nv_int2float_rd(int __a);
+__device__ float __nv_int2float_rn(int __a);
+__device__ float __nv_int2float_ru(int __a);
+__device__ float __nv_int2float_rz(int __a);
+__device__ float __nv_int_as_float(int __a);
+__device__ int __nv_isfinited(double __a);
+__device__ int __nv_isinfd(double __a);
+__device__ int __nv_isinff(float __a);
+__device__ int __nv_isnand(double __a);
+__device__ int __nv_isnanf(float __a);
+__device__ double __nv_j0(double __a);
+__device__ float __nv_j0f(float __a);
+__device__ double __nv_j1(double __a);
+__device__ float __nv_j1f(float __a);
+__device__ float __nv_jnf(int __a, float __b);
+__device__ double __nv_jn(int __a, double __b);
+__device__ double __nv_ldexp(double __a, int __b);
+__device__ float __nv_ldexpf(float __a, int __b);
+__device__ double __nv_lgamma(double __a);
+__device__ float __nv_lgammaf(float __a);
+__device__ double __nv_ll2double_rd(long long __a);
+__device__ double __nv_ll2double_rn(long long __a);
+__device__ double __nv_ll2double_ru(long long __a);
+__device__ double __nv_ll2double_rz(long long __a);
+__device__ float __nv_ll2float_rd(long long __a);
+__device__ float __nv_ll2float_rn(long long __a);
+__device__ float __nv_ll2float_ru(long long __a);
+__device__ float __nv_ll2float_rz(long long __a);
+__device__ long long __nv_llabs(long long __a);
+__device__ long long __nv_llmax(long long __a, long long __b);
+__device__ long long __nv_llmin(long long __a, long long __b);
+__device__ long long __nv_llrint(double __a);
+__device__ long long __nv_llrintf(float __a);
+__device__ long long __nv_llround(double __a);
+__device__ long long __nv_llroundf(float __a);
+__device__ double __nv_log10(double __a);
+__device__ float __nv_log10f(float __a);
+__device__ double __nv_log1p(double __a);
+__device__ float __nv_log1pf(float __a);
+__device__ double __nv_log2(double __a);
+__device__ float __nv_log2f(float __a);
+__device__ double __nv_logb(double __a);
+__device__ float __nv_logbf(float __a);
+__device__ double __nv_log(double __a);
+__device__ float __nv_logf(float __a);
+__device__ double __nv_longlong_as_double(long long __a);
+__device__ int __nv_max(int __a, int __b);
+__device__ int __nv_min(int __a, int __b);
+__device__ double __nv_modf(double __a, double *__b);
+__device__ float __nv_modff(float __a, float *__b);
+__device__ int __nv_mul24(int __a, int __b);
+__device__ long long __nv_mul64hi(long long __a, long long __b);
+__device__ int __nv_mulhi(int __a, int __b);
+__device__ double __nv_nan(const signed char *__a);
+__device__ float __nv_nanf(const signed char *__a);
+__device__ double __nv_nearbyint(double __a);
+__device__ float __nv_nearbyintf(float __a);
+__device__ double __nv_nextafter(double __a, double __b);
+__device__ float __nv_nextafterf(float __a, float __b);
+__device__ double __nv_norm3d(double __a, double __b, double __c);
+__device__ float __nv_norm3df(float __a, float __b, float __c);
+__device__ double __nv_norm4d(double __a, double __b, double __c, double __d);
+__device__ float __nv_norm4df(float __a, float __b, float __c, float __d);
+__device__ double __nv_normcdf(double __a);
+__device__ float __nv_normcdff(float __a);
+__device__ double __nv_normcdfinv(double __a);
+__device__ float __nv_normcdfinvf(float __a);
+__device__ float __nv_normf(int __a, const float *__b);
+__device__ double __nv_norm(int __a, const double *__b);
+__device__ int __nv_popc(int __a);
+__device__ int __nv_popcll(long long __a);
+__device__ double __nv_pow(double __a, double __b);
+__device__ float __nv_powf(float __a, float __b);
+__device__ double __nv_powi(double __a, int __b);
+__device__ float __nv_powif(float __a, int __b);
+__device__ double __nv_rcbrt(double __a);
+__device__ float __nv_rcbrtf(float __a);
+__device__ double __nv_rcp64h(double __a);
+__device__ double __nv_remainder(double __a, double __b);
+__device__ float __nv_remainderf(float __a, float __b);
+__device__ double __nv_remquo(double __a, double __b, int *__c);
+__device__ float __nv_remquof(float __a, float __b, int *__c);
+__device__ int __nv_rhadd(int __a, int __b);
+__device__ double __nv_rhypot(double __a, double __b);
+__device__ float __nv_rhypotf(float __a, float __b);
+__device__ double __nv_rint(double __a);
+__device__ float __nv_rintf(float __a);
+__device__ double __nv_rnorm3d(double __a, double __b, double __c);
+__device__ float __nv_rnorm3df(float __a, float __b, float __c);
+__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
+__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
+__device__ float __nv_rnormf(int __a, const float *__b);
+__device__ double __nv_rnorm(int __a, const double *__b);
+__device__ double __nv_round(double __a);
+__device__ float __nv_roundf(float __a);
+__device__ double __nv_rsqrt(double __a);
+__device__ float __nv_rsqrtf(float __a);
+__device__ int __nv_sad(int __a, int __b, int __c);
+__device__ float __nv_saturatef(float __a);
+__device__ double __nv_scalbn(double __a, int __b);
+__device__ float __nv_scalbnf(float __a, int __b);
+__device__ int __nv_signbitd(double __a);
+__device__ int __nv_signbitf(float __a);
+__device__ void __nv_sincos(double __a, double *__b, double *__c);
+__device__ void __nv_sincosf(float __a, float *__b, float *__c);
+__device__ void __nv_sincospi(double __a, double *__b, double *__c);
+__device__ void __nv_sincospif(float __a, float *__b, float *__c);
+__device__ double __nv_sin(double __a);
+__device__ float __nv_sinf(float __a);
+__device__ double __nv_sinh(double __a);
+__device__ float __nv_sinhf(float __a);
+__device__ double __nv_sinpi(double __a);
+__device__ float __nv_sinpif(float __a);
+__device__ double __nv_sqrt(double __a);
+__device__ float __nv_sqrtf(float __a);
+__device__ double __nv_tan(double __a);
+__device__ float __nv_tanf(float __a);
+__device__ double __nv_tanh(double __a);
+__device__ float __nv_tanhf(float __a);
+__device__ double __nv_tgamma(double __a);
+__device__ float __nv_tgammaf(float __a);
+__device__ double __nv_trunc(double __a);
+__device__ float __nv_truncf(float __a);
+__device__ int __nv_uhadd(unsigned int __a, unsigned int __b);
+__device__ double __nv_uint2double_rn(unsigned int __i);
+__device__ float __nv_uint2float_rd(unsigned int __a);
+__device__ float __nv_uint2float_rn(unsigned int __a);
+__device__ float __nv_uint2float_ru(unsigned int __a);
+__device__ float __nv_uint2float_rz(unsigned int __a);
+__device__ float __nv_uint_as_float(unsigned int __a);
+__device__ double __nv_ull2double_rd(unsigned long long __a);
+__device__ double __nv_ull2double_rn(unsigned long long __a);
+__device__ double __nv_ull2double_ru(unsigned long long __a);
+__device__ double __nv_ull2double_rz(unsigned long long __a);
+__device__ float __nv_ull2float_rd(unsigned long long __a);
+__device__ float __nv_ull2float_rn(unsigned long long __a);
+__device__ float __nv_ull2float_ru(unsigned long long __a);
+__device__ float __nv_ull2float_rz(unsigned long long __a);
+__device__ unsigned long long __nv_ullmax(unsigned long long __a,
+                                          unsigned long long __b);
+__device__ unsigned long long __nv_ullmin(unsigned long long __a,
+                                          unsigned long long __b);
+__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
+__device__ unsigned long long __nv_umul64hi(unsigned long long __a,
+                                            unsigned long long __b);
+__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
+                                  unsigned int __c);
+__device__ int __nv_vabs2(int __a);
+__device__ int __nv_vabs4(int __a);
+__device__ int __nv_vabsdiffs2(int __a, int __b);
+__device__ int __nv_vabsdiffs4(int __a, int __b);
+__device__ int __nv_vabsdiffu2(int __a, int __b);
+__device__ int __nv_vabsdiffu4(int __a, int __b);
+__device__ int __nv_vabsss2(int __a);
+__device__ int __nv_vabsss4(int __a);
+__device__ int __nv_vadd2(int __a, int __b);
+__device__ int __nv_vadd4(int __a, int __b);
+__device__ int __nv_vaddss2(int __a, int __b);
+__device__ int __nv_vaddss4(int __a, int __b);
+__device__ int __nv_vaddus2(int __a, int __b);
+__device__ int __nv_vaddus4(int __a, int __b);
+__device__ int __nv_vavgs2(int __a, int __b);
+__device__ int __nv_vavgs4(int __a, int __b);
+__device__ int __nv_vavgu2(int __a, int __b);
+__device__ int __nv_vavgu4(int __a, int __b);
+__device__ int __nv_vcmpeq2(int __a, int __b);
+__device__ int __nv_vcmpeq4(int __a, int __b);
+__device__ int __nv_vcmpges2(int __a, int __b);
+__device__ int __nv_vcmpges4(int __a, int __b);
+__device__ int __nv_vcmpgeu2(int __a, int __b);
+__device__ int __nv_vcmpgeu4(int __a, int __b);
+__device__ int __nv_vcmpgts2(int __a, int __b);
+__device__ int __nv_vcmpgts4(int __a, int __b);
+__device__ int __nv_vcmpgtu2(int __a, int __b);
+__device__ int __nv_vcmpgtu4(int __a, int __b);
+__device__ int __nv_vcmples2(int __a, int __b);
+__device__ int __nv_vcmples4(int __a, int __b);
+__device__ int __nv_vcmpleu2(int __a, int __b);
+__device__ int __nv_vcmpleu4(int __a, int __b);
+__device__ int __nv_vcmplts2(int __a, int __b);
+__device__ int __nv_vcmplts4(int __a, int __b);
+__device__ int __nv_vcmpltu2(int __a, int __b);
+__device__ int __nv_vcmpltu4(int __a, int __b);
+__device__ int __nv_vcmpne2(int __a, int __b);
+__device__ int __nv_vcmpne4(int __a, int __b);
+__device__ int __nv_vhaddu2(int __a, int __b);
+__device__ int __nv_vhaddu4(int __a, int __b);
+__device__ int __nv_vmaxs2(int __a, int __b);
+__device__ int __nv_vmaxs4(int __a, int __b);
+__device__ int __nv_vmaxu2(int __a, int __b);
+__device__ int __nv_vmaxu4(int __a, int __b);
+__device__ int __nv_vmins2(int __a, int __b);
+__device__ int __nv_vmins4(int __a, int __b);
+__device__ int __nv_vminu2(int __a, int __b);
+__device__ int __nv_vminu4(int __a, int __b);
+__device__ int __nv_vneg2(int __a);
+__device__ int __nv_vneg4(int __a);
+__device__ int __nv_vnegss2(int __a);
+__device__ int __nv_vnegss4(int __a);
+__device__ int __nv_vsads2(int __a, int __b);
+__device__ int __nv_vsads4(int __a, int __b);
+__device__ int __nv_vsadu2(int __a, int __b);
+__device__ int __nv_vsadu4(int __a, int __b);
+__device__ int __nv_vseteq2(int __a, int __b);
+__device__ int __nv_vseteq4(int __a, int __b);
+__device__ int __nv_vsetges2(int __a, int __b);
+__device__ int __nv_vsetges4(int __a, int __b);
+__device__ int __nv_vsetgeu2(int __a, int __b);
+__device__ int __nv_vsetgeu4(int __a, int __b);
+__device__ int __nv_vsetgts2(int __a, int __b);
+__device__ int __nv_vsetgts4(int __a, int __b);
+__device__ int __nv_vsetgtu2(int __a, int __b);
+__device__ int __nv_vsetgtu4(int __a, int __b);
+__device__ int __nv_vsetles2(int __a, int __b);
+__device__ int __nv_vsetles4(int __a, int __b);
+__device__ int __nv_vsetleu2(int __a, int __b);
+__device__ int __nv_vsetleu4(int __a, int __b);
+__device__ int __nv_vsetlts2(int __a, int __b);
+__device__ int __nv_vsetlts4(int __a, int __b);
+__device__ int __nv_vsetltu2(int __a, int __b);
+__device__ int __nv_vsetltu4(int __a, int __b);
+__device__ int __nv_vsetne2(int __a, int __b);
+__device__ int __nv_vsetne4(int __a, int __b);
+__device__ int __nv_vsub2(int __a, int __b);
+__device__ int __nv_vsub4(int __a, int __b);
+__device__ int __nv_vsubss2(int __a, int __b);
+__device__ int __nv_vsubss4(int __a, int __b);
+__device__ int __nv_vsubus2(int __a, int __b);
+__device__ int __nv_vsubus4(int __a, int __b);
+__device__ double __nv_y0(double __a);
+__device__ float __nv_y0f(float __a);
+__device__ double __nv_y1(double __a);
+__device__ float __nv_y1f(float __a);
+__device__ float __nv_ynf(int __a, float __b);
+__device__ double __nv_yn(int __a, double __b);
+
+} // extern "C"
+#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h
index a82a8490f367..a5eb4bd74ab3 100644
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -62,7 +62,7 @@
 #include "cuda.h"
 #if !defined(CUDA_VERSION)
 #error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9010
 #error "Unsupported CUDA version!"
 #endif
 
@@ -84,6 +84,9 @@
 #define __DEVICE_FUNCTIONS_H__
 #define __MATH_FUNCTIONS_H__
 #define __COMMON_FUNCTIONS_H__
+// device_functions_decls is replaced by __clang_cuda_device_functions.h
+// included below.
+#define __DEVICE_FUNCTIONS_DECLS_H__
 
 #undef __CUDACC__
 #if CUDA_VERSION < 9000
@@ -137,20 +140,22 @@ inline __host__ double __signbitd(double x) {
 }
 #endif
 
-// We need decls for functions in CUDA's libdevice with __device__
-// attribute only. Alas they come either as __host__ __device__ or
-// with no attributes at all. To work around that, define __CUDA_RTC__
-// which produces HD variant and undef __host__ which gives us desided
-// decls with __device__ attribute.
-#pragma push_macro("__host__")
-#define __host__
-#define __CUDACC_RTC__
-#include "device_functions_decls.h"
-#undef __CUDACC_RTC__
+// CUDA 9.1 no longer provides declarations for libdevice functions, so we need
+// to provide our own.
+#include <__clang_cuda_libdevice_declares.h>
 
-// Temporarily poison __host__ macro to ensure it's not used by any of
-// the headers we're about to include.
-#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+// Wrappers for many device-side standard library functions became compiler
+// builtins in CUDA-9 and have been removed from the CUDA headers. Clang now
+// provides its own implementation of the wrappers.
+#if CUDA_VERSION >= 9000
+#include <__clang_cuda_device_functions.h>
+#endif
+
+// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
+// counterpart does not do it, so we need to make it empty here to keep
+// following CUDA includes happy.
+#undef __THROW
+#define __THROW
 
 // CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
 // Previous versions used to check whether they are defined or not.
@@ -167,24 +172,20 @@ inline __host__ double __signbitd(double x) {
 #endif
 #endif
 
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#pragma push_macro("__host__")
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
 // device_functions.hpp and math_functions*.hpp use 'static
 // __forceinline__' (with no __device__) for definitions of device
 // functions. Temporarily redefine __forceinline__ to include
 // __device__.
 #pragma push_macro("__forceinline__")
 #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
-
-#pragma push_macro("__float2half_rn")
-#if CUDA_VERSION >= 9000
-// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
-// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
-// device_functions.hpp out of the way.
-#define __float2half_rn  __float2half_rn_disabled
-#endif
-
+#if CUDA_VERSION < 9000
 #include "device_functions.hpp"
-#pragma pop_macro("__float2half_rn")
-
+#endif
 
 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
@@ -196,17 +197,27 @@ inline __host__ double __signbitd(double x) {
 #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
 #define __USE_FAST_MATH__ 1
 #endif
+
+#if CUDA_VERSION >= 9000
+#include "crt/math_functions.hpp"
+#else
 #include "math_functions.hpp"
+#endif
+
 #pragma pop_macro("__USE_FAST_MATH__")
 
+#if CUDA_VERSION < 9000
 #include "math_functions_dbl_ptx3.hpp"
+#endif
 #pragma pop_macro("__forceinline__")
 
 // Pull in host-only functions that are only available when neither
 // __CUDACC__ nor __CUDABE__ are defined.
 #undef __MATH_FUNCTIONS_HPP__
 #undef __CUDABE__
+#if CUDA_VERSION < 9000
 #include "math_functions.hpp"
+#endif
 // Alas, additional overloads for these functions are hard to get to.
 // Considering that we only need these overloads for a few functions,
 // we can provide them here.
@@ -222,22 +233,36 @@ static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
 static inline float normcdf(float __a) { return normcdff(__a); }
 static inline float erfcx(float __a) { return erfcxf(__a); }
 
+#if CUDA_VERSION < 9000
 // For some reason single-argument variant is not always declared by
 // CUDA headers. Alas, device_functions.hpp included below needs it.
 static inline __device__ void __brkpt(int __c) { __brkpt(); }
+#endif
 
 // Now include *.hpp with definitions of various GPU functions.  Alas,
 // a lot of thins get declared/defined with __host__ attribute which
 // we don't want and we have to define it out. We also have to include
 // {device,math}_functions.hpp again in order to extract the other
 // branch of #if/else inside.
-
 #define __host__
 #undef __CUDABE__
 #define __CUDACC__
+#if CUDA_VERSION >= 9000
+// Some atomic functions became compiler builtins in CUDA-9 , so we need their
+// declarations.
+#include "device_atomic_functions.h"
+#endif
 #undef __DEVICE_FUNCTIONS_HPP__
 #include "device_atomic_functions.hpp"
+#if CUDA_VERSION >= 9000
+#include "crt/device_functions.hpp"
+#include "crt/device_double_functions.hpp"
+#else
 #include "device_functions.hpp"
+#define __CUDABE__
+#include "device_double_functions.h"
+#undef __CUDABE__
+#endif
 #include "sm_20_atomic_functions.hpp"
 #include "sm_20_intrinsics.hpp"
 #include "sm_32_atomic_functions.hpp"
@@ -251,8 +276,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 // reason about our code.
 
 #if CUDA_VERSION >= 8000
+#pragma push_macro("__CUDA_ARCH__")
+#undef __CUDA_ARCH__
 #include "sm_60_atomic_functions.hpp"
 #include "sm_61_intrinsics.hpp"
+#pragma pop_macro("__CUDA_ARCH__")
 #endif
 
 #undef __MATH_FUNCTIONS_HPP__
@@ -279,7 +307,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 #endif
 #endif
 
+#if CUDA_VERSION >= 9000
+#include "crt/math_functions.hpp"
+#else
 #include "math_functions.hpp"
+#endif
 #pragma pop_macro("_GLIBCXX_MATH_H")
 #pragma pop_macro("_LIBCPP_VERSION")
 #pragma pop_macro("__GNUC__")
diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h
index 3ff0e3aafdcc..064300a48798 100644
--- a/lib/Headers/avx512bwintrin.h
+++ b/lib/Headers/avx512bwintrin.h
@@ -1854,13 +1854,15 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64)  (( __A  & 0xFFFFFFFF) | ( __B << 32));
+  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+                (__mmask64) __B);
 }
 
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-return (__mmask32)  (( __A  & 0xFFFF) | ( __B << 16));
+  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+                (__mmask32) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index d34f0b1327ae..f5137428ba3f 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -8787,7 +8787,7 @@ _mm512_kortestz (__mmask16 __A, __mmask16 __B)
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  return (__mmask16)  (( __A  & 0xFF) | ( __B << 8));
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
 }
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h
index dff5897b6bb6..27dc64424ae0 100644
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h
@@ -1120,7 +1120,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 /// \param A
 ///    A 256-bit vector of [8 x float].
 /// \param C
-///    An immediate integer operand specifying how the values are to be \n
+///    An immediate integer operand specifying how the values are to be
 ///    copied. \n
 ///    Bits [1:0]: \n
 ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
@@ -1150,7 +1150,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
 ///          returned vector. \n
 ///    Bits [7:6]: \n
-///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
 ///          returned vector. \n
 ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
 ///          returned vector. \n
@@ -1665,38 +1665,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    0x00 : Equal (ordered, non-signaling)
-///    0x01 : Less-than (ordered, signaling)
-///    0x02 : Less-than-or-equal (ordered, signaling)
-///    0x03 : Unordered (non-signaling)
-///    0x04 : Not-equal (unordered, non-signaling)
-///    0x05 : Not-less-than (unordered, signaling)
-///    0x06 : Not-less-than-or-equal (unordered, signaling)
-///    0x07 : Ordered (non-signaling)
-///    0x08 : Equal (unordered, non-signaling)
-///    0x09 : Not-greater-than-or-equal (unordered, signaling)
-///    0x0a : Not-greater-than (unordered, signaling)
-///    0x0b : False (ordered, non-signaling)
-///    0x0c : Not-equal (ordered, non-signaling)
-///    0x0d : Greater-than-or-equal (ordered, signaling)
-///    0x0e : Greater-than (ordered, signaling)
-///    0x0f : True (unordered, non-signaling)
-///    0x10 : Equal (ordered, signaling)
-///    0x11 : Less-than (ordered, non-signaling)
-///    0x12 : Less-than-or-equal (ordered, non-signaling)
-///    0x13 : Unordered (signaling)
-///    0x14 : Not-equal (unordered, signaling)
-///    0x15 : Not-less-than (unordered, non-signaling)
-///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
-///    0x17 : Ordered (signaling)
-///    0x18 : Equal (unordered, signaling)
-///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
-///    0x1a : Not-greater-than (unordered, non-signaling)
-///    0x1b : False (ordered, signaling)
-///    0x1c : Not-equal (ordered, signaling)
-///    0x1d : Greater-than-or-equal (ordered, non-signaling)
-///    0x1e : Greater-than (ordered, non-signaling)
-///    0x1f : True (unordered, signaling)
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
@@ -1725,38 +1725,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    0x00 : Equal (ordered, non-signaling)
-///    0x01 : Less-than (ordered, signaling)
-///    0x02 : Less-than-or-equal (ordered, signaling)
-///    0x03 : Unordered (non-signaling)
-///    0x04 : Not-equal (unordered, non-signaling)
-///    0x05 : Not-less-than (unordered, signaling)
-///    0x06 : Not-less-than-or-equal (unordered, signaling)
-///    0x07 : Ordered (non-signaling)
-///    0x08 : Equal (unordered, non-signaling)
-///    0x09 : Not-greater-than-or-equal (unordered, signaling)
-///    0x0a : Not-greater-than (unordered, signaling)
-///    0x0b : False (ordered, non-signaling)
-///    0x0c : Not-equal (ordered, non-signaling)
-///    0x0d : Greater-than-or-equal (ordered, signaling)
-///    0x0e : Greater-than (ordered, signaling)
-///    0x0f : True (unordered, non-signaling)
-///    0x10 : Equal (ordered, signaling)
-///    0x11 : Less-than (ordered, non-signaling)
-///    0x12 : Less-than-or-equal (ordered, non-signaling)
-///    0x13 : Unordered (signaling)
-///    0x14 : Not-equal (unordered, signaling)
-///    0x15 : Not-less-than (unordered, non-signaling)
-///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
-///    0x17 : Ordered (signaling)
-///    0x18 : Equal (unordered, signaling)
-///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
-///    0x1a : Not-greater-than (unordered, non-signaling)
-///    0x1b : False (ordered, signaling)
-///    0x1c : Not-equal (ordered, signaling)
-///    0x1d : Greater-than-or-equal (ordered, non-signaling)
-///    0x1e : Greater-than (ordered, non-signaling)
-///    0x1f : True (unordered, signaling)
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
@@ -1785,38 +1785,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    0x00 : Equal (ordered, non-signaling)
-///    0x01 : Less-than (ordered, signaling)
-///    0x02 : Less-than-or-equal (ordered, signaling)
-///    0x03 : Unordered (non-signaling)
-///    0x04 : Not-equal (unordered, non-signaling)
-///    0x05 : Not-less-than (unordered, signaling)
-///    0x06 : Not-less-than-or-equal (unordered, signaling)
-///    0x07 : Ordered (non-signaling)
-///    0x08 : Equal (unordered, non-signaling)
-///    0x09 : Not-greater-than-or-equal (unordered, signaling)
-///    0x0a : Not-greater-than (unordered, signaling)
-///    0x0b : False (ordered, non-signaling)
-///    0x0c : Not-equal (ordered, non-signaling)
-///    0x0d : Greater-than-or-equal (ordered, signaling)
-///    0x0e : Greater-than (ordered, signaling)
-///    0x0f : True (unordered, non-signaling)
-///    0x10 : Equal (ordered, signaling)
-///    0x11 : Less-than (ordered, non-signaling)
-///    0x12 : Less-than-or-equal (ordered, non-signaling)
-///    0x13 : Unordered (signaling)
-///    0x14 : Not-equal (unordered, signaling)
-///    0x15 : Not-less-than (unordered, non-signaling)
-///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
-///    0x17 : Ordered (signaling)
-///    0x18 : Equal (unordered, signaling)
-///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
-///    0x1a : Not-greater-than (unordered, non-signaling)
-///    0x1b : False (ordered, signaling)
-///    0x1c : Not-equal (ordered, signaling)
-///    0x1d : Greater-than-or-equal (ordered, non-signaling)
-///    0x1e : Greater-than (ordered, non-signaling)
-///    0x1f : True (unordered, signaling)
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
@@ -1845,38 +1845,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    0x00 : Equal (ordered, non-signaling)
-///    0x01 : Less-than (ordered, signaling)
-///    0x02 : Less-than-or-equal (ordered, signaling)
-///    0x03 : Unordered (non-signaling)
-///    0x04 : Not-equal (unordered, non-signaling)
-///    0x05 : Not-less-than (unordered, signaling)
-///    0x06 : Not-less-than-or-equal (unordered, signaling)
-///    0x07 : Ordered (non-signaling)
-///    0x08 : Equal (unordered, non-signaling)
-///    0x09 : Not-greater-than-or-equal (unordered, signaling)
-///    0x0a : Not-greater-than (unordered, signaling)
-///    0x0b : False (ordered, non-signaling)
-///    0x0c : Not-equal (ordered, non-signaling)
-///    0x0d : Greater-than-or-equal (ordered, signaling)
-///    0x0e : Greater-than (ordered, signaling)
-///    0x0f : True (unordered, non-signaling)
-///    0x10 : Equal (ordered, signaling)
-///    0x11 : Less-than (ordered, non-signaling)
-///    0x12 : Less-than-or-equal (ordered, non-signaling)
-///    0x13 : Unordered (signaling)
-///    0x14 : Not-equal (unordered, signaling)
-///    0x15 : Not-less-than (unordered, non-signaling)
-///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
-///    0x17 : Ordered (signaling)
-///    0x18 : Equal (unordered, signaling)
-///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
-///    0x1a : Not-greater-than (unordered, non-signaling)
-///    0x1b : False (ordered, signaling)
-///    0x1c : Not-equal (ordered, signaling)
-///    0x1d : Greater-than-or-equal (ordered, non-signaling)
-///    0x1e : Greater-than (ordered, non-signaling)
-///    0x1f : True (unordered, signaling)
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
@@ -1904,38 +1904,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    0x00 : Equal (ordered, non-signaling)
-///    0x01 : Less-than (ordered, signaling)
-///    0x02 : Less-than-or-equal (ordered, signaling)
-///    0x03 : Unordered (non-signaling)
-///    0x04 : Not-equal (unordered, non-signaling)
-///    0x05 : Not-less-than (unordered, signaling)
-///    0x06 : Not-less-than-or-equal (unordered, signaling)
-///    0x07 : Ordered (non-signaling)
-///    0x08 : Equal (unordered, non-signaling)
-///    0x09 : Not-greater-than-or-equal (unordered, signaling)
-///    0x0a : Not-greater-than (unordered, signaling)
-///    0x0b : False (ordered, non-signaling)
-///    0x0c : Not-equal (ordered, non-signaling)
-///    0x0d : Greater-than-or-equal (ordered, signaling)
-///    0x0e : Greater-than (ordered, signaling)
-///    0x0f : True (unordered, non-signaling)
-///    0x10 : Equal (ordered, signaling)
-///    0x11 : Less-than (ordered, non-signaling)
-///    0x12 : Less-than-or-equal (ordered, non-signaling)
-///    0x13 : Unordered (signaling)
-///    0x14 : Not-equal (unordered, signaling)
-///    0x15 : Not-less-than (unordered, non-signaling)
-///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
-///    0x17 : Ordered (signaling)
-///    0x18 : Equal (unordered, signaling)
-///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
-///    0x1a : Not-greater-than (unordered, non-signaling)
-///    0x1b : False (ordered, signaling)
-///    0x1c : Not-equal (ordered, signaling)
-///    0x1d : Greater-than-or-equal (ordered, non-signaling)
-///    0x1e : Greater-than (ordered, non-signaling)
-///    0x1f : True (unordered, signaling)
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
@@ -1963,38 +1963,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    0x00 : Equal (ordered, non-signaling)
-///    0x01 : Less-than (ordered, signaling)
-///    0x02 : Less-than-or-equal (ordered, signaling)
-///    0x03 : Unordered (non-signaling)
-///    0x04 : Not-equal (unordered, non-signaling)
-///    0x05 : Not-less-than (unordered, signaling)
-///    0x06 : Not-less-than-or-equal (unordered, signaling)
-///    0x07 : Ordered (non-signaling)
-///    0x08 : Equal (unordered, non-signaling)
-///    0x09 : Not-greater-than-or-equal (unordered, signaling)
-///    0x0a : Not-greater-than (unordered, signaling)
-///    0x0b : False (ordered, non-signaling)
-///    0x0c : Not-equal (ordered, non-signaling)
-///    0x0d : Greater-than-or-equal (ordered, signaling)
-///    0x0e : Greater-than (ordered, signaling)
-///    0x0f : True (unordered, non-signaling)
-///    0x10 : Equal (ordered, signaling)
-///    0x11 : Less-than (ordered, non-signaling)
-///    0x12 : Less-than-or-equal (ordered, non-signaling)
-///    0x13 : Unordered (signaling)
-///    0x14 : Not-equal (unordered, signaling)
-///    0x15 : Not-less-than (unordered, non-signaling)
-///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
-///    0x17 : Ordered (signaling)
-///    0x18 : Equal (unordered, signaling)
-///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
-///    0x1a : Not-greater-than (unordered, non-signaling)
-///    0x1b : False (ordered, signaling)
-///    0x1c : Not-equal (ordered, signaling)
-///    0x1d : Greater-than-or-equal (ordered, non-signaling)
-///    0x1e : Greater-than (ordered, non-signaling)
-///    0x1f : True (unordered, signaling)
+///    0x00: Equal (ordered, non-signaling) \n
+///    0x01: Less-than (ordered, signaling) \n
+///    0x02: Less-than-or-equal (ordered, signaling) \n
+///    0x03: Unordered (non-signaling) \n
+///    0x04: Not-equal (unordered, non-signaling) \n
+///    0x05: Not-less-than (unordered, signaling) \n
+///    0x06: Not-less-than-or-equal (unordered, signaling) \n
+///    0x07: Ordered (non-signaling) \n
+///    0x08: Equal (unordered, non-signaling) \n
+///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
+///    0x0A: Not-greater-than (unordered, signaling) \n
+///    0x0B: False (ordered, non-signaling) \n
+///    0x0C: Not-equal (ordered, non-signaling) \n
+///    0x0D: Greater-than-or-equal (ordered, signaling) \n
+///    0x0E: Greater-than (ordered, signaling) \n
+///    0x0F: True (unordered, non-signaling) \n
+///    0x10: Equal (ordered, signaling) \n
+///    0x11: Less-than (ordered, non-signaling) \n
+///    0x12: Less-than-or-equal (ordered, non-signaling) \n
+///    0x13: Unordered (signaling) \n
+///    0x14: Not-equal (unordered, signaling) \n
+///    0x15: Not-less-than (unordered, non-signaling) \n
+///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+///    0x17: Ordered (signaling) \n
+///    0x18: Equal (unordered, signaling) \n
+///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+///    0x1A: Not-greater-than (unordered, non-signaling) \n
+///    0x1B: False (ordered, signaling) \n
+///    0x1C: Not-equal (ordered, signaling) \n
+///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+///    0x1E: Greater-than (ordered, non-signaling) \n
+///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
@@ -2375,9 +2375,8 @@ _mm256_cvtss_f32(__m256 __a)
 }
 
 /* Vector replicate */
-/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
-///    vector of [8 x float] to float values in a 256-bit vector of
-///    [8 x float].
+/// \brief Moves and duplicates odd-indexed values from a 256-bit vector of
+///    [8 x float] to float values in a 256-bit vector of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2401,8 +2400,8 @@ _mm256_movehdup_ps(__m256 __a)
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
 }
 
-/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
-///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
+/// \brief Moves and duplicates even-indexed values from a 256-bit vector of
+///    [8 x float] to float values in a 256-bit vector of [8 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2998,7 +2997,7 @@ _mm256_movemask_pd(__m256d __a)
   return __builtin_ia32_movmskpd256((__v4df)__a);
 }
 
-/// \brief Extracts the sign bits of double-precision floating point elements
+/// \brief Extracts the sign bits of single-precision floating point elements
 ///    in a 256-bit vector of [8 x float] and writes them to the lower order
 ///    bits of the return value.
 ///
@@ -3007,7 +3006,7 @@ _mm256_movemask_pd(__m256d __a)
 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
 ///
 /// \param __a
-///    A 256-bit vector of [8 x float] containing the double-precision floating
+///    A 256-bit vector of [8 x float] containing the single-precision floating
 ///    point values with sign bits to be extracted.
 /// \returns The sign bits from the operand, written to bits [7:0].
 static __inline int __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index 3372508a7f81..b332eeec205a 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -217,8 +217,8 @@ _mm_div_pd(__m128d __a, __m128d __b)
 
 /// \brief Calculates the square root of the lower double-precision value of
 ///    the second operand and returns it in the lower 64 bits of the result.
-///    The upper 64 bits of the result are copied from the upper double-
-///    precision value of the first operand.
+///    The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -260,8 +260,8 @@ _mm_sqrt_pd(__m128d __a)
 
 /// \brief Compares lower 64-bit double-precision values of both operands, and
 ///    returns the lesser of the pair of values in the lower 64-bits of the
-///    result. The upper 64 bits of the result are copied from the upper double-
-///    precision value of the first operand.
+///    result. The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -304,8 +304,8 @@ _mm_min_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares lower 64-bit double-precision values of both operands, and
 ///    returns the greater of the pair of values in the lower 64-bits of the
-///    result. The upper 64 bits of the result are copied from the upper double-
-///    precision value of the first operand.
+///    result. The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -983,8 +983,10 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
 }
 
 /// \brief Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality. The
-///    comparison yields 0 for false, 1 for true.
+///    the two 128-bit floating-point vectors of [2 x double] for equality.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -996,7 +998,8 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
@@ -1008,7 +1011,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1020,7 +1024,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
@@ -1032,7 +1037,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1044,7 +1050,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
@@ -1056,7 +1063,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1068,7 +1076,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
@@ -1080,7 +1089,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1092,7 +1102,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_sd(__m128d __a, __m128d __b)
 {
@@ -1104,7 +1115,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1116,7 +1128,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
@@ -1127,7 +1140,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
 ///    comparison yields 0 for false, 1 for true.
 ///
-///    If either of the two lower double-precision values is NaN, 1 is returned.
+///    If either of the two lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1140,7 +1153,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
@@ -1153,7 +1166,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
 ///    the second parameter.
 ///
 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 1 is returned.
+///    double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1166,7 +1179,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
@@ -1179,7 +1192,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
 ///    corresponding value in the second parameter.
 ///
 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 1 is returned.
+///    double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1192,7 +1205,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 1 is returned.
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
@@ -1257,7 +1270,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
 ///    the second parameter.
 ///
 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1270,7 +1283,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison result. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
@@ -1935,14 +1948,15 @@ _mm_store_pd(double *__dp, __m128d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
+/// This intrinsic corresponds to the
+///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
 ///
 /// \param __dp
 ///    A pointer to a memory location that can store two double-precision
 ///    values.
 /// \param __a
 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
-///    of the values in \a dp.
+///    of the values in \a __dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_pd(double *__dp, __m128d __a)
 {
@@ -1950,18 +1964,20 @@ _mm_store1_pd(double *__dp, __m128d __a)
   _mm_store_pd(__dp, __a);
 }
 
-/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
-///    location.
+/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+///    the upper and lower 64 bits of a memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+/// This intrinsic corresponds to the
+///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
 ///
 /// \param __dp
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 16-byte aligned.
+///    A pointer to a memory location that can store two double-precision
+///    values.
 /// \param __a
-///    A 128-bit vector of [2 x double] containing the values to be stored.
+///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
+///    of the values in \a __dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd1(double *__dp, __m128d __a)
 {
@@ -3846,8 +3862,7 @@ _mm_set1_epi8(char __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
-///   instruction.
+/// This intrinsic does not correspond to a specific instruction.
 ///
 /// \param __q0
 ///    A 64-bit integral value used to initialize the lower 64 bits of the
@@ -4018,7 +4033,7 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
 ///    specified unaligned memory location. When a mask bit is 1, the
 ///    corresponding byte is written, otherwise it is not written.
 ///
-///    To minimize caching, the date is flagged as non-temporal (unlikely to be
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon). Exception and trap behavior for elements not selected
 ///    for storage to memory are implementation dependent.
 ///
@@ -4532,8 +4547,8 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b)
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }
 
-/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
-///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4665,7 +4680,7 @@ _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic has no corresponding instruction.
+/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
@@ -4682,7 +4697,7 @@ _mm_movepi64_pi64(__m128i __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
+/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit value.
@@ -4712,8 +4727,8 @@ _mm_move_epi64(__m128i __a)
   return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
 }
 
-/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
-///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
+///    [2 x double] and interleaves them into a 128-bit vector of [2 x
 ///    double].
 ///
 /// \headerfile <x86intrin.h>
@@ -4733,7 +4748,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b)
   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
 }
 
-/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors
 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
 ///    double].
 ///
@@ -4792,9 +4807,9 @@ _mm_movemask_pd(__m128d __a)
 ///    A 128-bit vector of [2 x double].
 /// \param i
 ///    An 8-bit immediate value. The least significant two bits specify which
-///    elements to copy from a and b: \n
-///    Bit[0] = 0: lower element of a copied to lower element of result. \n
-///    Bit[0] = 1: upper element of a copied to lower element of result. \n
+///    elements to copy from \a a and \a b: \n
+///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
+///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h
index d3421dc86c99..a332879500d4 100644
--- a/lib/Headers/immintrin.h
+++ b/lib/Headers/immintrin.h
@@ -247,6 +247,18 @@ _mm256_cvtph_ps(__m128i __a)
 #include <gfniintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDPID__)
+/// \brief Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDPID </c> instruction.
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
+_rdpid_u32(void) {
+  return __builtin_ia32_rdpid();
+}
+#endif // __RDPID__
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
diff --git a/lib/Headers/module.modulemap b/lib/Headers/module.modulemap
index 95d26cefa6f7..961d8dde6eb6 100644
--- a/lib/Headers/module.modulemap
+++ b/lib/Headers/module.modulemap
@@ -38,6 +38,7 @@ module _Builtin_intrinsics [system] [extern_c] {
     explicit module neon {
       requires neon
       header "arm_neon.h"
+      header "arm_fp16.h"
       export *
     }
   }
diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h
index ce204b04c030..e648b0f2f370 100644
--- a/lib/Headers/opencl-c.h
+++ b/lib/Headers/opencl-c.h
@@ -12862,7 +12862,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags);
  * Read memory barrier that orders only
  * loads.
  * The flags argument specifies the memory
- * address space and can be set to to a
+ * address space and can be set to a
  * combination of the following literal
  * values:
  * CLK_LOCAL_MEM_FENCE
@@ -12874,7 +12874,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags);
  * Write memory barrier that orders only
  * stores.
  * The flags argument specifies the memory
- * address space and can be set to to a
+ * address space and can be set to a
  * combination of the following literal
  * values:
  * CLK_LOCAL_MEM_FENCE
@@ -15421,8 +15421,8 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept
 #define CLK_DEPTH_STENCIL     0x10BE
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #define CLK_sRGB              0x10BF
-#define CLK_sRGBA             0x10C1
 #define CLK_sRGBx             0x10C0
+#define CLK_sRGBA             0x10C1
 #define CLK_sBGRA             0x10C2
 #define CLK_ABGR              0x10C3
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h
index 559ece2e3974..7ec08a1bcbb3 100644
--- a/lib/Headers/pmmintrin.h
+++ b/lib/Headers/pmmintrin.h
@@ -115,8 +115,8 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 }
 
-/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
-///    vector of [4 x float] to float values stored in a 128-bit vector of
+/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
+///    of [4 x float] to float values stored in a 128-bit vector of
 ///    [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a)
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
 }
 
-/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
+/// \brief Duplicates even-indexed values from a 128-bit vector of
 ///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h
index c2fa5a452bce..e02775cea358 100644
--- a/lib/Headers/smmintrin.h
+++ b/lib/Headers/smmintrin.h
@@ -648,7 +648,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    input vectors are used as an input for dot product; otherwise that input
 ///    is treated as zero. Bits [1:0] determine which elements of the result
 ///    will receive a copy of the final dot product, with bit [0] corresponding
-///    to the lowest element and bit [3] corresponding to the highest element of
+///    to the lowest element and bit [1] corresponding to the highest element of
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
 #define _mm_dp_pd(X, Y, M) __extension__ ({\
@@ -866,8 +866,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
 ///    Bits[3:0]: If any of these bits are set, the corresponding result
 ///    element is cleared.
-/// \returns A 128-bit vector of [4 x float] containing the copied single-
-///    precision floating point elements from the operands.
+/// \returns A 128-bit vector of [4 x float] containing the copied
+///    single-precision floating point elements from the operands.
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
 
 /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index 279c0275d93f..5e8901cd8e59 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -1011,6 +1011,8 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 /// \brief Compares two 32-bit float values in the low-order bits of both
 ///    operands for equality and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
@@ -1022,7 +1024,8 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the
+///    two lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
@@ -1033,6 +1036,8 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is less than the second
 ///    operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
@@ -1044,7 +1049,8 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
@@ -1055,6 +1061,8 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is less than or equal to the
 ///    second operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
@@ -1065,7 +1073,8 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
@@ -1076,6 +1085,8 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is greater than the second
 ///    operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
@@ -1086,7 +1097,8 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the
+///     two lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
@@ -1097,6 +1109,8 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is greater than or equal to
 ///    the second operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
@@ -1107,7 +1121,8 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
@@ -1118,6 +1133,8 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is not equal to the second
 ///    operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
@@ -1128,7 +1145,8 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the
+///     two lower 32-bit values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
@@ -1139,6 +1157,8 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 ///    the low-order bits of both operands to determine equality and returns
 ///    the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
@@ -1149,7 +1169,8 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
@@ -1160,6 +1181,8 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 ///    the low-order bits of both operands to determine if the first operand is
 ///    less than the second operand and returns the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
@@ -1170,7 +1193,8 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
@@ -1182,6 +1206,8 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 ///    less than or equal to the second operand and returns the result of the
 ///    comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
@@ -1192,7 +1218,8 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
@@ -1204,6 +1231,8 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 ///    greater than the second operand and returns the result of the
 ///    comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
@@ -1214,7 +1243,8 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
@@ -1226,6 +1256,8 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 ///    greater than or equal to the second operand and returns the result of
 ///    the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
@@ -1236,7 +1268,8 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower 32-bit values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
@@ -1247,6 +1280,8 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 ///    the low-order bits of both operands to determine inequality and returns
 ///    the result of the comparison.
 ///
+///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
@@ -1257,7 +1292,8 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower 32-bit values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
@@ -1571,7 +1607,7 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+/// This intrinsic has no corresponding instruction.
 ///
 /// \param __a
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1667,7 +1703,7 @@ _mm_load_ss(const float *__p)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
+/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
 ///    instruction.
 ///
 /// \param __p
@@ -1696,7 +1732,7 @@ _mm_load1_ps(const float *__p)
 /// \param __p
 ///    A pointer to a 128-bit memory location. The address of the memory
 ///    location has to be 128-bit aligned.
-/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ps(const float *__p)
 {
@@ -1888,7 +1924,7 @@ _mm_setzero_ps(void)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
 ///
 /// \param __p
 ///    A pointer to a 64-bit memory location.
@@ -2163,7 +2199,7 @@ void _mm_sfence(void);
 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
 /// \endcode
 ///
-/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
 ///
 /// \param a
 ///    A 64-bit vector of [4 x i16].
@@ -2261,7 +2297,7 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 }
 
 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit
-///    integer vector to create a 16-bit mask value. Zero-extends the value to
+///    integer vector to create an 8-bit mask value. Zero-extends the value to
 ///    32-bit integer and writes it to the destination.
 ///
 /// \headerfile <x86intrin.h>
@@ -2270,8 +2306,8 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 ///
 /// \param __a
 ///    A 64-bit integer vector containing the values with bits to be extracted.
-/// \returns The most significant bit from each 8-bit element in the operand,
-///    written to bits [15:0].
+/// \returns The most significant bit from each 8-bit element in \a __a,
+///    written to bits [7:0].
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pi8(__m64 __a)
 {
@@ -2444,7 +2480,7 @@ extern "C" {
 ///    <li>
 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
-///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
+///      _MM_GET_ROUNDING_MODE().
 ///    </li>
 ///    <li>
 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
@@ -2457,11 +2493,11 @@ extern "C" {
 ///    </li>
 ///    </ul>
 ///
-///    For example, the expression below checks if an overflow exception has
+///    For example, the following expression checks if an overflow exception has
 ///    occurred:
 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
 ///
-///    The following example gets the current rounding mode:
+///    The following expression gets the current rounding mode:
 ///      _MM_GET_ROUNDING_MODE()
 ///
 /// \headerfile <x86intrin.h>
@@ -2511,10 +2547,12 @@ unsigned int _mm_getcsr(void);
 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
 ///
 ///    The following example sets the DAZ and FTZ flags:
-///      void setFlags() {
-///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
-///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
-///      }
+///    \code
+///    void setFlags() {
+///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+///    }
+///    \endcode
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2621,7 +2659,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
+///    instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
diff --git a/lib/Index/IndexDecl.cpp b/lib/Index/IndexDecl.cpp
index e14750e046eb..f1ef6c0ea21f 100644
--- a/lib/Index/IndexDecl.cpp
+++ b/lib/Index/IndexDecl.cpp
@@ -664,8 +664,11 @@ class IndexingDeclVisitor : public ConstDeclVisitor<IndexingDeclVisitor, bool> {
 
   bool VisitTemplateDecl(const TemplateDecl *D) {
 
-    // Index the default values for the template parameters.
     const NamedDecl *Parent = D->getTemplatedDecl();
+    if (!Parent)
+      return true;
+
+    // Index the default values for the template parameters.
     if (D->getTemplateParameters() &&
         shouldIndexTemplateParameterDefaultValue(Parent)) {
       const TemplateParameterList *Params = D->getTemplateParameters();
@@ -684,7 +687,7 @@ class IndexingDeclVisitor : public ConstDeclVisitor<IndexingDeclVisitor, bool> {
       }
     }
 
-    return Visit(D->getTemplatedDecl());
+    return Visit(Parent);
   }
 
   bool VisitFriendDecl(const FriendDecl *D) {
diff --git a/lib/Lex/HeaderSearch.cpp b/lib/Lex/HeaderSearch.cpp
index 6976294a2eaf..3528f8820fca 100644
--- a/lib/Lex/HeaderSearch.cpp
+++ b/lib/Lex/HeaderSearch.cpp
@@ -1580,9 +1580,15 @@ void HeaderSearch::loadSubdirectoryModuleMaps(DirectoryLookup &SearchDir) {
 std::string HeaderSearch::suggestPathToFileForDiagnostics(const FileEntry *File,
                                                           bool *IsSystem) {
   // FIXME: We assume that the path name currently cached in the FileEntry is
-  // the most appropriate one for this analysis (and that it's spelled the same
-  // way as the corresponding header search path).
-  StringRef Name = File->getName();
+  // the most appropriate one for this analysis (and that it's spelled the
+  // same way as the corresponding header search path).
+  return suggestPathToFileForDiagnostics(File->getName(), /*BuildDir=*/"",
+                                         IsSystem);
+}
+
+std::string HeaderSearch::suggestPathToFileForDiagnostics(
+    llvm::StringRef File, llvm::StringRef WorkingDir, bool *IsSystem) {
+  using namespace llvm::sys;
 
   unsigned BestPrefixLength = 0;
   unsigned BestSearchDir;
@@ -1593,12 +1599,17 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(const FileEntry *File,
       continue;
 
     StringRef Dir = SearchDirs[I].getDir()->getName();
-    for (auto NI = llvm::sys::path::begin(Name),
-              NE = llvm::sys::path::end(Name),
-              DI = llvm::sys::path::begin(Dir),
-              DE = llvm::sys::path::end(Dir);
+    llvm::SmallString<32> DirPath(Dir.begin(), Dir.end());
+    if (!WorkingDir.empty() && !path::is_absolute(Dir)) {
+      auto err = fs::make_absolute(WorkingDir, DirPath);
+      if (!err)
+        path::remove_dots(DirPath, /*remove_dot_dot=*/true);
+      Dir = DirPath;
+    }
+    for (auto NI = path::begin(File), NE = path::end(File),
+              DI = path::begin(Dir), DE = path::end(Dir);
          /*termination condition in loop*/; ++NI, ++DI) {
-      // '.' components in Name are ignored.
+      // '.' components in File are ignored.
       while (NI != NE && *NI == ".")
         ++NI;
       if (NI == NE)
@@ -1608,9 +1619,9 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(const FileEntry *File,
       while (DI != DE && *DI == ".")
         ++DI;
       if (DI == DE) {
-        // Dir is a prefix of Name, up to '.' components and choice of path
+        // Dir is a prefix of File, up to '.' components and choice of path
         // separators.
-        unsigned PrefixLength = NI - llvm::sys::path::begin(Name);
+        unsigned PrefixLength = NI - path::begin(File);
         if (PrefixLength > BestPrefixLength) {
           BestPrefixLength = PrefixLength;
           BestSearchDir = I;
@@ -1625,5 +1636,5 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(const FileEntry *File,
 
   if (IsSystem)
     *IsSystem = BestPrefixLength ? BestSearchDir >= SystemDirIdx : false;
-  return Name.drop_front(BestPrefixLength);
+  return File.drop_front(BestPrefixLength);
 }
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 830354ab23f0..8bd4ab0ff9ca 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -2009,18 +2009,21 @@ bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
   const char *AfterLessPos = CurPtr;
   char C = getAndAdvanceChar(CurPtr, Result);
   while (C != '>') {
-    // Skip escaped characters.
-    if (C == '\\' && CurPtr < BufferEnd) {
-      // Skip the escaped character.
-      getAndAdvanceChar(CurPtr, Result);
-    } else if (C == '\n' || C == '\r' ||             // Newline.
-               (C == 0 && (CurPtr-1 == BufferEnd ||  // End of file.
-                           isCodeCompletionPoint(CurPtr-1)))) {
+    // Skip escaped characters.  Escaped newlines will already be processed by
+    // getAndAdvanceChar.
+    if (C == '\\')
+      C = getAndAdvanceChar(CurPtr, Result);
+
+    if (C == '\n' || C == '\r' ||             // Newline.
+        (C == 0 && (CurPtr-1 == BufferEnd ||  // End of file.
+                    isCodeCompletionPoint(CurPtr-1)))) {
       // If the filename is unterminated, then it must just be a lone <
       // character.  Return this as such.
       FormTokenWithChars(Result, AfterLessPos, tok::less);
       return true;
-    } else if (C == 0) {
+    }
+
+    if (C == 0) {
       NulCharacter = CurPtr-1;
     }
     C = getAndAdvanceChar(CurPtr, Result);
diff --git a/lib/Lex/ModuleMap.cpp b/lib/Lex/ModuleMap.cpp
index b3ac10c5c5ae..657eef167a69 100644
--- a/lib/Lex/ModuleMap.cpp
+++ b/lib/Lex/ModuleMap.cpp
@@ -281,6 +281,8 @@ ModuleMap::ModuleMap(SourceManager &SourceMgr, DiagnosticsEngine &Diags,
 ModuleMap::~ModuleMap() {
   for (auto &M : Modules)
     delete M.getValue();
+  for (auto *M : ShadowModules)
+    delete M;
 }
 
 void ModuleMap::setTarget(const TargetInfo &Target) {
@@ -751,7 +753,7 @@ std::pair<Module *, bool> ModuleMap::findOrCreateModule(StringRef Name,
   // Try to find an existing module with this name.
   if (Module *Sub = lookupModuleQualified(Name, Parent))
     return std::make_pair(Sub, false);
-  
+
   // Create a new module with this name.
   Module *Result = new Module(Name, SourceLocation(), Parent, IsFramework,
                               IsExplicit, NumCreatedModules++);
@@ -759,6 +761,7 @@ std::pair<Module *, bool> ModuleMap::findOrCreateModule(StringRef Name,
     if (LangOpts.CurrentModule == Name)
       SourceModule = Result;
     Modules[Name] = Result;
+    ModuleScopeIDs[Result] = CurrentModuleScopeID;
   }
   return std::make_pair(Result, true);
 }
@@ -927,6 +930,7 @@ Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir,
     if (LangOpts.CurrentModule == ModuleName)
       SourceModule = Result;
     Modules[ModuleName] = Result;
+    ModuleScopeIDs[Result] = CurrentModuleScopeID;
   }
 
   Result->IsSystem |= Attrs.IsSystem;
@@ -999,6 +1003,21 @@ Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir,
   return Result;
 }
 
+Module *ModuleMap::createShadowedModule(StringRef Name, bool IsFramework,
+                                        Module *ShadowingModule) {
+
+  // Create a new module with this name.
+  Module *Result =
+      new Module(Name, SourceLocation(), /*Parent=*/nullptr, IsFramework,
+                 /*IsExplicit=*/false, NumCreatedModules++);
+  Result->ShadowingModule = ShadowingModule;
+  Result->IsAvailable = false;
+  ModuleScopeIDs[Result] = CurrentModuleScopeID;
+  ShadowModules.push_back(Result);
+
+  return Result;
+}
+
 void ModuleMap::setUmbrellaHeader(Module *Mod, const FileEntry *UmbrellaHeader,
                                   Twine NameAsWritten) {
   Headers[UmbrellaHeader].push_back(KnownHeader(Mod, NormalHeader));
@@ -1319,7 +1338,7 @@ namespace clang {
 
     /// \brief Consume the current token and return its location.
     SourceLocation consumeToken();
-    
+
     /// \brief Skip tokens until we reach the a token with the given kind
     /// (or the end of the file).
     void skipUntil(MMToken::TokenKind K);
@@ -1345,20 +1364,17 @@ namespace clang {
     bool parseOptionalAttributes(Attributes &Attrs);
     
   public:
-    explicit ModuleMapParser(Lexer &L, SourceManager &SourceMgr, 
-                             const TargetInfo *Target,
-                             DiagnosticsEngine &Diags,
-                             ModuleMap &Map,
-                             const FileEntry *ModuleMapFile,
-                             const DirectoryEntry *Directory,
-                             bool IsSystem)
+    explicit ModuleMapParser(Lexer &L, SourceManager &SourceMgr,
+                             const TargetInfo *Target, DiagnosticsEngine &Diags,
+                             ModuleMap &Map, const FileEntry *ModuleMapFile,
+                             const DirectoryEntry *Directory, bool IsSystem)
         : L(L), SourceMgr(SourceMgr), Target(Target), Diags(Diags), Map(Map),
           ModuleMapFile(ModuleMapFile), Directory(Directory),
           IsSystem(IsSystem) {
       Tok.clear();
       consumeToken();
     }
-    
+
     bool parseModuleMapFile();
 
     bool terminatedByDirective() { return false; }
@@ -1787,6 +1803,7 @@ void ModuleMapParser::parseModuleDecl() {
   SourceLocation LBraceLoc = consumeToken();
   
   // Determine whether this (sub)module has already been defined.
+  Module *ShadowingModule = nullptr;
   if (Module *Existing = Map.lookupModuleQualified(ModuleName, ActiveModule)) {
     // We might see a (re)definition of a module that we already have a
     // definition for in two cases:
@@ -1812,23 +1829,35 @@ void ModuleMapParser::parseModuleDecl() {
       }
       return;
     }
-    
-    Diags.Report(ModuleNameLoc, diag::err_mmap_module_redefinition)
-      << ModuleName;
-    Diags.Report(Existing->DefinitionLoc, diag::note_mmap_prev_definition);
-    
-    // Skip the module definition.
-    skipUntil(MMToken::RBrace);
-    if (Tok.is(MMToken::RBrace))
-      consumeToken();
-    
-    HadError = true;
-    return;
+
+    if (!Existing->Parent && Map.mayShadowNewModule(Existing)) {
+      ShadowingModule = Existing;
+    } else {
+      // This is not a shawdowed module decl, it is an illegal redefinition.
+      Diags.Report(ModuleNameLoc, diag::err_mmap_module_redefinition)
+          << ModuleName;
+      Diags.Report(Existing->DefinitionLoc, diag::note_mmap_prev_definition);
+
+      // Skip the module definition.
+      skipUntil(MMToken::RBrace);
+      if (Tok.is(MMToken::RBrace))
+        consumeToken();
+
+      HadError = true;
+      return;
+    }
   }
 
   // Start defining this module.
-  ActiveModule = Map.findOrCreateModule(ModuleName, ActiveModule, Framework,
-                                        Explicit).first;
+  if (ShadowingModule) {
+    ActiveModule =
+        Map.createShadowedModule(ModuleName, Framework, ShadowingModule);
+  } else {
+    ActiveModule =
+        Map.findOrCreateModule(ModuleName, ActiveModule, Framework, Explicit)
+            .first;
+  }
+
   ActiveModule->DefinitionLoc = ModuleNameLoc;
   if (Attrs.IsSystem || IsSystem)
     ActiveModule->IsSystem = true;
@@ -2854,5 +2883,6 @@ bool ModuleMap::parseModuleMapFile(const FileEntry *File, bool IsSystem,
   // Notify callbacks that we parsed it.
   for (const auto &Cb : Callbacks)
     Cb->moduleMapFileRead(Start, *File, IsSystem);
+
   return Result;
 }
diff --git a/lib/Lex/PPCaching.cpp b/lib/Lex/PPCaching.cpp
index f5e8cdc25d38..9758557d7b44 100644
--- a/lib/Lex/PPCaching.cpp
+++ b/lib/Lex/PPCaching.cpp
@@ -105,8 +105,10 @@ void Preprocessor::CachingLex(Token &Result) {
 }
 
 void Preprocessor::EnterCachingLexMode() {
-  if (InCachingLexMode())
+  if (InCachingLexMode()) {
+    assert(CurLexerKind == CLK_CachingLexer && "Unexpected lexer kind");
     return;
+  }
 
   PushIncludeMacroStack();
   CurLexerKind = CLK_CachingLexer;
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index ca3e70fd1060..405d19bb2853 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp
@@ -558,7 +558,9 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation HashTokenLoc,
   // the #if block.
   CurPPLexer->LexingRawMode = false;
 
-  if (Callbacks)
+  // The last skipped range isn't actually skipped yet if it's truncated
+  // by the end of the preamble; we'll resume parsing after the preamble.
+  if (Callbacks && (Tok.isNot(tok::eof) || !isRecordingPreamble()))
     Callbacks->SourceRangeSkipped(
         SourceRange(HashTokenLoc, CurPPLexer->getSourceLocation()),
         Tok.getLocation());
@@ -1655,12 +1657,18 @@ bool Preprocessor::checkModuleIsAvailable(const LangOptions &LangOpts,
                                           DiagnosticsEngine &Diags, Module *M) {
   Module::Requirement Requirement;
   Module::UnresolvedHeaderDirective MissingHeader;
-  if (M->isAvailable(LangOpts, TargetInfo, Requirement, MissingHeader))
+  Module *ShadowingModule = nullptr;
+  if (M->isAvailable(LangOpts, TargetInfo, Requirement, MissingHeader,
+                     ShadowingModule))
     return false;
 
   if (MissingHeader.FileNameLoc.isValid()) {
     Diags.Report(MissingHeader.FileNameLoc, diag::err_module_header_missing)
         << MissingHeader.IsUmbrella << MissingHeader.FileName;
+  } else if (ShadowingModule) {
+    Diags.Report(M->DefinitionLoc, diag::err_module_shadowed) << M->Name;
+    Diags.Report(ShadowingModule->DefinitionLoc,
+                 diag::note_previous_definition);
   } else {
     // FIXME: Track the location at which the requirement was specified, and
     // use it here.
@@ -2024,6 +2032,15 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
 
   // Determine if we're switching to building a new submodule, and which one.
   if (auto *M = SuggestedModule.getModule()) {
+    if (M->getTopLevelModule()->ShadowingModule) {
+      // We are building a submodule that belongs to a shadowed module. This
+      // means we find header files in the shadowed module.
+      Diag(M->DefinitionLoc, diag::err_module_build_shadowed_submodule)
+        << M->getFullModuleName();
+      Diag(M->getTopLevelModule()->ShadowingModule->DefinitionLoc,
+           diag::note_previous_definition);
+      return;
+    }
     // When building a pch, -fmodule-name tells the compiler to textually
     // include headers in the specified module. We are not building the
     // specified module.
diff --git a/lib/Lex/PPLexerChange.cpp b/lib/Lex/PPLexerChange.cpp
index e484e9c4c3a3..f21787338b37 100644
--- a/lib/Lex/PPLexerChange.cpp
+++ b/lib/Lex/PPLexerChange.cpp
@@ -444,6 +444,7 @@ bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) {
       }
 
       CurPPLexer = nullptr;
+      recomputeCurLexerKind();
       return true;
     }
 
diff --git a/lib/Lex/Pragma.cpp b/lib/Lex/Pragma.cpp
index b8acd92521fb..b9be03f4f20d 100644
--- a/lib/Lex/Pragma.cpp
+++ b/lib/Lex/Pragma.cpp
@@ -1601,44 +1601,6 @@ struct PragmaPopMacroHandler : public PragmaHandler {
   }
 };
 
-// Pragma STDC implementations.
-
-/// PragmaSTDC_FENV_ACCESSHandler - "\#pragma STDC FENV_ACCESS ...".
-struct PragmaSTDC_FENV_ACCESSHandler : public PragmaHandler {
-  PragmaSTDC_FENV_ACCESSHandler() : PragmaHandler("FENV_ACCESS") {}
-
-  void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
-                    Token &Tok) override {
-    tok::OnOffSwitch OOS;
-    if (PP.LexOnOffSwitch(OOS))
-     return;
-    if (OOS == tok::OOS_ON)
-      PP.Diag(Tok, diag::warn_stdc_fenv_access_not_supported);
-  }
-};
-
-/// PragmaSTDC_CX_LIMITED_RANGEHandler - "\#pragma STDC CX_LIMITED_RANGE ...".
-struct PragmaSTDC_CX_LIMITED_RANGEHandler : public PragmaHandler {
-  PragmaSTDC_CX_LIMITED_RANGEHandler() : PragmaHandler("CX_LIMITED_RANGE") {}
-
-  void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
-                    Token &Tok) override {
-    tok::OnOffSwitch OOS;
-    PP.LexOnOffSwitch(OOS);
-  }
-};
-
-/// PragmaSTDC_UnknownHandler - "\#pragma STDC ...".
-struct PragmaSTDC_UnknownHandler : public PragmaHandler {
-  PragmaSTDC_UnknownHandler() = default;
-
-  void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
-                    Token &UnknownTok) override {
-    // C99 6.10.6p2, unknown forms are not allowed.
-    PP.Diag(UnknownTok, diag::ext_stdc_pragma_ignored);
-  }
-};
-
 /// PragmaARCCFCodeAuditedHandler - 
 ///   \#pragma clang arc_cf_code_audited begin/end
 struct PragmaARCCFCodeAuditedHandler : public PragmaHandler {
@@ -1814,17 +1776,15 @@ void Preprocessor::RegisterBuiltinPragmas() {
   ModuleHandler->AddPragma(new PragmaModuleEndHandler());
   ModuleHandler->AddPragma(new PragmaModuleBuildHandler());
   ModuleHandler->AddPragma(new PragmaModuleLoadHandler());
-
-  AddPragmaHandler("STDC", new PragmaSTDC_FENV_ACCESSHandler());
-  AddPragmaHandler("STDC", new PragmaSTDC_CX_LIMITED_RANGEHandler());
-  AddPragmaHandler("STDC", new PragmaSTDC_UnknownHandler());
+    
+  // Add region pragmas.
+  AddPragmaHandler(new PragmaRegionHandler("region"));
+  AddPragmaHandler(new PragmaRegionHandler("endregion"));
 
   // MS extensions.
   if (LangOpts.MicrosoftExt) {
     AddPragmaHandler(new PragmaWarningHandler());
     AddPragmaHandler(new PragmaIncludeAliasHandler());
-    AddPragmaHandler(new PragmaRegionHandler("region"));
-    AddPragmaHandler(new PragmaRegionHandler("endregion"));
   }
 
   // Pragmas added by plugins
@@ -1843,17 +1803,4 @@ void Preprocessor::IgnorePragmas() {
   // in Preprocessor::RegisterBuiltinPragmas().
   AddPragmaHandler("GCC", new EmptyPragmaHandler());
   AddPragmaHandler("clang", new EmptyPragmaHandler());
-  if (PragmaHandler *NS = PragmaHandlers->FindHandler("STDC")) {
-    // Preprocessor::RegisterBuiltinPragmas() already registers
-    // PragmaSTDC_UnknownHandler as the empty handler, so remove it first,
-    // otherwise there will be an assert about a duplicate handler.
-    PragmaNamespace *STDCNamespace = NS->getIfNamespace();
-    assert(STDCNamespace &&
-           "Invalid namespace, registered as a regular pragma handler!");
-    if (PragmaHandler *Existing = STDCNamespace->FindHandler("", false)) {
-      RemovePragmaHandler("STDC", Existing);
-      delete Existing;
-    }
-  }
-  AddPragmaHandler("STDC", new EmptyPragmaHandler());
 }
diff --git a/lib/Lex/PreprocessingRecord.cpp b/lib/Lex/PreprocessingRecord.cpp
index af439dbfa584..cd4a63f114e0 100644
--- a/lib/Lex/PreprocessingRecord.cpp
+++ b/lib/Lex/PreprocessingRecord.cpp
@@ -329,6 +329,23 @@ unsigned PreprocessingRecord::allocateLoadedEntities(unsigned NumEntities) {
   return Result;
 }
 
+unsigned PreprocessingRecord::allocateSkippedRanges(unsigned NumRanges) {
+  unsigned Result = SkippedRanges.size();
+  SkippedRanges.resize(SkippedRanges.size() + NumRanges);
+  SkippedRangesAllLoaded = false;
+  return Result;
+}
+
+void PreprocessingRecord::ensureSkippedRangesLoaded() {
+  if (SkippedRangesAllLoaded || !ExternalSource)
+    return;
+  for (unsigned Index = 0; Index != SkippedRanges.size(); ++Index) {
+    if (SkippedRanges[Index].isInvalid())
+      SkippedRanges[Index] = ExternalSource->ReadSkippedRange(Index);
+  }
+  SkippedRangesAllLoaded = true;
+}
+
 void PreprocessingRecord::RegisterMacroDefinition(MacroInfo *Macro,
                                                   MacroDefinitionRecord *Def) {
   MacroDefinitions[Macro] = Def;
@@ -418,6 +435,7 @@ void PreprocessingRecord::Defined(const Token &MacroNameTok,
 
 void PreprocessingRecord::SourceRangeSkipped(SourceRange Range,
                                              SourceLocation EndifLoc) {
+  assert(Range.isValid());
   SkippedRanges.emplace_back(Range.getBegin(), EndifLoc);
 }
 
@@ -497,5 +515,6 @@ size_t PreprocessingRecord::getTotalMemory() const {
   return BumpAlloc.getTotalMemory()
     + llvm::capacity_in_bytes(MacroDefinitions)
     + llvm::capacity_in_bytes(PreprocessedEntities)
-    + llvm::capacity_in_bytes(LoadedPreprocessedEntities);
+    + llvm::capacity_in_bytes(LoadedPreprocessedEntities)
+    + llvm::capacity_in_bytes(SkippedRanges);
 }
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index 7d789e780113..08469bc02530 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -773,8 +773,13 @@ void Preprocessor::Lex(Token &Result) {
     }
   } while (!ReturnedToken);
 
-  if (Result.is(tok::code_completion))
+  if (Result.is(tok::code_completion) && Result.getIdentifierInfo()) {
+    // Remember the identifier before code completion token.
     setCodeCompletionIdentifierInfo(Result.getIdentifierInfo());
+    // Set IdenfitierInfo to null to avoid confusing code that handles both
+    // identifiers and completion tokens.
+    Result.setIdentifierInfo(nullptr);
+  }
 
   LastTokenWasAt = Result.is(tok::at);
 }
diff --git a/lib/Lex/ScratchBuffer.cpp b/lib/Lex/ScratchBuffer.cpp
index e0f3966fce48..dc03e16daa8b 100644
--- a/lib/Lex/ScratchBuffer.cpp
+++ b/lib/Lex/ScratchBuffer.cpp
@@ -74,11 +74,11 @@ void ScratchBuffer::AllocScratchBuffer(unsigned RequestLen) {
 
   // Get scratch buffer. Zero-initialize it so it can be dumped into a PCH file
   // deterministically.
-  std::unique_ptr<llvm::MemoryBuffer> OwnBuf =
-      llvm::MemoryBuffer::getNewMemBuffer(RequestLen, "<scratch space>");
-  llvm::MemoryBuffer &Buf = *OwnBuf;
+  std::unique_ptr<llvm::WritableMemoryBuffer> OwnBuf =
+      llvm::WritableMemoryBuffer::getNewMemBuffer(RequestLen,
+                                                  "<scratch space>");
+  CurBuffer = OwnBuf->getBufferStart();
   FileID FID = SourceMgr.createFileID(std::move(OwnBuf));
   BufferStartLoc = SourceMgr.getLocForStartOfFile(FID);
-  CurBuffer = const_cast<char*>(Buf.getBufferStart());
   BytesUsed = 0;
 }
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
index 35bb6a9e5cc8..127e18439b8e 100644
--- a/lib/Parse/ParseDecl.cpp
+++ b/lib/Parse/ParseDecl.cpp
@@ -48,8 +48,8 @@ TypeResult Parser::ParseTypeName(SourceRange *Range,
                                  Decl **OwnedType,
                                  ParsedAttributes *Attrs) {
   DeclSpecContext DSC = getDeclSpecContextFromDeclaratorContext(Context);
-  if (DSC == DSC_normal)
-    DSC = DSC_type_specifier;
+  if (DSC == DeclSpecContext::DSC_normal)
+    DSC = DeclSpecContext::DSC_type_specifier;
 
   // Parse the common declaration-specifiers piece.
   DeclSpec DS(AttrFactory);
@@ -2201,7 +2201,7 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
       // FIXME: This check should be for a variable template instantiation only.
 
       // Check that this is a valid instantiation
-      if (D.getName().getKind() != UnqualifiedId::IK_TemplateId) {
+      if (D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) {
         // If the declarator-id is not a template-id, issue a diagnostic and
         // recover by ignoring the 'template' keyword.
         Diag(Tok, diag::err_template_defn_explicit_instantiation)
@@ -2399,7 +2399,7 @@ void Parser::ParseSpecifierQualifierList(DeclSpec &DS, AccessSpecifier AS,
   }
 
   // Issue diagnostic and remove constexpr specfier if present.
-  if (DS.isConstexprSpecified() && DSC != DSC_condition) {
+  if (DS.isConstexprSpecified() && DSC != DeclSpecContext::DSC_condition) {
     Diag(DS.getConstexprSpecLoc(), diag::err_typename_invalid_constexpr);
     DS.ClearConstexprSpec();
   }
@@ -2488,7 +2488,7 @@ bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS,
     // classes.
     if (ParsedType T = Actions.ActOnMSVCUnknownTypeName(
             *Tok.getIdentifierInfo(), Tok.getLocation(),
-            DSC == DSC_template_type_arg)) {
+            DSC == DeclSpecContext::DSC_template_type_arg)) {
       const char *PrevSpec;
       unsigned DiagID;
       DS.SetTypeSpecType(DeclSpec::TST_typename, Loc, PrevSpec, DiagID, T,
@@ -2542,18 +2542,20 @@ bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS,
 
       // Parse this as a tag as if the missing tag were present.
       if (TagKind == tok::kw_enum)
-        ParseEnumSpecifier(Loc, DS, TemplateInfo, AS, DSC_normal);
+        ParseEnumSpecifier(Loc, DS, TemplateInfo, AS,
+                           DeclSpecContext::DSC_normal);
       else
         ParseClassSpecifier(TagKind, Loc, DS, TemplateInfo, AS,
-                            /*EnteringContext*/ false, DSC_normal, Attrs);
+                            /*EnteringContext*/ false,
+                            DeclSpecContext::DSC_normal, Attrs);
       return true;
     }
   }
 
   // Determine whether this identifier could plausibly be the name of something
   // being declared (with a missing type).
-  if (!isTypeSpecifier(DSC) &&
-      (!SS || DSC == DSC_top_level || DSC == DSC_class)) {
+  if (!isTypeSpecifier(DSC) && (!SS || DSC == DeclSpecContext::DSC_top_level ||
+                                DSC == DeclSpecContext::DSC_class)) {
     // Look ahead to the next token to try to figure out what this declaration
     // was supposed to be.
     switch (NextToken().getKind()) {
@@ -2577,7 +2579,8 @@ bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS,
 
       // If we're in a context where we could be declaring a constructor,
       // check whether this is a constructor declaration with a bogus name.
-      if (DSC == DSC_class || (DSC == DSC_top_level && SS)) {
+      if (DSC == DeclSpecContext::DSC_class ||
+          (DSC == DeclSpecContext::DSC_top_level && SS)) {
         IdentifierInfo *II = Tok.getIdentifierInfo();
         if (Actions.isCurrentClassNameTypo(II, SS)) {
           Diag(Loc, diag::err_constructor_bad_name)
@@ -2661,19 +2664,19 @@ bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS,
 Parser::DeclSpecContext
 Parser::getDeclSpecContextFromDeclaratorContext(DeclaratorContext Context) {
   if (Context == DeclaratorContext::MemberContext)
-    return DSC_class;
+    return DeclSpecContext::DSC_class;
   if (Context == DeclaratorContext::FileContext)
-    return DSC_top_level;
+    return DeclSpecContext::DSC_top_level;
   if (Context == DeclaratorContext::TemplateParamContext)
-    return DSC_template_param;
+    return DeclSpecContext::DSC_template_param;
   if (Context == DeclaratorContext::TemplateTypeArgContext)
-    return DSC_template_type_arg;
+    return DeclSpecContext::DSC_template_type_arg;
   if (Context == DeclaratorContext::TrailingReturnContext)
-    return DSC_trailing;
+    return DeclSpecContext::DSC_trailing;
   if (Context == DeclaratorContext::AliasDeclContext ||
       Context == DeclaratorContext::AliasTemplateContext)
-    return DSC_alias_declaration;
-  return DSC_normal;
+    return DeclSpecContext::DSC_alias_declaration;
+  return DeclSpecContext::DSC_normal;
 }
 
 /// ParseAlignArgument - Parse the argument to an alignment-specifier.
@@ -2753,7 +2756,8 @@ Parser::DiagnoseMissingSemiAfterTagDefinition(DeclSpec &DS, AccessSpecifier AS,
                                               LateParsedAttrList *LateAttrs) {
   assert(DS.hasTagDefinition() && "shouldn't call this");
 
-  bool EnteringContext = (DSContext == DSC_class || DSContext == DSC_top_level);
+  bool EnteringContext = (DSContext == DeclSpecContext::DSC_class ||
+                          DSContext == DeclSpecContext::DSC_top_level);
 
   if (getLangOpts().CPlusPlus &&
       Tok.isOneOf(tok::identifier, tok::coloncolon, tok::kw_decltype,
@@ -2887,7 +2891,8 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
     DS.SetRangeEnd(SourceLocation());
   }
 
-  bool EnteringContext = (DSContext == DSC_class || DSContext == DSC_top_level);
+  bool EnteringContext = (DSContext == DeclSpecContext::DSC_class ||
+                          DSContext == DeclSpecContext::DSC_top_level);
   bool AttrsLastTime = false;
   ParsedAttributesWithRange attrs(AttrFactory);
   // We use Sema's policy to get bool macros right.
@@ -2957,8 +2962,8 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
                                           Scope::FunctionPrototypeScope |
                                           Scope::AtCatchScope)) == 0;
         bool AllowNestedNameSpecifiers
-          = DSContext == DSC_top_level ||
-            (DSContext == DSC_class && DS.isFriendSpecified());
+          = DSContext == DeclSpecContext::DSC_top_level ||
+            (DSContext == DeclSpecContext::DSC_class && DS.isFriendSpecified());
 
         Actions.CodeCompleteDeclSpec(getCurScope(), DS,
                                      AllowNonIdentifiers,
@@ -2969,9 +2974,9 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
       if (getCurScope()->getFnParent() || getCurScope()->getBlockParent())
         CCC = Sema::PCC_LocalDeclarationSpecifiers;
       else if (TemplateInfo.Kind != ParsedTemplateInfo::NonTemplate)
-        CCC = DSContext == DSC_class? Sema::PCC_MemberTemplate
-                                    : Sema::PCC_Template;
-      else if (DSContext == DSC_class)
+        CCC = DSContext == DeclSpecContext::DSC_class ? Sema::PCC_MemberTemplate
+                                                      : Sema::PCC_Template;
+      else if (DSContext == DeclSpecContext::DSC_class)
         CCC = Sema::PCC_Class;
       else if (CurParsedObjCImpl)
         CCC = Sema::PCC_ObjCImplementation;
@@ -3015,10 +3020,11 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
         // To improve diagnostics for this case, parse the declaration as a
         // constructor (and reject the extra template arguments later).
         TemplateIdAnnotation *TemplateId = takeTemplateIdAnnotation(Next);
-        if ((DSContext == DSC_top_level || DSContext == DSC_class) &&
+        if ((DSContext == DeclSpecContext::DSC_top_level ||
+             DSContext == DeclSpecContext::DSC_class) &&
             TemplateId->Name &&
             Actions.isCurrentClassName(*TemplateId->Name, getCurScope(), &SS) &&
-            isConstructorDeclarator(/*Unqualified*/false)) {
+            isConstructorDeclarator(/*Unqualified*/ false)) {
           // The user meant this to be an out-of-line constructor
           // definition, but template arguments are not allowed
           // there.  Just allow this as a constructor; we'll
@@ -3057,7 +3063,8 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
       // Check whether this is a constructor declaration. If we're in a
       // context where the identifier could be a class name, and it has the
       // shape of a constructor declaration, process it as one.
-      if ((DSContext == DSC_top_level || DSContext == DSC_class) &&
+      if ((DSContext == DeclSpecContext::DSC_top_level ||
+           DSContext == DeclSpecContext::DSC_class) &&
           Actions.isCurrentClassName(*Next.getIdentifierInfo(), getCurScope(),
                                      &SS) &&
           isConstructorDeclarator(/*Unqualified*/ false))
@@ -3195,7 +3202,8 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
       if (DS.isTypeAltiVecVector())
         goto DoneWithDeclSpec;
 
-      if (DSContext == DSC_objc_method_result && isObjCInstancetype()) {
+      if (DSContext == DeclSpecContext::DSC_objc_method_result &&
+          isObjCInstancetype()) {
         ParsedType TypeRep = Actions.ActOnObjCInstanceType(Loc);
         assert(TypeRep);
         isInvalid = DS.SetTypeSpecType(DeclSpec::TST_typename, Loc, PrevSpec,
@@ -3229,7 +3237,7 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
 
       // If we're in a context where the identifier could be a class name,
       // check whether this is a constructor declaration.
-      if (getLangOpts().CPlusPlus && DSContext == DSC_class &&
+      if (getLangOpts().CPlusPlus && DSContext == DeclSpecContext::DSC_class &&
           Actions.isCurrentClassName(*Tok.getIdentifierInfo(), getCurScope()) &&
           isConstructorDeclarator(/*Unqualified*/true))
         goto DoneWithDeclSpec;
@@ -3237,7 +3245,8 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
       // Likewise, if this is a context where the identifier could be a template
       // name, check whether this is a deduction guide declaration.
       if (getLangOpts().CPlusPlus17 &&
-          (DSContext == DSC_class || DSContext == DSC_top_level) &&
+          (DSContext == DeclSpecContext::DSC_class ||
+           DSContext == DeclSpecContext::DSC_top_level) &&
           Actions.isDeductionGuideName(getCurScope(), *Tok.getIdentifierInfo(),
                                        Tok.getLocation()) &&
           isConstructorDeclarator(/*Unqualified*/ true,
@@ -3283,7 +3292,7 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
       // If we're in a context where the template-id could be a
       // constructor name or specialization, check whether this is a
       // constructor declaration.
-      if (getLangOpts().CPlusPlus && DSContext == DSC_class &&
+      if (getLangOpts().CPlusPlus && DSContext == DeclSpecContext::DSC_class &&
           Actions.isCurrentClassName(*TemplateId->Name, getCurScope()) &&
           isConstructorDeclarator(TemplateId->SS.isEmpty()))
         goto DoneWithDeclSpec;
@@ -3453,7 +3462,7 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
 
     // friend
     case tok::kw_friend:
-      if (DSContext == DSC_class)
+      if (DSContext == DeclSpecContext::DSC_class)
         isInvalid = DS.SetFriendSpec(Loc, PrevSpec, DiagID);
       else {
         PrevSpec = ""; // not actually used by the diagnostic
@@ -4072,7 +4081,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   SuppressAccessChecks diagsFromTag(*this, shouldDelayDiagsInTag);
 
   // Enum definitions should not be parsed in a trailing-return-type.
-  bool AllowDeclaration = DSC != DSC_trailing;
+  bool AllowDeclaration = DSC != DeclSpecContext::DSC_trailing;
 
   bool AllowFixedUnderlyingType = AllowDeclaration &&
     (getLangOpts().CPlusPlus11 || getLangOpts().MicrosoftExt ||
@@ -4298,14 +4307,14 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   bool IsDependent = false;
   const char *PrevSpec = nullptr;
   unsigned DiagID;
-  Decl *TagDecl = Actions.ActOnTag(getCurScope(), DeclSpec::TST_enum, TUK,
-                                   StartLoc, SS, Name, NameLoc, attrs.getList(),
-                                   AS, DS.getModulePrivateSpecLoc(), TParams,
-                                   Owned, IsDependent, ScopedEnumKWLoc,
-                                   IsScopedUsingClassTag, BaseType,
-                                   DSC == DSC_type_specifier,
-                                   DSC == DSC_template_param ||
-                                   DSC == DSC_template_type_arg, &SkipBody);
+  Decl *TagDecl = Actions.ActOnTag(
+      getCurScope(), DeclSpec::TST_enum, TUK, StartLoc, SS, Name, NameLoc,
+      attrs.getList(), AS, DS.getModulePrivateSpecLoc(), TParams, Owned,
+      IsDependent, ScopedEnumKWLoc, IsScopedUsingClassTag, BaseType,
+      DSC == DeclSpecContext::DSC_type_specifier,
+      DSC == DeclSpecContext::DSC_template_param ||
+          DSC == DeclSpecContext::DSC_template_type_arg,
+      &SkipBody);
 
   if (SkipBody.ShouldSkip) {
     assert(TUK == Sema::TUK_Definition && "can only skip a definition");
diff --git a/lib/Parse/ParseDeclCXX.cpp b/lib/Parse/ParseDeclCXX.cpp
index c62580f4e24f..68b73ca1d25f 100644
--- a/lib/Parse/ParseDeclCXX.cpp
+++ b/lib/Parse/ParseDeclCXX.cpp
@@ -701,7 +701,7 @@ Parser::ParseUsingDeclaration(DeclaratorContext Context,
       // "typename" keyword is allowed for identifiers only,
       // because it may be a type definition.
       if (D.TypenameLoc.isValid() &&
-          D.Name.getKind() != UnqualifiedId::IK_Identifier) {
+          D.Name.getKind() != UnqualifiedIdKind::IK_Identifier) {
         Diag(D.Name.getSourceRange().getBegin(),
              diag::err_typename_identifiers_only)
             << FixItHint::CreateRemoval(SourceRange(D.TypenameLoc));
@@ -755,7 +755,7 @@ Decl *Parser::ParseAliasDeclarationAfterDeclarator(
   // Type alias templates cannot be specialized.
   int SpecKind = -1;
   if (TemplateInfo.Kind == ParsedTemplateInfo::Template &&
-      D.Name.getKind() == UnqualifiedId::IK_TemplateId)
+      D.Name.getKind() == UnqualifiedIdKind::IK_TemplateId)
     SpecKind = 0;
   if (TemplateInfo.Kind == ParsedTemplateInfo::ExplicitSpecialization)
     SpecKind = 1;
@@ -775,7 +775,7 @@ Decl *Parser::ParseAliasDeclarationAfterDeclarator(
   }
 
   // Name must be an identifier.
-  if (D.Name.getKind() != UnqualifiedId::IK_Identifier) {
+  if (D.Name.getKind() != UnqualifiedIdKind::IK_Identifier) {
     Diag(D.Name.StartLocation, diag::err_alias_declaration_not_identifier);
     // No removal fixit: can't recover from this.
     SkipUntil(tok::semi);
@@ -1614,14 +1614,15 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   //   new struct s;
   // or
   //   &T::operator struct s;
-  // For these, DSC is DSC_type_specifier or DSC_alias_declaration.
+  // For these, DSC is DeclSpecContext::DSC_type_specifier or
+  // DeclSpecContext::DSC_alias_declaration.
 
   // If there are attributes after class name, parse them.
   MaybeParseCXX11Attributes(Attributes);
 
   const PrintingPolicy &Policy = Actions.getASTContext().getPrintingPolicy();
   Sema::TagUseKind TUK;
-  if (DSC == DSC_trailing)
+  if (DSC == DeclSpecContext::DSC_trailing)
     TUK = Sema::TUK_Reference;
   else if (Tok.is(tok::l_brace) ||
            (getLangOpts().CPlusPlus && Tok.is(tok::colon)) ||
@@ -1883,15 +1884,14 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     stripTypeAttributesOffDeclSpec(attrs, DS, TUK);
 
     // Declaration or definition of a class type
-    TagOrTempResult = Actions.ActOnTag(getCurScope(), TagType, TUK, StartLoc,
-                                       SS, Name, NameLoc, attrs.getList(), AS,
-                                       DS.getModulePrivateSpecLoc(),
-                                       TParams, Owned, IsDependent,
-                                       SourceLocation(), false,
-                                       clang::TypeResult(),
-                                       DSC == DSC_type_specifier,
-                                       DSC == DSC_template_param ||
-                                       DSC == DSC_template_type_arg, &SkipBody);
+    TagOrTempResult = Actions.ActOnTag(
+        getCurScope(), TagType, TUK, StartLoc, SS, Name, NameLoc,
+        attrs.getList(), AS, DS.getModulePrivateSpecLoc(), TParams, Owned,
+        IsDependent, SourceLocation(), false, clang::TypeResult(),
+        DSC == DeclSpecContext::DSC_type_specifier,
+        DSC == DeclSpecContext::DSC_template_param ||
+            DSC == DeclSpecContext::DSC_template_type_arg,
+        &SkipBody);
 
     // If ActOnTag said the type was dependent, try again with the
     // less common call.
@@ -2561,7 +2561,7 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS,
   if (MalformedTypeSpec)
     DS.SetTypeSpecError();
 
-  ParseDeclarationSpecifiers(DS, TemplateInfo, AS, DSC_class,
+  ParseDeclarationSpecifiers(DS, TemplateInfo, AS, DeclSpecContext::DSC_class,
                              &CommonLateParsedAttrs);
 
   // Turn off colon protection that was set for declspec.
@@ -2571,7 +2571,7 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS,
   // may get this far before the problem becomes obvious.
   if (DS.hasTagDefinition() &&
       TemplateInfo.Kind == ParsedTemplateInfo::NonTemplate &&
-      DiagnoseMissingSemiAfterTagDefinition(DS, AS, DSC_class,
+      DiagnoseMissingSemiAfterTagDefinition(DS, AS, DeclSpecContext::DSC_class,
                                             &CommonLateParsedAttrs))
     return nullptr;
 
diff --git a/lib/Parse/ParseExprCXX.cpp b/lib/Parse/ParseExprCXX.cpp
index f11a6c3b043b..914f6f3613bb 100644
--- a/lib/Parse/ParseExprCXX.cpp
+++ b/lib/Parse/ParseExprCXX.cpp
@@ -292,8 +292,8 @@ bool Parser::ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS,
           break;
         }
 
-        if (TemplateName.getKind() != UnqualifiedId::IK_OperatorFunctionId &&
-            TemplateName.getKind() != UnqualifiedId::IK_LiteralOperatorId) {
+        if (TemplateName.getKind() != UnqualifiedIdKind::IK_OperatorFunctionId &&
+            TemplateName.getKind() != UnqualifiedIdKind::IK_LiteralOperatorId) {
           Diag(TemplateName.getSourceRange().getBegin(),
                diag::err_id_after_template_in_nested_name_spec)
             << TemplateName.getSourceRange();
@@ -1672,9 +1672,9 @@ Parser::ParseCXXTypeConstructExpression(const DeclSpec &DS) {
     if (Init.isInvalid())
       return Init;
     Expr *InitList = Init.get();
-    return Actions.ActOnCXXTypeConstructExpr(TypeRep, SourceLocation(),
-                                             MultiExprArg(&InitList, 1),
-                                             SourceLocation());
+    return Actions.ActOnCXXTypeConstructExpr(
+        TypeRep, InitList->getLocStart(), MultiExprArg(&InitList, 1),
+        InitList->getLocEnd(), /*ListInitialization=*/true);
   } else {
     BalancedDelimiterTracker T(*this, tok::l_paren);
     T.consumeOpen();
@@ -1702,9 +1702,9 @@ Parser::ParseCXXTypeConstructExpression(const DeclSpec &DS) {
 
     assert((Exprs.size() == 0 || Exprs.size()-1 == CommaLocs.size())&&
            "Unexpected number of commas!");
-    return Actions.ActOnCXXTypeConstructExpr(TypeRep, T.getOpenLocation(), 
-                                             Exprs,
-                                             T.getCloseLocation());
+    return Actions.ActOnCXXTypeConstructExpr(TypeRep, T.getOpenLocation(),
+                                             Exprs, T.getCloseLocation(),
+                                             /*ListInitialization=*/false);
   }
 }
 
@@ -1782,7 +1782,7 @@ Sema::ConditionResult Parser::ParseCXXCondition(StmtResult *InitStmt,
   // type-specifier-seq
   DeclSpec DS(AttrFactory);
   DS.takeAttributesFrom(attrs);
-  ParseSpecifierQualifierList(DS, AS_none, DSC_condition);
+  ParseSpecifierQualifierList(DS, AS_none, DeclSpecContext::DSC_condition);
 
   // declarator
   Declarator DeclaratorInfo(DS, DeclaratorContext::ConditionContext);
@@ -1983,7 +1983,7 @@ void Parser::ParseCXXSimpleTypeSpecifier(DeclSpec &DS) {
 ///     type-specifier type-specifier-seq[opt]
 ///
 bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS) {
-  ParseSpecifierQualifierList(DS, AS_none, DSC_type_specifier);
+  ParseSpecifierQualifierList(DS, AS_none, DeclSpecContext::DSC_type_specifier);
   DS.Finish(Actions, Actions.getASTContext().getPrintingPolicy());
   return false;
 }
@@ -2034,9 +2034,9 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS,
   TemplateTy Template;
   TemplateNameKind TNK = TNK_Non_template;
   switch (Id.getKind()) {
-  case UnqualifiedId::IK_Identifier:
-  case UnqualifiedId::IK_OperatorFunctionId:
-  case UnqualifiedId::IK_LiteralOperatorId:
+  case UnqualifiedIdKind::IK_Identifier:
+  case UnqualifiedIdKind::IK_OperatorFunctionId:
+  case UnqualifiedIdKind::IK_LiteralOperatorId:
     if (AssumeTemplateId) {
       // We defer the injected-class-name checks until we've found whether
       // this template-id is used to form a nested-name-specifier or not.
@@ -2059,11 +2059,11 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS,
         // parse correctly as a template, so suggest the keyword 'template'
         // before 'getAs' and treat this as a dependent template name.
         std::string Name;
-        if (Id.getKind() == UnqualifiedId::IK_Identifier)
+        if (Id.getKind() == UnqualifiedIdKind::IK_Identifier)
           Name = Id.Identifier->getName();
         else {
           Name = "operator ";
-          if (Id.getKind() == UnqualifiedId::IK_OperatorFunctionId)
+          if (Id.getKind() == UnqualifiedIdKind::IK_OperatorFunctionId)
             Name += getOperatorSpelling(Id.OperatorFunctionId.Operator);
           else
             Name += Id.Identifier->getName();
@@ -2080,7 +2080,7 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS,
     }
     break;
       
-  case UnqualifiedId::IK_ConstructorName: {
+  case UnqualifiedIdKind::IK_ConstructorName: {
     UnqualifiedId TemplateName;
     bool MemberOfUnknownSpecialization;
     TemplateName.setIdentifier(Name, NameLoc);
@@ -2091,7 +2091,7 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS,
     break;
   }
       
-  case UnqualifiedId::IK_DestructorName: {
+  case UnqualifiedIdKind::IK_DestructorName: {
     UnqualifiedId TemplateName;
     bool MemberOfUnknownSpecialization;
     TemplateName.setIdentifier(Name, NameLoc);
@@ -2130,18 +2130,20 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS,
                                true, LAngleLoc, TemplateArgs, RAngleLoc))
     return true;
   
-  if (Id.getKind() == UnqualifiedId::IK_Identifier ||
-      Id.getKind() == UnqualifiedId::IK_OperatorFunctionId ||
-      Id.getKind() == UnqualifiedId::IK_LiteralOperatorId) {
+  if (Id.getKind() == UnqualifiedIdKind::IK_Identifier ||
+      Id.getKind() == UnqualifiedIdKind::IK_OperatorFunctionId ||
+      Id.getKind() == UnqualifiedIdKind::IK_LiteralOperatorId) {
     // Form a parsed representation of the template-id to be stored in the
     // UnqualifiedId.
 
     // FIXME: Store name for literal operator too.
     IdentifierInfo *TemplateII =
-        Id.getKind() == UnqualifiedId::IK_Identifier ? Id.Identifier : nullptr;
-    OverloadedOperatorKind OpKind = Id.getKind() == UnqualifiedId::IK_Identifier
-                                        ? OO_None
-                                        : Id.OperatorFunctionId.Operator;
+        Id.getKind() == UnqualifiedIdKind::IK_Identifier ? Id.Identifier
+                                                         : nullptr;
+    OverloadedOperatorKind OpKind =
+        Id.getKind() == UnqualifiedIdKind::IK_Identifier
+            ? OO_None
+            : Id.OperatorFunctionId.Operator;
 
     TemplateIdAnnotation *TemplateId = TemplateIdAnnotation::Create(
         SS, TemplateKWLoc, Id.StartLocation, TemplateII, OpKind, Template, TNK,
@@ -2163,7 +2165,7 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS,
   if (Type.isInvalid())
     return true;
   
-  if (Id.getKind() == UnqualifiedId::IK_ConstructorName)
+  if (Id.getKind() == UnqualifiedIdKind::IK_ConstructorName)
     Id.setConstructorName(Type.get(), NameLoc, RAngleLoc);
   else
     Id.setDestructorName(Id.StartLocation, Type.get(), RAngleLoc);
@@ -2557,8 +2559,8 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext,
     // 
     //   template-id:
     //     operator-function-id < template-argument-list[opt] >
-    if ((Result.getKind() == UnqualifiedId::IK_OperatorFunctionId ||
-         Result.getKind() == UnqualifiedId::IK_LiteralOperatorId) &&
+    if ((Result.getKind() == UnqualifiedIdKind::IK_OperatorFunctionId ||
+         Result.getKind() == UnqualifiedIdKind::IK_LiteralOperatorId) &&
         (TemplateSpecified || Tok.is(tok::less)))
       return ParseUnqualifiedIdTemplateId(SS, TemplateKWLoc,
                                           nullptr, SourceLocation(),
diff --git a/lib/Parse/ParseObjc.cpp b/lib/Parse/ParseObjc.cpp
index a446fd7aa127..688376ca28e6 100644
--- a/lib/Parse/ParseObjc.cpp
+++ b/lib/Parse/ParseObjc.cpp
@@ -1265,9 +1265,9 @@ ParsedType Parser::ParseObjCTypeName(ObjCDeclSpec &DS,
     // Parse an abstract declarator.
     DeclSpec declSpec(AttrFactory);
     declSpec.setObjCQualifiers(&DS);
-    DeclSpecContext dsContext = DSC_normal;
+    DeclSpecContext dsContext = DeclSpecContext::DSC_normal;
     if (context == DeclaratorContext::ObjCResultContext)
-      dsContext = DSC_objc_method_result;
+      dsContext = DeclSpecContext::DSC_objc_method_result;
     ParseSpecifierQualifierList(declSpec, AS_none, dsContext);
     Declarator declarator(declSpec, context);
     ParseDeclarator(declarator);
diff --git a/lib/Parse/ParseOpenMP.cpp b/lib/Parse/ParseOpenMP.cpp
index 628c1f134ac5..f3edf7ca51f4 100644
--- a/lib/Parse/ParseOpenMP.cpp
+++ b/lib/Parse/ParseOpenMP.cpp
@@ -1205,11 +1205,13 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
                                      OpenMPClauseKind CKind, bool FirstClause) {
   OMPClause *Clause = nullptr;
   bool ErrorFound = false;
+  bool WrongDirective = false;
   // Check if clause is allowed for the given directive.
   if (CKind != OMPC_unknown && !isAllowedClauseForDirective(DKind, CKind)) {
     Diag(Tok, diag::err_omp_unexpected_clause) << getOpenMPClauseName(CKind)
                                                << getOpenMPDirectiveName(DKind);
     ErrorFound = true;
+    WrongDirective = true;
   }
 
   switch (CKind) {
@@ -1253,9 +1255,9 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     }
 
     if (CKind == OMPC_ordered && PP.LookAhead(/*N=*/0).isNot(tok::l_paren))
-      Clause = ParseOpenMPClause(CKind);
+      Clause = ParseOpenMPClause(CKind, WrongDirective);
     else
-      Clause = ParseOpenMPSingleExprClause(CKind);
+      Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
     break;
   case OMPC_default:
   case OMPC_proc_bind:
@@ -1270,7 +1272,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
       ErrorFound = true;
     }
 
-    Clause = ParseOpenMPSimpleClause(CKind);
+    Clause = ParseOpenMPSimpleClause(CKind, WrongDirective);
     break;
   case OMPC_schedule:
   case OMPC_dist_schedule:
@@ -1287,7 +1289,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     LLVM_FALLTHROUGH;
 
   case OMPC_if:
-    Clause = ParseOpenMPSingleExprWithArgClause(CKind);
+    Clause = ParseOpenMPSingleExprWithArgClause(CKind, WrongDirective);
     break;
   case OMPC_nowait:
   case OMPC_untied:
@@ -1310,7 +1312,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
       ErrorFound = true;
     }
 
-    Clause = ParseOpenMPClause(CKind);
+    Clause = ParseOpenMPClause(CKind, WrongDirective);
     break;
   case OMPC_private:
   case OMPC_firstprivate:
@@ -1330,7 +1332,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_from:
   case OMPC_use_device_ptr:
   case OMPC_is_device_ptr:
-    Clause = ParseOpenMPVarListClause(DKind, CKind);
+    Clause = ParseOpenMPVarListClause(DKind, CKind, WrongDirective);
     break;
   case OMPC_unknown:
     Diag(Tok, diag::warn_omp_extra_tokens_at_eol)
@@ -1339,8 +1341,9 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     break;
   case OMPC_threadprivate:
   case OMPC_uniform:
-    Diag(Tok, diag::err_omp_unexpected_clause) << getOpenMPClauseName(CKind)
-                                               << getOpenMPDirectiveName(DKind);
+    if (!WrongDirective)
+      Diag(Tok, diag::err_omp_unexpected_clause)
+          << getOpenMPClauseName(CKind) << getOpenMPDirectiveName(DKind);
     SkipUntil(tok::comma, tok::annot_pragma_openmp_end, StopBeforeMatch);
     break;
   }
@@ -1400,7 +1403,8 @@ ExprResult Parser::ParseOpenMPParensExpr(StringRef ClauseName,
 ///    hint-clause:
 ///      'hint' '(' expression ')'
 ///
-OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind) {
+OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind,
+                                               bool ParseOnly) {
   SourceLocation Loc = ConsumeToken();
   SourceLocation LLoc = Tok.getLocation();
   SourceLocation RLoc;
@@ -1410,6 +1414,8 @@ OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind) {
   if (Val.isInvalid())
     return nullptr;
 
+  if (ParseOnly)
+    return nullptr;
   return Actions.ActOnOpenMPSingleExprClause(Kind, Val.get(), Loc, LLoc, RLoc);
 }
 
@@ -1421,7 +1427,8 @@ OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind) {
 ///    proc_bind-clause:
 ///         'proc_bind' '(' 'master' | 'close' | 'spread' ')
 ///
-OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind) {
+OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind,
+                                           bool ParseOnly) {
   SourceLocation Loc = Tok.getLocation();
   SourceLocation LOpen = ConsumeToken();
   // Parse '('.
@@ -1440,6 +1447,8 @@ OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind) {
   // Parse ')'.
   T.consumeClose();
 
+  if (ParseOnly)
+    return nullptr;
   return Actions.ActOnOpenMPSimpleClause(Kind, Type, TypeLoc, LOpen, Loc,
                                          Tok.getLocation());
 }
@@ -1470,10 +1479,12 @@ OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind) {
 ///    nogroup-clause:
 ///         'nogroup'
 ///
-OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind) {
+OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly) {
   SourceLocation Loc = Tok.getLocation();
   ConsumeAnyToken();
 
+  if (ParseOnly)
+    return nullptr;
   return Actions.ActOnOpenMPClause(Kind, Loc, Tok.getLocation());
 }
 
@@ -1491,7 +1502,8 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind) {
 ///    defaultmap:
 ///      'defaultmap' '(' modifier ':' kind ')'
 ///
-OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind) {
+OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind,
+                                                      bool ParseOnly) {
   SourceLocation Loc = ConsumeToken();
   SourceLocation DelimLoc;
   // Parse '('.
@@ -1613,6 +1625,8 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind) {
   if (NeedAnExpression && Val.isInvalid())
     return nullptr;
 
+  if (ParseOnly)
+    return nullptr;
   return Actions.ActOnOpenMPSingleExprWithArgClause(
       Kind, Arg, Val.get(), Loc, T.getOpenLocation(), KLoc, DelimLoc,
       T.getCloseLocation());
@@ -1940,7 +1954,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
 ///  modifier(list)
 /// where modifier is 'val' (C) or 'ref', 'val' or 'uval'(C++).
 OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind,
-                                            OpenMPClauseKind Kind) {
+                                            OpenMPClauseKind Kind,
+                                            bool ParseOnly) {
   SourceLocation Loc = Tok.getLocation();
   SourceLocation LOpen = ConsumeToken();
   SmallVector<Expr *, 4> Vars;
@@ -1949,6 +1964,8 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind,
   if (ParseOpenMPVarList(DKind, Kind, Vars, Data))
     return nullptr;
 
+  if (ParseOnly)
+    return nullptr;
   return Actions.ActOnOpenMPVarListClause(
       Kind, Vars, Data.TailExpr, Loc, LOpen, Data.ColonLoc, Tok.getLocation(),
       Data.ReductionIdScopeSpec, Data.ReductionId, Data.DepKind, Data.LinKind,
diff --git a/lib/Parse/ParsePragma.cpp b/lib/Parse/ParsePragma.cpp
index 198d5c6e9cb0..8152176484df 100644
--- a/lib/Parse/ParsePragma.cpp
+++ b/lib/Parse/ParsePragma.cpp
@@ -95,6 +95,44 @@ struct PragmaFPContractHandler : public PragmaHandler {
                     Token &FirstToken) override;
 };
 
+// Pragma STDC implementations.
+
+/// PragmaSTDC_FENV_ACCESSHandler - "\#pragma STDC FENV_ACCESS ...".
+struct PragmaSTDC_FENV_ACCESSHandler : public PragmaHandler {
+  PragmaSTDC_FENV_ACCESSHandler() : PragmaHandler("FENV_ACCESS") {}
+
+  void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
+                    Token &Tok) override {
+    tok::OnOffSwitch OOS;
+    if (PP.LexOnOffSwitch(OOS))
+     return;
+    if (OOS == tok::OOS_ON)
+      PP.Diag(Tok, diag::warn_stdc_fenv_access_not_supported);
+  }
+};
+
+/// PragmaSTDC_CX_LIMITED_RANGEHandler - "\#pragma STDC CX_LIMITED_RANGE ...".
+struct PragmaSTDC_CX_LIMITED_RANGEHandler : public PragmaHandler {
+  PragmaSTDC_CX_LIMITED_RANGEHandler() : PragmaHandler("CX_LIMITED_RANGE") {}
+
+  void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
+                    Token &Tok) override {
+    tok::OnOffSwitch OOS;
+    PP.LexOnOffSwitch(OOS);
+  }
+};
+
+/// PragmaSTDC_UnknownHandler - "\#pragma STDC ...".
+struct PragmaSTDC_UnknownHandler : public PragmaHandler {
+  PragmaSTDC_UnknownHandler() = default;
+
+  void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
+                    Token &UnknownTok) override {
+    // C99 6.10.6p2, unknown forms are not allowed.
+    PP.Diag(UnknownTok, diag::ext_stdc_pragma_ignored);
+  }
+};
+
 struct PragmaFPHandler : public PragmaHandler {
   PragmaFPHandler() : PragmaHandler("fp") {}
   void HandlePragma(Preprocessor &PP, PragmaIntroducerKind Introducer,
@@ -233,6 +271,15 @@ void Parser::initializePragmaHandlers() {
   FPContractHandler.reset(new PragmaFPContractHandler());
   PP.AddPragmaHandler("STDC", FPContractHandler.get());
 
+  STDCFENVHandler.reset(new PragmaSTDC_FENV_ACCESSHandler());
+  PP.AddPragmaHandler("STDC", STDCFENVHandler.get());
+
+  STDCCXLIMITHandler.reset(new PragmaSTDC_CX_LIMITED_RANGEHandler());
+  PP.AddPragmaHandler("STDC", STDCCXLIMITHandler.get());
+
+  STDCUnknownHandler.reset(new PragmaSTDC_UnknownHandler());
+  PP.AddPragmaHandler("STDC", STDCUnknownHandler.get());
+
   PCSectionHandler.reset(new PragmaClangSectionHandler(Actions));
   PP.AddPragmaHandler("clang", PCSectionHandler.get());
 
@@ -371,6 +418,15 @@ void Parser::resetPragmaHandlers() {
   PP.RemovePragmaHandler("STDC", FPContractHandler.get());
   FPContractHandler.reset();
 
+  PP.RemovePragmaHandler("STDC", STDCFENVHandler.get());
+  STDCFENVHandler.reset();
+
+  PP.RemovePragmaHandler("STDC", STDCCXLIMITHandler.get());
+  STDCCXLIMITHandler.reset();
+
+  PP.RemovePragmaHandler("STDC", STDCUnknownHandler.get());
+  STDCUnknownHandler.reset();
+
   PP.RemovePragmaHandler("clang", OptimizeHandler.get());
   OptimizeHandler.reset();
 
diff --git a/lib/Parse/ParseTemplate.cpp b/lib/Parse/ParseTemplate.cpp
index 6c3018aa9a84..611c0779b160 100644
--- a/lib/Parse/ParseTemplate.cpp
+++ b/lib/Parse/ParseTemplate.cpp
@@ -271,7 +271,8 @@ Parser::ParseSingleDeclarationAfterTemplate(
     }
 
     if (TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation) {
-      if (DeclaratorInfo.getName().getKind() != UnqualifiedId::IK_TemplateId) {
+      if (DeclaratorInfo.getName().getKind() !=
+          UnqualifiedIdKind::IK_TemplateId) {
         // If the declarator-id is not a template-id, issue a diagnostic and
         // recover by ignoring the 'template' keyword.
         Diag(Tok, diag::err_template_defn_explicit_instantiation) << 0;
@@ -676,7 +677,7 @@ Parser::ParseNonTypeTemplateParameter(unsigned Depth, unsigned Position) {
   // declarators (parts of declarators?) are accepted for parameters.
   DeclSpec DS(AttrFactory);
   ParseDeclarationSpecifiers(DS, ParsedTemplateInfo(), AS_none,
-                             DSC_template_param);
+                             DeclSpecContext::DSC_template_param);
 
   // Parse this as a typename.
   Declarator ParamDecl(DS, DeclaratorContext::TemplateParamContext);
@@ -1015,12 +1016,12 @@ bool Parser::AnnotateTemplateIdToken(TemplateTy Template, TemplateNameKind TNK,
     Tok.setKind(tok::annot_template_id);
     
     IdentifierInfo *TemplateII =
-        TemplateName.getKind() == UnqualifiedId::IK_Identifier
+        TemplateName.getKind() == UnqualifiedIdKind::IK_Identifier
             ? TemplateName.Identifier
             : nullptr;
 
     OverloadedOperatorKind OpKind =
-        TemplateName.getKind() == UnqualifiedId::IK_Identifier
+        TemplateName.getKind() == UnqualifiedIdKind::IK_Identifier
             ? OO_None
             : TemplateName.OperatorFunctionId.Operator;
 
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
index 8b62f1211253..a6f966eda1b9 100644
--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp
@@ -919,12 +919,13 @@ Parser::ParseDeclOrFunctionDefInternal(ParsedAttributesWithRange &attrs,
                                        AccessSpecifier AS) {
   MaybeParseMicrosoftAttributes(DS.getAttributes());
   // Parse the common declaration-specifiers piece.
-  ParseDeclarationSpecifiers(DS, ParsedTemplateInfo(), AS, DSC_top_level);
+  ParseDeclarationSpecifiers(DS, ParsedTemplateInfo(), AS,
+                             DeclSpecContext::DSC_top_level);
 
   // If we had a free-standing type definition with a missing semicolon, we
   // may get this far before the problem becomes obvious.
-  if (DS.hasTagDefinition() &&
-      DiagnoseMissingSemiAfterTagDefinition(DS, AS, DSC_top_level))
+  if (DS.hasTagDefinition() && DiagnoseMissingSemiAfterTagDefinition(
+                                   DS, AS, DeclSpecContext::DSC_top_level))
     return nullptr;
 
   // C99 6.7.2.3p6: Handle "struct-or-union identifier;", "enum { X };"
diff --git a/lib/Rewrite/HTMLRewrite.cpp b/lib/Rewrite/HTMLRewrite.cpp
index 618c0179f100..3a086537b4a9 100644
--- a/lib/Rewrite/HTMLRewrite.cpp
+++ b/lib/Rewrite/HTMLRewrite.cpp
@@ -210,9 +210,9 @@ static void AddLineNumber(RewriteBuffer &RB, unsigned LineNo,
   SmallString<256> Str;
   llvm::raw_svector_ostream OS(Str);
 
-  OS << "<tr><td class=\"num\" id=\"LN"
-     << LineNo << "\">"
-     << LineNo << "</td><td class=\"line\">";
+  OS << "<tr class=\"codeline\" data-linenumber=\"" << LineNo << "\">"
+     << "<td class=\"num\" id=\"LN" << LineNo << "\">" << LineNo
+     << "</td><td class=\"line\">";
 
   if (B == E) { // Handle empty lines.
     OS << " </td></tr>";
@@ -263,7 +263,10 @@ void html::AddLineNumbers(Rewriter& R, FileID FID) {
   }
 
   // Add one big table tag that surrounds all of the code.
-  RB.InsertTextBefore(0, "<table class=\"code\">\n");
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  os << "<table class=\"code\" data-fileid=\"" << FID.getHashValue() << "\">\n";
+  RB.InsertTextBefore(0, os.str());
   RB.InsertTextAfter(FileEnd - FileBeg, "</table>");
 }
 
@@ -285,78 +288,128 @@ void html::AddHeaderFooterInternalBuiltinCSS(Rewriter &R, FileID FID,
   if (!title.empty())
     os << "<title>" << html::EscapeText(title) << "</title>\n";
 
-  os << "<style type=\"text/css\">\n"
-      " body { color:#000000; background-color:#ffffff }\n"
-      " body { font-family:Helvetica, sans-serif; font-size:10pt }\n"
-      " h1 { font-size:14pt }\n"
-      " .FileName { margin-top: 5px; margin-bottom: 5px; display: inline; }\n"
-      " .FileNav { margin-left: 5px; margin-right: 5px; display: inline; }\n"
-      " .FileNav a { text-decoration:none; font-size: larger; }\n"
-      " .divider { margin-top: 30px; margin-bottom: 30px; height: 15px; }\n"
-      " .divider { background-color: gray; }\n"
-      " .code { border-collapse:collapse; width:100%; }\n"
-      " .code { font-family: \"Monospace\", monospace; font-size:10pt }\n"
-      " .code { line-height: 1.2em }\n"
-      " .comment { color: green; font-style: oblique }\n"
-      " .keyword { color: blue }\n"
-      " .string_literal { color: red }\n"
-      " .directive { color: darkmagenta }\n"
-      // Macro expansions.
-      " .expansion { display: none; }\n"
-      " .macro:hover .expansion { display: block; border: 2px solid #FF0000; "
-          "padding: 2px; background-color:#FFF0F0; font-weight: normal; "
-          "  -webkit-border-radius:5px;  -webkit-box-shadow:1px 1px 7px #000; "
-          "  border-radius:5px;  box-shadow:1px 1px 7px #000; "
-          "position: absolute; top: -1em; left:10em; z-index: 1 } \n"
-      " .macro { color: darkmagenta; background-color:LemonChiffon;"
-             // Macros are position: relative to provide base for expansions.
-             " position: relative }\n"
-      " .num { width:2.5em; padding-right:2ex; background-color:#eeeeee }\n"
-      " .num { text-align:right; font-size:8pt }\n"
-      " .num { color:#444444 }\n"
-      " .line { padding-left: 1ex; border-left: 3px solid #ccc }\n"
-      " .line { white-space: pre }\n"
-      " .msg { -webkit-box-shadow:1px 1px 7px #000 }\n"
-      " .msg { box-shadow:1px 1px 7px #000 }\n"
-      " .msg { -webkit-border-radius:5px }\n"
-      " .msg { border-radius:5px }\n"
-      " .msg { font-family:Helvetica, sans-serif; font-size:8pt }\n"
-      " .msg { float:left }\n"
-      " .msg { padding:0.25em 1ex 0.25em 1ex }\n"
-      " .msg { margin-top:10px; margin-bottom:10px }\n"
-      " .msg { font-weight:bold }\n"
-      " .msg { max-width:60em; word-wrap: break-word; white-space: pre-wrap }\n"
-      " .msgT { padding:0x; spacing:0x }\n"
-      " .msgEvent { background-color:#fff8b4; color:#000000 }\n"
-      " .msgControl { background-color:#bbbbbb; color:#000000 }\n"
-      " .msgNote { background-color:#ddeeff; color:#000000 }\n"
-      " .mrange { background-color:#dfddf3 }\n"
-      " .mrange { border-bottom:1px solid #6F9DBE }\n"
-      " .PathIndex { font-weight: bold; padding:0px 5px; "
-        "margin-right:5px; }\n"
-      " .PathIndex { -webkit-border-radius:8px }\n"
-      " .PathIndex { border-radius:8px }\n"
-      " .PathIndexEvent { background-color:#bfba87 }\n"
-      " .PathIndexControl { background-color:#8c8c8c }\n"
-      " .PathNav a { text-decoration:none; font-size: larger }\n"
-      " .CodeInsertionHint { font-weight: bold; background-color: #10dd10 }\n"
-      " .CodeRemovalHint { background-color:#de1010 }\n"
-      " .CodeRemovalHint { border-bottom:1px solid #6F9DBE }\n"
-      " .selected{ background-color:orange !important; }\n"
-      " table.simpletable {\n"
-      "   padding: 5px;\n"
-      "   font-size:12pt;\n"
-      "   margin:20px;\n"
-      "   border-collapse: collapse; border-spacing: 0px;\n"
-      " }\n"
-      " td.rowname {\n"
-      "   text-align: right;\n"
-      "   vertical-align: top;\n"
-      "   font-weight: bold;\n"
-      "   color:#444444;\n"
-      "   padding-right:2ex;\n"
-      " }\n"
-      "</style>\n</head>\n<body>";
+  os << R"<<<(
+<style type="text/css">
+body { color:#000000; background-color:#ffffff }
+body { font-family:Helvetica, sans-serif; font-size:10pt }
+h1 { font-size:14pt }
+.FileName { margin-top: 5px; margin-bottom: 5px; display: inline; }
+.FileNav { margin-left: 5px; margin-right: 5px; display: inline; }
+.FileNav a { text-decoration:none; font-size: larger; }
+.divider { margin-top: 30px; margin-bottom: 30px; height: 15px; }
+.divider { background-color: gray; }
+.code { border-collapse:collapse; width:100%; }
+.code { font-family: "Monospace", monospace; font-size:10pt }
+.code { line-height: 1.2em }
+.comment { color: green; font-style: oblique }
+.keyword { color: blue }
+.string_literal { color: red }
+.directive { color: darkmagenta }
+/* Macro expansions. */
+.expansion { display: none; }
+.macro:hover .expansion {
+  display: block;
+  border: 2px solid #FF0000;
+  padding: 2px;
+  background-color:#FFF0F0;
+  font-weight: normal;
+  -webkit-border-radius:5px;
+  -webkit-box-shadow:1px 1px 7px #000;
+  border-radius:5px;
+  box-shadow:1px 1px 7px #000;
+  position: absolute;
+  top: -1em;
+  left:10em;
+  z-index: 1
+}
+
+#tooltiphint {
+  position: fixed;
+  width: 50em;
+  margin-left: -25em;
+  left: 50%;
+  padding: 10px;
+  border: 1px solid #b0b0b0;
+  border-radius: 2px;
+  box-shadow: 1px 1px 7px black;
+  background-color: #c0c0c0;
+  z-index: 2;
+}
+.macro {
+  color: darkmagenta;
+  background-color:LemonChiffon;
+  /* Macros are position: relative to provide base for expansions. */
+  position: relative;
+}
+
+.num { width:2.5em; padding-right:2ex; background-color:#eeeeee }
+.num { text-align:right; font-size:8pt }
+.num { color:#444444 }
+.line { padding-left: 1ex; border-left: 3px solid #ccc }
+.line { white-space: pre }
+.msg { -webkit-box-shadow:1px 1px 7px #000 }
+.msg { box-shadow:1px 1px 7px #000 }
+.msg { -webkit-border-radius:5px }
+.msg { border-radius:5px }
+.msg { font-family:Helvetica, sans-serif; font-size:8pt }
+.msg { float:left }
+.msg { padding:0.25em 1ex 0.25em 1ex }
+.msg { margin-top:10px; margin-bottom:10px }
+.msg { font-weight:bold }
+.msg { max-width:60em; word-wrap: break-word; white-space: pre-wrap }
+.msgT { padding:0x; spacing:0x }
+.msgEvent { background-color:#fff8b4; color:#000000 }
+.msgControl { background-color:#bbbbbb; color:#000000 }
+.msgNote { background-color:#ddeeff; color:#000000 }
+.mrange { background-color:#dfddf3 }
+.mrange { border-bottom:1px solid #6F9DBE }
+.PathIndex { font-weight: bold; padding:0px 5px; margin-right:5px; }
+.PathIndex { -webkit-border-radius:8px }
+.PathIndex { border-radius:8px }
+.PathIndexEvent { background-color:#bfba87 }
+.PathIndexControl { background-color:#8c8c8c }
+.PathNav a { text-decoration:none; font-size: larger }
+.CodeInsertionHint { font-weight: bold; background-color: #10dd10 }
+.CodeRemovalHint { background-color:#de1010 }
+.CodeRemovalHint { border-bottom:1px solid #6F9DBE }
+.selected{ background-color:orange !important; }
+
+table.simpletable {
+  padding: 5px;
+  font-size:12pt;
+  margin:20px;
+  border-collapse: collapse; border-spacing: 0px;
+}
+td.rowname {
+  text-align: right;
+  vertical-align: top;
+  font-weight: bold;
+  color:#444444;
+  padding-right:2ex;
+}
+
+/* Hidden text. */
+input.spoilerhider + label {
+  cursor: pointer;
+  text-decoration: underline;
+  display: block;
+}
+input.spoilerhider {
+ display: none;
+}
+input.spoilerhider ~ .spoiler {
+  overflow: hidden;
+  margin: 10px auto 0;
+  height: 0;
+  opacity: 0;
+}
+input.spoilerhider:checked + label + .spoiler{
+  height: auto;
+  opacity: 1;
+}
+</style>
+</head>
+<body>)<<<";
 
   // Generate header
   R.InsertTextBefore(StartLoc, os.str());
diff --git a/lib/Sema/AnalysisBasedWarnings.cpp b/lib/Sema/AnalysisBasedWarnings.cpp
index 0033edf326ac..e67f94c6383e 100644
--- a/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/lib/Sema/AnalysisBasedWarnings.cpp
@@ -287,48 +287,21 @@ enum ThrowState {
   FoundPathWithNoThrowOutFunction,
 };
 
-static bool isThrowCaught(const CXXThrowExpr *Throw,
-                          const CXXCatchStmt *Catch) {
-  const Type *CaughtType = Catch->getCaughtType().getTypePtrOrNull();
-  if (!CaughtType)
-    return true;
-  const Type *ThrowType = nullptr;
-  if (Throw->getSubExpr())
-    ThrowType = Throw->getSubExpr()->getType().getTypePtrOrNull();
-  if (!ThrowType)
-    return false;
-  if (ThrowType->isReferenceType())
-    ThrowType = ThrowType->castAs<ReferenceType>()
-                    ->getPointeeType()
-                    ->getUnqualifiedDesugaredType();
-  if (CaughtType->isReferenceType())
-    CaughtType = CaughtType->castAs<ReferenceType>()
-                     ->getPointeeType()
-                     ->getUnqualifiedDesugaredType();
-  if (ThrowType->isPointerType() && CaughtType->isPointerType()) {
-    ThrowType = ThrowType->getPointeeType()->getUnqualifiedDesugaredType();
-    CaughtType = CaughtType->getPointeeType()->getUnqualifiedDesugaredType();
-  }
-  if (CaughtType == ThrowType)
-    return true;
-  const CXXRecordDecl *CaughtAsRecordType =
-      CaughtType->getAsCXXRecordDecl();
-  const CXXRecordDecl *ThrowTypeAsRecordType = ThrowType->getAsCXXRecordDecl();
-  if (CaughtAsRecordType && ThrowTypeAsRecordType)
-    return ThrowTypeAsRecordType->isDerivedFrom(CaughtAsRecordType);
-  return false;
-}
-
-static bool isThrowCaughtByHandlers(const CXXThrowExpr *CE,
+static bool isThrowCaughtByHandlers(Sema &S,
+                                    const CXXThrowExpr *CE,
                                     const CXXTryStmt *TryStmt) {
   for (unsigned H = 0, E = TryStmt->getNumHandlers(); H < E; ++H) {
-    if (isThrowCaught(CE, TryStmt->getHandler(H)))
+    QualType Caught = TryStmt->getHandler(H)->getCaughtType();
+    if (Caught.isNull() || // catch (...) catches everything
+        (CE->getSubExpr() && // throw; is only caught by ...
+         S.handlerCanCatch(Caught, CE->getSubExpr()->getType())))
       return true;
   }
   return false;
 }
 
-static bool doesThrowEscapePath(CFGBlock Block, SourceLocation &OpLoc) {
+static bool doesThrowEscapePath(Sema &S, CFGBlock Block,
+                                SourceLocation &OpLoc) {
   for (const auto &B : Block) {
     if (B.getKind() != CFGElement::Statement)
       continue;
@@ -342,7 +315,7 @@ static bool doesThrowEscapePath(CFGBlock Block, SourceLocation &OpLoc) {
         continue;
       if (const auto *Terminator =
               dyn_cast_or_null<CXXTryStmt>(I->getTerminator()))
-        if (isThrowCaughtByHandlers(CE, Terminator))
+        if (isThrowCaughtByHandlers(S, CE, Terminator))
           return false;
     }
     return true;
@@ -350,8 +323,8 @@ static bool doesThrowEscapePath(CFGBlock Block, SourceLocation &OpLoc) {
   return false;
 }
 
-static bool hasThrowOutNonThrowingFunc(SourceLocation &OpLoc, CFG *BodyCFG) {
-
+static bool hasThrowOutNonThrowingFunc(Sema &S, SourceLocation &OpLoc,
+                                       CFG *BodyCFG) {
   unsigned ExitID = BodyCFG->getExit().getBlockID();
 
   SmallVector<ThrowState, 16> States(BodyCFG->getNumBlockIDs(),
@@ -369,7 +342,7 @@ static bool hasThrowOutNonThrowingFunc(SourceLocation &OpLoc, CFG *BodyCFG) {
       if (ExitID == ID)
         continue;
 
-      if (doesThrowEscapePath(*CurBlock, OpLoc))
+      if (doesThrowEscapePath(S, *CurBlock, OpLoc))
         CurState = FoundPathForThrow;
     }
 
@@ -419,7 +392,7 @@ static void checkThrowInNonThrowingFunc(Sema &S, const FunctionDecl *FD,
   if (BodyCFG->getExit().pred_empty())
     return;
   SourceLocation OpLoc;
-  if (hasThrowOutNonThrowingFunc(OpLoc, BodyCFG))
+  if (hasThrowOutNonThrowingFunc(S, OpLoc, BodyCFG))
     EmitDiagForCXXThrowInNonThrowingFunc(S, OpLoc, FD);
 }
 
diff --git a/lib/Sema/CoroutineStmtBuilder.h b/lib/Sema/CoroutineStmtBuilder.h
index 33a368d92ff4..3b7ee7afcb70 100644
--- a/lib/Sema/CoroutineStmtBuilder.h
+++ b/lib/Sema/CoroutineStmtBuilder.h
@@ -51,9 +51,6 @@ class CoroutineStmtBuilder : public CoroutineBodyStmt::CtorArgs {
   /// name lookup.
   bool buildDependentStatements();
 
-  /// \brief Build just parameter moves. To use for rebuilding in TreeTransform.
-  bool buildParameterMoves();
-
   bool isInvalid() const { return !this->IsValid; }
 
 private:
@@ -65,7 +62,6 @@ class CoroutineStmtBuilder : public CoroutineBodyStmt::CtorArgs {
   bool makeReturnObject();
   bool makeGroDeclAndReturnStmt();
   bool makeReturnOnAllocFailure();
-  bool makeParamMoves();
 };
 
 } // end namespace clang
diff --git a/lib/Sema/DeclSpec.cpp b/lib/Sema/DeclSpec.cpp
index b537b95baf22..2fad5a18ba6b 100644
--- a/lib/Sema/DeclSpec.cpp
+++ b/lib/Sema/DeclSpec.cpp
@@ -30,7 +30,7 @@ using namespace clang;
 
 void UnqualifiedId::setTemplateId(TemplateIdAnnotation *TemplateId) {
   assert(TemplateId && "NULL template-id annotation?");
-  Kind = IK_TemplateId;
+  Kind = UnqualifiedIdKind::IK_TemplateId;
   this->TemplateId = TemplateId;
   StartLocation = TemplateId->TemplateNameLoc;
   EndLocation = TemplateId->RAngleLoc;
@@ -38,7 +38,7 @@ void UnqualifiedId::setTemplateId(TemplateIdAnnotation *TemplateId) {
 
 void UnqualifiedId::setConstructorTemplateId(TemplateIdAnnotation *TemplateId) {
   assert(TemplateId && "NULL template-id annotation?");
-  Kind = IK_ConstructorTemplateId;
+  Kind = UnqualifiedIdKind::IK_ConstructorTemplateId;
   this->TemplateId = TemplateId;
   StartLocation = TemplateId->TemplateNameLoc;
   EndLocation = TemplateId->RAngleLoc;
@@ -389,14 +389,14 @@ bool Declarator::isDeclarationOfFunction() const {
 bool Declarator::isStaticMember() {
   assert(getContext() == DeclaratorContext::MemberContext);
   return getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_static ||
-         (getName().Kind == UnqualifiedId::IK_OperatorFunctionId &&
+         (getName().Kind == UnqualifiedIdKind::IK_OperatorFunctionId &&
           CXXMethodDecl::isStaticOverloadedOperator(
               getName().OperatorFunctionId.Operator));
 }
 
 bool Declarator::isCtorOrDtor() {
-  return (getName().getKind() == UnqualifiedId::IK_ConstructorName) ||
-         (getName().getKind() == UnqualifiedId::IK_DestructorName);
+  return (getName().getKind() == UnqualifiedIdKind::IK_ConstructorName) ||
+         (getName().getKind() == UnqualifiedIdKind::IK_DestructorName);
 }
 
 bool DeclSpec::hasTagDefinition() const {
@@ -1281,7 +1281,7 @@ bool DeclSpec::isMissingDeclaratorOk() {
 void UnqualifiedId::setOperatorFunctionId(SourceLocation OperatorLoc, 
                                           OverloadedOperatorKind Op,
                                           SourceLocation SymbolLocations[3]) {
-  Kind = IK_OperatorFunctionId;
+  Kind = UnqualifiedIdKind::IK_OperatorFunctionId;
   StartLocation = OperatorLoc;
   EndLocation = OperatorLoc;
   OperatorFunctionId.Operator = Op;
diff --git a/lib/Sema/MultiplexExternalSemaSource.cpp b/lib/Sema/MultiplexExternalSemaSource.cpp
index 77ace0cfa579..46238fb3e40b 100644
--- a/lib/Sema/MultiplexExternalSemaSource.cpp
+++ b/lib/Sema/MultiplexExternalSemaSource.cpp
@@ -164,6 +164,13 @@ void MultiplexExternalSemaSource::PrintStats() {
     Sources[i]->PrintStats();
 }
 
+Module *MultiplexExternalSemaSource::getModule(unsigned ID) {
+  for (size_t i = 0; i < Sources.size(); ++i)
+    if (auto M = Sources[i]->getModule(ID))
+      return M;
+  return nullptr;
+}
+
 bool MultiplexExternalSemaSource::layoutRecordType(const RecordDecl *Record,
                                                    uint64_t &Size, 
                                                    uint64_t &Alignment,
diff --git a/lib/Sema/Scope.cpp b/lib/Sema/Scope.cpp
index ae5b181c6728..eae5a328bfa2 100644
--- a/lib/Sema/Scope.cpp
+++ b/lib/Sema/Scope.cpp
@@ -143,72 +143,43 @@ void Scope::dumpImpl(raw_ostream &OS) const {
   if (HasFlags)
     OS << "Flags: ";
 
-  while (Flags) {
-    if (Flags & FnScope) {
-      OS << "FnScope";
-      Flags &= ~FnScope;
-    } else if (Flags & BreakScope) {
-      OS << "BreakScope";
-      Flags &= ~BreakScope;
-    } else if (Flags & ContinueScope) {
-      OS << "ContinueScope";
-      Flags &= ~ContinueScope;
-    } else if (Flags & DeclScope) {
-      OS << "DeclScope";
-      Flags &= ~DeclScope;
-    } else if (Flags & ControlScope) {
-      OS << "ControlScope";
-      Flags &= ~ControlScope;
-    } else if (Flags & ClassScope) {
-      OS << "ClassScope";
-      Flags &= ~ClassScope;
-    } else if (Flags & BlockScope) {
-      OS << "BlockScope";
-      Flags &= ~BlockScope;
-    } else if (Flags & TemplateParamScope) {
-      OS << "TemplateParamScope";
-      Flags &= ~TemplateParamScope;
-    } else if (Flags & FunctionPrototypeScope) {
-      OS << "FunctionPrototypeScope";
-      Flags &= ~FunctionPrototypeScope;
-    } else if (Flags & FunctionDeclarationScope) {
-      OS << "FunctionDeclarationScope";
-      Flags &= ~FunctionDeclarationScope;
-    } else if (Flags & AtCatchScope) {
-      OS << "AtCatchScope";
-      Flags &= ~AtCatchScope;
-    } else if (Flags & ObjCMethodScope) {
-      OS << "ObjCMethodScope";
-      Flags &= ~ObjCMethodScope;
-    } else if (Flags & SwitchScope) {
-      OS << "SwitchScope";
-      Flags &= ~SwitchScope;
-    } else if (Flags & TryScope) {
-      OS << "TryScope";
-      Flags &= ~TryScope;
-    } else if (Flags & FnTryCatchScope) {
-      OS << "FnTryCatchScope";
-      Flags &= ~FnTryCatchScope;
-    } else if (Flags & SEHTryScope) {
-      OS << "SEHTryScope";
-      Flags &= ~SEHTryScope;
-    } else if (Flags & SEHExceptScope) {
-      OS << "SEHExceptScope";
-      Flags &= ~SEHExceptScope;
-    } else if (Flags & OpenMPDirectiveScope) {
-      OS << "OpenMPDirectiveScope";
-      Flags &= ~OpenMPDirectiveScope;
-    } else if (Flags & OpenMPLoopDirectiveScope) {
-      OS << "OpenMPLoopDirectiveScope";
-      Flags &= ~OpenMPLoopDirectiveScope;
-    } else if (Flags & OpenMPSimdDirectiveScope) {
-      OS << "OpenMPSimdDirectiveScope";
-      Flags &= ~OpenMPSimdDirectiveScope;
+  std::pair<unsigned, const char *> FlagInfo[] = {
+      {FnScope, "FnScope"},
+      {BreakScope, "BreakScope"},
+      {ContinueScope, "ContinueScope"},
+      {DeclScope, "DeclScope"},
+      {ControlScope, "ControlScope"},
+      {ClassScope, "ClassScope"},
+      {BlockScope, "BlockScope"},
+      {TemplateParamScope, "TemplateParamScope"},
+      {FunctionPrototypeScope, "FunctionPrototypeScope"},
+      {FunctionDeclarationScope, "FunctionDeclarationScope"},
+      {AtCatchScope, "AtCatchScope"},
+      {ObjCMethodScope, "ObjCMethodScope"},
+      {SwitchScope, "SwitchScope"},
+      {TryScope, "TryScope"},
+      {FnTryCatchScope, "FnTryCatchScope"},
+      {OpenMPDirectiveScope, "OpenMPDirectiveScope"},
+      {OpenMPLoopDirectiveScope, "OpenMPLoopDirectiveScope"},
+      {OpenMPSimdDirectiveScope, "OpenMPSimdDirectiveScope"},
+      {EnumScope, "EnumScope"},
+      {SEHTryScope, "SEHTryScope"},
+      {SEHExceptScope, "SEHExceptScope"},
+      {SEHFilterScope, "SEHFilterScope"},
+      {CompoundStmtScope, "CompoundStmtScope"},
+      {ClassInheritanceScope, "ClassInheritanceScope"}};
+
+  for (auto Info : FlagInfo) {
+    if (Flags & Info.first) {
+      OS << Info.second;
+      Flags &= ~Info.first;
+      if (Flags)
+        OS << " | ";
     }
-
-    if (Flags)
-      OS << " | ";
   }
+
+  assert(Flags == 0 && "Unknown scope flags");
+
   if (HasFlags)
     OS << '\n';
 
diff --git a/lib/Sema/ScopeInfo.cpp b/lib/Sema/ScopeInfo.cpp
index b309a36a30a3..62a83ccb70aa 100644
--- a/lib/Sema/ScopeInfo.cpp
+++ b/lib/Sema/ScopeInfo.cpp
@@ -43,6 +43,7 @@ void FunctionScopeInfo::Clear() {
   // Coroutine state
   FirstCoroutineStmtLoc = SourceLocation();
   CoroutinePromise = nullptr;
+  CoroutineParameterMoves.clear();
   NeedsCoroutineSuspends = true;
   CoroutineSuspends.first = nullptr;
   CoroutineSuspends.second = nullptr;
diff --git a/lib/Sema/Sema.cpp b/lib/Sema/Sema.cpp
index 4e57e5ef81c6..e07f60989d8f 100644
--- a/lib/Sema/Sema.cpp
+++ b/lib/Sema/Sema.cpp
@@ -1647,6 +1647,12 @@ static void noteOverloads(Sema &S, const UnresolvedSetImpl &Overloads,
     }
 
     NamedDecl *Fn = (*It)->getUnderlyingDecl();
+    // Don't print overloads for non-default multiversioned functions.
+    if (const auto *FD = Fn->getAsFunction()) {
+      if (FD->isMultiVersion() &&
+          !FD->getAttr<TargetAttr>()->isDefaultVersion())
+        continue;
+    }
     S.Diag(Fn->getLocation(), diag::note_possible_target_of_call);
     ++ShownOverloads;
   }
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 803f87b3c568..3a6988c60261 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -1353,6 +1353,7 @@ bool Sema::CheckNeonBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   switch (BuiltinID) {
 #define GET_NEON_OVERLOAD_CHECK
 #include "clang/Basic/arm_neon.inc"
+#include "clang/Basic/arm_fp16.inc"
 #undef GET_NEON_OVERLOAD_CHECK
   }
 
@@ -1404,6 +1405,7 @@ bool Sema::CheckNeonBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     return false;
 #define GET_NEON_IMMEDIATE_CHECK
 #include "clang/Basic/arm_neon.inc"
+#include "clang/Basic/arm_fp16.inc"
 #undef GET_NEON_IMMEDIATE_CHECK
   }
 
@@ -2361,13 +2363,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_cmpss_mask:
     i = 2; l = 0; u = 31;
     break;
-  case X86::BI__builtin_ia32_xabort:
-    i = 0; l = -128; u = 255;
-    break;
-  case X86::BI__builtin_ia32_pshufw:
-  case X86::BI__builtin_ia32_aeskeygenassist128:
-    i = 1; l = -128; u = 255;
-    break;
   case X86::BI__builtin_ia32_vcvtps2ph:
   case X86::BI__builtin_ia32_vcvtps2ph_mask:
   case X86::BI__builtin_ia32_vcvtps2ph256:
@@ -2405,27 +2400,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_fpclassss_mask:
     i = 1; l = 0; u = 255;
     break;
-  case X86::BI__builtin_ia32_palignr:
-  case X86::BI__builtin_ia32_insertps128:
-  case X86::BI__builtin_ia32_dpps:
-  case X86::BI__builtin_ia32_dppd:
-  case X86::BI__builtin_ia32_dpps256:
-  case X86::BI__builtin_ia32_mpsadbw128:
-  case X86::BI__builtin_ia32_mpsadbw256:
-  case X86::BI__builtin_ia32_pcmpistrm128:
-  case X86::BI__builtin_ia32_pcmpistri128:
-  case X86::BI__builtin_ia32_pcmpistria128:
-  case X86::BI__builtin_ia32_pcmpistric128:
-  case X86::BI__builtin_ia32_pcmpistrio128:
-  case X86::BI__builtin_ia32_pcmpistris128:
-  case X86::BI__builtin_ia32_pcmpistriz128:
-  case X86::BI__builtin_ia32_pclmulqdq128:
-  case X86::BI__builtin_ia32_vperm2f128_pd256:
-  case X86::BI__builtin_ia32_vperm2f128_ps256:
-  case X86::BI__builtin_ia32_vperm2f128_si256:
-  case X86::BI__builtin_ia32_permti256:
-    i = 2; l = -128; u = 255;
-    break;
   case X86::BI__builtin_ia32_palignr128:
   case X86::BI__builtin_ia32_palignr256:
   case X86::BI__builtin_ia32_palignr512_mask:
@@ -2438,6 +2412,24 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_dbpsadbw128_mask:
   case X86::BI__builtin_ia32_dbpsadbw256_mask:
   case X86::BI__builtin_ia32_dbpsadbw512_mask:
+  case X86::BI__builtin_ia32_vpshldd128_mask:
+  case X86::BI__builtin_ia32_vpshldd256_mask:
+  case X86::BI__builtin_ia32_vpshldd512_mask:
+  case X86::BI__builtin_ia32_vpshldq128_mask:
+  case X86::BI__builtin_ia32_vpshldq256_mask:
+  case X86::BI__builtin_ia32_vpshldq512_mask:
+  case X86::BI__builtin_ia32_vpshldw128_mask:
+  case X86::BI__builtin_ia32_vpshldw256_mask:
+  case X86::BI__builtin_ia32_vpshldw512_mask:
+  case X86::BI__builtin_ia32_vpshrdd128_mask:
+  case X86::BI__builtin_ia32_vpshrdd256_mask:
+  case X86::BI__builtin_ia32_vpshrdd512_mask:
+  case X86::BI__builtin_ia32_vpshrdq128_mask:
+  case X86::BI__builtin_ia32_vpshrdq256_mask:
+  case X86::BI__builtin_ia32_vpshrdq512_mask:
+  case X86::BI__builtin_ia32_vpshrdw128_mask:
+  case X86::BI__builtin_ia32_vpshrdw256_mask:
+  case X86::BI__builtin_ia32_vpshrdw512_mask:
     i = 2; l = 0; u = 255;
     break;
   case X86::BI__builtin_ia32_fixupimmpd512_mask:
@@ -2480,15 +2472,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_scatterpfqps:
     i = 4; l = 2; u = 3;
     break;
-  case X86::BI__builtin_ia32_pcmpestrm128:
-  case X86::BI__builtin_ia32_pcmpestri128:
-  case X86::BI__builtin_ia32_pcmpestria128:
-  case X86::BI__builtin_ia32_pcmpestric128:
-  case X86::BI__builtin_ia32_pcmpestrio128:
-  case X86::BI__builtin_ia32_pcmpestris128:
-  case X86::BI__builtin_ia32_pcmpestriz128:
-    i = 4; l = -128; u = 255;
-    break;
   case X86::BI__builtin_ia32_rndscalesd_round_mask:
   case X86::BI__builtin_ia32_rndscaless_round_mask:
     i = 4; l = 0; u = 255;
diff --git a/lib/Sema/SemaCodeComplete.cpp b/lib/Sema/SemaCodeComplete.cpp
index 2acc896d53d7..8fffa465ec9b 100644
--- a/lib/Sema/SemaCodeComplete.cpp
+++ b/lib/Sema/SemaCodeComplete.cpp
@@ -318,6 +318,11 @@ namespace {
     /// \brief Ignore this declaration, if it is seen again.
     void Ignore(const Decl *D) { AllDeclsFound.insert(D->getCanonicalDecl()); }
 
+    /// \brief Add a visited context.
+    void addVisitedContext(DeclContext *Ctx) {
+      CompletionContext.addVisitedContext(Ctx);
+    }
+
     /// \name Name lookup predicates
     ///
     /// These predicates can be passed to the name lookup functions to filter the
@@ -838,6 +843,12 @@ void ResultBuilder::MaybeAddConstructorResults(Result R) {
   }
 }
 
+static bool isConstructor(const Decl *ND) {
+  if (const auto *Tmpl = dyn_cast<FunctionTemplateDecl>(ND))
+    ND = Tmpl->getTemplatedDecl();
+  return isa<CXXConstructorDecl>(ND);
+}
+
 void ResultBuilder::MaybeAddResult(Result R, DeclContext *CurContext) {
   assert(!ShadowMaps.empty() && "Must enter into a results scope");
   
@@ -865,7 +876,7 @@ void ResultBuilder::MaybeAddResult(Result R, DeclContext *CurContext) {
     return;
       
   // C++ constructors are never found by name lookup.
-  if (isa<CXXConstructorDecl>(R.Declaration))
+  if (isConstructor(R.Declaration))
     return;
 
   ShadowMap &SMap = ShadowMaps.back();
@@ -978,7 +989,7 @@ void ResultBuilder::AddResult(Result R, DeclContext *CurContext,
     return;
   
   // C++ constructors are never found by name lookup.
-  if (isa<CXXConstructorDecl>(R.Declaration))
+  if (isConstructor(R.Declaration))
     return;
 
   if (Hiding && CheckHiddenResult(R, CurContext, Hiding))
@@ -1280,7 +1291,7 @@ namespace {
   class CodeCompletionDeclConsumer : public VisibleDeclConsumer {
     ResultBuilder &Results;
     DeclContext *CurContext;
-    
+
   public:
     CodeCompletionDeclConsumer(ResultBuilder &Results, DeclContext *CurContext)
       : Results(Results), CurContext(CurContext) { }
@@ -1295,6 +1306,10 @@ namespace {
                                    false, Accessible);
       Results.AddResult(Result, CurContext, Hiding, InBaseClass);
     }
+
+    void EnteredContext(DeclContext* Ctx) override {
+      Results.addVisitedContext(Ctx);
+    }
   };
 }
 
@@ -2136,7 +2151,7 @@ static void AddResultTypeChunk(ASTContext &Context,
 
   // Skip constructors and conversion functions, which have their return types
   // built into their names.
-  if (isa<CXXConstructorDecl>(ND) || isa<CXXConversionDecl>(ND))
+  if (isConstructor(ND) || isa<CXXConversionDecl>(ND))
     return;
 
   // Determine the type of the declaration (if it has a type).
@@ -3524,7 +3539,8 @@ void Sema::CodeCompleteOrdinaryName(Scope *S,
   
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
 
   AddOrdinaryNameResults(CompletionContext, S, *this, Results);
   Results.ExitScope();
@@ -3599,7 +3615,8 @@ void Sema::CodeCompleteDeclSpec(Scope *S, DeclSpec &DS,
       Results.setFilter(&ResultBuilder::IsImpossibleToSatisfy);
       CodeCompletionDeclConsumer Consumer(Results, CurContext);
       LookupVisibleDecls(S, LookupNestedNameSpecifierName, Consumer,
-                         CodeCompleter->includeGlobals());
+                         CodeCompleter->includeGlobals(),
+                         CodeCompleter->loadExternal());
       Results.setFilter(nullptr);
     }
   }
@@ -3669,8 +3686,9 @@ void Sema::CodeCompleteExpression(Scope *S,
   
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
-  
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
+
   Results.EnterNewScope();
   AddOrdinaryNameResults(PCC_Expression, S, *this, Results);
   Results.ExitScope();
@@ -3939,7 +3957,8 @@ static void AddRecordMembersCompletionResults(Sema &SemaRef,
   CodeCompletionDeclConsumer Consumer(Results, SemaRef.CurContext);
   SemaRef.LookupVisibleDecls(RD, Sema::LookupMemberName, Consumer,
                              SemaRef.CodeCompleter->includeGlobals(),
-                             /*IncludeDependentBases=*/true);
+                             /*IncludeDependentBases=*/true,
+                             SemaRef.CodeCompleter->loadExternal());
 
   if (SemaRef.getLangOpts().CPlusPlus) {
     if (!Results.empty()) {
@@ -4049,8 +4068,9 @@ void Sema::CodeCompleteMemberReferenceExpr(Scope *S, Expr *Base,
     if (Class) {
       CodeCompletionDeclConsumer Consumer(Results, CurContext);
       Results.setFilter(&ResultBuilder::IsObjCIvar);
-      LookupVisibleDecls(Class, LookupMemberName, Consumer,
-                         CodeCompleter->includeGlobals());
+      LookupVisibleDecls(
+          Class, LookupMemberName, Consumer, CodeCompleter->includeGlobals(),
+          /*IncludeDependentBases=*/false, CodeCompleter->loadExternal());
     }
   }
   
@@ -4124,12 +4144,15 @@ void Sema::CodeCompleteTag(Scope *S, unsigned TagSpec) {
   // First pass: look for tags.
   Results.setFilter(Filter);
   LookupVisibleDecls(S, LookupTagName, Consumer,
-                     CodeCompleter->includeGlobals());
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
 
   if (CodeCompleter->includeGlobals()) {
     // Second pass: look for nested name specifiers.
     Results.setFilter(&ResultBuilder::IsNestedNameSpecifier);
-    LookupVisibleDecls(S, LookupNestedNameSpecifierName, Consumer);
+    LookupVisibleDecls(S, LookupNestedNameSpecifierName, Consumer,
+                       CodeCompleter->includeGlobals(),
+                       CodeCompleter->loadExternal());
   }
   
   HandleCodeCompleteResults(this, CodeCompleter, Results.getCompletionContext(),
@@ -4540,8 +4563,9 @@ void Sema::CodeCompleteAfterIf(Scope *S) {
   
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
-  
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
+
   AddOrdinaryNameResults(PCC_Statement, S, *this, Results);
   
   // "else" block
@@ -4649,7 +4673,8 @@ void Sema::CodeCompleteQualifiedId(Scope *S, CXXScopeSpec &SS,
     CodeCompletionDeclConsumer Consumer(Results, CurContext);
     LookupVisibleDecls(Ctx, LookupOrdinaryName, Consumer,
                        /*IncludeGlobalScope=*/true,
-                       /*IncludeDependentBases=*/true);
+                       /*IncludeDependentBases=*/true,
+                       CodeCompleter->loadExternal());
   }
 
   auto CC = Results.getCompletionContext();
@@ -4677,7 +4702,8 @@ void Sema::CodeCompleteUsing(Scope *S) {
   // nested-name-specifier.
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
   Results.ExitScope();
   
   HandleCodeCompleteResults(this, CodeCompleter, 
@@ -4698,7 +4724,8 @@ void Sema::CodeCompleteUsingDirective(Scope *S) {
   Results.EnterNewScope();
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
   Results.ExitScope();
   HandleCodeCompleteResults(this, CodeCompleter, 
                             CodeCompletionContext::CCC_Namespace,
@@ -4764,7 +4791,8 @@ void Sema::CodeCompleteNamespaceAliasDecl(Scope *S)  {
                         &ResultBuilder::IsNamespaceOrAlias);
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
   HandleCodeCompleteResults(this, CodeCompleter, 
                             Results.getCompletionContext(),
                             Results.data(),Results.size());
@@ -4791,8 +4819,9 @@ void Sema::CodeCompleteOperatorName(Scope *S) {
   Results.allowNestedNameSpecifiers();
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
-  
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
+
   // Add any type specifiers
   AddTypeSpecifierResults(getLangOpts(), Results);
   Results.ExitScope();
@@ -5620,8 +5649,9 @@ void Sema::CodeCompleteObjCPassingType(Scope *S, ObjCDeclSpec &DS,
   Results.setFilter(&ResultBuilder::IsOrdinaryNonValueName);
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
-  
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
+
   if (CodeCompleter->includeMacros())
     AddMacroResults(PP, Results, false);
 
@@ -5834,8 +5864,9 @@ void Sema::CodeCompleteObjCMessageReceiver(Scope *S) {
   CodeCompletionDeclConsumer Consumer(Results, CurContext);
   Results.EnterNewScope();
   LookupVisibleDecls(S, LookupOrdinaryName, Consumer,
-                     CodeCompleter->includeGlobals());
-  
+                     CodeCompleter->includeGlobals(),
+                     CodeCompleter->loadExternal());
+
   // If we are in an Objective-C method inside a class that has a superclass,
   // add "super" as an option.
   if (ObjCMethodDecl *Method = getCurMethodDecl())
@@ -7942,8 +7973,9 @@ void Sema::GatherGlobalCodeCompletions(CodeCompletionAllocator &Allocator,
   if (!CodeCompleter || CodeCompleter->includeGlobals()) {
     CodeCompletionDeclConsumer Consumer(Builder, 
                                         Context.getTranslationUnitDecl());
-    LookupVisibleDecls(Context.getTranslationUnitDecl(), LookupAnyName, 
-                       Consumer);
+    LookupVisibleDecls(Context.getTranslationUnitDecl(), LookupAnyName,
+                       Consumer,
+                       !CodeCompleter || CodeCompleter->loadExternal());
   }
   
   if (!CodeCompleter || CodeCompleter->includeMacros())
diff --git a/lib/Sema/SemaCoroutine.cpp b/lib/Sema/SemaCoroutine.cpp
index e6b640f878c2..f05a94ebe0ef 100644
--- a/lib/Sema/SemaCoroutine.cpp
+++ b/lib/Sema/SemaCoroutine.cpp
@@ -494,9 +494,67 @@ VarDecl *Sema::buildCoroutinePromise(SourceLocation Loc) {
   CheckVariableDeclarationType(VD);
   if (VD->isInvalidDecl())
     return nullptr;
-  ActOnUninitializedDecl(VD);
+
+  auto *ScopeInfo = getCurFunction();
+  // Build a list of arguments, based on the coroutine functions arguments,
+  // that will be passed to the promise type's constructor.
+  llvm::SmallVector<Expr *, 4> CtorArgExprs;
+  auto &Moves = ScopeInfo->CoroutineParameterMoves;
+  for (auto *PD : FD->parameters()) {
+    if (PD->getType()->isDependentType())
+      continue;
+
+    auto RefExpr = ExprEmpty();
+    auto Move = Moves.find(PD);
+    if (Move != Moves.end()) {
+      // If a reference to the function parameter exists in the coroutine
+      // frame, use that reference.
+      auto *MoveDecl =
+          cast<VarDecl>(cast<DeclStmt>(Move->second)->getSingleDecl());
+      RefExpr = BuildDeclRefExpr(MoveDecl, MoveDecl->getType(),
+                                 ExprValueKind::VK_LValue, FD->getLocation());
+    } else {
+      // If the function parameter doesn't exist in the coroutine frame, it
+      // must be a scalar value. Use it directly.
+      assert(!PD->getType()->getAsCXXRecordDecl() &&
+             "Non-scalar types should have been moved and inserted into the "
+             "parameter moves map");
+      RefExpr =
+          BuildDeclRefExpr(PD, PD->getOriginalType().getNonReferenceType(),
+                           ExprValueKind::VK_LValue, FD->getLocation());
+    }
+
+    if (RefExpr.isInvalid())
+      return nullptr;
+    CtorArgExprs.push_back(RefExpr.get());
+  }
+
+  // Create an initialization sequence for the promise type using the
+  // constructor arguments, wrapped in a parenthesized list expression.
+  Expr *PLE = new (Context) ParenListExpr(Context, FD->getLocation(),
+                                          CtorArgExprs, FD->getLocation());
+  InitializedEntity Entity = InitializedEntity::InitializeVariable(VD);
+  InitializationKind Kind = InitializationKind::CreateForInit(
+      VD->getLocation(), /*DirectInit=*/true, PLE);
+  InitializationSequence InitSeq(*this, Entity, Kind, CtorArgExprs,
+                                 /*TopLevelOfInitList=*/false,
+                                 /*TreatUnavailableAsInvalid=*/false);
+
+  // Attempt to initialize the promise type with the arguments.
+  // If that fails, fall back to the promise type's default constructor.
+  if (InitSeq) {
+    ExprResult Result = InitSeq.Perform(*this, Entity, Kind, CtorArgExprs);
+    if (Result.isInvalid()) {
+      VD->setInvalidDecl();
+    } else if (Result.get()) {
+      VD->setInit(MaybeCreateExprWithCleanups(Result.get()));
+      VD->setInitStyle(VarDecl::CallInit);
+      CheckCompleteVariableDeclaration(VD);
+    }
+  } else
+    ActOnUninitializedDecl(VD);
+
   FD->addDecl(VD);
-  assert(!VD->isInvalidDecl());
   return VD;
 }
 
@@ -518,6 +576,9 @@ static FunctionScopeInfo *checkCoroutineContext(Sema &S, SourceLocation Loc,
   if (ScopeInfo->CoroutinePromise)
     return ScopeInfo;
 
+  if (!S.buildCoroutineParameterMoves(Loc))
+    return nullptr;
+
   ScopeInfo->CoroutinePromise = S.buildCoroutinePromise(Loc);
   if (!ScopeInfo->CoroutinePromise)
     return nullptr;
@@ -861,6 +922,11 @@ CoroutineStmtBuilder::CoroutineStmtBuilder(Sema &S, FunctionDecl &FD,
           !Fn.CoroutinePromise ||
           Fn.CoroutinePromise->getType()->isDependentType()) {
   this->Body = Body;
+
+  for (auto KV : Fn.CoroutineParameterMoves)
+    this->ParamMovesVector.push_back(KV.second);
+  this->ParamMoves = this->ParamMovesVector;
+
   if (!IsPromiseDependentType) {
     PromiseRecordDecl = Fn.CoroutinePromise->getType()->getAsCXXRecordDecl();
     assert(PromiseRecordDecl && "Type should have already been checked");
@@ -870,7 +936,7 @@ CoroutineStmtBuilder::CoroutineStmtBuilder(Sema &S, FunctionDecl &FD,
 
 bool CoroutineStmtBuilder::buildStatements() {
   assert(this->IsValid && "coroutine already invalid");
-  this->IsValid = makeReturnObject() && makeParamMoves();
+  this->IsValid = makeReturnObject();
   if (this->IsValid && !IsPromiseDependentType)
     buildDependentStatements();
   return this->IsValid;
@@ -886,12 +952,6 @@ bool CoroutineStmtBuilder::buildDependentStatements() {
   return this->IsValid;
 }
 
-bool CoroutineStmtBuilder::buildParameterMoves() {
-  assert(this->IsValid && "coroutine already invalid");
-  assert(this->ParamMoves.empty() && "param moves already built");
-  return this->IsValid = makeParamMoves();
-}
-
 bool CoroutineStmtBuilder::makePromiseStmt() {
   // Form a declaration statement for the promise declaration, so that AST
   // visitors can more easily find it.
@@ -1304,47 +1364,50 @@ static Expr *castForMoving(Sema &S, Expr *E, QualType T = QualType()) {
       .get();
 }
 
-
 /// \brief Build a variable declaration for move parameter.
 static VarDecl *buildVarDecl(Sema &S, SourceLocation Loc, QualType Type,
                              IdentifierInfo *II) {
   TypeSourceInfo *TInfo = S.Context.getTrivialTypeSourceInfo(Type, Loc);
-  VarDecl *Decl =
-      VarDecl::Create(S.Context, S.CurContext, Loc, Loc, II, Type, TInfo, SC_None);
+  VarDecl *Decl = VarDecl::Create(S.Context, S.CurContext, Loc, Loc, II, Type,
+                                  TInfo, SC_None);
   Decl->setImplicit();
   return Decl;
 }
 
-bool CoroutineStmtBuilder::makeParamMoves() {
-  for (auto *paramDecl : FD.parameters()) {
-    auto Ty = paramDecl->getType();
-    if (Ty->isDependentType())
+// Build statements that move coroutine function parameters to the coroutine
+// frame, and store them on the function scope info.
+bool Sema::buildCoroutineParameterMoves(SourceLocation Loc) {
+  assert(isa<FunctionDecl>(CurContext) && "not in a function scope");
+  auto *FD = cast<FunctionDecl>(CurContext);
+
+  auto *ScopeInfo = getCurFunction();
+  assert(ScopeInfo->CoroutineParameterMoves.empty() &&
+         "Should not build parameter moves twice");
+
+  for (auto *PD : FD->parameters()) {
+    if (PD->getType()->isDependentType())
       continue;
 
-    // No need to copy scalars, llvm will take care of them.
-    if (Ty->getAsCXXRecordDecl()) {
-      ExprResult ParamRef =
-          S.BuildDeclRefExpr(paramDecl, paramDecl->getType(),
-                             ExprValueKind::VK_LValue, Loc); // FIXME: scope?
-      if (ParamRef.isInvalid())
+    // No need to copy scalars, LLVM will take care of them.
+    if (PD->getType()->getAsCXXRecordDecl()) {
+      ExprResult PDRefExpr = BuildDeclRefExpr(
+          PD, PD->getType(), ExprValueKind::VK_LValue, Loc); // FIXME: scope?
+      if (PDRefExpr.isInvalid())
         return false;
 
-      Expr *RCast = castForMoving(S, ParamRef.get());
+      Expr *CExpr = castForMoving(*this, PDRefExpr.get());
 
-      auto D = buildVarDecl(S, Loc, Ty, paramDecl->getIdentifier());
-      S.AddInitializerToDecl(D, RCast, /*DirectInit=*/true);
+      auto D = buildVarDecl(*this, Loc, PD->getType(), PD->getIdentifier());
+      AddInitializerToDecl(D, CExpr, /*DirectInit=*/true);
 
       // Convert decl to a statement.
-      StmtResult Stmt = S.ActOnDeclStmt(S.ConvertDeclToDeclGroup(D), Loc, Loc);
+      StmtResult Stmt = ActOnDeclStmt(ConvertDeclToDeclGroup(D), Loc, Loc);
       if (Stmt.isInvalid())
         return false;
 
-      ParamMovesVector.push_back(Stmt.get());
+      ScopeInfo->CoroutineParameterMoves.insert(std::make_pair(PD, Stmt.get()));
     }
   }
-
-  // Convert to ArrayRef in CtorArgs structure that builder inherits from.
-  ParamMoves = ParamMovesVector;
   return true;
 }
 
diff --git a/lib/Sema/SemaDecl.cpp b/lib/Sema/SemaDecl.cpp
index 86555907438e..1a7d9a84306f 100644
--- a/lib/Sema/SemaDecl.cpp
+++ b/lib/Sema/SemaDecl.cpp
@@ -2495,7 +2495,7 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
   else if (const auto *UA = dyn_cast<UuidAttr>(Attr))
     NewAttr = S.mergeUuidAttr(D, UA->getRange(), AttrSpellingListIndex,
                               UA->getGuid());
-  else if (Attr->duplicatesAllowed() || !DeclHasAttr(D, Attr))
+  else if (Attr->shouldInheritEvenIfAlreadyPresent() || !DeclHasAttr(D, Attr))
     NewAttr = cast<InheritableAttr>(Attr->clone(S.Context));
 
   if (NewAttr) {
@@ -2929,6 +2929,48 @@ static bool hasIdenticalPassObjectSizeAttrs(const FunctionDecl *A,
   return std::equal(A->param_begin(), A->param_end(), B->param_begin(), AttrEq);
 }
 
+/// If necessary, adjust the semantic declaration context for a qualified
+/// declaration to name the correct inline namespace within the qualifier.
+static void adjustDeclContextForDeclaratorDecl(DeclaratorDecl *NewD,
+                                               DeclaratorDecl *OldD) {
+  // The only case where we need to update the DeclContext is when
+  // redeclaration lookup for a qualified name finds a declaration
+  // in an inline namespace within the context named by the qualifier:
+  //
+  //   inline namespace N { int f(); }
+  //   int ::f(); // Sema DC needs adjusting from :: to N::.
+  //
+  // For unqualified declarations, the semantic context *can* change
+  // along the redeclaration chain (for local extern declarations,
+  // extern "C" declarations, and friend declarations in particular).
+  if (!NewD->getQualifier())
+    return;
+
+  // NewD is probably already in the right context.
+  auto *NamedDC = NewD->getDeclContext()->getRedeclContext();
+  auto *SemaDC = OldD->getDeclContext()->getRedeclContext();
+  if (NamedDC->Equals(SemaDC))
+    return;
+
+  assert((NamedDC->InEnclosingNamespaceSetOf(SemaDC) ||
+          NewD->isInvalidDecl() || OldD->isInvalidDecl()) &&
+         "unexpected context for redeclaration");
+
+  auto *LexDC = NewD->getLexicalDeclContext();
+  auto FixSemaDC = [=](NamedDecl *D) {
+    if (!D)
+      return;
+    D->setDeclContext(SemaDC);
+    D->setLexicalDeclContext(LexDC);
+  };
+
+  FixSemaDC(NewD);
+  if (auto *FD = dyn_cast<FunctionDecl>(NewD))
+    FixSemaDC(FD->getDescribedFunctionTemplate());
+  else if (auto *VD = dyn_cast<VarDecl>(NewD))
+    FixSemaDC(VD->getDescribedVarTemplate());
+}
+
 /// MergeFunctionDecl - We just parsed a function 'New' from
 /// declarator D which has the same name and scope as a previous
 /// declaration 'Old'.  Figure out how to resolve this situation,
@@ -3953,6 +3995,7 @@ void Sema::MergeVarDecl(VarDecl *New, LookupResult &Previous) {
   New->setPreviousDecl(Old);
   if (NewTemplate)
     NewTemplate->setPreviousDecl(OldTemplate);
+  adjustDeclContextForDeclaratorDecl(New, Old);
 
   // Inherit access appropriately.
   New->setAccess(Old->getAccess());
@@ -4897,13 +4940,13 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
 
   switch (Name.getKind()) {
 
-  case UnqualifiedId::IK_ImplicitSelfParam:
-  case UnqualifiedId::IK_Identifier:
+  case UnqualifiedIdKind::IK_ImplicitSelfParam:
+  case UnqualifiedIdKind::IK_Identifier:
     NameInfo.setName(Name.Identifier);
     NameInfo.setLoc(Name.StartLocation);
     return NameInfo;
 
-  case UnqualifiedId::IK_DeductionGuideName: {
+  case UnqualifiedIdKind::IK_DeductionGuideName: {
     // C++ [temp.deduct.guide]p3:
     //   The simple-template-id shall name a class template specialization.
     //   The template-name shall be the same identifier as the template-name
@@ -4931,7 +4974,7 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
     return NameInfo;
   }
 
-  case UnqualifiedId::IK_OperatorFunctionId:
+  case UnqualifiedIdKind::IK_OperatorFunctionId:
     NameInfo.setName(Context.DeclarationNames.getCXXOperatorName(
                                            Name.OperatorFunctionId.Operator));
     NameInfo.setLoc(Name.StartLocation);
@@ -4941,14 +4984,14 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
       = Name.EndLocation.getRawEncoding();
     return NameInfo;
 
-  case UnqualifiedId::IK_LiteralOperatorId:
+  case UnqualifiedIdKind::IK_LiteralOperatorId:
     NameInfo.setName(Context.DeclarationNames.getCXXLiteralOperatorName(
                                                            Name.Identifier));
     NameInfo.setLoc(Name.StartLocation);
     NameInfo.setCXXLiteralOperatorNameLoc(Name.EndLocation);
     return NameInfo;
 
-  case UnqualifiedId::IK_ConversionFunctionId: {
+  case UnqualifiedIdKind::IK_ConversionFunctionId: {
     TypeSourceInfo *TInfo;
     QualType Ty = GetTypeFromParser(Name.ConversionFunctionId, &TInfo);
     if (Ty.isNull())
@@ -4960,7 +5003,7 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
     return NameInfo;
   }
 
-  case UnqualifiedId::IK_ConstructorName: {
+  case UnqualifiedIdKind::IK_ConstructorName: {
     TypeSourceInfo *TInfo;
     QualType Ty = GetTypeFromParser(Name.ConstructorName, &TInfo);
     if (Ty.isNull())
@@ -4972,7 +5015,7 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
     return NameInfo;
   }
 
-  case UnqualifiedId::IK_ConstructorTemplateId: {
+  case UnqualifiedIdKind::IK_ConstructorTemplateId: {
     // In well-formed code, we can only have a constructor
     // template-id that refers to the current context, so go there
     // to find the actual type being constructed.
@@ -4995,7 +5038,7 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
     return NameInfo;
   }
 
-  case UnqualifiedId::IK_DestructorName: {
+  case UnqualifiedIdKind::IK_DestructorName: {
     TypeSourceInfo *TInfo;
     QualType Ty = GetTypeFromParser(Name.DestructorName, &TInfo);
     if (Ty.isNull())
@@ -5007,7 +5050,7 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
     return NameInfo;
   }
 
-  case UnqualifiedId::IK_TemplateId: {
+  case UnqualifiedIdKind::IK_TemplateId: {
     TemplateName TName = Name.TemplateId->Template.get();
     SourceLocation TNameLoc = Name.TemplateId->TemplateNameLoc;
     return Context.getNameForTemplate(TName, TNameLoc);
@@ -5670,8 +5713,8 @@ Sema::ActOnTypedefDeclarator(Scope* S, Declarator& D, DeclContext* DC,
     Diag(D.getDeclSpec().getConstexprSpecLoc(), diag::err_invalid_constexpr)
       << 1;
 
-  if (D.getName().Kind != UnqualifiedId::IK_Identifier) {
-    if (D.getName().Kind == UnqualifiedId::IK_DeductionGuideName)
+  if (D.getName().Kind != UnqualifiedIdKind::IK_Identifier) {
+    if (D.getName().Kind == UnqualifiedIdKind::IK_DeductionGuideName)
       Diag(D.getName().StartLocation,
            diag::err_deduction_guide_invalid_specifier)
           << "typedef";
@@ -6425,7 +6468,7 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     TemplateParams = MatchTemplateParametersToScopeSpecifier(
         D.getDeclSpec().getLocStart(), D.getIdentifierLoc(),
         D.getCXXScopeSpec(),
-        D.getName().getKind() == UnqualifiedId::IK_TemplateId
+        D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
             ? D.getName().TemplateId
             : nullptr,
         TemplateParamLists,
@@ -6433,7 +6476,7 @@ NamedDecl *Sema::ActOnVariableDeclarator(
 
     if (TemplateParams) {
       if (!TemplateParams->size() &&
-          D.getName().getKind() != UnqualifiedId::IK_TemplateId) {
+          D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) {
         // There is an extraneous 'template<>' for this variable. Complain
         // about it, but allow the declaration of the variable.
         Diag(TemplateParams->getTemplateLoc(),
@@ -6443,7 +6486,7 @@ NamedDecl *Sema::ActOnVariableDeclarator(
                          TemplateParams->getRAngleLoc());
         TemplateParams = nullptr;
       } else {
-        if (D.getName().getKind() == UnqualifiedId::IK_TemplateId) {
+        if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) {
           // This is an explicit specialization or a partial specialization.
           // FIXME: Check that we can declare a specialization here.
           IsVariableTemplateSpecialization = true;
@@ -6464,9 +6507,9 @@ NamedDecl *Sema::ActOnVariableDeclarator(
         }
       }
     } else {
-      assert(
-          (Invalid || D.getName().getKind() != UnqualifiedId::IK_TemplateId) &&
-          "should have a 'template<>' for this decl");
+      assert((Invalid ||
+              D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) &&
+             "should have a 'template<>' for this decl");
     }
 
     if (IsVariableTemplateSpecialization) {
@@ -8261,7 +8304,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
             MatchTemplateParametersToScopeSpecifier(
                 D.getDeclSpec().getLocStart(), D.getIdentifierLoc(),
                 D.getCXXScopeSpec(),
-                D.getName().getKind() == UnqualifiedId::IK_TemplateId
+                D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId
                     ? D.getName().TemplateId
                     : nullptr,
                 TemplateParamLists, isFriend, isMemberSpecialization,
@@ -8318,7 +8361,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
           // and clearly the user wants a template specialization.  So
           // we need to insert '<>' after the name.
           SourceLocation InsertLoc;
-          if (D.getName().getKind() != UnqualifiedId::IK_TemplateId) {
+          if (D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) {
             InsertLoc = D.getName().getSourceRange().getEnd();
             InsertLoc = getLocForEndOfToken(InsertLoc);
           }
@@ -8719,7 +8762,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
 
     // If the declarator is a template-id, translate the parser's template
     // argument list into our AST format.
-    if (D.getName().getKind() == UnqualifiedId::IK_TemplateId) {
+    if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) {
       TemplateIdAnnotation *TemplateId = D.getName().TemplateId;
       TemplateArgs.setLAngleLoc(TemplateId->LAngleLoc);
       TemplateArgs.setRAngleLoc(TemplateId->RAngleLoc);
@@ -9114,6 +9157,359 @@ bool Sema::shouldLinkDependentDeclWithPrevious(Decl *D, Decl *PrevDecl) {
            D->getFriendObjectKind() != Decl::FOK_None);
 }
 
+/// \brief Check the target attribute of the function for MultiVersion
+/// validity.
+///
+/// Returns true if there was an error, false otherwise.
+static bool CheckMultiVersionValue(Sema &S, const FunctionDecl *FD) {
+  const auto *TA = FD->getAttr<TargetAttr>();
+  assert(TA && "MultiVersion Candidate requires a target attribute");
+  TargetAttr::ParsedTargetAttr ParseInfo = TA->parse();
+  const TargetInfo &TargetInfo = S.Context.getTargetInfo();
+  enum ErrType { Feature = 0, Architecture = 1 };
+
+  if (!ParseInfo.Architecture.empty() &&
+      !TargetInfo.validateCpuIs(ParseInfo.Architecture)) {
+    S.Diag(FD->getLocation(), diag::err_bad_multiversion_option)
+        << Architecture << ParseInfo.Architecture;
+    return true;
+  }
+
+  for (const auto &Feat : ParseInfo.Features) {
+    auto BareFeat = StringRef{Feat}.substr(1);
+    if (Feat[0] == '-') {
+      S.Diag(FD->getLocation(), diag::err_bad_multiversion_option)
+          << Feature << ("no-" + BareFeat).str();
+      return true;
+    }
+
+    if (!TargetInfo.validateCpuSupports(BareFeat) ||
+        !TargetInfo.isValidFeatureName(BareFeat)) {
+      S.Diag(FD->getLocation(), diag::err_bad_multiversion_option)
+          << Feature << BareFeat;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool CheckMultiVersionAdditionalRules(Sema &S, const FunctionDecl *OldFD,
+                                             const FunctionDecl *NewFD,
+                                             bool CausesMV) {
+  enum DoesntSupport {
+    FuncTemplates = 0,
+    VirtFuncs = 1,
+    DeducedReturn = 2,
+    Constructors = 3,
+    Destructors = 4,
+    DeletedFuncs = 5,
+    DefaultedFuncs = 6
+  };
+  enum Different {
+    CallingConv = 0,
+    ReturnType = 1,
+    ConstexprSpec = 2,
+    InlineSpec = 3,
+    StorageClass = 4,
+    Linkage = 5
+  };
+
+  // For now, disallow all other attributes.  These should be opt-in, but
+  // an analysis of all of them is a future FIXME.
+  if (CausesMV && OldFD &&
+      std::distance(OldFD->attr_begin(), OldFD->attr_end()) != 1) {
+    S.Diag(OldFD->getLocation(), diag::err_multiversion_no_other_attrs);
+    S.Diag(NewFD->getLocation(), diag::note_multiversioning_caused_here);
+    return true;
+  }
+
+  if (std::distance(NewFD->attr_begin(), NewFD->attr_end()) != 1) {
+    S.Diag(NewFD->getLocation(), diag::err_multiversion_no_other_attrs);
+    return true;
+  }
+
+  if (NewFD->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate) {
+    S.Diag(NewFD->getLocation(), diag::err_multiversion_doesnt_support)
+        << FuncTemplates;
+    return true;
+  }
+
+
+  if (const auto *NewCXXFD = dyn_cast<CXXMethodDecl>(NewFD)) {
+    if (NewCXXFD->isVirtual()) {
+      S.Diag(NewCXXFD->getLocation(), diag::err_multiversion_doesnt_support)
+          << VirtFuncs;
+      return true;
+    }
+
+    if (const auto *NewCXXCtor = dyn_cast<CXXConstructorDecl>(NewFD)) {
+      S.Diag(NewCXXCtor->getLocation(), diag::err_multiversion_doesnt_support)
+          << Constructors;
+      return true;
+    }
+
+    if (const auto *NewCXXDtor = dyn_cast<CXXDestructorDecl>(NewFD)) {
+      S.Diag(NewCXXDtor->getLocation(), diag::err_multiversion_doesnt_support)
+          << Destructors;
+      return true;
+    }
+  }
+
+  if (NewFD->isDeleted()) {
+    S.Diag(NewFD->getLocation(), diag::err_multiversion_doesnt_support)
+      << DeletedFuncs;
+  }
+  if (NewFD->isDefaulted()) {
+    S.Diag(NewFD->getLocation(), diag::err_multiversion_doesnt_support)
+      << DefaultedFuncs;
+  }
+
+  QualType NewQType = S.getASTContext().getCanonicalType(NewFD->getType());
+  const auto *NewType = cast<FunctionType>(NewQType);
+  QualType NewReturnType = NewType->getReturnType();
+
+  if (NewReturnType->isUndeducedType()) {
+    S.Diag(NewFD->getLocation(), diag::err_multiversion_doesnt_support)
+        << DeducedReturn;
+    return true;
+  }
+
+  // Only allow transition to MultiVersion if it hasn't been used.
+  if (OldFD && CausesMV && OldFD->isUsed(false)) {
+    S.Diag(NewFD->getLocation(), diag::err_multiversion_after_used);
+    return true;
+  }
+
+  // Ensure the return type is identical.
+  if (OldFD) {
+    QualType OldQType = S.getASTContext().getCanonicalType(OldFD->getType());
+    const auto *OldType = cast<FunctionType>(OldQType);
+    FunctionType::ExtInfo OldTypeInfo = OldType->getExtInfo();
+    FunctionType::ExtInfo NewTypeInfo = NewType->getExtInfo();
+
+    if (OldTypeInfo.getCC() != NewTypeInfo.getCC()) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_diff) << CallingConv;
+      return true;
+    }
+
+    QualType OldReturnType = OldType->getReturnType();
+
+    if (OldReturnType != NewReturnType) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_diff) << ReturnType;
+      return true;
+    }
+
+    if (OldFD->isConstexpr() != NewFD->isConstexpr()) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_diff)
+          << ConstexprSpec;
+      return true;
+    }
+
+    if (OldFD->isInlineSpecified() != NewFD->isInlineSpecified()) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_diff) << InlineSpec;
+      return true;
+    }
+
+    if (OldFD->getStorageClass() != NewFD->getStorageClass()) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_diff) << StorageClass;
+      return true;
+    }
+
+    if (OldFD->isExternC() != NewFD->isExternC()) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_diff) << Linkage;
+      return true;
+    }
+
+    if (S.CheckEquivalentExceptionSpec(
+            OldFD->getType()->getAs<FunctionProtoType>(), OldFD->getLocation(),
+            NewFD->getType()->getAs<FunctionProtoType>(), NewFD->getLocation()))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Check the validity of a mulitversion function declaration.
+/// Also sets the multiversion'ness' of the function itself.
+///
+/// This sets NewFD->isInvalidDecl() to true if there was an error.
+///
+/// Returns true if there was an error, false otherwise.
+static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
+                                      bool &Redeclaration, NamedDecl *&OldDecl,
+                                      bool &MergeTypeWithPrevious,
+                                      LookupResult &Previous) {
+  const auto *NewTA = NewFD->getAttr<TargetAttr>();
+  if (NewFD->isMain()) {
+    if (NewTA && NewTA->isDefaultVersion()) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_not_allowed_on_main);
+      NewFD->isInvalidDecl();
+      return true;
+    }
+    return false;
+  }
+
+  // If there is no matching previous decl, only 'default' can
+  // cause MultiVersioning.
+  if (!OldDecl) {
+    if (NewTA && NewTA->isDefaultVersion()) {
+      if (!NewFD->getType()->getAs<FunctionProtoType>()) {
+        S.Diag(NewFD->getLocation(), diag::err_multiversion_noproto);
+        NewFD->setInvalidDecl();
+        return true;
+      }
+      if (CheckMultiVersionAdditionalRules(S, nullptr, NewFD, true)) {
+        NewFD->setInvalidDecl();
+        return true;
+      }
+      if (!S.getASTContext().getTargetInfo().supportsMultiVersioning()) {
+        S.Diag(NewFD->getLocation(), diag::err_multiversion_not_supported);
+        return true;
+      }
+
+      NewFD->setIsMultiVersion();
+    }
+    return false;
+  }
+
+  if (OldDecl->getDeclContext()->getRedeclContext() !=
+      NewFD->getDeclContext()->getRedeclContext())
+    return false;
+
+  FunctionDecl *OldFD = OldDecl->getAsFunction();
+  // Unresolved 'using' statements (the other way OldDecl can be not a function)
+  // likely cannot cause a problem here.
+  if (!OldFD)
+    return false;
+
+  if (!OldFD->isMultiVersion() && !NewTA)
+    return false;
+
+  if (OldFD->isMultiVersion() && !NewTA) {
+    S.Diag(NewFD->getLocation(), diag::err_target_required_in_redecl);
+    NewFD->setInvalidDecl();
+    return true;
+  }
+
+  TargetAttr::ParsedTargetAttr NewParsed = NewTA->parse();
+  // Sort order doesn't matter, it just needs to be consistent.
+  std::sort(NewParsed.Features.begin(), NewParsed.Features.end());
+
+  const auto *OldTA = OldFD->getAttr<TargetAttr>();
+  if (!OldFD->isMultiVersion()) {
+    // If the old decl is NOT MultiVersioned yet, and we don't cause that
+    // to change, this is a simple redeclaration.
+    if (!OldTA || OldTA->getFeaturesStr() == NewTA->getFeaturesStr())
+      return false;
+
+    // Otherwise, this decl causes MultiVersioning.
+    if (!S.getASTContext().getTargetInfo().supportsMultiVersioning()) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_not_supported);
+      S.Diag(OldFD->getLocation(), diag::note_previous_declaration);
+      return true;
+    }
+
+    if (!OldFD->getType()->getAs<FunctionProtoType>()) {
+      S.Diag(OldFD->getLocation(), diag::err_multiversion_noproto);
+      S.Diag(NewFD->getLocation(), diag::note_multiversioning_caused_here);
+      NewFD->setInvalidDecl();
+      return true;
+    }
+
+    if (CheckMultiVersionValue(S, NewFD)) {
+      NewFD->setInvalidDecl();
+      return true;
+    }
+
+    if (CheckMultiVersionValue(S, OldFD)) {
+      S.Diag(NewFD->getLocation(), diag::note_multiversioning_caused_here);
+      NewFD->setInvalidDecl();
+      return true;
+    }
+
+    TargetAttr::ParsedTargetAttr OldParsed =
+        OldTA->parse(std::less<std::string>());
+
+    if (OldParsed == NewParsed) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_duplicate);
+      S.Diag(OldFD->getLocation(), diag::note_previous_declaration);
+      NewFD->setInvalidDecl();
+      return true;
+    }
+
+    for (const auto *FD : OldFD->redecls()) {
+      const auto *CurTA = FD->getAttr<TargetAttr>();
+      if (!CurTA || CurTA->isInherited()) {
+        S.Diag(FD->getLocation(), diag::err_target_required_in_redecl);
+        S.Diag(NewFD->getLocation(), diag::note_multiversioning_caused_here);
+        NewFD->setInvalidDecl();
+        return true;
+      }
+    }
+
+    if (CheckMultiVersionAdditionalRules(S, OldFD, NewFD, true)) {
+      NewFD->setInvalidDecl();
+      return true;
+    }
+
+    OldFD->setIsMultiVersion();
+    NewFD->setIsMultiVersion();
+    Redeclaration = false;
+    MergeTypeWithPrevious = false;
+    OldDecl = nullptr;
+    Previous.clear();
+    return false;
+  }
+
+  bool UseMemberUsingDeclRules =
+      S.CurContext->isRecord() && !NewFD->getFriendObjectKind();
+
+  // Next, check ALL non-overloads to see if this is a redeclaration of a
+  // previous member of the MultiVersion set.
+  for (NamedDecl *ND : Previous) {
+    FunctionDecl *CurFD = ND->getAsFunction();
+    if (!CurFD)
+      continue;
+    if (S.IsOverload(NewFD, CurFD, UseMemberUsingDeclRules))
+      continue;
+
+    const auto *CurTA = CurFD->getAttr<TargetAttr>();
+    if (CurTA->getFeaturesStr() == NewTA->getFeaturesStr()) {
+      NewFD->setIsMultiVersion();
+      Redeclaration = true;
+      OldDecl = ND;
+      return false;
+    }
+
+    TargetAttr::ParsedTargetAttr CurParsed =
+        CurTA->parse(std::less<std::string>());
+
+    if (CurParsed == NewParsed) {
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_duplicate);
+      S.Diag(CurFD->getLocation(), diag::note_previous_declaration);
+      NewFD->setInvalidDecl();
+      return true;
+    }
+  }
+
+  // Else, this is simply a non-redecl case.
+  if (CheckMultiVersionValue(S, NewFD)) {
+    NewFD->setInvalidDecl();
+    return true;
+  }
+
+  if (CheckMultiVersionAdditionalRules(S, OldFD, NewFD, false)) {
+    NewFD->setInvalidDecl();
+    return true;
+  }
+
+  NewFD->setIsMultiVersion();
+  Redeclaration = false;
+  MergeTypeWithPrevious = false;
+  OldDecl = nullptr;
+  Previous.clear();
+  return false;
+}
+
 /// \brief Perform semantic checking of a new function declaration.
 ///
 /// Performs semantic analysis of the new function declaration
@@ -9201,6 +9597,10 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
     }
   }
 
+  if (CheckMultiVersionFunction(*this, NewFD, Redeclaration, OldDecl,
+                                MergeTypeWithPrevious, Previous))
+    return Redeclaration;
+
   // C++11 [dcl.constexpr]p8:
   //   A constexpr specifier for a non-static member function that is not
   //   a constructor declares that member function to be const.
@@ -9252,12 +9652,13 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
 
     if (FunctionTemplateDecl *OldTemplateDecl
                                   = dyn_cast<FunctionTemplateDecl>(OldDecl)) {
-      NewFD->setPreviousDeclaration(OldTemplateDecl->getTemplatedDecl());
+      auto *OldFD = OldTemplateDecl->getTemplatedDecl();
+      NewFD->setPreviousDeclaration(OldFD);
+      adjustDeclContextForDeclaratorDecl(NewFD, OldFD);
       FunctionTemplateDecl *NewTemplateDecl
         = NewFD->getDescribedFunctionTemplate();
       assert(NewTemplateDecl && "Template/non-template mismatch");
-      if (CXXMethodDecl *Method
-            = dyn_cast<CXXMethodDecl>(NewTemplateDecl->getTemplatedDecl())) {
+      if (auto *Method = dyn_cast<CXXMethodDecl>(NewFD)) {
         Method->setAccess(OldTemplateDecl->getAccess());
         NewTemplateDecl->setAccess(OldTemplateDecl->getAccess());
       }
@@ -9270,22 +9671,22 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
         assert(OldTemplateDecl->isMemberSpecialization());
         // Explicit specializations of a member template do not inherit deleted
         // status from the parent member template that they are specializing.
-        if (OldTemplateDecl->getTemplatedDecl()->isDeleted()) {
-          FunctionDecl *const OldTemplatedDecl =
-              OldTemplateDecl->getTemplatedDecl();
+        if (OldFD->isDeleted()) {
           // FIXME: This assert will not hold in the presence of modules.
-          assert(OldTemplatedDecl->getCanonicalDecl() == OldTemplatedDecl);
+          assert(OldFD->getCanonicalDecl() == OldFD);
           // FIXME: We need an update record for this AST mutation.
-          OldTemplatedDecl->setDeletedAsWritten(false);
+          OldFD->setDeletedAsWritten(false);
         }
       }
 
     } else {
       if (shouldLinkDependentDeclWithPrevious(NewFD, OldDecl)) {
+        auto *OldFD = cast<FunctionDecl>(OldDecl);
         // This needs to happen first so that 'inline' propagates.
-        NewFD->setPreviousDeclaration(cast<FunctionDecl>(OldDecl));
+        NewFD->setPreviousDeclaration(OldFD);
+        adjustDeclContextForDeclaratorDecl(NewFD, OldFD);
         if (isa<CXXMethodDecl>(NewFD))
-          NewFD->setAccess(OldDecl->getAccess());
+          NewFD->setAccess(OldFD->getAccess());
       }
     }
   } else if (!getLangOpts().CPlusPlus && MayNeedOverloadableChecks &&
@@ -10947,6 +11348,8 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
   if (var->isThisDeclarationADefinition() &&
       var->getDeclContext()->getRedeclContext()->isFileContext() &&
       var->isExternallyVisible() && var->hasLinkage() &&
+      !var->isInline() && !var->getDescribedVarTemplate() &&
+      !isTemplateInstantiation(var->getTemplateSpecializationKind()) &&
       !getDiagnostics().isIgnored(diag::warn_missing_variable_declarations,
                                   var->getLocation())) {
     // Find a previous declaration that's not a definition.
diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp
index 21fe46ad9dd1..fdaf4b1fadd3 100644
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp
@@ -5554,7 +5554,7 @@ static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
   // Attribute can only be applied to function types.
   if (!isa<FunctionDecl>(D)) {
     S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type)
-      << Attr.getName() << /* function */0;
+      << Attr.getName() << ExpectedFunction;
     return;
   }
 
diff --git a/lib/Sema/SemaDeclCXX.cpp b/lib/Sema/SemaDeclCXX.cpp
index 1be187803063..28a88d5bb8b0 100644
--- a/lib/Sema/SemaDeclCXX.cpp
+++ b/lib/Sema/SemaDeclCXX.cpp
@@ -2417,9 +2417,16 @@ bool Sema::AttachBaseSpecifiers(CXXRecordDecl *Class,
   // Attach the remaining base class specifiers to the derived class.
   Class->setBases(Bases.data(), NumGoodBases);
 
+  // Check that the only base classes that are duplicate are virtual.
   for (unsigned idx = 0; idx < NumGoodBases; ++idx) {
     // Check whether this direct base is inaccessible due to ambiguity.
     QualType BaseType = Bases[idx]->getType();
+
+    // Skip all dependent types in templates being used as base specifiers.
+    // Checks below assume that the base specifier is a CXXRecord.
+    if (BaseType->isDependentType())
+      continue;
+
     CanQualType CanonicalBase = Context.getCanonicalType(BaseType)
       .getUnqualifiedType();
 
@@ -3582,9 +3589,12 @@ void Sema::ActOnFinishCXXInClassMemberInitializer(Decl *D,
   ExprResult Init = InitExpr;
   if (!FD->getType()->isDependentType() && !InitExpr->isTypeDependent()) {
     InitializedEntity Entity = InitializedEntity::InitializeMember(FD);
-    InitializationKind Kind = FD->getInClassInitStyle() == ICIS_ListInit
-        ? InitializationKind::CreateDirectList(InitExpr->getLocStart())
-        : InitializationKind::CreateCopy(InitExpr->getLocStart(), InitLoc);
+    InitializationKind Kind =
+        FD->getInClassInitStyle() == ICIS_ListInit
+            ? InitializationKind::CreateDirectList(InitExpr->getLocStart(),
+                                                   InitExpr->getLocStart(),
+                                                   InitExpr->getLocEnd())
+            : InitializationKind::CreateCopy(InitExpr->getLocStart(), InitLoc);
     InitializationSequence Seq(*this, Entity, Kind, InitExpr);
     Init = Seq.Perform(*this, Entity, Kind, InitExpr);
     if (Init.isInvalid()) {
@@ -3979,9 +3989,10 @@ Sema::BuildMemberInitializer(ValueDecl *Member, Expr *Init,
                    : InitializedEntity::InitializeMember(IndirectMember,
                                                          nullptr);
     InitializationKind Kind =
-      InitList ? InitializationKind::CreateDirectList(IdLoc)
-               : InitializationKind::CreateDirect(IdLoc, InitRange.getBegin(),
-                                                  InitRange.getEnd());
+        InitList ? InitializationKind::CreateDirectList(
+                       IdLoc, Init->getLocStart(), Init->getLocEnd())
+                 : InitializationKind::CreateDirect(IdLoc, InitRange.getBegin(),
+                                                    InitRange.getEnd());
 
     InitializationSequence InitSeq(*this, MemberEntity, Kind, Args);
     ExprResult MemberInit = InitSeq.Perform(*this, MemberEntity, Kind, Args,
@@ -4033,9 +4044,10 @@ Sema::BuildDelegatingInitializer(TypeSourceInfo *TInfo, Expr *Init,
   InitializedEntity DelegationEntity = InitializedEntity::InitializeDelegation(
                                      QualType(ClassDecl->getTypeForDecl(), 0));
   InitializationKind Kind =
-    InitList ? InitializationKind::CreateDirectList(NameLoc)
-             : InitializationKind::CreateDirect(NameLoc, InitRange.getBegin(),
-                                                InitRange.getEnd());
+      InitList ? InitializationKind::CreateDirectList(
+                     NameLoc, Init->getLocStart(), Init->getLocEnd())
+               : InitializationKind::CreateDirect(NameLoc, InitRange.getBegin(),
+                                                  InitRange.getEnd());
   InitializationSequence InitSeq(*this, DelegationEntity, Kind, Args);
   ExprResult DelegationInit = InitSeq.Perform(*this, DelegationEntity, Kind,
                                               Args, nullptr);
@@ -4167,9 +4179,9 @@ Sema::BuildBaseInitializer(QualType BaseType, TypeSourceInfo *BaseTInfo,
   InitializedEntity BaseEntity =
     InitializedEntity::InitializeBase(Context, BaseSpec, VirtualBaseSpec);
   InitializationKind Kind =
-    InitList ? InitializationKind::CreateDirectList(BaseLoc)
-             : InitializationKind::CreateDirect(BaseLoc, InitRange.getBegin(),
-                                                InitRange.getEnd());
+      InitList ? InitializationKind::CreateDirectList(BaseLoc)
+               : InitializationKind::CreateDirect(BaseLoc, InitRange.getBegin(),
+                                                  InitRange.getEnd());
   InitializationSequence InitSeq(*this, BaseEntity, Kind, Args);
   ExprResult BaseInit = InitSeq.Perform(*this, BaseEntity, Kind, Args, nullptr);
   if (BaseInit.isInvalid())
@@ -8986,15 +8998,15 @@ Decl *Sema::ActOnUsingDeclaration(Scope *S,
   }
 
   switch (Name.getKind()) {
-  case UnqualifiedId::IK_ImplicitSelfParam:
-  case UnqualifiedId::IK_Identifier:
-  case UnqualifiedId::IK_OperatorFunctionId:
-  case UnqualifiedId::IK_LiteralOperatorId:
-  case UnqualifiedId::IK_ConversionFunctionId:
+  case UnqualifiedIdKind::IK_ImplicitSelfParam:
+  case UnqualifiedIdKind::IK_Identifier:
+  case UnqualifiedIdKind::IK_OperatorFunctionId:
+  case UnqualifiedIdKind::IK_LiteralOperatorId:
+  case UnqualifiedIdKind::IK_ConversionFunctionId:
     break;
 
-  case UnqualifiedId::IK_ConstructorName:
-  case UnqualifiedId::IK_ConstructorTemplateId:
+  case UnqualifiedIdKind::IK_ConstructorName:
+  case UnqualifiedIdKind::IK_ConstructorTemplateId:
     // C++11 inheriting constructors.
     Diag(Name.getLocStart(),
          getLangOpts().CPlusPlus11 ?
@@ -9006,17 +9018,17 @@ Decl *Sema::ActOnUsingDeclaration(Scope *S,
 
     return nullptr;
 
-  case UnqualifiedId::IK_DestructorName:
+  case UnqualifiedIdKind::IK_DestructorName:
     Diag(Name.getLocStart(), diag::err_using_decl_destructor)
       << SS.getRange();
     return nullptr;
 
-  case UnqualifiedId::IK_TemplateId:
+  case UnqualifiedIdKind::IK_TemplateId:
     Diag(Name.getLocStart(), diag::err_using_decl_template_id)
       << SourceRange(Name.TemplateId->LAngleLoc, Name.TemplateId->RAngleLoc);
     return nullptr;
 
-  case UnqualifiedId::IK_DeductionGuideName:
+  case UnqualifiedIdKind::IK_DeductionGuideName:
     llvm_unreachable("cannot parse qualified deduction guide name");
   }
 
@@ -10040,7 +10052,7 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S,
     Previous.clear();
   }
 
-  assert(Name.Kind == UnqualifiedId::IK_Identifier &&
+  assert(Name.Kind == UnqualifiedIdKind::IK_Identifier &&
          "name in alias declaration must be an identifier");
   TypeAliasDecl *NewTD = TypeAliasDecl::Create(Context, CurContext, UsingLoc,
                                                Name.StartLocation,
@@ -10990,11 +11002,11 @@ buildMemcpyForAssignmentOp(Sema &S, SourceLocation Loc, QualType T,
   Expr *From = FromB.build(S, Loc);
   From = new (S.Context) UnaryOperator(From, UO_AddrOf,
                          S.Context.getPointerType(From->getType()),
-                         VK_RValue, OK_Ordinary, Loc);
+                         VK_RValue, OK_Ordinary, Loc, false);
   Expr *To = ToB.build(S, Loc);
   To = new (S.Context) UnaryOperator(To, UO_AddrOf,
                        S.Context.getPointerType(To->getType()),
-                       VK_RValue, OK_Ordinary, Loc);
+                       VK_RValue, OK_Ordinary, Loc, false);
 
   const Type *E = T->getBaseElementTypeUnsafe();
   bool NeedsCollectableMemCpy =
@@ -11230,16 +11242,18 @@ buildSingleCopyAssignRecursively(Sema &S, SourceLocation Loc, QualType T,
   Expr *Comparison
     = new (S.Context) BinaryOperator(IterationVarRefRVal.build(S, Loc),
                      IntegerLiteral::Create(S.Context, Upper, SizeType, Loc),
-                                     BO_NE, S.Context.BoolTy,
-                                     VK_RValue, OK_Ordinary, Loc, FPOptions());
-
-  // Create the pre-increment of the iteration variable.
-  Expr *Increment
-    = new (S.Context) UnaryOperator(IterationVarRef.build(S, Loc), UO_PreInc,
-                                    SizeType, VK_LValue, OK_Ordinary, Loc);
-
-  // Construct the loop that copies all elements of this array.
-  return S.ActOnForStmt(
+                                     BO_NE, S.Context.BoolTy,
+                                     VK_RValue, OK_Ordinary, Loc, FPOptions());
+
+  // Create the pre-increment of the iteration variable. We can determine
+  // whether the increment will overflow based on the value of the array
+  // bound.
+  Expr *Increment = new (S.Context)
+      UnaryOperator(IterationVarRef.build(S, Loc), UO_PreInc, SizeType,
+                    VK_LValue, OK_Ordinary, Loc, Upper.isMaxValue());
+
+  // Construct the loop that copies all elements of this array.
+  return S.ActOnForStmt(
       Loc, Loc, InitStmt,
       S.ActOnCondition(nullptr, Loc, Comparison, Sema::ConditionKind::Boolean),
       S.MakeFullDiscardedValueExpr(Increment), Loc, Copy.get());
@@ -12215,30 +12229,27 @@ void Sema::DefineImplicitLambdaToFunctionPointerConversion(
                             SourceLocation CurrentLocation,
                             CXXConversionDecl *Conv) {
   SynthesizedFunctionScope Scope(*this, Conv);
+  assert(!Conv->getReturnType()->isUndeducedType());
 
   CXXRecordDecl *Lambda = Conv->getParent();
-  CXXMethodDecl *CallOp = Lambda->getLambdaCallOperator();
-  // If we are defining a specialization of a conversion to function-ptr
-  // cache the deduced template arguments for this specialization
-  // so that we can use them to retrieve the corresponding call-operator
-  // and static-invoker.
-  const TemplateArgumentList *DeducedTemplateArgs = nullptr;
-
-  // Retrieve the corresponding call-operator specialization.
-  if (Lambda->isGenericLambda()) {
-    assert(Conv->isFunctionTemplateSpecialization());
-    FunctionTemplateDecl *CallOpTemplate =
-        CallOp->getDescribedFunctionTemplate();
-    DeducedTemplateArgs = Conv->getTemplateSpecializationArgs();
-    void *InsertPos = nullptr;
-    FunctionDecl *CallOpSpec = CallOpTemplate->findSpecialization(
-                                                DeducedTemplateArgs->asArray(),
-                                                InsertPos);
-    assert(CallOpSpec &&
-          "Conversion operator must have a corresponding call operator");
-    CallOp = cast<CXXMethodDecl>(CallOpSpec);
+  FunctionDecl *CallOp = Lambda->getLambdaCallOperator();
+  FunctionDecl *Invoker = Lambda->getLambdaStaticInvoker();
+
+  if (auto *TemplateArgs = Conv->getTemplateSpecializationArgs()) {
+    CallOp = InstantiateFunctionDeclaration(
+        CallOp->getDescribedFunctionTemplate(), TemplateArgs, CurrentLocation);
+    if (!CallOp)
+      return;
+
+    Invoker = InstantiateFunctionDeclaration(
+        Invoker->getDescribedFunctionTemplate(), TemplateArgs, CurrentLocation);
+    if (!Invoker)
+      return;
   }
 
+  if (CallOp->isInvalidDecl())
+    return;
+
   // Mark the call operator referenced (and add to pending instantiations
   // if necessary).
   // For both the conversion and static-invoker template specializations
@@ -12246,39 +12257,24 @@ void Sema::DefineImplicitLambdaToFunctionPointerConversion(
   // to the PendingInstantiations.
   MarkFunctionReferenced(CurrentLocation, CallOp);
 
-  // Retrieve the static invoker...
-  CXXMethodDecl *Invoker = Lambda->getLambdaStaticInvoker();
-  // ... and get the corresponding specialization for a generic lambda.
-  if (Lambda->isGenericLambda()) {
-    assert(DeducedTemplateArgs &&
-      "Must have deduced template arguments from Conversion Operator");
-    FunctionTemplateDecl *InvokeTemplate =
-                          Invoker->getDescribedFunctionTemplate();
-    void *InsertPos = nullptr;
-    FunctionDecl *InvokeSpec = InvokeTemplate->findSpecialization(
-                                                DeducedTemplateArgs->asArray(),
-                                                InsertPos);
-    assert(InvokeSpec &&
-      "Must have a corresponding static invoker specialization");
-    Invoker = cast<CXXMethodDecl>(InvokeSpec);
-  }
+  // Fill in the __invoke function with a dummy implementation. IR generation
+  // will fill in the actual details. Update its type in case it contained
+  // an 'auto'.
+  Invoker->markUsed(Context);
+  Invoker->setReferenced();
+  Invoker->setType(Conv->getReturnType()->getPointeeType());
+  Invoker->setBody(new (Context) CompoundStmt(Conv->getLocation()));
+
   // Construct the body of the conversion function { return __invoke; }.
   Expr *FunctionRef = BuildDeclRefExpr(Invoker, Invoker->getType(),
-                                        VK_LValue, Conv->getLocation()).get();
+                                       VK_LValue, Conv->getLocation()).get();
   assert(FunctionRef && "Can't refer to __invoke function?");
   Stmt *Return = BuildReturnStmt(Conv->getLocation(), FunctionRef).get();
   Conv->setBody(CompoundStmt::Create(Context, Return, Conv->getLocation(),
                                      Conv->getLocation()));
-
   Conv->markUsed(Context);
   Conv->setReferenced();
 
-  // Fill in the __invoke function with a dummy implementation. IR generation
-  // will fill in the actual details.
-  Invoker->markUsed(Context);
-  Invoker->setReferenced();
-  Invoker->setBody(new (Context) CompoundStmt(Conv->getLocation()));
-
   if (ASTMutationListener *L = getASTMutationListener()) {
     L->CompletedImplicitDefinition(Conv);
     L->CompletedImplicitDefinition(Invoker);
@@ -13799,7 +13795,8 @@ NamedDecl *Sema::ActOnFriendFunctionDecl(Scope *S, Declarator &D,
     //   elaborated-type-specifier, the lookup to determine whether
     //   the entity has been previously declared shall not consider
     //   any scopes outside the innermost enclosing namespace.
-    bool isTemplateId = D.getName().getKind() == UnqualifiedId::IK_TemplateId;
+    bool isTemplateId =
+        D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId;
 
     // Find the appropriate context according to the above.
     DC = CurContext;
@@ -13910,24 +13907,24 @@ NamedDecl *Sema::ActOnFriendFunctionDecl(Scope *S, Declarator &D,
   if (!DC->isRecord()) {
     int DiagArg = -1;
     switch (D.getName().getKind()) {
-    case UnqualifiedId::IK_ConstructorTemplateId:
-    case UnqualifiedId::IK_ConstructorName:
+    case UnqualifiedIdKind::IK_ConstructorTemplateId:
+    case UnqualifiedIdKind::IK_ConstructorName:
       DiagArg = 0;
       break;
-    case UnqualifiedId::IK_DestructorName:
+    case UnqualifiedIdKind::IK_DestructorName:
       DiagArg = 1;
       break;
-    case UnqualifiedId::IK_ConversionFunctionId:
+    case UnqualifiedIdKind::IK_ConversionFunctionId:
       DiagArg = 2;
       break;
-    case UnqualifiedId::IK_DeductionGuideName:
+    case UnqualifiedIdKind::IK_DeductionGuideName:
       DiagArg = 3;
       break;
-    case UnqualifiedId::IK_Identifier:
-    case UnqualifiedId::IK_ImplicitSelfParam:
-    case UnqualifiedId::IK_LiteralOperatorId:
-    case UnqualifiedId::IK_OperatorFunctionId:
-    case UnqualifiedId::IK_TemplateId:
+    case UnqualifiedIdKind::IK_Identifier:
+    case UnqualifiedIdKind::IK_ImplicitSelfParam:
+    case UnqualifiedIdKind::IK_LiteralOperatorId:
+    case UnqualifiedIdKind::IK_OperatorFunctionId:
+    case UnqualifiedIdKind::IK_TemplateId:
       break;
     }
     // This implies that it has to be an operator or function.
diff --git a/lib/Sema/SemaDeclObjC.cpp b/lib/Sema/SemaDeclObjC.cpp
index ff3c07e938a8..d44b5f357df1 100644
--- a/lib/Sema/SemaDeclObjC.cpp
+++ b/lib/Sema/SemaDeclObjC.cpp
@@ -156,23 +156,23 @@ void Sema::CheckObjCMethodOverride(ObjCMethodDecl *NewMethod,
       Diag(Overridden->getLocation(), 
            diag::note_related_result_type_overridden);
   }
-  if (getLangOpts().ObjCAutoRefCount) {
-    Diags.setSeverity(diag::warn_nsreturns_retained_attribute_mismatch,
-                      diag::Severity::Error, SourceLocation());
-    Diags.setSeverity(diag::warn_nsconsumed_attribute_mismatch,
-                      diag::Severity::Error, SourceLocation());
-  }
 
   if ((NewMethod->hasAttr<NSReturnsRetainedAttr>() !=
        Overridden->hasAttr<NSReturnsRetainedAttr>())) {
     Diag(NewMethod->getLocation(),
-         diag::warn_nsreturns_retained_attribute_mismatch) << 1;
+         getLangOpts().ObjCAutoRefCount
+             ? diag::err_nsreturns_retained_attribute_mismatch
+             : diag::warn_nsreturns_retained_attribute_mismatch)
+        << 1;
     Diag(Overridden->getLocation(), diag::note_previous_decl) << "method";
   }
   if ((NewMethod->hasAttr<NSReturnsNotRetainedAttr>() !=
        Overridden->hasAttr<NSReturnsNotRetainedAttr>())) {
     Diag(NewMethod->getLocation(),
-         diag::warn_nsreturns_retained_attribute_mismatch) << 0;
+         getLangOpts().ObjCAutoRefCount
+             ? diag::err_nsreturns_retained_attribute_mismatch
+             : diag::warn_nsreturns_retained_attribute_mismatch)
+        << 0;
     Diag(Overridden->getLocation(), diag::note_previous_decl)  << "method";
   }
 
@@ -185,7 +185,10 @@ void Sema::CheckObjCMethodOverride(ObjCMethodDecl *NewMethod,
     ParmVarDecl *newDecl = (*ni);
     if (newDecl->hasAttr<NSConsumedAttr>() !=
         oldDecl->hasAttr<NSConsumedAttr>()) {
-      Diag(newDecl->getLocation(), diag::warn_nsconsumed_attribute_mismatch);
+      Diag(newDecl->getLocation(),
+           getLangOpts().ObjCAutoRefCount
+               ? diag::err_nsconsumed_attribute_mismatch
+               : diag::warn_nsconsumed_attribute_mismatch);
       Diag(oldDecl->getLocation(), diag::note_previous_decl) << "parameter";
     }
 
diff --git a/lib/Sema/SemaExceptionSpec.cpp b/lib/Sema/SemaExceptionSpec.cpp
index 67d1b02d1fca..ebe93750a792 100644
--- a/lib/Sema/SemaExceptionSpec.cpp
+++ b/lib/Sema/SemaExceptionSpec.cpp
@@ -626,6 +626,90 @@ bool Sema::CheckEquivalentExceptionSpec(const PartialDiagnostic &DiagID,
                                           New, NewLoc);
 }
 
+bool Sema::handlerCanCatch(QualType HandlerType, QualType ExceptionType) {
+  // [except.handle]p3:
+  //   A handler is a match for an exception object of type E if:
+
+  // HandlerType must be ExceptionType or derived from it, or pointer or
+  // reference to such types.
+  const ReferenceType *RefTy = HandlerType->getAs<ReferenceType>();
+  if (RefTy)
+    HandlerType = RefTy->getPointeeType();
+
+  //   -- the handler is of type cv T or cv T& and E and T are the same type
+  if (Context.hasSameUnqualifiedType(ExceptionType, HandlerType))
+    return true;
+
+  // FIXME: ObjC pointer types?
+  if (HandlerType->isPointerType() || HandlerType->isMemberPointerType()) {
+    if (RefTy && (!HandlerType.isConstQualified() ||
+                  HandlerType.isVolatileQualified()))
+      return false;
+
+    // -- the handler is of type cv T or const T& where T is a pointer or
+    //    pointer to member type and E is std::nullptr_t
+    if (ExceptionType->isNullPtrType())
+      return true;
+
+    // -- the handler is of type cv T or const T& where T is a pointer or
+    //    pointer to member type and E is a pointer or pointer to member type
+    //    that can be converted to T by one or more of
+    //    -- a qualification conversion
+    //    -- a function pointer conversion
+    bool LifetimeConv;
+    QualType Result;
+    // FIXME: Should we treat the exception as catchable if a lifetime
+    // conversion is required?
+    if (IsQualificationConversion(ExceptionType, HandlerType, false,
+                                  LifetimeConv) ||
+        IsFunctionConversion(ExceptionType, HandlerType, Result))
+      return true;
+
+    //    -- a standard pointer conversion [...]
+    if (!ExceptionType->isPointerType() || !HandlerType->isPointerType())
+      return false;
+
+    // Handle the "qualification conversion" portion.
+    Qualifiers EQuals, HQuals;
+    ExceptionType = Context.getUnqualifiedArrayType(
+        ExceptionType->getPointeeType(), EQuals);
+    HandlerType = Context.getUnqualifiedArrayType(
+        HandlerType->getPointeeType(), HQuals);
+    if (!HQuals.compatiblyIncludes(EQuals))
+      return false;
+
+    if (HandlerType->isVoidType() && ExceptionType->isObjectType())
+      return true;
+
+    // The only remaining case is a derived-to-base conversion.
+  }
+
+  //   -- the handler is of type cg T or cv T& and T is an unambiguous public
+  //      base class of E
+  if (!ExceptionType->isRecordType() || !HandlerType->isRecordType())
+    return false;
+  CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
+                     /*DetectVirtual=*/false);
+  if (!IsDerivedFrom(SourceLocation(), ExceptionType, HandlerType, Paths) ||
+      Paths.isAmbiguous(Context.getCanonicalType(HandlerType)))
+    return false;
+
+  // Do this check from a context without privileges.
+  switch (CheckBaseClassAccess(SourceLocation(), HandlerType, ExceptionType,
+                               Paths.front(),
+                               /*Diagnostic*/ 0,
+                               /*ForceCheck*/ true,
+                               /*ForceUnprivileged*/ true)) {
+  case AR_accessible: return true;
+  case AR_inaccessible: return false;
+  case AR_dependent:
+    llvm_unreachable("access check dependent for unprivileged context");
+  case AR_delayed:
+    llvm_unreachable("access check delayed in non-declaration");
+  }
+  llvm_unreachable("unexpected access check result");
+}
+
 /// CheckExceptionSpecSubset - Check whether the second function type's
 /// exception specification is a subset (or equivalent) of the first function
 /// type. This is used by override and pointer assignment checks.
@@ -722,75 +806,23 @@ bool Sema::CheckExceptionSpecSubset(const PartialDiagnostic &DiagID,
          "Exception spec subset: non-dynamic case slipped through.");
 
   // Neither contains everything or nothing. Do a proper comparison.
-  for (const auto &SubI : Subset->exceptions()) {
-    // Take one type from the subset.
-    QualType CanonicalSubT = Context.getCanonicalType(SubI);
-    // Unwrap pointers and references so that we can do checks within a class
-    // hierarchy. Don't unwrap member pointers; they don't have hierarchy
-    // conversions on the pointee.
-    bool SubIsPointer = false;
-    if (const ReferenceType *RefTy = CanonicalSubT->getAs<ReferenceType>())
-      CanonicalSubT = RefTy->getPointeeType();
-    if (const PointerType *PtrTy = CanonicalSubT->getAs<PointerType>()) {
-      CanonicalSubT = PtrTy->getPointeeType();
-      SubIsPointer = true;
-    }
-    bool SubIsClass = CanonicalSubT->isRecordType();
-    CanonicalSubT = CanonicalSubT.getLocalUnqualifiedType();
-
-    CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
-                       /*DetectVirtual=*/false);
+  for (QualType SubI : Subset->exceptions()) {
+    if (const ReferenceType *RefTy = SubI->getAs<ReferenceType>())
+      SubI = RefTy->getPointeeType();
 
-    bool Contained = false;
     // Make sure it's in the superset.
-    for (const auto &SuperI : Superset->exceptions()) {
-      QualType CanonicalSuperT = Context.getCanonicalType(SuperI);
-      // SubT must be SuperT or derived from it, or pointer or reference to
-      // such types.
-      if (const ReferenceType *RefTy = CanonicalSuperT->getAs<ReferenceType>())
-        CanonicalSuperT = RefTy->getPointeeType();
-      if (SubIsPointer) {
-        if (const PointerType *PtrTy = CanonicalSuperT->getAs<PointerType>())
-          CanonicalSuperT = PtrTy->getPointeeType();
-        else {
-          continue;
-        }
-      }
-      CanonicalSuperT = CanonicalSuperT.getLocalUnqualifiedType();
-      // If the types are the same, move on to the next type in the subset.
-      if (CanonicalSubT == CanonicalSuperT) {
+    bool Contained = false;
+    for (QualType SuperI : Superset->exceptions()) {
+      // [except.spec]p5:
+      //   the target entity shall allow at least the exceptions allowed by the
+      //   source
+      //
+      // We interpret this as meaning that a handler for some target type would
+      // catch an exception of each source type.
+      if (handlerCanCatch(SuperI, SubI)) {
         Contained = true;
         break;
       }
-
-      // Otherwise we need to check the inheritance.
-      if (!SubIsClass || !CanonicalSuperT->isRecordType())
-        continue;
-
-      Paths.clear();
-      if (!IsDerivedFrom(SubLoc, CanonicalSubT, CanonicalSuperT, Paths))
-        continue;
-
-      if (Paths.isAmbiguous(Context.getCanonicalType(CanonicalSuperT)))
-        continue;
-
-      // Do this check from a context without privileges.
-      switch (CheckBaseClassAccess(SourceLocation(),
-                                   CanonicalSuperT, CanonicalSubT,
-                                   Paths.front(),
-                                   /*Diagnostic*/ 0,
-                                   /*ForceCheck*/ true,
-                                   /*ForceUnprivileged*/ true)) {
-      case AR_accessible: break;
-      case AR_inaccessible: continue;
-      case AR_dependent:
-        llvm_unreachable("access check dependent for unprivileged context");
-      case AR_delayed:
-        llvm_unreachable("access check delayed in non-declaration");
-      }
-
-      Contained = true;
-      break;
     }
     if (!Contained) {
       Diag(SubLoc, DiagID);
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 3812bc67448e..23b3a0ccebf8 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -1092,13 +1092,12 @@ static bool unsupportedTypeConversion(const Sema &S, QualType LHSType,
   Float128AndLongDouble |= (LHSElemType == S.Context.LongDoubleTy &&
                             RHSElemType == S.Context.Float128Ty);
 
-  /* We've handled the situation where __float128 and long double have the same
-     representation. The only other allowable conversion is if long double is
-     really just double.
-  */
+  // We've handled the situation where __float128 and long double have the same
+  // representation. We allow all conversions for all possible long double types
+  // except PPC's double double.
   return Float128AndLongDouble &&
-    (&S.Context.getFloatTypeSemantics(S.Context.LongDoubleTy) !=
-     &llvm::APFloat::IEEEdouble());
+    (&S.Context.getFloatTypeSemantics(S.Context.LongDoubleTy) ==
+     &llvm::APFloat::PPCDoubleDouble());
 }
 
 typedef ExprResult PerformCastFn(Sema &S, Expr *operand, QualType toType);
@@ -1711,7 +1710,7 @@ Sema::DecomposeUnqualifiedId(const UnqualifiedId &Id,
                              TemplateArgumentListInfo &Buffer,
                              DeclarationNameInfo &NameInfo,
                              const TemplateArgumentListInfo *&TemplateArgs) {
-  if (Id.getKind() == UnqualifiedId::IK_TemplateId) {
+  if (Id.getKind() == UnqualifiedIdKind::IK_TemplateId) {
     Buffer.setLAngleLoc(Id.TemplateId->LAngleLoc);
     Buffer.setRAngleLoc(Id.TemplateId->RAngleLoc);
 
@@ -2070,9 +2069,10 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
                                       IsAddressOfOperand, TemplateArgs);
 
   // Perform the required lookup.
-  LookupResult R(*this, NameInfo, 
-                 (Id.getKind() == UnqualifiedId::IK_ImplicitSelfParam) 
-                  ? LookupObjCImplicitSelfParam : LookupOrdinaryName);
+  LookupResult R(*this, NameInfo,
+                 (Id.getKind() == UnqualifiedIdKind::IK_ImplicitSelfParam)
+                     ? LookupObjCImplicitSelfParam
+                     : LookupOrdinaryName);
   if (TemplateArgs) {
     // Lookup the template name again to correctly establish the context in
     // which it was found. This is really unfortunate as we already did the
@@ -2243,7 +2243,7 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
     // In C++1y, if this is a variable template id, then check it
     // in BuildTemplateIdExpr().
     // The single lookup result must be a variable template declaration.
-    if (Id.getKind() == UnqualifiedId::IK_TemplateId && Id.TemplateId &&
+    if (Id.getKind() == UnqualifiedIdKind::IK_TemplateId && Id.TemplateId &&
         Id.TemplateId->Kind == TNK_Var_template) {
       assert(R.getAsSingle<VarTemplateDecl>() &&
              "There should only be one declaration found.");
@@ -2401,7 +2401,7 @@ Sema::LookupInObjCMethod(LookupResult &Lookup, Scope *S,
       IdentifierInfo &II = Context.Idents.get("self");
       UnqualifiedId SelfName;
       SelfName.setIdentifier(&II, SourceLocation());
-      SelfName.setKind(UnqualifiedId::IK_ImplicitSelfParam);
+      SelfName.setKind(UnqualifiedIdKind::IK_ImplicitSelfParam);
       CXXScopeSpec SelfScopeSpec;
       SourceLocation TemplateKWLoc;
       ExprResult SelfExpr = ActOnIdExpression(S, SelfScopeSpec, TemplateKWLoc,
@@ -9299,16 +9299,6 @@ QualType Sema::CheckShiftOperands(ExprResult &LHS, ExprResult &RHS,
   return LHSType;
 }
 
-static bool IsWithinTemplateSpecialization(Decl *D) {
-  if (DeclContext *DC = D->getDeclContext()) {
-    if (isa<ClassTemplateSpecializationDecl>(DC))
-      return true;
-    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(DC))
-      return FD->isFunctionTemplateSpecialization();
-  }
-  return false;
-}
-
 /// If two different enums are compared, raise a warning.
 static void checkEnumComparison(Sema &S, SourceLocation Loc, Expr *LHS,
                                 Expr *RHS) {
@@ -9586,137 +9576,181 @@ static void diagnoseLogicalNotOnLHSofCheck(Sema &S, ExprResult &LHS,
 // Get the decl for a simple expression: a reference to a variable,
 // an implicit C++ field reference, or an implicit ObjC ivar reference.
 static ValueDecl *getCompareDecl(Expr *E) {
-  if (DeclRefExpr* DR = dyn_cast<DeclRefExpr>(E))
+  if (DeclRefExpr *DR = dyn_cast<DeclRefExpr>(E))
     return DR->getDecl();
-  if (ObjCIvarRefExpr* Ivar = dyn_cast<ObjCIvarRefExpr>(E)) {
+  if (ObjCIvarRefExpr *Ivar = dyn_cast<ObjCIvarRefExpr>(E)) {
     if (Ivar->isFreeIvar())
       return Ivar->getDecl();
   }
-  if (MemberExpr* Mem = dyn_cast<MemberExpr>(E)) {
+  if (MemberExpr *Mem = dyn_cast<MemberExpr>(E)) {
     if (Mem->isImplicitAccess())
       return Mem->getMemberDecl();
   }
   return nullptr;
 }
 
+/// Diagnose some forms of syntactically-obvious tautological comparison.
+static void diagnoseTautologicalComparison(Sema &S, SourceLocation Loc,
+                                           Expr *LHS, Expr *RHS,
+                                           BinaryOperatorKind Opc) {
+  Expr *LHSStripped = LHS->IgnoreParenImpCasts();
+  Expr *RHSStripped = RHS->IgnoreParenImpCasts();
+
+  QualType LHSType = LHS->getType();
+  if (LHSType->hasFloatingRepresentation() ||
+      (LHSType->isBlockPointerType() && !BinaryOperator::isEqualityOp(Opc)) ||
+      LHS->getLocStart().isMacroID() || RHS->getLocStart().isMacroID() ||
+      S.inTemplateInstantiation())
+    return;
+
+  // For non-floating point types, check for self-comparisons of the form
+  // x == x, x != x, x < x, etc.  These always evaluate to a constant, and
+  // often indicate logic errors in the program.
+  //
+  // NOTE: Don't warn about comparison expressions resulting from macro
+  // expansion. Also don't warn about comparisons which are only self
+  // comparisons within a template instantiation. The warnings should catch
+  // obvious cases in the definition of the template anyways. The idea is to
+  // warn when the typed comparison operator will always evaluate to the same
+  // result.
+  ValueDecl *DL = getCompareDecl(LHSStripped);
+  ValueDecl *DR = getCompareDecl(RHSStripped);
+  if (DL && DR && declaresSameEntity(DL, DR)) {
+    StringRef Result;
+    switch (Opc) {
+    case BO_EQ: case BO_LE: case BO_GE:
+      Result = "true";
+      break;
+    case BO_NE: case BO_LT: case BO_GT:
+      Result = "false";
+      break;
+    case BO_Cmp:
+      Result = "'std::strong_ordering::equal'";
+      break;
+    default:
+      break;
+    }
+    S.DiagRuntimeBehavior(Loc, nullptr,
+                          S.PDiag(diag::warn_comparison_always)
+                              << 0 /*self-comparison*/ << !Result.empty()
+                              << Result);
+  } else if (DL && DR &&
+             DL->getType()->isArrayType() && DR->getType()->isArrayType() &&
+             !DL->isWeak() && !DR->isWeak()) {
+    // What is it always going to evaluate to?
+    StringRef Result;
+    switch(Opc) {
+    case BO_EQ: // e.g. array1 == array2
+      Result = "false";
+      break;
+    case BO_NE: // e.g. array1 != array2
+      Result = "true";
+      break;
+    default: // e.g. array1 <= array2
+      // The best we can say is 'a constant'
+      break;
+    }
+    S.DiagRuntimeBehavior(Loc, nullptr,
+                          S.PDiag(diag::warn_comparison_always)
+                              << 1 /*array comparison*/
+                              << !Result.empty() << Result);
+  }
+
+  if (isa<CastExpr>(LHSStripped))
+    LHSStripped = LHSStripped->IgnoreParenCasts();
+  if (isa<CastExpr>(RHSStripped))
+    RHSStripped = RHSStripped->IgnoreParenCasts();
+
+  // Warn about comparisons against a string constant (unless the other
+  // operand is null); the user probably wants strcmp.
+  Expr *LiteralString = nullptr;
+  Expr *LiteralStringStripped = nullptr;
+  if ((isa<StringLiteral>(LHSStripped) || isa<ObjCEncodeExpr>(LHSStripped)) &&
+      !RHSStripped->isNullPointerConstant(S.Context,
+                                          Expr::NPC_ValueDependentIsNull)) {
+    LiteralString = LHS;
+    LiteralStringStripped = LHSStripped;
+  } else if ((isa<StringLiteral>(RHSStripped) ||
+              isa<ObjCEncodeExpr>(RHSStripped)) &&
+             !LHSStripped->isNullPointerConstant(S.Context,
+                                          Expr::NPC_ValueDependentIsNull)) {
+    LiteralString = RHS;
+    LiteralStringStripped = RHSStripped;
+  }
+
+  if (LiteralString) {
+    S.DiagRuntimeBehavior(Loc, nullptr,
+                          S.PDiag(diag::warn_stringcompare)
+                              << isa<ObjCEncodeExpr>(LiteralStringStripped)
+                              << LiteralString->getSourceRange());
+  }
+}
+
+static QualType checkArithmeticOrEnumeralCompare(Sema &S, ExprResult &LHS,
+                                                 ExprResult &RHS,
+                                                 SourceLocation Loc,
+                                                 BinaryOperatorKind Opc) {
+  // C99 6.5.8p3 / C99 6.5.9p4
+  QualType Type = S.UsualArithmeticConversions(LHS, RHS);
+  if (LHS.isInvalid() || RHS.isInvalid())
+    return QualType();
+  if (Type.isNull())
+    return S.InvalidOperands(Loc, LHS, RHS);
+  assert(Type->isArithmeticType() || Type->isEnumeralType());
+
+  checkEnumComparison(S, Loc, LHS.get(), RHS.get());
+
+  enum { StrongEquality, PartialOrdering, StrongOrdering } Ordering;
+  if (Type->isAnyComplexType())
+    Ordering = StrongEquality;
+  else if (Type->isFloatingType())
+    Ordering = PartialOrdering;
+  else
+    Ordering = StrongOrdering;
+
+  if (Ordering == StrongEquality && BinaryOperator::isRelationalOp(Opc))
+    return S.InvalidOperands(Loc, LHS, RHS);
+
+  // Check for comparisons of floating point operands using != and ==.
+  if (Type->hasFloatingRepresentation() && BinaryOperator::isEqualityOp(Opc))
+    S.CheckFloatComparison(Loc, LHS.get(), RHS.get());
+
+  // The result of comparisons is 'bool' in C++, 'int' in C.
+  // FIXME: For BO_Cmp, return the relevant comparison category type.
+  return S.Context.getLogicalOperationType();
+}
+
 // C99 6.5.8, C++ [expr.rel]
 QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
                                     SourceLocation Loc, BinaryOperatorKind Opc,
                                     bool IsRelational) {
+  // Comparisons expect an rvalue, so convert to rvalue before any
+  // type-related checks.
+  LHS = DefaultFunctionArrayLvalueConversion(LHS.get());
+  if (LHS.isInvalid())
+    return QualType();
+  RHS = DefaultFunctionArrayLvalueConversion(RHS.get());
+  if (RHS.isInvalid())
+    return QualType();
+
   checkArithmeticNull(*this, LHS, RHS, Loc, /*isCompare=*/true);
 
   // Handle vector comparisons separately.
   if (LHS.get()->getType()->isVectorType() ||
       RHS.get()->getType()->isVectorType())
-    return CheckVectorCompareOperands(LHS, RHS, Loc, IsRelational);
+    return CheckVectorCompareOperands(LHS, RHS, Loc, Opc);
 
-  QualType LHSType = LHS.get()->getType();
-  QualType RHSType = RHS.get()->getType();
-
-  Expr *LHSStripped = LHS.get()->IgnoreParenImpCasts();
-  Expr *RHSStripped = RHS.get()->IgnoreParenImpCasts();
-
-  checkEnumComparison(*this, Loc, LHS.get(), RHS.get());
   diagnoseLogicalNotOnLHSofCheck(*this, LHS, RHS, Loc, Opc);
+  diagnoseTautologicalComparison(*this, Loc, LHS.get(), RHS.get(), Opc);
 
-  if (!LHSType->hasFloatingRepresentation() &&
-      !(LHSType->isBlockPointerType() && IsRelational) &&
-      !LHS.get()->getLocStart().isMacroID() &&
-      !RHS.get()->getLocStart().isMacroID() &&
-      !inTemplateInstantiation()) {
-    // For non-floating point types, check for self-comparisons of the form
-    // x == x, x != x, x < x, etc.  These always evaluate to a constant, and
-    // often indicate logic errors in the program.
-    //
-    // NOTE: Don't warn about comparison expressions resulting from macro
-    // expansion. Also don't warn about comparisons which are only self
-    // comparisons within a template specialization. The warnings should catch
-    // obvious cases in the definition of the template anyways. The idea is to
-    // warn when the typed comparison operator will always evaluate to the same
-    // result.
-    ValueDecl *DL = getCompareDecl(LHSStripped);
-    ValueDecl *DR = getCompareDecl(RHSStripped);
-    if (DL && DR && DL == DR && !IsWithinTemplateSpecialization(DL)) {
-      DiagRuntimeBehavior(Loc, nullptr, PDiag(diag::warn_comparison_always)
-                          << 0 // self-
-                          << (Opc == BO_EQ
-                              || Opc == BO_LE
-                              || Opc == BO_GE));
-    } else if (DL && DR && LHSType->isArrayType() && RHSType->isArrayType() &&
-               !DL->getType()->isReferenceType() &&
-               !DR->getType()->isReferenceType()) {
-        // what is it always going to eval to?
-        char always_evals_to;
-        switch(Opc) {
-        case BO_EQ: // e.g. array1 == array2
-          always_evals_to = 0; // false
-          break;
-        case BO_NE: // e.g. array1 != array2
-          always_evals_to = 1; // true
-          break;
-        default:
-          // best we can say is 'a constant'
-          always_evals_to = 2; // e.g. array1 <= array2
-          break;
-        }
-        DiagRuntimeBehavior(Loc, nullptr, PDiag(diag::warn_comparison_always)
-                            << 1 // array
-                            << always_evals_to);
-    }
-
-    if (isa<CastExpr>(LHSStripped))
-      LHSStripped = LHSStripped->IgnoreParenCasts();
-    if (isa<CastExpr>(RHSStripped))
-      RHSStripped = RHSStripped->IgnoreParenCasts();
-
-    // Warn about comparisons against a string constant (unless the other
-    // operand is null), the user probably wants strcmp.
-    Expr *literalString = nullptr;
-    Expr *literalStringStripped = nullptr;
-    if ((isa<StringLiteral>(LHSStripped) || isa<ObjCEncodeExpr>(LHSStripped)) &&
-        !RHSStripped->isNullPointerConstant(Context,
-                                            Expr::NPC_ValueDependentIsNull)) {
-      literalString = LHS.get();
-      literalStringStripped = LHSStripped;
-    } else if ((isa<StringLiteral>(RHSStripped) ||
-                isa<ObjCEncodeExpr>(RHSStripped)) &&
-               !LHSStripped->isNullPointerConstant(Context,
-                                            Expr::NPC_ValueDependentIsNull)) {
-      literalString = RHS.get();
-      literalStringStripped = RHSStripped;
-    }
-
-    if (literalString) {
-      DiagRuntimeBehavior(Loc, nullptr,
-        PDiag(diag::warn_stringcompare)
-          << isa<ObjCEncodeExpr>(literalStringStripped)
-          << literalString->getSourceRange());
-    }
-  }
-
-  // C99 6.5.8p3 / C99 6.5.9p4
-  UsualArithmeticConversions(LHS, RHS);
-  if (LHS.isInvalid() || RHS.isInvalid())
-    return QualType();
-
-  LHSType = LHS.get()->getType();
-  RHSType = RHS.get()->getType();
+  QualType LHSType = LHS.get()->getType();
+  QualType RHSType = RHS.get()->getType();
+  if ((LHSType->isArithmeticType() || LHSType->isEnumeralType()) &&
+      (RHSType->isArithmeticType() || RHSType->isEnumeralType()))
+    return checkArithmeticOrEnumeralCompare(*this, LHS, RHS, Loc, Opc);
 
-  // The result of comparisons is 'bool' in C++, 'int' in C.
   QualType ResultTy = Context.getLogicalOperationType();
 
-  if (IsRelational) {
-    if (LHSType->isRealType() && RHSType->isRealType())
-      return ResultTy;
-  } else {
-    // Check for comparisons of floating point operands using != and ==.
-    if (LHSType->hasFloatingRepresentation())
-      CheckFloatComparison(Loc, LHS.get(), RHS.get());
-
-    if (LHSType->isArithmeticType() && RHSType->isArithmeticType())
-      return ResultTy;
-  }
-
   const Expr::NullPointerConstantKind LHSNullKind =
       LHS.get()->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull);
   const Expr::NullPointerConstantKind RHSNullKind =
@@ -9890,13 +9924,6 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
       else
         return ResultTy;
     }
-
-    // Handle scoped enumeration types specifically, since they don't promote
-    // to integers.
-    if (LHS.get()->getType()->isEnumeralType() &&
-        Context.hasSameUnqualifiedType(LHS.get()->getType(),
-                                       RHS.get()->getType()))
-      return ResultTy;
   }
 
   // Handle block pointer types.
@@ -10100,7 +10127,7 @@ QualType Sema::GetSignedVectorType(QualType V) {
 /// types.
 QualType Sema::CheckVectorCompareOperands(ExprResult &LHS, ExprResult &RHS,
                                           SourceLocation Loc,
-                                          bool IsRelational) {
+                                          BinaryOperatorKind Opc) {
   // Check to make sure we're operating on vectors of the same type and width,
   // Allowing one side to be a scalar of element type.
   QualType vType = CheckVectorOperands(LHS, RHS, Loc, /*isCompAssign*/false,
@@ -10120,22 +10147,12 @@ QualType Sema::CheckVectorCompareOperands(ExprResult &LHS, ExprResult &RHS,
   // For non-floating point types, check for self-comparisons of the form
   // x == x, x != x, x < x, etc.  These always evaluate to a constant, and
   // often indicate logic errors in the program.
-  if (!LHSType->hasFloatingRepresentation() && !inTemplateInstantiation()) {
-    if (DeclRefExpr* DRL
-          = dyn_cast<DeclRefExpr>(LHS.get()->IgnoreParenImpCasts()))
-      if (DeclRefExpr* DRR
-            = dyn_cast<DeclRefExpr>(RHS.get()->IgnoreParenImpCasts()))
-        if (DRL->getDecl() == DRR->getDecl())
-          DiagRuntimeBehavior(Loc, nullptr,
-                              PDiag(diag::warn_comparison_always)
-                                << 0 // self-
-                                << 2 // "a constant"
-                              );
-  }
+  diagnoseTautologicalComparison(*this, Loc, LHS.get(), RHS.get(), Opc);
 
   // Check for comparisons of floating point operands using != and ==.
-  if (!IsRelational && LHSType->hasFloatingRepresentation()) {
-    assert (RHS.get()->getType()->hasFloatingRepresentation());
+  if (BinaryOperator::isEqualityOp(Opc) &&
+      LHSType->hasFloatingRepresentation()) {
+    assert(RHS.get()->getType()->hasFloatingRepresentation());
     CheckFloatComparison(Loc, LHS.get(), RHS.get());
   }
 
@@ -11583,8 +11600,8 @@ ExprResult Sema::CreateBuiltinBinOp(SourceLocation OpLoc,
     // C++11 5.17p9:
     //   The meaning of x = {v} [...] is that of x = T(v) [...]. The meaning
     //   of x = {} is x = T().
-    InitializationKind Kind =
-        InitializationKind::CreateDirectList(RHSExpr->getLocStart());
+    InitializationKind Kind = InitializationKind::CreateDirectList(
+        RHSExpr->getLocStart(), RHSExpr->getLocStart(), RHSExpr->getLocEnd());
     InitializedEntity Entity =
         InitializedEntity::InitializeTemporary(LHSExpr->getType());
     InitializationSequence InitSeq(*this, Entity, Kind, RHSExpr);
@@ -12160,6 +12177,16 @@ ExprResult Sema::BuildBinOp(Scope *S, SourceLocation OpLoc,
   return CreateBuiltinBinOp(OpLoc, Opc, LHSExpr, RHSExpr);
 }
 
+static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
+  if (T.isNull() || T->isDependentType())
+    return false;
+
+  if (!T->isPromotableIntegerType())
+    return true;
+
+  return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
+}
+
 ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
                                       UnaryOperatorKind Opc,
                                       Expr *InputExpr) {
@@ -12167,6 +12194,8 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
   ExprValueKind VK = VK_RValue;
   ExprObjectKind OK = OK_Ordinary;
   QualType resultType;
+  bool CanOverflow = false;
+
   bool ConvertHalfVec = false;
   if (getLangOpts().OpenCL) {
     QualType Ty = InputExpr->getType();
@@ -12192,6 +12221,7 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
                                                 Opc == UO_PostInc,
                                                 Opc == UO_PreInc ||
                                                 Opc == UO_PreDec);
+    CanOverflow = isOverflowingIntegerType(Context, resultType);
     break;
   case UO_AddrOf:
     resultType = CheckAddressOfOperand(Input, OpLoc);
@@ -12205,6 +12235,8 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
   }
   case UO_Plus:
   case UO_Minus:
+    CanOverflow = Opc == UO_Minus &&
+                  isOverflowingIntegerType(Context, Input.get()->getType());
     Input = UsualUnaryConversions(Input.get());
     if (Input.isInvalid()) return ExprError();
     // Unary plus and minus require promoting an operand of half vector to a
@@ -12241,6 +12273,7 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
     if (Input.isInvalid())
       return ExprError();
     resultType = Input.get()->getType();
+
     if (resultType->isDependentType())
       break;
     // C99 6.5.3.3p1. We allow complex int and float as a GCC extension.
@@ -12355,7 +12388,7 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
     CheckArrayAccess(Input.get());
 
   auto *UO = new (Context)
-      UnaryOperator(Input.get(), Opc, resultType, VK, OK, OpLoc);
+      UnaryOperator(Input.get(), Opc, resultType, VK, OK, OpLoc, CanOverflow);
   // Convert the result back to a half vector.
   if (ConvertHalfVec)
     return convertVector(UO, Context.HalfTy, *this);
diff --git a/lib/Sema/SemaExprCXX.cpp b/lib/Sema/SemaExprCXX.cpp
index cff9fbbf491b..f0994d7bcdce 100644
--- a/lib/Sema/SemaExprCXX.cpp
+++ b/lib/Sema/SemaExprCXX.cpp
@@ -356,7 +356,7 @@ ParsedType Sema::getDestructorTypeForDecltype(const DeclSpec &DS,
 
 bool Sema::checkLiteralOperatorId(const CXXScopeSpec &SS,
                                   const UnqualifiedId &Name) {
-  assert(Name.getKind() == UnqualifiedId::IK_LiteralOperatorId);
+  assert(Name.getKind() == UnqualifiedIdKind::IK_LiteralOperatorId);
 
   if (!SS.isValid())
     return false;
@@ -1244,11 +1244,16 @@ bool Sema::isThisOutsideMemberFunctionBody(QualType BaseType) {
   return Class && Class->isBeingDefined();
 }
 
+/// Parse construction of a specified type.
+/// Can be interpreted either as function-style casting ("int(x)")
+/// or class type construction ("ClassType(x,y,z)")
+/// or creation of a value-initialized type ("int()").
 ExprResult
 Sema::ActOnCXXTypeConstructExpr(ParsedType TypeRep,
-                                SourceLocation LParenLoc,
+                                SourceLocation LParenOrBraceLoc,
                                 MultiExprArg exprs,
-                                SourceLocation RParenLoc) {
+                                SourceLocation RParenOrBraceLoc,
+                                bool ListInitialization) {
   if (!TypeRep)
     return ExprError();
 
@@ -1257,7 +1262,8 @@ Sema::ActOnCXXTypeConstructExpr(ParsedType TypeRep,
   if (!TInfo)
     TInfo = Context.getTrivialTypeSourceInfo(Ty, SourceLocation());
 
-  auto Result = BuildCXXTypeConstructExpr(TInfo, LParenLoc, exprs, RParenLoc);
+  auto Result = BuildCXXTypeConstructExpr(TInfo, LParenOrBraceLoc, exprs,
+                                          RParenOrBraceLoc, ListInitialization);
   // Avoid creating a non-type-dependent expression that contains typos.
   // Non-type-dependent expressions are liable to be discarded without
   // checking for embedded typos.
@@ -1267,38 +1273,40 @@ Sema::ActOnCXXTypeConstructExpr(ParsedType TypeRep,
   return Result;
 }
 
-/// ActOnCXXTypeConstructExpr - Parse construction of a specified type.
-/// Can be interpreted either as function-style casting ("int(x)")
-/// or class type construction ("ClassType(x,y,z)")
-/// or creation of a value-initialized type ("int()").
 ExprResult
 Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
-                                SourceLocation LParenLoc,
+                                SourceLocation LParenOrBraceLoc,
                                 MultiExprArg Exprs,
-                                SourceLocation RParenLoc) {
+                                SourceLocation RParenOrBraceLoc,
+                                bool ListInitialization) {
   QualType Ty = TInfo->getType();
   SourceLocation TyBeginLoc = TInfo->getTypeLoc().getBeginLoc();
 
   if (Ty->isDependentType() || CallExpr::hasAnyTypeDependentArguments(Exprs)) {
-    return CXXUnresolvedConstructExpr::Create(Context, TInfo, LParenLoc, Exprs,
-                                              RParenLoc);
+    // FIXME: CXXUnresolvedConstructExpr does not model list-initialization
+    // directly. We work around this by dropping the locations of the braces.
+    SourceRange Locs = ListInitialization
+                           ? SourceRange()
+                           : SourceRange(LParenOrBraceLoc, RParenOrBraceLoc);
+    return CXXUnresolvedConstructExpr::Create(Context, TInfo, Locs.getBegin(),
+                                              Exprs, Locs.getEnd());
   }
 
-  bool ListInitialization = LParenLoc.isInvalid();
   assert((!ListInitialization ||
           (Exprs.size() == 1 && isa<InitListExpr>(Exprs[0]))) &&
          "List initialization must have initializer list as expression.");
-  SourceRange FullRange = SourceRange(TyBeginLoc,
-      ListInitialization ? Exprs[0]->getSourceRange().getEnd() : RParenLoc);
+  SourceRange FullRange = SourceRange(TyBeginLoc, RParenOrBraceLoc);
 
   InitializedEntity Entity = InitializedEntity::InitializeTemporary(TInfo);
   InitializationKind Kind =
       Exprs.size()
           ? ListInitialization
-                ? InitializationKind::CreateDirectList(TyBeginLoc)
-                : InitializationKind::CreateDirect(TyBeginLoc, LParenLoc,
-                                                   RParenLoc)
-          : InitializationKind::CreateValue(TyBeginLoc, LParenLoc, RParenLoc);
+                ? InitializationKind::CreateDirectList(
+                      TyBeginLoc, LParenOrBraceLoc, RParenOrBraceLoc)
+                : InitializationKind::CreateDirect(TyBeginLoc, LParenOrBraceLoc,
+                                                   RParenOrBraceLoc)
+          : InitializationKind::CreateValue(TyBeginLoc, LParenOrBraceLoc,
+                                            RParenOrBraceLoc);
 
   // C++1z [expr.type.conv]p1:
   //   If the type is a placeholder for a deduced class type, [...perform class
@@ -1319,7 +1327,8 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
   if (Exprs.size() == 1 && !ListInitialization &&
       !isa<InitListExpr>(Exprs[0])) {
     Expr *Arg = Exprs[0];
-    return BuildCXXFunctionalCastExpr(TInfo, Ty, LParenLoc, Arg, RParenLoc);
+    return BuildCXXFunctionalCastExpr(TInfo, Ty, LParenOrBraceLoc, Arg,
+                                      RParenOrBraceLoc);
   }
 
   //   For an expression of the form T(), T shall not be an array type.
@@ -1367,9 +1376,12 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
     // CXXTemporaryObjectExpr. It's also weird that the functional cast
     // is sometimes handled by initialization and sometimes not.
     QualType ResultType = Result.get()->getType();
+    SourceRange Locs = ListInitialization
+                           ? SourceRange()
+                           : SourceRange(LParenOrBraceLoc, RParenOrBraceLoc);
     Result = CXXFunctionalCastExpr::Create(
-        Context, ResultType, Expr::getValueKindForType(Ty), TInfo,
-        CK_NoOp, Result.get(), /*Path=*/nullptr, LParenLoc, RParenLoc);
+        Context, ResultType, Expr::getValueKindForType(Ty), TInfo, CK_NoOp,
+        Result.get(), /*Path=*/nullptr, Locs.getBegin(), Locs.getEnd());
   }
 
   return Result;
@@ -1728,7 +1740,9 @@ Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   //     - Otherwise, the new-initializer is interpreted according to the
   //       initialization rules of 8.5 for direct-initialization.
         : initStyle == CXXNewExpr::ListInit
-            ? InitializationKind::CreateDirectList(TypeRange.getBegin())
+            ? InitializationKind::CreateDirectList(TypeRange.getBegin(),
+                                                   Initializer->getLocStart(),
+                                                   Initializer->getLocEnd())
             : InitializationKind::CreateDirect(TypeRange.getBegin(),
                                                DirectInitRange.getBegin(),
                                                DirectInitRange.getEnd());
@@ -4645,11 +4659,14 @@ static bool evaluateTypeTrait(Sema &S, TypeTrait Kind, SourceLocation KWLoc,
   if (Kind <= UTT_Last)
     return EvaluateUnaryTypeTrait(S, Kind, KWLoc, Args[0]->getType());
 
-  if (Kind <= BTT_Last)
+  // Evaluate BTT_ReferenceBindsToTemporary alongside the IsConstructible
+  // traits to avoid duplication.
+  if (Kind <= BTT_Last && Kind != BTT_ReferenceBindsToTemporary)
     return EvaluateBinaryTypeTrait(S, Kind, Args[0]->getType(),
                                    Args[1]->getType(), RParenLoc);
 
   switch (Kind) {
+  case clang::BTT_ReferenceBindsToTemporary:
   case clang::TT_IsConstructible:
   case clang::TT_IsNothrowConstructible:
   case clang::TT_IsTriviallyConstructible: {
@@ -4726,6 +4743,13 @@ static bool evaluateTypeTrait(Sema &S, TypeTrait Kind, SourceLocation KWLoc,
     if (Kind == clang::TT_IsConstructible)
       return true;
 
+    if (Kind == clang::BTT_ReferenceBindsToTemporary) {
+      if (!T->isReferenceType())
+        return false;
+
+      return !Init.isDirectReferenceBinding();
+    }
+
     if (Kind == clang::TT_IsNothrowConstructible)
       return S.canThrow(Result.get()) == CT_Cannot;
 
@@ -6716,11 +6740,11 @@ ExprResult Sema::ActOnPseudoDestructorExpr(Scope *S, Expr *Base,
                                            SourceLocation CCLoc,
                                            SourceLocation TildeLoc,
                                            UnqualifiedId &SecondTypeName) {
-  assert((FirstTypeName.getKind() == UnqualifiedId::IK_TemplateId ||
-          FirstTypeName.getKind() == UnqualifiedId::IK_Identifier) &&
+  assert((FirstTypeName.getKind() == UnqualifiedIdKind::IK_TemplateId ||
+          FirstTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) &&
          "Invalid first type name in pseudo-destructor");
-  assert((SecondTypeName.getKind() == UnqualifiedId::IK_TemplateId ||
-          SecondTypeName.getKind() == UnqualifiedId::IK_Identifier) &&
+  assert((SecondTypeName.getKind() == UnqualifiedIdKind::IK_TemplateId ||
+          SecondTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) &&
          "Invalid second type name in pseudo-destructor");
 
   QualType ObjectType;
@@ -6742,7 +6766,7 @@ ExprResult Sema::ActOnPseudoDestructorExpr(Scope *S, Expr *Base,
   QualType DestructedType;
   TypeSourceInfo *DestructedTypeInfo = nullptr;
   PseudoDestructorTypeStorage Destructed;
-  if (SecondTypeName.getKind() == UnqualifiedId::IK_Identifier) {
+  if (SecondTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) {
     ParsedType T = getTypeName(*SecondTypeName.Identifier,
                                SecondTypeName.StartLocation,
                                S, &SS, true, false, ObjectTypePtrForLookup,
@@ -6800,9 +6824,9 @@ ExprResult Sema::ActOnPseudoDestructorExpr(Scope *S, Expr *Base,
   // Convert the name of the scope type (the type prior to '::') into a type.
   TypeSourceInfo *ScopeTypeInfo = nullptr;
   QualType ScopeType;
-  if (FirstTypeName.getKind() == UnqualifiedId::IK_TemplateId ||
+  if (FirstTypeName.getKind() == UnqualifiedIdKind::IK_TemplateId ||
       FirstTypeName.Identifier) {
-    if (FirstTypeName.getKind() == UnqualifiedId::IK_Identifier) {
+    if (FirstTypeName.getKind() == UnqualifiedIdKind::IK_Identifier) {
       ParsedType T = getTypeName(*FirstTypeName.Identifier,
                                  FirstTypeName.StartLocation,
                                  S, &SS, true, false, ObjectTypePtrForLookup,
diff --git a/lib/Sema/SemaExprMember.cpp b/lib/Sema/SemaExprMember.cpp
index dd516ea3b428..f57f79e62efe 100644
--- a/lib/Sema/SemaExprMember.cpp
+++ b/lib/Sema/SemaExprMember.cpp
@@ -1707,7 +1707,7 @@ ExprResult Sema::ActOnMemberAccessExpr(Scope *S, Expr *Base,
 
   // Warn about the explicit constructor calls Microsoft extension.
   if (getLangOpts().MicrosoftExt &&
-      Id.getKind() == UnqualifiedId::IK_ConstructorName)
+      Id.getKind() == UnqualifiedIdKind::IK_ConstructorName)
     Diag(Id.getSourceRange().getBegin(),
          diag::ext_ms_explicit_constructor_call);
 
diff --git a/lib/Sema/SemaExprObjC.cpp b/lib/Sema/SemaExprObjC.cpp
index cd0c2c47ae4c..6a725c485d57 100644
--- a/lib/Sema/SemaExprObjC.cpp
+++ b/lib/Sema/SemaExprObjC.cpp
@@ -4276,9 +4276,9 @@ Expr *Sema::stripARCUnbridgedCast(Expr *e) {
   } else if (UnaryOperator *uo = dyn_cast<UnaryOperator>(e)) {
     assert(uo->getOpcode() == UO_Extension);
     Expr *sub = stripARCUnbridgedCast(uo->getSubExpr());
-    return new (Context) UnaryOperator(sub, UO_Extension, sub->getType(),
-                                   sub->getValueKind(), sub->getObjectKind(),
-                                       uo->getOperatorLoc());
+    return new (Context)
+        UnaryOperator(sub, UO_Extension, sub->getType(), sub->getValueKind(),
+                      sub->getObjectKind(), uo->getOperatorLoc(), false);
   } else if (GenericSelectionExpr *gse = dyn_cast<GenericSelectionExpr>(e)) {
     assert(!gse->isResultDependent());
 
diff --git a/lib/Sema/SemaInit.cpp b/lib/Sema/SemaInit.cpp
index 011051da58e5..464a311ea9d0 100644
--- a/lib/Sema/SemaInit.cpp
+++ b/lib/Sema/SemaInit.cpp
@@ -3533,8 +3533,8 @@ static bool TryInitializerListConstruction(Sema &S,
       clang::ArrayType::Normal, 0);
   InitializedEntity HiddenArray =
       InitializedEntity::InitializeTemporary(ArrayType);
-  InitializationKind Kind =
-      InitializationKind::CreateDirectList(List->getExprLoc());
+  InitializationKind Kind = InitializationKind::CreateDirectList(
+      List->getExprLoc(), List->getLocStart(), List->getLocEnd());
   TryListInitialization(S, HiddenArray, Kind, List, Sequence,
                         TreatUnavailableAsInvalid);
   if (Sequence)
@@ -6031,10 +6031,7 @@ PerformConstructorInitialization(Sema &S,
     TypeSourceInfo *TSInfo = Entity.getTypeSourceInfo();
     if (!TSInfo)
       TSInfo = S.Context.getTrivialTypeSourceInfo(Entity.getType(), Loc);
-    SourceRange ParenOrBraceRange =
-      (Kind.getKind() == InitializationKind::IK_DirectList)
-      ? SourceRange(LBraceLoc, RBraceLoc)
-      : Kind.getParenRange();
+    SourceRange ParenOrBraceRange = Kind.getParenOrBraceRange();
 
     if (auto *Shadow = dyn_cast<ConstructorUsingShadowDecl>(
             Step.Function.FoundDecl.getDecl())) {
@@ -6068,7 +6065,7 @@ PerformConstructorInitialization(Sema &S,
     if (IsListInitialization)
       ParenOrBraceRange = SourceRange(LBraceLoc, RBraceLoc);
     else if (Kind.getKind() == InitializationKind::IK_Direct)
-      ParenOrBraceRange = Kind.getParenRange();
+      ParenOrBraceRange = Kind.getParenOrBraceRange();
 
     // If the entity allows NRVO, mark the construction as elidable
     // unconditionally.
@@ -6594,7 +6591,7 @@ InitializationSequence::Perform(Sema &S,
     if (Kind.getKind() == InitializationKind::IK_Direct &&
         !Kind.isExplicitCast()) {
       // Rebuild the ParenListExpr.
-      SourceRange ParenRange = Kind.getParenRange();
+      SourceRange ParenRange = Kind.getParenOrBraceRange();
       return S.ActOnParenListExpr(ParenRange.getBegin(), ParenRange.getEnd(),
                                   Args);
     }
@@ -7114,14 +7111,17 @@ InitializationSequence::Perform(Sema &S,
       bool IsStdInitListInit =
           Step->Kind == SK_StdInitializerListConstructorCall;
       Expr *Source = CurInit.get();
+      SourceRange Range = Kind.hasParenOrBraceRange()
+                              ? Kind.getParenOrBraceRange()
+                              : SourceRange();
       CurInit = PerformConstructorInitialization(
           S, UseTemporary ? TempEntity : Entity, Kind,
           Source ? MultiExprArg(Source) : Args, *Step,
           ConstructorInitRequiresZeroInit,
           /*IsListInitialization*/ IsStdInitListInit,
           /*IsStdInitListInitialization*/ IsStdInitListInit,
-          /*LBraceLoc*/ SourceLocation(),
-          /*RBraceLoc*/ SourceLocation());
+          /*LBraceLoc*/ Range.getBegin(),
+          /*RBraceLoc*/ Range.getEnd());
       break;
     }
 
diff --git a/lib/Sema/SemaLambda.cpp b/lib/Sema/SemaLambda.cpp
index cbfc330ca60b..19d2de719728 100644
--- a/lib/Sema/SemaLambda.cpp
+++ b/lib/Sema/SemaLambda.cpp
@@ -1167,6 +1167,24 @@ void Sema::ActOnLambdaError(SourceLocation StartLoc, Scope *CurScope,
   PopFunctionScopeInfo();
 }
 
+QualType Sema::getLambdaConversionFunctionResultType(
+    const FunctionProtoType *CallOpProto) {
+  // The function type inside the pointer type is the same as the call
+  // operator with some tweaks. The calling convention is the default free
+  // function convention, and the type qualifications are lost.
+  const FunctionProtoType::ExtProtoInfo CallOpExtInfo = 
+      CallOpProto->getExtProtoInfo();   
+  FunctionProtoType::ExtProtoInfo InvokerExtInfo = CallOpExtInfo;
+  CallingConv CC = Context.getDefaultCallingConvention(
+      CallOpProto->isVariadic(), /*IsCXXMethod=*/false);
+  InvokerExtInfo.ExtInfo = InvokerExtInfo.ExtInfo.withCallingConv(CC);
+  InvokerExtInfo.TypeQuals = 0;
+  assert(InvokerExtInfo.RefQualifier == RQ_None && 
+      "Lambda's call operator should not have a reference qualifier");
+  return Context.getFunctionType(CallOpProto->getReturnType(),
+                                 CallOpProto->getParamTypes(), InvokerExtInfo);
+}
+
 /// \brief Add a lambda's conversion to function pointer, as described in
 /// C++11 [expr.prim.lambda]p6.
 static void addFunctionPointerConversion(Sema &S,
@@ -1182,25 +1200,9 @@ static void addFunctionPointerConversion(Sema &S,
     return;
 
   // Add the conversion to function pointer.
-  const FunctionProtoType *CallOpProto = 
-      CallOperator->getType()->getAs<FunctionProtoType>();
-  const FunctionProtoType::ExtProtoInfo CallOpExtInfo = 
-      CallOpProto->getExtProtoInfo();   
-  QualType PtrToFunctionTy;
-  QualType InvokerFunctionTy;
-  {
-    FunctionProtoType::ExtProtoInfo InvokerExtInfo = CallOpExtInfo;
-    CallingConv CC = S.Context.getDefaultCallingConvention(
-        CallOpProto->isVariadic(), /*IsCXXMethod=*/false);
-    InvokerExtInfo.ExtInfo = InvokerExtInfo.ExtInfo.withCallingConv(CC);
-    InvokerExtInfo.TypeQuals = 0;
-    assert(InvokerExtInfo.RefQualifier == RQ_None && 
-        "Lambda's call operator should not have a reference qualifier");
-    InvokerFunctionTy =
-        S.Context.getFunctionType(CallOpProto->getReturnType(),
-                                  CallOpProto->getParamTypes(), InvokerExtInfo);
-    PtrToFunctionTy = S.Context.getPointerType(InvokerFunctionTy);
-  }
+  QualType InvokerFunctionTy = S.getLambdaConversionFunctionResultType(
+      CallOperator->getType()->castAs<FunctionProtoType>());
+  QualType PtrToFunctionTy = S.Context.getPointerType(InvokerFunctionTy);
 
   // Create the type of the conversion function.
   FunctionProtoType::ExtProtoInfo ConvExtInfo(
@@ -1357,19 +1359,8 @@ static void addBlockPointerConversion(Sema &S,
                                       SourceRange IntroducerRange,
                                       CXXRecordDecl *Class,
                                       CXXMethodDecl *CallOperator) {
-  const FunctionProtoType *Proto =
-      CallOperator->getType()->getAs<FunctionProtoType>();
-
-  // The function type inside the block pointer type is the same as the call
-  // operator with some tweaks. The calling convention is the default free
-  // function convention, and the type qualifications are lost.
-  FunctionProtoType::ExtProtoInfo BlockEPI = Proto->getExtProtoInfo();
-  BlockEPI.ExtInfo =
-      BlockEPI.ExtInfo.withCallingConv(S.Context.getDefaultCallingConvention(
-          Proto->isVariadic(), /*IsCXXMethod=*/false));
-  BlockEPI.TypeQuals = 0;
-  QualType FunctionTy = S.Context.getFunctionType(
-      Proto->getReturnType(), Proto->getParamTypes(), BlockEPI);
+  QualType FunctionTy = S.getLambdaConversionFunctionResultType(
+      CallOperator->getType()->castAs<FunctionProtoType>());
   QualType BlockPtrTy = S.Context.getBlockPointerType(FunctionTy);
 
   FunctionProtoType::ExtProtoInfo ConversionEPI(
diff --git a/lib/Sema/SemaLookup.cpp b/lib/Sema/SemaLookup.cpp
index a9db973851df..dd54fb4ad9e5 100644
--- a/lib/Sema/SemaLookup.cpp
+++ b/lib/Sema/SemaLookup.cpp
@@ -1656,7 +1656,8 @@ bool Sema::shouldLinkPossiblyHiddenDecl(LookupResult &R, const NamedDecl *New) {
 ///
 /// \returns D, or a visible previous declaration of D, whichever is more recent
 /// and visible. If no declaration of D is visible, returns null.
-static NamedDecl *findAcceptableDecl(Sema &SemaRef, NamedDecl *D) {
+static NamedDecl *findAcceptableDecl(Sema &SemaRef, NamedDecl *D,
+                                     unsigned IDNS) {
   assert(!LookupResult::isVisible(SemaRef, D) && "not in slow case");
 
   for (auto RD : D->redecls()) {
@@ -1668,7 +1669,8 @@ static NamedDecl *findAcceptableDecl(Sema &SemaRef, NamedDecl *D) {
     // FIXME: This is wrong in the case where the previous declaration is not
     // visible in the same scope as D. This needs to be done much more
     // carefully.
-    if (LookupResult::isVisible(SemaRef, ND))
+    if (ND->isInIdentifierNamespace(IDNS) &&
+        LookupResult::isVisible(SemaRef, ND))
       return ND;
   }
 
@@ -1693,14 +1695,15 @@ NamedDecl *LookupResult::getAcceptableDeclSlow(NamedDecl *D) const {
     auto *Key = ND->getCanonicalDecl();
     if (auto *Acceptable = getSema().VisibleNamespaceCache.lookup(Key))
       return Acceptable;
-    auto *Acceptable =
-        isVisible(getSema(), Key) ? Key : findAcceptableDecl(getSema(), Key);
+    auto *Acceptable = isVisible(getSema(), Key)
+                           ? Key
+                           : findAcceptableDecl(getSema(), Key, IDNS);
     if (Acceptable)
       getSema().VisibleNamespaceCache.insert(std::make_pair(Key, Acceptable));
     return Acceptable;
   }
 
-  return findAcceptableDecl(getSema(), D);
+  return findAcceptableDecl(getSema(), D, IDNS);
 }
 
 /// @brief Perform unqualified name lookup starting from a given
@@ -3329,6 +3332,23 @@ void Sema::ArgumentDependentLookup(DeclarationName Name, SourceLocation Loc,
     //        lookup (11.4).
     DeclContext::lookup_result R = NS->lookup(Name);
     for (auto *D : R) {
+      auto *Underlying = D;
+      if (auto *USD = dyn_cast<UsingShadowDecl>(D))
+        Underlying = USD->getTargetDecl();
+
+      if (!isa<FunctionDecl>(Underlying) &&
+          !isa<FunctionTemplateDecl>(Underlying))
+        continue;
+
+      if (!isVisible(D)) {
+        D = findAcceptableDecl(
+            *this, D, (Decl::IDNS_Ordinary | Decl::IDNS_OrdinaryFriend));
+        if (!D)
+          continue;
+        if (auto *USD = dyn_cast<UsingShadowDecl>(D))
+          Underlying = USD->getTargetDecl();
+      }
+
       // If the only declaration here is an ordinary friend, consider
       // it only if it was declared in an associated classes.
       if ((D->getIdentifierNamespace() & Decl::IDNS_Ordinary) == 0) {
@@ -3350,22 +3370,6 @@ void Sema::ArgumentDependentLookup(DeclarationName Name, SourceLocation Loc,
           continue;
       }
 
-      auto *Underlying = D;
-      if (auto *USD = dyn_cast<UsingShadowDecl>(D))
-        Underlying = USD->getTargetDecl();
-
-      if (!isa<FunctionDecl>(Underlying) &&
-          !isa<FunctionTemplateDecl>(Underlying))
-        continue;
-
-      if (!isVisible(D)) {
-        D = findAcceptableDecl(*this, D);
-        if (!D)
-          continue;
-        if (auto *USD = dyn_cast<UsingShadowDecl>(D))
-          Underlying = USD->getTargetDecl();
-      }
-
       // FIXME: Preserve D as the FoundDecl.
       Result.insert(Underlying);
     }
@@ -3494,7 +3498,8 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
                                bool InBaseClass,
                                VisibleDeclConsumer &Consumer,
                                VisibleDeclsRecord &Visited,
-                               bool IncludeDependentBases = false) {
+                               bool IncludeDependentBases,
+                               bool LoadExternal) {
   if (!Ctx)
     return;
 
@@ -3502,6 +3507,8 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
   if (Visited.visitedContext(Ctx->getPrimaryContext()))
     return;
 
+  Consumer.EnteredContext(Ctx);
+
   // Outside C++, lookup results for the TU live on identifiers.
   if (isa<TranslationUnitDecl>(Ctx) &&
       !Result.getSema().getLangOpts().CPlusPlus) {
@@ -3509,11 +3516,12 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
     auto &Idents = S.Context.Idents;
 
     // Ensure all external identifiers are in the identifier table.
-    if (IdentifierInfoLookup *External = Idents.getExternalIdentifierLookup()) {
-      std::unique_ptr<IdentifierIterator> Iter(External->getIdentifiers());
-      for (StringRef Name = Iter->Next(); !Name.empty(); Name = Iter->Next())
-        Idents.get(Name);
-    }
+    if (LoadExternal)
+      if (IdentifierInfoLookup *External = Idents.getExternalIdentifierLookup()) {
+        std::unique_ptr<IdentifierIterator> Iter(External->getIdentifiers());
+        for (StringRef Name = Iter->Next(); !Name.empty(); Name = Iter->Next())
+          Idents.get(Name);
+      }
 
     // Walk all lookup results in the TU for each identifier.
     for (const auto &Ident : Idents) {
@@ -3535,8 +3543,13 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
   if (CXXRecordDecl *Class = dyn_cast<CXXRecordDecl>(Ctx))
     Result.getSema().ForceDeclarationOfImplicitMembers(Class);
 
+  // We sometimes skip loading namespace-level results (they tend to be huge).
+  bool Load = LoadExternal ||
+              !(isa<TranslationUnitDecl>(Ctx) || isa<NamespaceDecl>(Ctx));
   // Enumerate all of the results in this context.
-  for (DeclContextLookupResult R : Ctx->lookups()) {
+  for (DeclContextLookupResult R :
+       Load ? Ctx->lookups()
+            : Ctx->noload_lookups(/*PreserveInternalState=*/false)) {
     for (auto *D : R) {
       if (auto *ND = Result.getAcceptableDecl(D)) {
         Consumer.FoundDecl(ND, Visited.checkHidden(ND), Ctx, InBaseClass);
@@ -3553,7 +3566,7 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
         continue;
       LookupVisibleDecls(I->getNominatedNamespace(), Result,
                          QualifiedNameLookup, InBaseClass, Consumer, Visited,
-                         IncludeDependentBases);
+                         IncludeDependentBases, LoadExternal);
     }
   }
 
@@ -3610,7 +3623,7 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
       // Find results in this base class (and its bases).
       ShadowContextRAII Shadow(Visited);
       LookupVisibleDecls(RD, Result, QualifiedNameLookup, true, Consumer,
-                         Visited, IncludeDependentBases);
+                         Visited, IncludeDependentBases, LoadExternal);
     }
   }
 
@@ -3619,22 +3632,23 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
     // Traverse categories.
     for (auto *Cat : IFace->visible_categories()) {
       ShadowContextRAII Shadow(Visited);
-      LookupVisibleDecls(Cat, Result, QualifiedNameLookup, false,
-                         Consumer, Visited);
+      LookupVisibleDecls(Cat, Result, QualifiedNameLookup, false, Consumer,
+                         Visited, IncludeDependentBases, LoadExternal);
     }
 
     // Traverse protocols.
     for (auto *I : IFace->all_referenced_protocols()) {
       ShadowContextRAII Shadow(Visited);
       LookupVisibleDecls(I, Result, QualifiedNameLookup, false, Consumer,
-                         Visited);
+                         Visited, IncludeDependentBases, LoadExternal);
     }
 
     // Traverse the superclass.
     if (IFace->getSuperClass()) {
       ShadowContextRAII Shadow(Visited);
       LookupVisibleDecls(IFace->getSuperClass(), Result, QualifiedNameLookup,
-                         true, Consumer, Visited);
+                         true, Consumer, Visited, IncludeDependentBases,
+                         LoadExternal);
     }
 
     // If there is an implementation, traverse it. We do this to find
@@ -3642,26 +3656,28 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
     if (IFace->getImplementation()) {
       ShadowContextRAII Shadow(Visited);
       LookupVisibleDecls(IFace->getImplementation(), Result,
-                         QualifiedNameLookup, InBaseClass, Consumer, Visited);
+                         QualifiedNameLookup, InBaseClass, Consumer, Visited,
+                         IncludeDependentBases, LoadExternal);
     }
   } else if (ObjCProtocolDecl *Protocol = dyn_cast<ObjCProtocolDecl>(Ctx)) {
     for (auto *I : Protocol->protocols()) {
       ShadowContextRAII Shadow(Visited);
       LookupVisibleDecls(I, Result, QualifiedNameLookup, false, Consumer,
-                         Visited);
+                         Visited, IncludeDependentBases, LoadExternal);
     }
   } else if (ObjCCategoryDecl *Category = dyn_cast<ObjCCategoryDecl>(Ctx)) {
     for (auto *I : Category->protocols()) {
       ShadowContextRAII Shadow(Visited);
       LookupVisibleDecls(I, Result, QualifiedNameLookup, false, Consumer,
-                         Visited);
+                         Visited, IncludeDependentBases, LoadExternal);
     }
 
     // If there is an implementation, traverse it.
     if (Category->getImplementation()) {
       ShadowContextRAII Shadow(Visited);
       LookupVisibleDecls(Category->getImplementation(), Result,
-                         QualifiedNameLookup, true, Consumer, Visited);
+                         QualifiedNameLookup, true, Consumer, Visited,
+                         IncludeDependentBases, LoadExternal);
     }
   }
 }
@@ -3669,7 +3685,8 @@ static void LookupVisibleDecls(DeclContext *Ctx, LookupResult &Result,
 static void LookupVisibleDecls(Scope *S, LookupResult &Result,
                                UnqualUsingDirectiveSet &UDirs,
                                VisibleDeclConsumer &Consumer,
-                               VisibleDeclsRecord &Visited) {
+                               VisibleDeclsRecord &Visited,
+                               bool LoadExternal) {
   if (!S)
     return;
 
@@ -3708,7 +3725,8 @@ static void LookupVisibleDecls(Scope *S, LookupResult &Result,
                                   Result.getNameLoc(), Sema::LookupMemberName);
           if (ObjCInterfaceDecl *IFace = Method->getClassInterface()) {
             LookupVisibleDecls(IFace, IvarResult, /*QualifiedNameLookup=*/false,
-                               /*InBaseClass=*/false, Consumer, Visited);
+                               /*InBaseClass=*/false, Consumer, Visited,
+                               /*IncludeDependentBases=*/false, LoadExternal);
           }
         }
 
@@ -3722,7 +3740,8 @@ static void LookupVisibleDecls(Scope *S, LookupResult &Result,
         continue;
 
       LookupVisibleDecls(Ctx, Result, /*QualifiedNameLookup=*/false,
-                         /*InBaseClass=*/false, Consumer, Visited);
+                         /*InBaseClass=*/false, Consumer, Visited,
+                         /*IncludeDependentBases=*/false, LoadExternal);
     }
   } else if (!S->getParent()) {
     // Look into the translation unit scope. We walk through the translation
@@ -3736,7 +3755,8 @@ static void LookupVisibleDecls(Scope *S, LookupResult &Result,
     // in DeclContexts unless we have to" optimization), we can eliminate this.
     Entity = Result.getSema().Context.getTranslationUnitDecl();
     LookupVisibleDecls(Entity, Result, /*QualifiedNameLookup=*/false,
-                       /*InBaseClass=*/false, Consumer, Visited);
+                       /*InBaseClass=*/false, Consumer, Visited,
+                       /*IncludeDependentBases=*/false, LoadExternal);
   }
 
   if (Entity) {
@@ -3745,17 +3765,19 @@ static void LookupVisibleDecls(Scope *S, LookupResult &Result,
     for (const UnqualUsingEntry &UUE : UDirs.getNamespacesFor(Entity))
       LookupVisibleDecls(const_cast<DeclContext *>(UUE.getNominatedNamespace()),
                          Result, /*QualifiedNameLookup=*/false,
-                         /*InBaseClass=*/false, Consumer, Visited);
+                         /*InBaseClass=*/false, Consumer, Visited,
+                         /*IncludeDependentBases=*/false, LoadExternal);
   }
 
   // Lookup names in the parent scope.
   ShadowContextRAII Shadow(Visited);
-  LookupVisibleDecls(S->getParent(), Result, UDirs, Consumer, Visited);
+  LookupVisibleDecls(S->getParent(), Result, UDirs, Consumer, Visited,
+                     LoadExternal);
 }
 
 void Sema::LookupVisibleDecls(Scope *S, LookupNameKind Kind,
                               VisibleDeclConsumer &Consumer,
-                              bool IncludeGlobalScope) {
+                              bool IncludeGlobalScope, bool LoadExternal) {
   // Determine the set of using directives available during
   // unqualified name lookup.
   Scope *Initial = S;
@@ -3776,13 +3798,13 @@ void Sema::LookupVisibleDecls(Scope *S, LookupNameKind Kind,
   if (!IncludeGlobalScope)
     Visited.visitedContext(Context.getTranslationUnitDecl());
   ShadowContextRAII Shadow(Visited);
-  ::LookupVisibleDecls(Initial, Result, UDirs, Consumer, Visited);
+  ::LookupVisibleDecls(Initial, Result, UDirs, Consumer, Visited, LoadExternal);
 }
 
 void Sema::LookupVisibleDecls(DeclContext *Ctx, LookupNameKind Kind,
                               VisibleDeclConsumer &Consumer,
                               bool IncludeGlobalScope,
-                              bool IncludeDependentBases) {
+                              bool IncludeDependentBases, bool LoadExternal) {
   LookupResult Result(*this, DeclarationName(), SourceLocation(), Kind);
   Result.setAllowHidden(Consumer.includeHiddenDecls());
   VisibleDeclsRecord Visited;
@@ -3791,7 +3813,7 @@ void Sema::LookupVisibleDecls(DeclContext *Ctx, LookupNameKind Kind,
   ShadowContextRAII Shadow(Visited);
   ::LookupVisibleDecls(Ctx, Result, /*QualifiedNameLookup=*/true,
                        /*InBaseClass=*/false, Consumer, Visited,
-                       IncludeDependentBases);
+                       IncludeDependentBases, LoadExternal);
 }
 
 /// LookupOrCreateLabel - Do a name lookup of a label with the specified name.
@@ -3865,17 +3887,13 @@ static void checkCorrectionVisibility(Sema &SemaRef, TypoCorrection &TC) {
   bool AnyVisibleDecls = !NewDecls.empty();
 
   for (/**/; DI != DE; ++DI) {
-    NamedDecl *VisibleDecl = *DI;
-    if (!LookupResult::isVisible(SemaRef, *DI))
-      VisibleDecl = findAcceptableDecl(SemaRef, *DI);
-
-    if (VisibleDecl) {
+    if (LookupResult::isVisible(SemaRef, *DI)) {
       if (!AnyVisibleDecls) {
         // Found a visible decl, discard all hidden ones.
         AnyVisibleDecls = true;
         NewDecls.clear();
       }
-      NewDecls.push_back(VisibleDecl);
+      NewDecls.push_back(*DI);
     } else if (!AnyVisibleDecls && !(*DI)->isModulePrivate())
       NewDecls.push_back(*DI);
   }
@@ -3945,8 +3963,7 @@ void TypoCorrectionConsumer::FoundDecl(NamedDecl *ND, NamedDecl *Hiding,
 
   // Only consider visible declarations and declarations from modules with
   // names that exactly match.
-  if (!LookupResult::isVisible(SemaRef, ND) && Name != Typo &&
-      !findAcceptableDecl(SemaRef, ND))
+  if (!LookupResult::isVisible(SemaRef, ND) && Name != Typo)
     return;
 
   FoundName(Name->getName());
diff --git a/lib/Sema/SemaOpenMP.cpp b/lib/Sema/SemaOpenMP.cpp
index 24b58e8fd12b..14bf828c9ca3 100644
--- a/lib/Sema/SemaOpenMP.cpp
+++ b/lib/Sema/SemaOpenMP.cpp
@@ -2121,13 +2121,33 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
   case OMPD_target_parallel_for_simd:
   case OMPD_target_teams_distribute:
   case OMPD_target_teams_distribute_simd: {
+    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
+    QualType Args[] = {Context.VoidPtrTy.withConst().withRestrict()};
+    FunctionProtoType::ExtProtoInfo EPI;
+    EPI.Variadic = true;
+    QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
+    Sema::CapturedParamNameType Params[] = {
+        std::make_pair(".global_tid.", KmpInt32Ty),
+        std::make_pair(".part_id.", Context.getPointerType(KmpInt32Ty)),
+        std::make_pair(".privates.", Context.VoidPtrTy.withConst()),
+        std::make_pair(".copy_fn.",
+                       Context.getPointerType(CopyFnType).withConst()),
+        std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             Params);
+    // Mark this captured region as inlined, because we don't use outlined
+    // function directly.
+    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+        AlwaysInlineAttr::CreateImplicit(
+            Context, AlwaysInlineAttr::Keyword_forceinline, SourceRange()));
     Sema::CapturedParamNameType ParamsTarget[] = {
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     // Start a captured region for 'target' with no implicit parameters.
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
                              ParamsTarget);
-    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
     Sema::CapturedParamNameType ParamsTeamsOrParallel[] = {
@@ -2141,6 +2161,33 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
                              ParamsTeamsOrParallel);
     break;
   }
+  case OMPD_target:
+  case OMPD_target_simd: {
+    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
+    QualType Args[] = {Context.VoidPtrTy.withConst().withRestrict()};
+    FunctionProtoType::ExtProtoInfo EPI;
+    EPI.Variadic = true;
+    QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
+    Sema::CapturedParamNameType Params[] = {
+        std::make_pair(".global_tid.", KmpInt32Ty),
+        std::make_pair(".part_id.", Context.getPointerType(KmpInt32Ty)),
+        std::make_pair(".privates.", Context.VoidPtrTy.withConst()),
+        std::make_pair(".copy_fn.",
+                       Context.getPointerType(CopyFnType).withConst()),
+        std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             Params);
+    // Mark this captured region as inlined, because we don't use outlined
+    // function directly.
+    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+        AlwaysInlineAttr::CreateImplicit(
+            Context, AlwaysInlineAttr::Keyword_forceinline, SourceRange()));
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             std::make_pair(StringRef(), QualType()));
+    break;
+  }
   case OMPD_simd:
   case OMPD_for:
   case OMPD_for_simd:
@@ -2154,9 +2201,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
   case OMPD_distribute_simd:
   case OMPD_ordered:
   case OMPD_atomic:
-  case OMPD_target_data:
-  case OMPD_target:
-  case OMPD_target_simd: {
+  case OMPD_target_data: {
     Sema::CapturedParamNameType Params[] = {
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
@@ -2226,23 +2271,77 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     break;
   }
   case OMPD_distribute_parallel_for_simd:
-  case OMPD_distribute_parallel_for:
+  case OMPD_distribute_parallel_for: {
+    QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
+    QualType KmpInt32PtrTy =
+        Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
+    Sema::CapturedParamNameType Params[] = {
+        std::make_pair(".global_tid.", KmpInt32PtrTy),
+        std::make_pair(".bound_tid.", KmpInt32PtrTy),
+        std::make_pair(".previous.lb.", Context.getSizeType()),
+        std::make_pair(".previous.ub.", Context.getSizeType()),
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             Params);
+    break;
+  }
   case OMPD_target_teams_distribute_parallel_for:
   case OMPD_target_teams_distribute_parallel_for_simd: {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
+
+    QualType Args[] = {Context.VoidPtrTy.withConst().withRestrict()};
+    FunctionProtoType::ExtProtoInfo EPI;
+    EPI.Variadic = true;
+    QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
     Sema::CapturedParamNameType Params[] = {
+        std::make_pair(".global_tid.", KmpInt32Ty),
+        std::make_pair(".part_id.", Context.getPointerType(KmpInt32Ty)),
+        std::make_pair(".privates.", Context.VoidPtrTy.withConst()),
+        std::make_pair(".copy_fn.",
+                       Context.getPointerType(CopyFnType).withConst()),
+        std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             Params);
+    // Mark this captured region as inlined, because we don't use outlined
+    // function directly.
+    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+        AlwaysInlineAttr::CreateImplicit(
+            Context, AlwaysInlineAttr::Keyword_forceinline, SourceRange()));
+    Sema::CapturedParamNameType ParamsTarget[] = {
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    // Start a captured region for 'target' with no implicit parameters.
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             ParamsTarget);
+
+    Sema::CapturedParamNameType ParamsTeams[] = {
+        std::make_pair(".global_tid.", KmpInt32PtrTy),
+        std::make_pair(".bound_tid.", KmpInt32PtrTy),
+        std::make_pair(StringRef(), QualType()) // __context with shared vars
+    };
+    // Start a captured region for 'target' with no implicit parameters.
+    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
+                             ParamsTeams);
+
+    Sema::CapturedParamNameType ParamsParallel[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(".previous.lb.", Context.getSizeType()),
         std::make_pair(".previous.ub.", Context.getSizeType()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
+    // Start a captured region for 'teams' or 'parallel'.  Both regions have
+    // the same implicit parameters.
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
+                             ParamsParallel);
     break;
   }
+
   case OMPD_teams_distribute_parallel_for:
   case OMPD_teams_distribute_parallel_for_simd: {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1);
@@ -4764,7 +4863,7 @@ CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
   }
 
   // Loop condition (IV < NumIterations) or (IV <= UB) for worksharing loops.
-  SourceLocation CondLoc;
+  SourceLocation CondLoc = AStmt->getLocStart();
   ExprResult Cond =
       (isOpenMPWorksharingDirective(DKind) ||
        isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind))
@@ -4777,7 +4876,7 @@ CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
         SemaRef.BuildBinOp(CurScope, CondLoc, BO_LE, IV.get(), CombUB.get());
   }
   // Loop increment (IV = IV + 1)
-  SourceLocation IncLoc;
+  SourceLocation IncLoc = AStmt->getLocStart();
   ExprResult Inc =
       SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(),
                          SemaRef.ActOnIntegerConstant(IncLoc, 1).get());
@@ -4844,7 +4943,7 @@ CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
   // directive with for as IV = IV + ST; ensure upper bound expression based
   // on PrevUB instead of NumIterations - used to implement 'for' when found
   // in combination with 'distribute', like in 'distribute parallel for'
-  SourceLocation DistIncLoc;
+  SourceLocation DistIncLoc = AStmt->getLocStart();
   ExprResult DistCond, DistInc, PrevEUB;
   if (isOpenMPLoopBoundSharingDirective(DKind)) {
     DistCond = SemaRef.BuildBinOp(CurScope, CondLoc, BO_LE, IV.get(), UB.get());
@@ -4860,7 +4959,7 @@ CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
 
     // Build expression: UB = min(UB, prevUB) for #for in composite or combined
     // construct
-    SourceLocation DistEUBLoc;
+    SourceLocation DistEUBLoc = AStmt->getLocStart();
     ExprResult IsUBGreater =
         SemaRef.BuildBinOp(CurScope, DistEUBLoc, BO_GT, UB.get(), PrevUB.get());
     ExprResult CondOp = SemaRef.ActOnConditionalOp(
@@ -6320,13 +6419,23 @@ StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
   // The point of exit cannot be a branch out of the structured block.
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
+  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target);
+       ThisCaptureLevel > 1; --ThisCaptureLevel) {
+    CS = cast<CapturedStmt>(CS->getCapturedStmt());
+    // 1.2.2 OpenMP Language Terminology
+    // Structured block - An executable statement with a single entry at the
+    // top and a single exit at the bottom.
+    // The point of exit cannot be a branch out of the structured block.
+    // longjmp() and throw() must not violate the entry/exit criteria.
+    CS->getCapturedDecl()->setNothrow();
+  }
 
   // OpenMP [2.16, Nesting of Regions]
   // If specified, a teams construct must be contained within a target
   // construct. That target construct must contain no statements or directives
   // outside of the teams construct.
   if (DSAStack->hasInnerTeamsRegion()) {
-    auto S = AStmt->IgnoreContainers(/*IgnoreCaptured*/ true);
+    Stmt *S = CS->IgnoreContainers(/*IgnoreCaptured=*/true);
     bool OMPTeamsFound = true;
     if (auto *CS = dyn_cast<CompoundStmt>(S)) {
       auto I = CS->body_begin();
@@ -6373,6 +6482,16 @@ Sema::ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
   // The point of exit cannot be a branch out of the structured block.
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
+  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel);
+       ThisCaptureLevel > 1; --ThisCaptureLevel) {
+    CS = cast<CapturedStmt>(CS->getCapturedStmt());
+    // 1.2.2 OpenMP Language Terminology
+    // Structured block - An executable statement with a single entry at the
+    // top and a single exit at the bottom.
+    // The point of exit cannot be a branch out of the structured block.
+    // longjmp() and throw() must not violate the entry/exit criteria.
+    CS->getCapturedDecl()->setNothrow();
+  }
 
   getCurFunction()->setHasBranchProtectedScope();
 
@@ -7381,14 +7500,24 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
   // The point of exit cannot be a branch out of the structured block.
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
+  for (int ThisCaptureLevel =
+           getOpenMPCaptureLevels(OMPD_target_teams_distribute_parallel_for);
+       ThisCaptureLevel > 1; --ThisCaptureLevel) {
+    CS = cast<CapturedStmt>(CS->getCapturedStmt());
+    // 1.2.2 OpenMP Language Terminology
+    // Structured block - An executable statement with a single entry at the
+    // top and a single exit at the bottom.
+    // The point of exit cannot be a branch out of the structured block.
+    // longjmp() and throw() must not violate the entry/exit criteria.
+    CS->getCapturedDecl()->setNothrow();
+  }
 
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_parallel_for,
-      getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
+      OMPD_target_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
+      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
@@ -7396,6 +7525,17 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
   assert((CurContext->isDependentContext() || B.builtAll()) &&
          "omp target teams distribute parallel for loop exprs were not built");
 
+  if (!CurContext->isDependentContext()) {
+    // Finalize the clauses that need pre-built expressions for CodeGen.
+    for (auto C : Clauses) {
+      if (auto *LC = dyn_cast<OMPLinearClause>(C))
+        if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
+                                     B.NumIterations, *this, CurScope,
+                                     DSAStack))
+          return StmtError();
+    }
+  }
+
   getCurFunction()->setHasBranchProtectedScope();
   return OMPTargetTeamsDistributeParallelForDirective::Create(
       Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
@@ -7416,15 +7556,26 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
   // The point of exit cannot be a branch out of the structured block.
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
+  for (int ThisCaptureLevel = getOpenMPCaptureLevels(
+           OMPD_target_teams_distribute_parallel_for_simd);
+       ThisCaptureLevel > 1; --ThisCaptureLevel) {
+    CS = cast<CapturedStmt>(CS->getCapturedStmt());
+    // 1.2.2 OpenMP Language Terminology
+    // Structured block - An executable statement with a single entry at the
+    // top and a single exit at the bottom.
+    // The point of exit cannot be a branch out of the structured block.
+    // longjmp() and throw() must not violate the entry/exit criteria.
+    CS->getCapturedDecl()->setNothrow();
+  }
 
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_parallel_for_simd,
-      getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_teams_distribute_parallel_for_simd,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, CS, *this,
+                      *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -7612,13 +7763,18 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_target_parallel:
     case OMPD_target_parallel_for:
     case OMPD_target_parallel_for_simd:
-    case OMPD_target_teams_distribute_parallel_for:
-    case OMPD_target_teams_distribute_parallel_for_simd:
       // If this clause applies to the nested 'parallel' region, capture within
       // the 'target' region, otherwise do not capture.
       if (NameModifier == OMPD_unknown || NameModifier == OMPD_parallel)
         CaptureRegion = OMPD_target;
       break;
+    case OMPD_target_teams_distribute_parallel_for:
+    case OMPD_target_teams_distribute_parallel_for_simd:
+      // If this clause applies to the nested 'parallel' region, capture within
+      // the 'teams' region, otherwise do not capture.
+      if (NameModifier == OMPD_unknown || NameModifier == OMPD_parallel)
+        CaptureRegion = OMPD_teams;
+      break;
     case OMPD_teams_distribute_parallel_for:
     case OMPD_teams_distribute_parallel_for_simd:
       CaptureRegion = OMPD_teams;
@@ -7682,12 +7838,12 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_target_parallel:
     case OMPD_target_parallel_for:
     case OMPD_target_parallel_for_simd:
-    case OMPD_target_teams_distribute_parallel_for:
-    case OMPD_target_teams_distribute_parallel_for_simd:
       CaptureRegion = OMPD_target;
       break;
     case OMPD_teams_distribute_parallel_for:
     case OMPD_teams_distribute_parallel_for_simd:
+    case OMPD_target_teams_distribute_parallel_for:
+    case OMPD_target_teams_distribute_parallel_for_simd:
       CaptureRegion = OMPD_teams;
       break;
     case OMPD_parallel:
@@ -7870,20 +8026,16 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     break;
   case OMPC_schedule:
     switch (DKind) {
-    case OMPD_target_parallel_for:
-    case OMPD_target_parallel_for_simd:
-    case OMPD_target_teams_distribute_parallel_for:
-    case OMPD_target_teams_distribute_parallel_for_simd:
-      CaptureRegion = OMPD_target;
-      break;
-    case OMPD_teams_distribute_parallel_for:
-    case OMPD_teams_distribute_parallel_for_simd:
-      CaptureRegion = OMPD_teams;
-      break;
     case OMPD_parallel_for:
     case OMPD_parallel_for_simd:
     case OMPD_distribute_parallel_for:
     case OMPD_distribute_parallel_for_simd:
+    case OMPD_teams_distribute_parallel_for:
+    case OMPD_teams_distribute_parallel_for_simd:
+    case OMPD_target_parallel_for:
+    case OMPD_target_parallel_for_simd:
+    case OMPD_target_teams_distribute_parallel_for:
+    case OMPD_target_teams_distribute_parallel_for_simd:
       CaptureRegion = OMPD_parallel;
       break;
     case OMPD_for:
@@ -7941,18 +8093,14 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_teams_distribute_parallel_for_simd:
     case OMPD_teams_distribute:
     case OMPD_teams_distribute_simd:
-      CaptureRegion = OMPD_teams;
-      break;
     case OMPD_target_teams_distribute_parallel_for:
     case OMPD_target_teams_distribute_parallel_for_simd:
     case OMPD_target_teams_distribute:
     case OMPD_target_teams_distribute_simd:
-      CaptureRegion = OMPD_target;
+      CaptureRegion = OMPD_teams;
       break;
     case OMPD_distribute_parallel_for:
     case OMPD_distribute_parallel_for_simd:
-      CaptureRegion = OMPD_parallel;
-      break;
     case OMPD_distribute:
     case OMPD_distribute_simd:
       // Do not capture thread_limit-clause expressions.
@@ -8007,19 +8155,19 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_target_update:
     case OMPD_target_enter_data:
     case OMPD_target_exit_data:
-      CaptureRegion = OMPD_task;
-      break;
+    case OMPD_target:
+    case OMPD_target_simd:
     case OMPD_target_teams:
+    case OMPD_target_parallel:
     case OMPD_target_teams_distribute:
     case OMPD_target_teams_distribute_simd:
+    case OMPD_target_parallel_for:
+    case OMPD_target_parallel_for_simd:
     case OMPD_target_teams_distribute_parallel_for:
     case OMPD_target_teams_distribute_parallel_for_simd:
+      CaptureRegion = OMPD_task;
+      break;
     case OMPD_target_data:
-    case OMPD_target:
-    case OMPD_target_simd:
-    case OMPD_target_parallel:
-    case OMPD_target_parallel_for:
-    case OMPD_target_parallel_for_simd:
       // Do not capture device-clause expressions.
       break;
     case OMPD_teams_distribute_parallel_for:
@@ -9092,7 +9240,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
     // A list item cannot appear in both a map clause and a data-sharing
     // attribute clause on the same construct
     if (CurrDir == OMPD_target || CurrDir == OMPD_target_parallel ||
-        CurrDir == OMPD_target_teams || 
+        CurrDir == OMPD_target_teams ||
         CurrDir == OMPD_target_teams_distribute ||
         CurrDir == OMPD_target_teams_distribute_parallel_for ||
         CurrDir == OMPD_target_teams_distribute_parallel_for_simd ||
@@ -11377,8 +11525,8 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(Expr *Device, SourceLocation StartLoc,
     HelperValStmt = buildPreInits(Context, Captures);
   }
 
-  return new (Context)
-      OMPDeviceClause(ValExpr, HelperValStmt, StartLoc, LParenLoc, EndLoc);
+  return new (Context) OMPDeviceClause(ValExpr, HelperValStmt, CaptureRegion,
+                                       StartLoc, LParenLoc, EndLoc);
 }
 
 static bool CheckTypeMappable(SourceLocation SL, SourceRange SR, Sema &SemaRef,
diff --git a/lib/Sema/SemaOverload.cpp b/lib/Sema/SemaOverload.cpp
index 2144845f4dd3..1b07ec60ce91 100644
--- a/lib/Sema/SemaOverload.cpp
+++ b/lib/Sema/SemaOverload.cpp
@@ -1475,10 +1475,8 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType,
     const auto *ToFPT = cast<FunctionProtoType>(ToFn);
     if (FromFPT->isNothrow(Context) && !ToFPT->isNothrow(Context)) {
       FromFn = cast<FunctionType>(
-          Context.getFunctionType(FromFPT->getReturnType(),
-                                  FromFPT->getParamTypes(),
-                                  FromFPT->getExtProtoInfo().withExceptionSpec(
-                                      FunctionProtoType::ExceptionSpecInfo()))
+          Context.getFunctionTypeWithExceptionSpec(QualType(FromFPT, 0),
+                                                   EST_None)
                  .getTypePtr());
       Changed = true;
     }
@@ -1766,8 +1764,8 @@ static bool IsStandardConversion(Sema &S, Expr* From, QualType ToType,
                                    (FromType == S.Context.LongDoubleTy &&
                                     ToType == S.Context.Float128Ty));
       if (Float128AndLongDouble &&
-          (&S.Context.getFloatTypeSemantics(S.Context.LongDoubleTy) !=
-           &llvm::APFloat::IEEEdouble()))
+          (&S.Context.getFloatTypeSemantics(S.Context.LongDoubleTy) ==
+           &llvm::APFloat::PPCDoubleDouble()))
         return false;
     }
     // Floating point conversions (C++ 4.8).
@@ -5145,7 +5143,8 @@ Sema::PerformObjectArgumentInitialization(Expr *From,
       *this, From->getLocStart(), From->getType(), FromClassification, Method,
       Method->getParent());
   if (ICS.isBad()) {
-    if (ICS.Bad.Kind == BadConversionSequence::bad_qualifiers) {
+    switch (ICS.Bad.Kind) {
+    case BadConversionSequence::bad_qualifiers: {
       Qualifiers FromQs = FromRecordType.getQualifiers();
       Qualifiers ToQs = DestType.getQualifiers();
       unsigned CVR = FromQs.getCVRQualifiers() & ~ToQs.getCVRQualifiers();
@@ -5158,10 +5157,28 @@ Sema::PerformObjectArgumentInitialization(Expr *From,
           << Method->getDeclName();
         return ExprError();
       }
+      break;
+    }
+
+    case BadConversionSequence::lvalue_ref_to_rvalue:
+    case BadConversionSequence::rvalue_ref_to_lvalue: {
+      bool IsRValueQualified =
+        Method->getRefQualifier() == RefQualifierKind::RQ_RValue;
+      Diag(From->getLocStart(), diag::err_member_function_call_bad_ref)
+        << Method->getDeclName() << FromClassification.isRValue()
+        << IsRValueQualified;
+      Diag(Method->getLocation(), diag::note_previous_decl)
+        << Method->getDeclName();
+      return ExprError();
+    }
+
+    case BadConversionSequence::no_conversion:
+    case BadConversionSequence::unrelated_class:
+      break;
     }
 
     return Diag(From->getLocStart(),
-                diag::err_implicit_object_parameter_init)
+                diag::err_member_function_call_bad_type)
        << ImplicitParamRecordType << FromRecordType << From->getSourceRange();
   }
 
@@ -5940,6 +5957,13 @@ Sema::AddOverloadCandidate(FunctionDecl *Function,
   Candidate.IgnoreObjectArgument = false;
   Candidate.ExplicitCallArguments = Args.size();
 
+  if (Function->isMultiVersion() &&
+      !Function->getAttr<TargetAttr>()->isDefaultVersion()) {
+    Candidate.Viable = false;
+    Candidate.FailureKind = ovl_non_default_multiversion_function;
+    return;
+  }
+
   if (Constructor) {
     // C++ [class.copy]p3:
     //   A member function template is never instantiated to perform the copy
@@ -6564,6 +6588,12 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
     Candidate.DeductionFailure.Data = FailedAttr;
     return;
   }
+
+  if (Method->isMultiVersion() &&
+      !Method->getAttr<TargetAttr>()->isDefaultVersion()) {
+    Candidate.Viable = false;
+    Candidate.FailureKind = ovl_non_default_multiversion_function;
+  }
 }
 
 /// \brief Add a C++ member function template as a candidate to the candidate
@@ -6967,6 +6997,12 @@ Sema::AddConversionCandidate(CXXConversionDecl *Conversion,
     Candidate.DeductionFailure.Data = FailedAttr;
     return;
   }
+
+  if (Conversion->isMultiVersion() &&
+      !Conversion->getAttr<TargetAttr>()->isDefaultVersion()) {
+    Candidate.Viable = false;
+    Candidate.FailureKind = ovl_non_default_multiversion_function;
+  }
 }
 
 /// \brief Adds a conversion function template specialization
@@ -9375,6 +9411,8 @@ void Sema::NoteOverloadCandidate(NamedDecl *Found, FunctionDecl *Fn,
                                  QualType DestType, bool TakingAddress) {
   if (TakingAddress && !checkAddressOfCandidateIsAvailable(*this, Fn))
     return;
+  if (Fn->isMultiVersion() && !Fn->getAttr<TargetAttr>()->isDefaultVersion())
+    return;
 
   std::string FnDesc;
   OverloadCandidateKind K = ClassifyOverloadCandidate(*this, Found, Fn, FnDesc);
@@ -10176,6 +10214,9 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
     assert(!Available);
     break;
   }
+  case ovl_non_default_multiversion_function:
+    // Do nothing, these should simply be ignored.
+    break;
   }
 }
 
@@ -10493,9 +10534,8 @@ static void CompleteNonViableCandidate(Sema &S, OverloadCandidate *Cand,
   }
 }
 
-/// PrintOverloadCandidates - When overload resolution fails, prints
-/// diagnostic messages containing the candidates in the candidate
-/// set.
+/// When overload resolution fails, prints diagnostic messages containing the
+/// candidates in the candidate set.
 void OverloadCandidateSet::NoteCandidates(
     Sema &S, OverloadCandidateDisplayKind OCD, ArrayRef<Expr *> Args,
     StringRef Opc, SourceLocation OpLoc,
@@ -10912,6 +10952,12 @@ class AddressOfFunctionResolver {
         if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext))
           if (!Caller->isImplicit() && !S.IsAllowedCUDACall(Caller, FunDecl))
             return false;
+      if (FunDecl->isMultiVersion()) {
+        const auto *TA = FunDecl->getAttr<TargetAttr>();
+        assert(TA && "Multiversioned functions require a target attribute");
+        if (!TA->isDefaultVersion())
+          return false;
+      }
 
       // If any candidate has a placeholder return type, trigger its deduction
       // now.
@@ -11309,9 +11355,6 @@ Sema::ResolveSingleFunctionTemplateSpecialization(OverloadExpr *ovl,
   return Matched;
 }
 
-
-
-
 // Resolve and fix an overloaded expression that can be resolved
 // because it identifies a single function template specialization.
 //
@@ -11978,7 +12021,7 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
   if (Input->isTypeDependent()) {
     if (Fns.empty())
       return new (Context) UnaryOperator(Input, Opc, Context.DependentTy,
-                                         VK_RValue, OK_Ordinary, OpLoc);
+                                         VK_RValue, OK_Ordinary, OpLoc, false);
 
     CXXRecordDecl *NamingClass = nullptr; // lookup ignores member operators
     UnresolvedLookupExpr *Fn
@@ -13508,7 +13551,7 @@ Expr *Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
 
         return new (Context) UnaryOperator(SubExpr, UO_AddrOf, MemPtrType,
                                            VK_RValue, OK_Ordinary,
-                                           UnOp->getOperatorLoc());
+                                           UnOp->getOperatorLoc(), false);
       }
     }
     Expr *SubExpr = FixOverloadedFunctionReference(UnOp->getSubExpr(),
@@ -13519,7 +13562,7 @@ Expr *Sema::FixOverloadedFunctionReference(Expr *E, DeclAccessPair Found,
     return new (Context) UnaryOperator(SubExpr, UO_AddrOf,
                                      Context.getPointerType(SubExpr->getType()),
                                        VK_RValue, OK_Ordinary,
-                                       UnOp->getOperatorLoc());
+                                       UnOp->getOperatorLoc(), false);
   }
 
   // C++ [except.spec]p17:
diff --git a/lib/Sema/SemaPseudoObject.cpp b/lib/Sema/SemaPseudoObject.cpp
index 58980be64a30..f09a6f7158fb 100644
--- a/lib/Sema/SemaPseudoObject.cpp
+++ b/lib/Sema/SemaPseudoObject.cpp
@@ -132,7 +132,8 @@ namespace {
                                              uop->getType(),
                                              uop->getValueKind(),
                                              uop->getObjectKind(),
-                                             uop->getOperatorLoc());
+                                             uop->getOperatorLoc(),
+                                             uop->canOverflow());
       }
 
       if (GenericSelectionExpr *gse = dyn_cast<GenericSelectionExpr>(e)) {
@@ -524,15 +525,18 @@ PseudoOpBuilder::buildIncDecOperation(Scope *Sc, SourceLocation opcLoc,
   addSemanticExpr(result.get());
   if (UnaryOperator::isPrefix(opcode) && !captureSetValueAsResult() &&
       !result.get()->getType()->isVoidType() &&
-      (result.get()->isTypeDependent() || CanCaptureValue(result.get())))
-    setResultToLastSemantic();
-
-  UnaryOperator *syntactic =
-    new (S.Context) UnaryOperator(syntacticOp, opcode, resultType,
-                                  VK_LValue, OK_Ordinary, opcLoc);
-  return complete(syntactic);
-}
-
+      (result.get()->isTypeDependent() || CanCaptureValue(result.get())))
+    setResultToLastSemantic();
+
+  UnaryOperator *syntactic = new (S.Context) UnaryOperator(
+      syntacticOp, opcode, resultType, VK_LValue, OK_Ordinary, opcLoc,
+      !resultType->isDependentType()
+          ? S.Context.getTypeSize(resultType) >=
+                S.Context.getTypeSize(S.Context.IntTy)
+          : false);
+  return complete(syntactic);
+}
+
 
 //===----------------------------------------------------------------------===//
 //  Objective-C @property and implicit property references
@@ -1550,7 +1554,7 @@ ExprResult Sema::checkPseudoObjectIncDec(Scope *Sc, SourceLocation opcLoc,
   // Do nothing if the operand is dependent.
   if (op->isTypeDependent())
     return new (Context) UnaryOperator(op, opcode, Context.DependentTy,
-                                       VK_RValue, OK_Ordinary, opcLoc);
+                                       VK_RValue, OK_Ordinary, opcLoc, false);
 
   assert(UnaryOperator::isIncrementDecrementOp(opcode));
   Expr *opaqueRef = op->IgnoreParens();
@@ -1630,15 +1634,15 @@ static Expr *stripOpaqueValuesFromPseudoObjectRef(Sema &S, Expr *E) {
 /// capable of rebuilding a tree without stripping implicit
 /// operations.
 Expr *Sema::recreateSyntacticForm(PseudoObjectExpr *E) {
-  Expr *syntax = E->getSyntacticForm();
-  if (UnaryOperator *uop = dyn_cast<UnaryOperator>(syntax)) {
-    Expr *op = stripOpaqueValuesFromPseudoObjectRef(*this, uop->getSubExpr());
-    return new (Context) UnaryOperator(op, uop->getOpcode(), uop->getType(),
-                                       uop->getValueKind(), uop->getObjectKind(),
-                                       uop->getOperatorLoc());
-  } else if (CompoundAssignOperator *cop
-               = dyn_cast<CompoundAssignOperator>(syntax)) {
-    Expr *lhs = stripOpaqueValuesFromPseudoObjectRef(*this, cop->getLHS());
+  Expr *syntax = E->getSyntacticForm();
+  if (UnaryOperator *uop = dyn_cast<UnaryOperator>(syntax)) {
+    Expr *op = stripOpaqueValuesFromPseudoObjectRef(*this, uop->getSubExpr());
+    return new (Context) UnaryOperator(
+        op, uop->getOpcode(), uop->getType(), uop->getValueKind(),
+        uop->getObjectKind(), uop->getOperatorLoc(), uop->canOverflow());
+  } else if (CompoundAssignOperator *cop
+               = dyn_cast<CompoundAssignOperator>(syntax)) {
+    Expr *lhs = stripOpaqueValuesFromPseudoObjectRef(*this, cop->getLHS());
     Expr *rhs = cast<OpaqueValueExpr>(cop->getRHS())->getSourceExpr();
     return new (Context) CompoundAssignOperator(lhs, rhs, cop->getOpcode(),
                                                 cop->getType(),
diff --git a/lib/Sema/SemaStmt.cpp b/lib/Sema/SemaStmt.cpp
index 4474d62949a2..1ebc36716a88 100644
--- a/lib/Sema/SemaStmt.cpp
+++ b/lib/Sema/SemaStmt.cpp
@@ -14,6 +14,7 @@
 #include "clang/Sema/SemaInternal.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/DeclObjC.h"
@@ -3228,6 +3229,12 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD,
                                             SourceLocation ReturnLoc,
                                             Expr *&RetExpr,
                                             AutoType *AT) {
+  // If this is the conversion function for a lambda, we choose to deduce it
+  // type from the corresponding call operator, not from the synthesized return
+  // statement within it. See Sema::DeduceReturnType.
+  if (isLambdaConversionOperator(FD))
+    return false;
+
   TypeLoc OrigResultType = getReturnTypeLoc(FD);
   QualType Deduced;
 
diff --git a/lib/Sema/SemaTemplate.cpp b/lib/Sema/SemaTemplate.cpp
index d94cb0d0f485..7c6af5793fc5 100644
--- a/lib/Sema/SemaTemplate.cpp
+++ b/lib/Sema/SemaTemplate.cpp
@@ -169,16 +169,16 @@ TemplateNameKind Sema::isTemplateName(Scope *S,
   MemberOfUnknownSpecialization = false;
 
   switch (Name.getKind()) {
-  case UnqualifiedId::IK_Identifier:
+  case UnqualifiedIdKind::IK_Identifier:
     TName = DeclarationName(Name.Identifier);
     break;
 
-  case UnqualifiedId::IK_OperatorFunctionId:
+  case UnqualifiedIdKind::IK_OperatorFunctionId:
     TName = Context.DeclarationNames.getCXXOperatorName(
                                               Name.OperatorFunctionId.Operator);
     break;
 
-  case UnqualifiedId::IK_LiteralOperatorId:
+  case UnqualifiedIdKind::IK_LiteralOperatorId:
     TName = Context.DeclarationNames.getCXXLiteralOperatorName(Name.Identifier);
     break;
 
@@ -3557,7 +3557,7 @@ DeclResult Sema::ActOnVarTemplateSpecialization(
     TemplateParameterList *TemplateParams, StorageClass SC,
     bool IsPartialSpecialization) {
   // D must be variable template id.
-  assert(D.getName().getKind() == UnqualifiedId::IK_TemplateId &&
+  assert(D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId &&
          "Variable template specialization is declared with a template it.");
 
   TemplateIdAnnotation *TemplateId = D.getName().TemplateId;
@@ -4084,8 +4084,8 @@ TemplateNameKind Sema::ActOnDependentTemplateName(Scope *S,
       // We found something; return it.
       auto *LookupRD = dyn_cast<CXXRecordDecl>(LookupCtx);
       if (!AllowInjectedClassName && SS.isSet() && LookupRD &&
-          Name.getKind() == UnqualifiedId::IK_Identifier && Name.Identifier &&
-          LookupRD->getIdentifier() == Name.Identifier) {
+          Name.getKind() == UnqualifiedIdKind::IK_Identifier &&
+          Name.Identifier && LookupRD->getIdentifier() == Name.Identifier) {
         // C++14 [class.qual]p2:
         //   In a lookup in which function names are not ignored and the
         //   nested-name-specifier nominates a class C, if the name specified
@@ -4107,17 +4107,17 @@ TemplateNameKind Sema::ActOnDependentTemplateName(Scope *S,
   NestedNameSpecifier *Qualifier = SS.getScopeRep();
 
   switch (Name.getKind()) {
-  case UnqualifiedId::IK_Identifier:
+  case UnqualifiedIdKind::IK_Identifier:
     Result = TemplateTy::make(Context.getDependentTemplateName(Qualifier,
                                                               Name.Identifier));
     return TNK_Dependent_template_name;
 
-  case UnqualifiedId::IK_OperatorFunctionId:
+  case UnqualifiedIdKind::IK_OperatorFunctionId:
     Result = TemplateTy::make(Context.getDependentTemplateName(Qualifier,
                                              Name.OperatorFunctionId.Operator));
     return TNK_Function_template;
 
-  case UnqualifiedId::IK_LiteralOperatorId:
+  case UnqualifiedIdKind::IK_LiteralOperatorId:
     llvm_unreachable("literal operator id cannot have a dependent scope");
 
   default:
@@ -9044,7 +9044,7 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
         return true;
       }
 
-      if (D.getName().getKind() != UnqualifiedId::IK_TemplateId) {
+      if (D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) {
         // C++1y [temp.explicit]p3:
         //   If the explicit instantiation is for a variable, the unqualified-id
         //   in the declaration shall be a template-id.
@@ -9126,7 +9126,7 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
   // argument list into our AST format.
   bool HasExplicitTemplateArgs = false;
   TemplateArgumentListInfo TemplateArgs;
-  if (D.getName().getKind() == UnqualifiedId::IK_TemplateId) {
+  if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) {
     TemplateArgs = makeTemplateArgumentListInfo(*this, *D.getName().TemplateId);
     HasExplicitTemplateArgs = true;
   }
@@ -9292,7 +9292,7 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
   //
   // C++98 has the same restriction, just worded differently.
   FunctionTemplateDecl *FunTmpl = Specialization->getPrimaryTemplate();
-  if (D.getName().getKind() != UnqualifiedId::IK_TemplateId && !FunTmpl &&
+  if (D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId && !FunTmpl &&
       D.getCXXScopeSpec().isSet() &&
       !ScopeSpecifierHasTemplateId(D.getCXXScopeSpec()))
     Diag(D.getIdentifierLoc(),
diff --git a/lib/Sema/SemaTemplateDeduction.cpp b/lib/Sema/SemaTemplateDeduction.cpp
index f8ee60251698..d09cf9933ecf 100644
--- a/lib/Sema/SemaTemplateDeduction.cpp
+++ b/lib/Sema/SemaTemplateDeduction.cpp
@@ -502,6 +502,10 @@ DeduceTemplateArguments(Sema &S,
                         SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   assert(Arg.isCanonical() && "Argument type must be canonical");
 
+  // Treat an injected-class-name as its underlying template-id.
+  if (auto *Injected = dyn_cast<InjectedClassNameType>(Arg))
+    Arg = Injected->getInjectedSpecializationType();
+
   // Check whether the template argument is a dependent template-id.
   if (const TemplateSpecializationType *SpecArg
         = dyn_cast<TemplateSpecializationType>(Arg)) {
@@ -3940,117 +3944,6 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   return TDK_Success;
 }
 
-/// \brief Given a function declaration (e.g. a generic lambda conversion
-///  function) that contains an 'auto' in its result type, substitute it
-///  with TypeToReplaceAutoWith.  Be careful to pass in the type you want
-///  to replace 'auto' with and not the actual result type you want
-///  to set the function to.
-static inline void
-SubstAutoWithinFunctionReturnType(FunctionDecl *F,
-                                    QualType TypeToReplaceAutoWith, Sema &S) {
-  assert(!TypeToReplaceAutoWith->getContainedAutoType());
-  QualType AutoResultType = F->getReturnType();
-  assert(AutoResultType->getContainedAutoType());
-  QualType DeducedResultType = S.SubstAutoType(AutoResultType,
-                                               TypeToReplaceAutoWith);
-  S.Context.adjustDeducedFunctionResultType(F, DeducedResultType);
-}
-
-/// \brief Given a specialized conversion operator of a generic lambda
-/// create the corresponding specializations of the call operator and
-/// the static-invoker. If the return type of the call operator is auto,
-/// deduce its return type and check if that matches the
-/// return type of the destination function ptr.
-
-static inline Sema::TemplateDeductionResult
-SpecializeCorrespondingLambdaCallOperatorAndInvoker(
-    CXXConversionDecl *ConversionSpecialized,
-    SmallVectorImpl<DeducedTemplateArgument> &DeducedArguments,
-    QualType ReturnTypeOfDestFunctionPtr,
-    TemplateDeductionInfo &TDInfo,
-    Sema &S) {
-
-  CXXRecordDecl *LambdaClass = ConversionSpecialized->getParent();
-  assert(LambdaClass && LambdaClass->isGenericLambda());
-
-  CXXMethodDecl *CallOpGeneric = LambdaClass->getLambdaCallOperator();
-  QualType CallOpResultType = CallOpGeneric->getReturnType();
-  const bool GenericLambdaCallOperatorHasDeducedReturnType =
-      CallOpResultType->getContainedAutoType();
-
-  FunctionTemplateDecl *CallOpTemplate =
-      CallOpGeneric->getDescribedFunctionTemplate();
-
-  FunctionDecl *CallOpSpecialized = nullptr;
-  // Use the deduced arguments of the conversion function, to specialize our
-  // generic lambda's call operator.
-  if (Sema::TemplateDeductionResult Result
-      = S.FinishTemplateArgumentDeduction(CallOpTemplate,
-                                          DeducedArguments,
-                                          0, CallOpSpecialized, TDInfo))
-    return Result;
-
-  // If we need to deduce the return type, do so (instantiates the callop).
-  if (GenericLambdaCallOperatorHasDeducedReturnType &&
-      CallOpSpecialized->getReturnType()->isUndeducedType())
-    S.DeduceReturnType(CallOpSpecialized,
-                       CallOpSpecialized->getPointOfInstantiation(),
-                       /*Diagnose*/ true);
-
-  // Check to see if the return type of the destination ptr-to-function
-  // matches the return type of the call operator.
-  if (!S.Context.hasSameType(CallOpSpecialized->getReturnType(),
-                             ReturnTypeOfDestFunctionPtr))
-    return Sema::TDK_NonDeducedMismatch;
-  // Since we have succeeded in matching the source and destination
-  // ptr-to-functions (now including return type), and have successfully
-  // specialized our corresponding call operator, we are ready to
-  // specialize the static invoker with the deduced arguments of our
-  // ptr-to-function.
-  FunctionDecl *InvokerSpecialized = nullptr;
-  FunctionTemplateDecl *InvokerTemplate = LambdaClass->
-                  getLambdaStaticInvoker()->getDescribedFunctionTemplate();
-
-#ifndef NDEBUG
-  Sema::TemplateDeductionResult LLVM_ATTRIBUTE_UNUSED Result =
-#endif
-    S.FinishTemplateArgumentDeduction(InvokerTemplate, DeducedArguments, 0,
-          InvokerSpecialized, TDInfo);
-  assert(Result == Sema::TDK_Success &&
-    "If the call operator succeeded so should the invoker!");
-  // Set the result type to match the corresponding call operator
-  // specialization's result type.
-  if (GenericLambdaCallOperatorHasDeducedReturnType &&
-      InvokerSpecialized->getReturnType()->isUndeducedType()) {
-    // Be sure to get the type to replace 'auto' with and not
-    // the full result type of the call op specialization
-    // to substitute into the 'auto' of the invoker and conversion
-    // function.
-    // For e.g.
-    //  int* (*fp)(int*) = [](auto* a) -> auto* { return a; };
-    // We don't want to subst 'int*' into 'auto' to get int**.
-
-    QualType TypeToReplaceAutoWith = CallOpSpecialized->getReturnType()
-                                         ->getContainedAutoType()
-                                         ->getDeducedType();
-    SubstAutoWithinFunctionReturnType(InvokerSpecialized,
-        TypeToReplaceAutoWith, S);
-    SubstAutoWithinFunctionReturnType(ConversionSpecialized,
-        TypeToReplaceAutoWith, S);
-  }
-
-  // Ensure that static invoker doesn't have a const qualifier.
-  // FIXME: When creating the InvokerTemplate in SemaLambda.cpp
-  // do not use the CallOperator's TypeSourceInfo which allows
-  // the const qualifier to leak through.
-  const FunctionProtoType *InvokerFPT = InvokerSpecialized->
-                  getType().getTypePtr()->castAs<FunctionProtoType>();
-  FunctionProtoType::ExtProtoInfo EPI = InvokerFPT->getExtProtoInfo();
-  EPI.TypeQuals = 0;
-  InvokerSpecialized->setType(S.Context.getFunctionType(
-      InvokerFPT->getReturnType(), InvokerFPT->getParamTypes(), EPI));
-  return Sema::TDK_Success;
-}
 /// \brief Deduce template arguments for a templated conversion
 /// function (C++ [temp.deduct.conv]) and, if successful, produce a
 /// conversion function template specialization.
@@ -4158,35 +4051,6 @@ Sema::DeduceTemplateArguments(FunctionTemplateDecl *ConversionTemplate,
       = FinishTemplateArgumentDeduction(ConversionTemplate, Deduced, 0,
                                         ConversionSpecialized, Info);
   Specialization = cast_or_null<CXXConversionDecl>(ConversionSpecialized);
-
-  // If the conversion operator is being invoked on a lambda closure to convert
-  // to a ptr-to-function, use the deduced arguments from the conversion
-  // function to specialize the corresponding call operator.
-  //   e.g., int (*fp)(int) = [](auto a) { return a; };
-  if (Result == TDK_Success && isLambdaConversionOperator(ConversionGeneric)) {
-
-    // Get the return type of the destination ptr-to-function we are converting
-    // to.  This is necessary for matching the lambda call operator's return
-    // type to that of the destination ptr-to-function's return type.
-    assert(A->isPointerType() &&
-        "Can only convert from lambda to ptr-to-function");
-    const FunctionType *ToFunType =
-        A->getPointeeType().getTypePtr()->getAs<FunctionType>();
-    const QualType DestFunctionPtrReturnType = ToFunType->getReturnType();
-
-    // Create the corresponding specializations of the call operator and
-    // the static-invoker; and if the return type is auto,
-    // deduce the return type and check if it matches the
-    // DestFunctionPtrReturnType.
-    // For instance:
-    //   auto L = [](auto a) { return f(a); };
-    //   int (*fp)(int) = L;
-    //   char (*fp2)(int) = L; <-- Not OK.
-
-    Result = SpecializeCorrespondingLambdaCallOperatorAndInvoker(
-        Specialization, Deduced, DestFunctionPtrReturnType,
-        Info, *this);
-  }
   return Result;
 }
 
@@ -4536,6 +4400,43 @@ bool Sema::DeduceReturnType(FunctionDecl *FD, SourceLocation Loc,
                             bool Diagnose) {
   assert(FD->getReturnType()->isUndeducedType());
 
+  // For a lambda's conversion operator, deduce any 'auto' or 'decltype(auto)'
+  // within the return type from the call operator's type.
+  if (isLambdaConversionOperator(FD)) {
+    CXXRecordDecl *Lambda = cast<CXXMethodDecl>(FD)->getParent();
+    FunctionDecl *CallOp = Lambda->getLambdaCallOperator();
+
+    // For a generic lambda, instantiate the call operator if needed. 
+    if (auto *Args = FD->getTemplateSpecializationArgs()) {
+      CallOp = InstantiateFunctionDeclaration(
+          CallOp->getDescribedFunctionTemplate(), Args, Loc);
+      if (!CallOp || CallOp->isInvalidDecl())
+        return true;
+
+      // We might need to deduce the return type by instantiating the definition
+      // of the operator() function.
+      if (CallOp->getReturnType()->isUndeducedType())
+        InstantiateFunctionDefinition(Loc, CallOp);
+    }
+
+    if (CallOp->isInvalidDecl())
+      return true;
+    assert(!CallOp->getReturnType()->isUndeducedType() &&
+           "failed to deduce lambda return type");
+
+    // Build the new return type from scratch.
+    QualType RetType = getLambdaConversionFunctionResultType(
+        CallOp->getType()->castAs<FunctionProtoType>());
+    if (FD->getReturnType()->getAs<PointerType>())
+      RetType = Context.getPointerType(RetType);
+    else {
+      assert(FD->getReturnType()->getAs<BlockPointerType>());
+      RetType = Context.getBlockPointerType(RetType);
+    }
+    Context.adjustDeducedFunctionResultType(FD, RetType);
+    return false;
+  }
+
   if (FD->getTemplateInstantiationPattern())
     InstantiateFunctionDefinition(Loc, FD);
 
diff --git a/lib/Sema/SemaTemplateInstantiateDecl.cpp b/lib/Sema/SemaTemplateInstantiateDecl.cpp
index d8af8f34530b..b8b74408cb82 100644
--- a/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -343,14 +343,6 @@ static void instantiateOMPDeclareSimdDeclAttr(
       Attr.getRange());
 }
 
-static bool DeclContainsAttr(const Decl *D, const Attr *NewAttr) {
-  if (!D->hasAttrs() || NewAttr->duplicatesAllowed())
-    return false;
-  return llvm::find_if(D->getAttrs(), [NewAttr](const Attr *Attr) {
-           return Attr->getKind() == NewAttr->getKind();
-         }) != D->getAttrs().end();
-}
-
 void Sema::InstantiateAttrsForDecl(
     const MultiLevelTemplateArgumentList &TemplateArgs, const Decl *Tmpl,
     Decl *New, LateInstantiatedAttrVec *LateAttrs,
@@ -365,7 +357,7 @@ void Sema::InstantiateAttrsForDecl(
 
       Attr *NewAttr = sema::instantiateTemplateAttributeForDecl(
           TmplAttr, Context, *this, TemplateArgs);
-      if (NewAttr && !DeclContainsAttr(New, NewAttr))
+      if (NewAttr)
         New->addAttr(NewAttr);
     }
   }
@@ -470,8 +462,7 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
 
       Attr *NewAttr = sema::instantiateTemplateAttribute(TmplAttr, Context,
                                                          *this, TemplateArgs);
-
-      if (NewAttr && !DeclContainsAttr(New, NewAttr))
+      if (NewAttr)
         New->addAttr(NewAttr);
     }
   }
@@ -3729,6 +3720,30 @@ TemplateDeclInstantiator::InitMethodInstantiation(CXXMethodDecl *New,
   return false;
 }
 
+/// Instantiate (or find existing instantiation of) a function template with a
+/// given set of template arguments.
+///
+/// Usually this should not be used, and template argument deduction should be
+/// used in its place.
+FunctionDecl *
+Sema::InstantiateFunctionDeclaration(FunctionTemplateDecl *FTD,
+                                     const TemplateArgumentList *Args,
+                                     SourceLocation Loc) {
+  FunctionDecl *FD = FTD->getTemplatedDecl();
+
+  sema::TemplateDeductionInfo Info(Loc);
+  InstantiatingTemplate Inst(
+      *this, Loc, FTD, Args->asArray(),
+      CodeSynthesisContext::ExplicitTemplateArgumentSubstitution, Info);
+  if (Inst.isInvalid())
+    return nullptr;
+
+  ContextRAII SavedContext(*this, FD);
+  MultiLevelTemplateArgumentList MArgs(*Args);
+
+  return cast_or_null<FunctionDecl>(SubstDecl(FD, FD->getParent(), MArgs));
+}
+
 /// In the MS ABI, we need to instantiate default arguments of dllexported
 /// default constructors along with the constructor definition. This allows IR
 /// gen to emit a constructor closure which calls the default constructor with
@@ -3812,7 +3827,8 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
       PendingInstantiations.push_back(
         std::make_pair(Function, PointOfInstantiation));
     } else if (TSK == TSK_ImplicitInstantiation) {
-      if (AtEndOfTU && !getDiagnostics().hasErrorOccurred()) {
+      if (AtEndOfTU && !getDiagnostics().hasErrorOccurred() &&
+          !getSourceManager().isInSystemHeader(PatternDecl->getLocStart())) {
         Diag(PointOfInstantiation, diag::warn_func_template_missing)
           << Function;
         Diag(PatternDecl->getLocation(), diag::note_forward_template_decl);
@@ -4135,7 +4151,8 @@ void Sema::BuildVariableInstantiation(
   // it right away if the type contains 'auto'.
   if ((!isa<VarTemplateSpecializationDecl>(NewVar) &&
        !InstantiatingVarTemplate &&
-       !(OldVar->isInline() && OldVar->isThisDeclarationADefinition())) ||
+       !(OldVar->isInline() && OldVar->isThisDeclarationADefinition() &&
+         !NewVar->isThisDeclarationADefinition())) ||
       NewVar->getType()->isUndeducedType())
     InstantiateVariableInitializer(NewVar, OldVar, TemplateArgs);
 
@@ -4347,7 +4364,8 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
         std::make_pair(Var, PointOfInstantiation));
     } else if (TSK == TSK_ImplicitInstantiation) {
       // Warn about missing definition at the end of translation unit.
-      if (AtEndOfTU && !getDiagnostics().hasErrorOccurred()) {
+      if (AtEndOfTU && !getDiagnostics().hasErrorOccurred() &&
+          !getSourceManager().isInSystemHeader(PatternDecl->getLocStart())) {
         Diag(PointOfInstantiation, diag::warn_var_template_missing)
           << Var;
         Diag(PatternDecl->getLocation(), diag::note_forward_template_decl);
diff --git a/lib/Sema/SemaType.cpp b/lib/Sema/SemaType.cpp
index 65c37c4a068b..af060cf21b5f 100644
--- a/lib/Sema/SemaType.cpp
+++ b/lib/Sema/SemaType.cpp
@@ -2703,7 +2703,7 @@ static void diagnoseRedundantReturnTypeQualifiers(Sema &S, QualType RetTy,
   // If the qualifiers come from a conversion function type, don't diagnose
   // them -- they're not necessarily redundant, since such a conversion
   // operator can be explicitly called as "x.operator const int()".
-  if (D.getName().getKind() == UnqualifiedId::IK_ConversionFunctionId)
+  if (D.getName().getKind() == UnqualifiedIdKind::IK_ConversionFunctionId)
     return;
 
   // Just parens all the way out to the decl specifiers. Diagnose any qualifiers
@@ -2729,11 +2729,11 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
   TagDecl *OwnedTagDecl = nullptr;
 
   switch (D.getName().getKind()) {
-  case UnqualifiedId::IK_ImplicitSelfParam:
-  case UnqualifiedId::IK_OperatorFunctionId:
-  case UnqualifiedId::IK_Identifier:
-  case UnqualifiedId::IK_LiteralOperatorId:
-  case UnqualifiedId::IK_TemplateId:
+  case UnqualifiedIdKind::IK_ImplicitSelfParam:
+  case UnqualifiedIdKind::IK_OperatorFunctionId:
+  case UnqualifiedIdKind::IK_Identifier:
+  case UnqualifiedIdKind::IK_LiteralOperatorId:
+  case UnqualifiedIdKind::IK_TemplateId:
     T = ConvertDeclSpecToType(state);
 
     if (!D.isInvalidType() && D.getDeclSpec().isTypeSpecOwned()) {
@@ -2743,9 +2743,9 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
     }
     break;
 
-  case UnqualifiedId::IK_ConstructorName:
-  case UnqualifiedId::IK_ConstructorTemplateId:
-  case UnqualifiedId::IK_DestructorName:
+  case UnqualifiedIdKind::IK_ConstructorName:
+  case UnqualifiedIdKind::IK_ConstructorTemplateId:
+  case UnqualifiedIdKind::IK_DestructorName:
     // Constructors and destructors don't have return types. Use
     // "void" instead.
     T = SemaRef.Context.VoidTy;
@@ -2753,13 +2753,13 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
                      D.getDeclSpec().getAttributes().getList());
     break;
 
-  case UnqualifiedId::IK_DeductionGuideName:
+  case UnqualifiedIdKind::IK_DeductionGuideName:
     // Deduction guides have a trailing return type and no type in their
     // decl-specifier sequence. Use a placeholder return type for now.
     T = SemaRef.Context.DependentTy;
     break;
 
-  case UnqualifiedId::IK_ConversionFunctionId:
+  case UnqualifiedIdKind::IK_ConversionFunctionId:
     // The result type of a conversion function is the type that it
     // converts to.
     T = SemaRef.GetTypeFromParser(D.getName().ConversionFunctionId,
@@ -2917,7 +2917,7 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
     }
 
     SourceRange AutoRange = D.getDeclSpec().getTypeSpecTypeLoc();
-    if (D.getName().getKind() == UnqualifiedId::IK_ConversionFunctionId)
+    if (D.getName().getKind() == UnqualifiedIdKind::IK_ConversionFunctionId)
       AutoRange = D.getName().getSourceRange();
 
     if (Error != -1) {
@@ -4322,7 +4322,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
               << T << D.getSourceRange();
             D.setInvalidType(true);
           } else if (D.getName().getKind() ==
-                     UnqualifiedId::IK_DeductionGuideName) {
+                     UnqualifiedIdKind::IK_DeductionGuideName) {
             if (T != Context.DependentTy) {
               S.Diag(D.getDeclSpec().getLocStart(),
                      diag::err_deduction_guide_with_complex_decl)
@@ -4350,7 +4350,8 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
       // C99 6.7.5.3p1: The return type may not be a function or array type.
       // For conversion functions, we'll diagnose this particular error later.
       if (!D.isInvalidType() && (T->isArrayType() || T->isFunctionType()) &&
-          (D.getName().getKind() != UnqualifiedId::IK_ConversionFunctionId)) {
+          (D.getName().getKind() !=
+           UnqualifiedIdKind::IK_ConversionFunctionId)) {
         unsigned diagID = diag::err_func_returning_array_function;
         // Last processing chunk in block context means this function chunk
         // represents the block.
@@ -4766,7 +4767,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
         break;
       case DeclaratorChunk::Function: {
         const DeclaratorChunk::FunctionTypeInfo &FTI = DeclType.Fun;
-        if (FTI.NumParams == 0)
+        if (FTI.NumParams == 0 && !FTI.isVariadic)
           S.Diag(DeclType.Loc, diag::warn_strict_prototypes)
               << IsBlock
               << FixItHint::CreateInsertion(FTI.getRParenLoc(), "void");
@@ -4794,7 +4795,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
     // Core issue 547 also allows cv-qualifiers on function types that are
     // top-level template type arguments.
     enum { NonMember, Member, DeductionGuide } Kind = NonMember;
-    if (D.getName().getKind() == UnqualifiedId::IK_DeductionGuideName)
+    if (D.getName().getKind() == UnqualifiedIdKind::IK_DeductionGuideName)
       Kind = DeductionGuide;
     else if (!D.getCXXScopeSpec().isSet()) {
       if ((D.getContext() == DeclaratorContext::MemberContext ||
diff --git a/lib/Sema/TreeTransform.h b/lib/Sema/TreeTransform.h
index 96969ea87a13..c33a1fc2ecfa 100644
--- a/lib/Sema/TreeTransform.h
+++ b/lib/Sema/TreeTransform.h
@@ -2352,18 +2352,8 @@ class TreeTransform {
   /// Subclasses may override this routine to provide different behavior.
   ExprResult RebuildInitList(SourceLocation LBraceLoc,
                              MultiExprArg Inits,
-                             SourceLocation RBraceLoc,
-                             QualType ResultTy) {
-    ExprResult Result
-      = SemaRef.ActOnInitList(LBraceLoc, Inits, RBraceLoc);
-    if (Result.isInvalid() || ResultTy->isDependentType())
-      return Result;
-
-    // Patch in the result type we were given, which may have been computed
-    // when the initial InitListExpr was built.
-    InitListExpr *ILE = cast<InitListExpr>((Expr *)Result.get());
-    ILE->setType(ResultTy);
-    return Result;
+                             SourceLocation RBraceLoc) {
+    return SemaRef.ActOnInitList(LBraceLoc, Inits, RBraceLoc);
   }
 
   /// \brief Build a new designated initializer expression.
@@ -2591,10 +2581,11 @@ class TreeTransform {
   ExprResult RebuildCXXFunctionalCastExpr(TypeSourceInfo *TInfo,
                                           SourceLocation LParenLoc,
                                           Expr *Sub,
-                                          SourceLocation RParenLoc) {
+                                          SourceLocation RParenLoc,
+                                          bool ListInitialization) {
     return getSema().BuildCXXTypeConstructExpr(TInfo, LParenLoc,
-                                               MultiExprArg(&Sub, 1),
-                                               RParenLoc);
+                                               MultiExprArg(&Sub, 1), RParenLoc,
+                                               ListInitialization);
   }
 
   /// \brief Build a new C++ typeid(type) expression.
@@ -2694,8 +2685,8 @@ class TreeTransform {
   ExprResult RebuildCXXScalarValueInitExpr(TypeSourceInfo *TSInfo,
                                            SourceLocation LParenLoc,
                                            SourceLocation RParenLoc) {
-    return getSema().BuildCXXTypeConstructExpr(TSInfo, LParenLoc,
-                                               None, RParenLoc);
+    return getSema().BuildCXXTypeConstructExpr(
+        TSInfo, LParenLoc, None, RParenLoc, /*ListInitialization=*/false);
   }
 
   /// \brief Build a new C++ "new" expression.
@@ -2852,13 +2843,12 @@ class TreeTransform {
   /// By default, performs semantic analysis to build the new expression.
   /// Subclasses may override this routine to provide different behavior.
   ExprResult RebuildCXXTemporaryObjectExpr(TypeSourceInfo *TSInfo,
-                                           SourceLocation LParenLoc,
+                                           SourceLocation LParenOrBraceLoc,
                                            MultiExprArg Args,
-                                           SourceLocation RParenLoc) {
-    return getSema().BuildCXXTypeConstructExpr(TSInfo,
-                                               LParenLoc,
-                                               Args,
-                                               RParenLoc);
+                                           SourceLocation RParenOrBraceLoc,
+                                           bool ListInitialization) {
+    return getSema().BuildCXXTypeConstructExpr(
+        TSInfo, LParenOrBraceLoc, Args, RParenOrBraceLoc, ListInitialization);
   }
 
   /// \brief Build a new object-construction expression.
@@ -2868,11 +2858,10 @@ class TreeTransform {
   ExprResult RebuildCXXUnresolvedConstructExpr(TypeSourceInfo *TSInfo,
                                                SourceLocation LParenLoc,
                                                MultiExprArg Args,
-                                               SourceLocation RParenLoc) {
-    return getSema().BuildCXXTypeConstructExpr(TSInfo,
-                                               LParenLoc,
-                                               Args,
-                                               RParenLoc);
+                                               SourceLocation RParenLoc,
+                                               bool ListInitialization) {
+    return getSema().BuildCXXTypeConstructExpr(TSInfo, LParenLoc, Args,
+                                               RParenLoc, ListInitialization);
   }
 
   /// \brief Build a new member reference expression.
@@ -3394,11 +3383,10 @@ ExprResult TreeTransform<Derived>::TransformInitializer(Expr *Init,
                                   /*IsCall*/true, NewArgs, &ArgChanged))
     return ExprError();
 
-  // If this was list initialization, revert to list form.
+  // If this was list initialization, revert to syntactic list form.
   if (Construct->isListInitialization())
     return getDerived().RebuildInitList(Construct->getLocStart(), NewArgs,
-                                        Construct->getLocEnd(),
-                                        Construct->getType());
+                                        Construct->getLocEnd());
 
   // Build a ParenListExpr to represent anything else.
   SourceRange Parens = Construct->getParenOrBraceRange();
@@ -6956,6 +6944,8 @@ TreeTransform<Derived>::TransformCoroutineBodyStmt(CoroutineBodyStmt *S) {
 
   // The new CoroutinePromise object needs to be built and put into the current
   // FunctionScopeInfo before any transformations or rebuilding occurs.
+  if (!SemaRef.buildCoroutineParameterMoves(FD->getLocation()))
+    return StmtError();
   auto *Promise = SemaRef.buildCoroutinePromise(FD->getLocation());
   if (!Promise)
     return StmtError();
@@ -7046,8 +7036,6 @@ TreeTransform<Derived>::TransformCoroutineBodyStmt(CoroutineBodyStmt *S) {
       Builder.ReturnStmt = Res.get();
     }
   }
-  if (!Builder.buildParameterMoves())
-    return StmtError();
 
   return getDerived().RebuildCoroutineBodyStmt(Builder);
 }
@@ -7642,11 +7630,7 @@ StmtResult TreeTransform<Derived>::TransformOMPExecutableDirective(
     StmtResult Body;
     {
       Sema::CompoundScopeRAII CompoundScope(getSema());
-      int ThisCaptureLevel =
-          Sema::getOpenMPCaptureLevels(D->getDirectiveKind());
-      Stmt *CS = D->getAssociatedStmt();
-      while (--ThisCaptureLevel >= 0)
-        CS = cast<CapturedStmt>(CS)->getCapturedStmt();
+      Stmt *CS = D->getInnermostCapturedStmt()->getCapturedStmt();
       Body = getDerived().TransformStmt(CS);
     }
     AssociatedStmt =
@@ -9517,7 +9501,7 @@ TreeTransform<Derived>::TransformInitListExpr(InitListExpr *E) {
   }
 
   return getDerived().RebuildInitList(E->getLBraceLoc(), Inits,
-                                      E->getRBraceLoc(), E->getType());
+                                      E->getRBraceLoc());
 }
 
 template<typename Derived>
@@ -9951,7 +9935,8 @@ TreeTransform<Derived>::TransformCXXFunctionalCastExpr(
   return getDerived().RebuildCXXFunctionalCastExpr(Type,
                                                    E->getLParenLoc(),
                                                    SubExpr.get(),
-                                                   E->getRParenLoc());
+                                                   E->getRParenLoc(),
+                                                   E->isListInitialization());
 }
 
 template<typename Derived>
@@ -10867,11 +10852,12 @@ TreeTransform<Derived>::TransformCXXTemporaryObjectExpr(
     return SemaRef.MaybeBindToTemporary(E);
   }
 
-  // FIXME: Pass in E->isListInitialization().
-  return getDerived().RebuildCXXTemporaryObjectExpr(T,
-                                          /*FIXME:*/T->getTypeLoc().getEndLoc(),
-                                                    Args,
-                                                    E->getLocEnd());
+  // FIXME: We should just pass E->isListInitialization(), but we're not
+  // prepared to handle list-initialization without a child InitListExpr.
+  SourceLocation LParenLoc = T->getTypeLoc().getEndLoc();
+  return getDerived().RebuildCXXTemporaryObjectExpr(
+      T, LParenLoc, Args, E->getLocEnd(),
+      /*ListInitialization=*/LParenLoc.isInvalid());
 }
 
 template<typename Derived>
@@ -11157,10 +11143,8 @@ TreeTransform<Derived>::TransformCXXUnresolvedConstructExpr(
     return E;
 
   // FIXME: we're faking the locations of the commas
-  return getDerived().RebuildCXXUnresolvedConstructExpr(T,
-                                                        E->getLParenLoc(),
-                                                        Args,
-                                                        E->getRParenLoc());
+  return getDerived().RebuildCXXUnresolvedConstructExpr(
+      T, E->getLParenLoc(), Args, E->getRParenLoc(), E->isListInitialization());
 }
 
 template<typename Derived>
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index 4ed822e04f6c..b62c47c67f3e 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -3215,6 +3215,24 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
       break;
     }
 
+    case PPD_SKIPPED_RANGES: {
+      F.PreprocessedSkippedRangeOffsets = (const PPSkippedRange*)Blob.data();
+      assert(Blob.size() % sizeof(PPSkippedRange) == 0);
+      F.NumPreprocessedSkippedRanges = Blob.size() / sizeof(PPSkippedRange);
+
+      if (!PP.getPreprocessingRecord())
+        PP.createPreprocessingRecord();
+      if (!PP.getPreprocessingRecord()->getExternalSource())
+        PP.getPreprocessingRecord()->SetExternalSource(*this);
+      F.BasePreprocessedSkippedRangeID = PP.getPreprocessingRecord()
+          ->allocateSkippedRanges(F.NumPreprocessedSkippedRanges);
+      
+      if (F.NumPreprocessedSkippedRanges > 0)
+        GlobalSkippedRangeMap.insert(
+            std::make_pair(F.BasePreprocessedSkippedRangeID, &F));
+      break;
+    }
+
     case DECL_UPDATE_OFFSETS:
       if (Record.size() % 2 != 0) {
         Error("invalid DECL_UPDATE_OFFSETS block in AST file");
@@ -5387,6 +5405,20 @@ ASTReader::getModuleFileLevelDecls(ModuleFile &Mod) {
                          Mod.FileSortedDecls + Mod.NumFileSortedDecls));
 }
 
+SourceRange ASTReader::ReadSkippedRange(unsigned GlobalIndex) {
+  auto I = GlobalSkippedRangeMap.find(GlobalIndex);
+  assert(I != GlobalSkippedRangeMap.end() &&
+    "Corrupted global skipped range map");
+  ModuleFile *M = I->second;
+  unsigned LocalIndex = GlobalIndex - M->BasePreprocessedSkippedRangeID;
+  assert(LocalIndex < M->NumPreprocessedSkippedRanges);
+  PPSkippedRange RawRange = M->PreprocessedSkippedRangeOffsets[LocalIndex];
+  SourceRange Range(TranslateSourceLocation(*M, RawRange.getBegin()),
+                    TranslateSourceLocation(*M, RawRange.getEnd()));
+  assert(Range.isValid());
+  return Range;
+}
+
 PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
   PreprocessedEntityID PPID = Index+1;
   std::pair<ModuleFile *, unsigned> PPInfo = getModulePreprocessedEntity(Index);
@@ -10660,6 +10692,7 @@ void ASTReader::diagnoseOdrViolations() {
       Diagnosed = true;
       break;
     }
+    (void)Diagnosed;
     assert(Diagnosed && "Unable to emit ODR diagnostic.");
   }
 }
diff --git a/lib/Serialization/ASTReaderDecl.cpp b/lib/Serialization/ASTReaderDecl.cpp
index efbaf92a849a..f3eab2e6d786 100644
--- a/lib/Serialization/ASTReaderDecl.cpp
+++ b/lib/Serialization/ASTReaderDecl.cpp
@@ -792,6 +792,7 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   FD->IsConstexpr = Record.readInt();
   FD->UsesSEHTry = Record.readInt();
   FD->HasSkippedBody = Record.readInt();
+  FD->IsMultiVersion = Record.readInt();
   FD->IsLateTemplateParsed = Record.readInt();
   FD->setCachedLinkage(Linkage(Record.readInt()));
   FD->EndRangeLoc = ReadSourceLocation();
@@ -1500,7 +1501,8 @@ void ASTDeclReader::VisitUsingPackDecl(UsingPackDecl *D) {
 void ASTDeclReader::VisitUsingShadowDecl(UsingShadowDecl *D) {
   RedeclarableResult Redecl = VisitRedeclarable(D);
   VisitNamedDecl(D);
-  D->setTargetDecl(ReadDeclAs<NamedDecl>());
+  D->Underlying = ReadDeclAs<NamedDecl>();
+  D->IdentifierNamespace = Record.readInt();
   D->UsingOrNextShadow = ReadDeclAs<NamedDecl>();
   UsingShadowDecl *Pattern = ReadDeclAs<UsingShadowDecl>();
   if (Pattern)
@@ -2817,6 +2819,21 @@ static bool isSameEntity(NamedDecl *X, NamedDecl *Y) {
                         CtorY->getInheritedConstructor().getConstructor()))
         return false;
     }
+
+    if (FuncX->isMultiVersion() != FuncY->isMultiVersion())
+      return false;
+
+    // Multiversioned functions with different feature strings are represented
+    // as separate declarations.
+    if (FuncX->isMultiVersion()) {
+      const auto *TAX = FuncX->getAttr<TargetAttr>();
+      const auto *TAY = FuncY->getAttr<TargetAttr>();
+      assert(TAX && TAY && "Multiversion Function without target attribute");
+
+      if (TAX->getFeaturesStr() != TAY->getFeaturesStr())
+        return false;
+    }
+
     ASTContext &C = FuncX->getASTContext();
     if (!C.hasSameType(FuncX->getType(), FuncY->getType())) {
       // We can get functions with different types on the redecl chain in C++17
diff --git a/lib/Serialization/ASTReaderStmt.cpp b/lib/Serialization/ASTReaderStmt.cpp
index 6163b811c769..4938b0e9026a 100644
--- a/lib/Serialization/ASTReaderStmt.cpp
+++ b/lib/Serialization/ASTReaderStmt.cpp
@@ -550,12 +550,13 @@ void ASTStmtReader::VisitParenListExpr(ParenListExpr *E) {
 
 void ASTStmtReader::VisitUnaryOperator(UnaryOperator *E) {
   VisitExpr(E);
-  E->setSubExpr(Record.readSubExpr());
-  E->setOpcode((UnaryOperator::Opcode)Record.readInt());
-  E->setOperatorLoc(ReadSourceLocation());
-}
-
-void ASTStmtReader::VisitOffsetOfExpr(OffsetOfExpr *E) {
+  E->setSubExpr(Record.readSubExpr());
+  E->setOpcode((UnaryOperator::Opcode)Record.readInt());
+  E->setOperatorLoc(ReadSourceLocation());
+  E->setCanOverflow(Record.readInt());
+}
+
+void ASTStmtReader::VisitOffsetOfExpr(OffsetOfExpr *E) {
   VisitExpr(E);
   assert(E->getNumComponents() == Record.peekInt());
   Record.skipInts(1);
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index 1e72ced2ee36..d1bfe281e2fb 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -1104,6 +1104,7 @@ void ASTWriter::WriteBlockInfoBlock() {
   RECORD(UNUSED_FILESCOPED_DECLS);
   RECORD(PPD_ENTITIES_OFFSETS);
   RECORD(VTABLE_USES);
+  RECORD(PPD_SKIPPED_RANGES);
   RECORD(REFERENCED_SELECTOR_POOL);
   RECORD(TU_UPDATE_LEXICAL);
   RECORD(SEMA_DECL_REFS);
@@ -2726,6 +2727,26 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) {
     Stream.EmitRecordWithBlob(PPEOffsetAbbrev, Record,
                               bytes(PreprocessedEntityOffsets));
   }
+
+  // Write the skipped region table for the preprocessing record.
+  ArrayRef<SourceRange> SkippedRanges = PPRec.getSkippedRanges();
+  if (SkippedRanges.size() > 0) {
+    std::vector<PPSkippedRange> SerializedSkippedRanges;
+    SerializedSkippedRanges.reserve(SkippedRanges.size());
+    for (auto const& Range : SkippedRanges)
+      SerializedSkippedRanges.emplace_back(Range);
+
+    using namespace llvm;
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(PPD_SKIPPED_RANGES));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+    unsigned PPESkippedRangeAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
+
+    Record.clear();
+    Record.push_back(PPD_SKIPPED_RANGES);
+    Stream.EmitRecordWithBlob(PPESkippedRangeAbbrev, Record,
+                              bytes(SerializedSkippedRanges));
+  }
 }
 
 unsigned ASTWriter::getLocalOrImportedSubmoduleID(Module *Mod) {
@@ -5516,7 +5537,9 @@ void ASTWriter::associateDeclWithFile(const Decl *D, DeclID ID) {
     return;
   // FIXME: ParmVarDecls that are part of a function type of a parameter of
   // a function/objc method, should not have TU as lexical context.
-  if (isa<ParmVarDecl>(D))
+  // TemplateTemplateParmDecls that are part of an alias template, should not
+  // have TU as lexical context.
+  if (isa<ParmVarDecl>(D) || isa<TemplateTemplateParmDecl>(D))
     return;
 
   SourceManager &SM = Context->getSourceManager();
diff --git a/lib/Serialization/ASTWriterDecl.cpp b/lib/Serialization/ASTWriterDecl.cpp
index bb72a3b383ea..c2c6fd44d78a 100644
--- a/lib/Serialization/ASTWriterDecl.cpp
+++ b/lib/Serialization/ASTWriterDecl.cpp
@@ -534,6 +534,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   Record.push_back(D->IsConstexpr);
   Record.push_back(D->UsesSEHTry);
   Record.push_back(D->HasSkippedBody);
+  Record.push_back(D->IsMultiVersion);
   Record.push_back(D->IsLateTemplateParsed);
   Record.push_back(D->getLinkageInternal());
   Record.AddSourceLocation(D->getLocEnd());
@@ -1192,6 +1193,7 @@ void ASTDeclWriter::VisitUsingShadowDecl(UsingShadowDecl *D) {
   VisitRedeclarable(D);
   VisitNamedDecl(D);
   Record.AddDeclRef(D->getTargetDecl());
+  Record.push_back(D->getIdentifierNamespace());
   Record.AddDeclRef(D->UsingOrNextShadow);
   Record.AddDeclRef(Context.getInstantiatedFromUsingShadowDecl(D));
   Code = serialization::DECL_USING_SHADOW;
@@ -2071,6 +2073,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Constexpr
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // UsesSEHTry
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // SkippedBody
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // MultiVersion
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // LateParsed
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Linkage
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LocEnd
diff --git a/lib/Serialization/ASTWriterStmt.cpp b/lib/Serialization/ASTWriterStmt.cpp
index c5f4495d2f01..6fd03725d43d 100644
--- a/lib/Serialization/ASTWriterStmt.cpp
+++ b/lib/Serialization/ASTWriterStmt.cpp
@@ -506,12 +506,13 @@ void ASTStmtWriter::VisitParenListExpr(ParenListExpr *E) {
 
 void ASTStmtWriter::VisitUnaryOperator(UnaryOperator *E) {
   VisitExpr(E);
-  Record.AddStmt(E->getSubExpr());
-  Record.push_back(E->getOpcode()); // FIXME: stable encoding
-  Record.AddSourceLocation(E->getOperatorLoc());
-  Code = serialization::EXPR_UNARY_OPERATOR;
-}
-
+  Record.AddStmt(E->getSubExpr());
+  Record.push_back(E->getOpcode()); // FIXME: stable encoding
+  Record.AddSourceLocation(E->getOperatorLoc());
+  Record.push_back(E->canOverflow());
+  Code = serialization::EXPR_UNARY_OPERATOR;
+}
+
 void ASTStmtWriter::VisitOffsetOfExpr(OffsetOfExpr *E) {
   VisitExpr(E);
   Record.push_back(E->getNumComponents());
diff --git a/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp b/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
index 90d5c0e36a47..e2a35c042663 100644
--- a/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
@@ -15,8 +15,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "ClangSACheckers.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 
 using namespace clang;
@@ -29,6 +31,11 @@ class AnalysisOrderChecker
                      check::PostStmt<CastExpr>,
                      check::PreStmt<ArraySubscriptExpr>,
                      check::PostStmt<ArraySubscriptExpr>,
+                     check::PreStmt<CXXNewExpr>,
+                     check::PostStmt<CXXNewExpr>,
+                     check::PreCall,
+                     check::PostCall,
+                     check::NewAllocator,
                      check::Bind,
                      check::RegionChanges> {
   bool isCallbackEnabled(AnalyzerOptions &Opts, StringRef CallbackName) const {
@@ -72,6 +79,40 @@ class AnalysisOrderChecker
       llvm::errs() << "PostStmt<ArraySubscriptExpr>\n";
   }
 
+  void checkPreStmt(const CXXNewExpr *NE, CheckerContext &C) const {
+    if (isCallbackEnabled(C, "PreStmtCXXNewExpr"))
+      llvm::errs() << "PreStmt<CXXNewExpr>\n";
+  }
+
+  void checkPostStmt(const CXXNewExpr *NE, CheckerContext &C) const {
+    if (isCallbackEnabled(C, "PostStmtCXXNewExpr"))
+      llvm::errs() << "PostStmt<CXXNewExpr>\n";
+  }
+
+  void checkPreCall(const CallEvent &Call, CheckerContext &C) const {
+    if (isCallbackEnabled(C, "PreCall")) {
+      llvm::errs() << "PreCall";
+      if (const NamedDecl *ND = dyn_cast_or_null<NamedDecl>(Call.getDecl()))
+        llvm::errs() << " (" << ND->getQualifiedNameAsString() << ')';
+      llvm::errs() << '\n';
+    }
+  }
+
+  void checkPostCall(const CallEvent &Call, CheckerContext &C) const {
+    if (isCallbackEnabled(C, "PostCall")) {
+      llvm::errs() << "PostCall";
+      if (const NamedDecl *ND = dyn_cast_or_null<NamedDecl>(Call.getDecl()))
+        llvm::errs() << " (" << ND->getQualifiedNameAsString() << ')';
+      llvm::errs() << '\n';
+    }
+  }
+
+  void checkNewAllocator(const CXXNewExpr *CNE, SVal Target,
+                         CheckerContext &C) const {
+    if (isCallbackEnabled(C, "NewAllocator"))
+      llvm::errs() << "NewAllocator\n";
+  }
+
   void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &C) const {
     if (isCallbackEnabled(C, "Bind"))
       llvm::errs() << "Bind\n";
diff --git a/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp b/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
index 371187747f03..b16e32ecdfc9 100644
--- a/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
+++ b/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
@@ -436,8 +436,7 @@ void CFNumberChecker::checkPreStmt(const CallExpr *CE,
     return;
 
   // Get the value of the "theType" argument.
-  const LocationContext *LCtx = C.getLocationContext();
-  SVal TheTypeVal = state->getSVal(CE->getArg(1), LCtx);
+  SVal TheTypeVal = C.getSVal(CE->getArg(1));
 
   // FIXME: We really should allow ranges of valid theType values, and
   //   bifurcate the state appropriately.
@@ -457,7 +456,7 @@ void CFNumberChecker::checkPreStmt(const CallExpr *CE,
   // Look at the value of the integer being passed by reference.  Essentially
   // we want to catch cases where the value passed in is not equal to the
   // size of the type being created.
-  SVal TheValueExpr = state->getSVal(CE->getArg(2), LCtx);
+  SVal TheValueExpr = C.getSVal(CE->getArg(2));
 
   // FIXME: Eventually we should handle arbitrary locations.  We can do this
   //  by having an enhanced memory model that does low-level typing.
@@ -571,7 +570,7 @@ void CFRetainReleaseChecker::checkPreStmt(const CallExpr *CE,
 
   // Get the argument's value.
   const Expr *Arg = CE->getArg(0);
-  SVal ArgVal = state->getSVal(Arg, C.getLocationContext());
+  SVal ArgVal = C.getSVal(Arg);
   Optional<DefinedSVal> DefArgVal = ArgVal.getAs<DefinedSVal>();
   if (!DefArgVal)
     return;
@@ -977,8 +976,7 @@ assumeCollectionNonEmpty(CheckerContext &C, ProgramStateRef State,
   if (!State)
     return nullptr;
 
-  SymbolRef CollectionS =
-    State->getSVal(FCS->getCollection(), C.getLocationContext()).getAsSymbol();
+  SymbolRef CollectionS = C.getSVal(FCS->getCollection()).getAsSymbol();
   return assumeCollectionNonEmpty(C, State, CollectionS, Assumption);
 }
 
@@ -1206,7 +1204,7 @@ ProgramStateRef
 ObjCNonNilReturnValueChecker::assumeExprIsNonNull(const Expr *NonNullExpr,
                                                   ProgramStateRef State,
                                                   CheckerContext &C) const {
-  SVal Val = State->getSVal(NonNullExpr, C.getLocationContext());
+  SVal Val = C.getSVal(NonNullExpr);
   if (Optional<DefinedOrUnknownSVal> DV = Val.getAs<DefinedOrUnknownSVal>())
     return State->assume(*DV, true);
   return State;
diff --git a/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp b/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
index 097d4198800d..93c9bfa043ac 100644
--- a/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
@@ -43,7 +43,7 @@ bool BuiltinFunctionChecker::evalCall(const CallExpr *CE,
 
   case Builtin::BI__builtin_assume: {
     assert (CE->arg_begin() != CE->arg_end());
-    SVal ArgSVal = state->getSVal(CE->getArg(0), LCtx);
+    SVal ArgSVal = C.getSVal(CE->getArg(0));
     if (ArgSVal.isUndef())
       return true; // Return true to model purity.
 
@@ -68,7 +68,7 @@ bool BuiltinFunctionChecker::evalCall(const CallExpr *CE,
     // __builtin_addressof is going from a reference to a pointer, but those
     // are represented the same way in the analyzer.
     assert (CE->arg_begin() != CE->arg_end());
-    SVal X = state->getSVal(*(CE->arg_begin()), LCtx);
+    SVal X = C.getSVal(*(CE->arg_begin()));
     C.addTransition(state->BindExpr(CE, LCtx, X));
     return true;
   }
@@ -83,8 +83,7 @@ bool BuiltinFunctionChecker::evalCall(const CallExpr *CE,
     // Set the extent of the region in bytes. This enables us to use the
     // SVal of the argument directly. If we save the extent in bits, we
     // cannot represent values like symbol*8.
-    DefinedOrUnknownSVal Size =
-        state->getSVal(*(CE->arg_begin()), LCtx).castAs<DefinedOrUnknownSVal>();
+    auto Size = C.getSVal(*(CE->arg_begin())).castAs<DefinedOrUnknownSVal>();
 
     SValBuilder& svalBuilder = C.getSValBuilder();
     DefinedOrUnknownSVal Extent = R->getExtent(svalBuilder);
diff --git a/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 28ad7e9e5071..bd4033784ef2 100644
--- a/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -309,9 +309,19 @@ ProgramStateRef CStringChecker::CheckLocation(CheckerContext &C,
     if (!N)
       return nullptr;
 
+    CheckName Name;
+    // These checks are either enabled by the CString out-of-bounds checker
+    // explicitly or the "basic" CStringNullArg checker support that Malloc
+    // checker enables.
+    assert(Filter.CheckCStringOutOfBounds || Filter.CheckCStringNullArg);
+    if (Filter.CheckCStringOutOfBounds)
+      Name = Filter.CheckNameCStringOutOfBounds;
+    else
+      Name = Filter.CheckNameCStringNullArg;
+
     if (!BT_Bounds) {
       BT_Bounds.reset(new BuiltinBug(
-          Filter.CheckNameCStringOutOfBounds, "Out-of-bound array access",
+          Name, "Out-of-bound array access",
           "Byte string function accesses out-of-bound array element"));
     }
     BuiltinBug *BT = static_cast<BuiltinBug*>(BT_Bounds.get());
@@ -366,7 +376,7 @@ ProgramStateRef CStringChecker::CheckBufferAccess(CheckerContext &C,
   QualType PtrTy = Ctx.getPointerType(Ctx.CharTy);
 
   // Check that the first buffer is non-null.
-  SVal BufVal = state->getSVal(FirstBuf, LCtx);
+  SVal BufVal = C.getSVal(FirstBuf);
   state = checkNonNull(C, state, FirstBuf, BufVal);
   if (!state)
     return nullptr;
@@ -378,7 +388,7 @@ ProgramStateRef CStringChecker::CheckBufferAccess(CheckerContext &C,
   // Get the access length and make sure it is known.
   // FIXME: This assumes the caller has already checked that the access length
   // is positive. And that it's unsigned.
-  SVal LengthVal = state->getSVal(Size, LCtx);
+  SVal LengthVal = C.getSVal(Size);
   Optional<NonLoc> Length = LengthVal.getAs<NonLoc>();
   if (!Length)
     return state;
@@ -2149,7 +2159,7 @@ void CStringChecker::checkPreStmt(const DeclStmt *DS, CheckerContext &C) const {
     if (!MR)
       continue;
 
-    SVal StrVal = state->getSVal(Init, C.getLocationContext());
+    SVal StrVal = C.getSVal(Init);
     assert(StrVal.isValid() && "Initializer string is unknown or undefined");
     DefinedOrUnknownSVal strLength =
         getCStringLength(C, state, Init, StrVal).castAs<DefinedOrUnknownSVal>();
diff --git a/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp b/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp
index 3e178152d925..059553b21995 100644
--- a/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp
@@ -101,7 +101,7 @@ void CastSizeChecker::checkPreStmt(const CastExpr *CE,CheckerContext &C) const {
     return;
 
   ProgramStateRef state = C.getState();
-  const MemRegion *R = state->getSVal(E, C.getLocationContext()).getAsRegion();
+  const MemRegion *R = C.getSVal(E).getAsRegion();
   if (!R)
     return;
 
diff --git a/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp b/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
index 2818c9d9fd4a..bfd4019ed8d7 100644
--- a/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
+++ b/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
@@ -535,7 +535,7 @@ void ObjCDeallocChecker::diagnoseMissingReleases(CheckerContext &C) const {
       continue;
 
     // Prevents diagnosing multiple times for the same instance variable
-    // at, for example, both a return and at the end of of the function.
+    // at, for example, both a return and at the end of the function.
     NewUnreleased = F.remove(NewUnreleased, IvarSymbol);
 
     if (State->getStateManager()
@@ -645,7 +645,7 @@ ObjCDeallocChecker::findPropertyOnDeallocatingInstance(
 bool ObjCDeallocChecker::diagnoseExtraRelease(SymbolRef ReleasedValue,
                                               const ObjCMethodCall &M,
                                               CheckerContext &C) const {
-  // Try to get the region from which the the released value was loaded.
+  // Try to get the region from which the released value was loaded.
   // Note that, unlike diagnosing for missing releases, here we don't track
   // values that must not be released in the state. This is because even if
   // these values escape, it is still an error under the rules of MRR to
diff --git a/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp b/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
index 6dbacad7f2ea..62831be26eb2 100644
--- a/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
+++ b/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
@@ -510,6 +510,17 @@ void WalkAST::checkCall_strcpy(const CallExpr *CE, const FunctionDecl *FD) {
   if (!checkCall_strCommon(CE, FD))
     return;
 
+  const auto *Target = CE->getArg(0)->IgnoreImpCasts(),
+             *Source = CE->getArg(1)->IgnoreImpCasts();
+  if (const auto *DeclRef = dyn_cast<DeclRefExpr>(Target))
+    if (const auto *Array = dyn_cast<ConstantArrayType>(DeclRef->getType())) {
+      uint64_t ArraySize = BR.getContext().getTypeSize(Array) / 8;
+      if (const auto *String = dyn_cast<StringLiteral>(Source)) {
+        if (ArraySize >= String->getLength() + 1)
+          return;
+      }
+    }
+
   // Issue a warning.
   PathDiagnosticLocation CELoc =
     PathDiagnosticLocation::createBegin(CE, BR.getSourceManager(), AC);
diff --git a/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp b/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
index 95b6c4d3775d..ab1e2ec1f54e 100644
--- a/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
+++ b/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
@@ -42,6 +42,7 @@ class CheckerDocumentation : public Checker< check::PreStmt<ReturnStmt>,
                                        check::PreCall,
                                        check::PostCall,
                                        check::BranchCondition,
+                                       check::NewAllocator,
                                        check::Location,
                                        check::Bind,
                                        check::DeadSymbols,
@@ -126,6 +127,22 @@ class CheckerDocumentation : public Checker< check::PreStmt<ReturnStmt>,
   /// \brief Pre-visit of the condition statement of a branch (such as IfStmt).
   void checkBranchCondition(const Stmt *Condition, CheckerContext &Ctx) const {}
 
+  /// \brief Post-visit the C++ operator new's allocation call.
+  ///
+  /// Execution of C++ operator new consists of the following phases: (1) call
+  /// default or overridden operator new() to allocate memory (2) cast the
+  /// return value of operator new() from void pointer type to class pointer
+  /// type, (3) assuming that the value is non-null, call the object's
+  /// constructor over this pointer, (4) declare that the value of the
+  /// new-expression is this pointer. This callback is called between steps
+  /// (2) and (3). Post-call for the allocator is called after step (1).
+  /// Pre-statement for the new-expression is called on step (4) when the value
+  /// of the expression is evaluated.
+  /// \param NE     The C++ new-expression that triggered the allocation.
+  /// \param Target The allocated region, casted to the class type.
+  void checkNewAllocator(const CXXNewExpr *NE, SVal Target,
+                         CheckerContext &) const {}
+
   /// \brief Called on a load from and a store to a location.
   ///
   /// The method will be called each time a location (pointer) value is
diff --git a/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp b/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
index 9e9939ae25c0..d9739373f18b 100644
--- a/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
@@ -105,7 +105,7 @@ void ChrootChecker::Chdir(CheckerContext &C, const CallExpr *CE) const {
 
   // After chdir("/"), enter the jail, set the enum value JAIL_ENTERED.
   const Expr *ArgExpr = CE->getArg(0);
-  SVal ArgVal = state->getSVal(ArgExpr, C.getLocationContext());
+  SVal ArgVal = C.getSVal(ArgExpr);
 
   if (const MemRegion *R = ArgVal.getAsRegion()) {
     R = R->StripCasts();
diff --git a/lib/StaticAnalyzer/Checkers/DeleteWithNonVirtualDtorChecker.cpp b/lib/StaticAnalyzer/Checkers/DeleteWithNonVirtualDtorChecker.cpp
index e04e2ab2c320..53632b41df4d 100644
--- a/lib/StaticAnalyzer/Checkers/DeleteWithNonVirtualDtorChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/DeleteWithNonVirtualDtorChecker.cpp
@@ -110,8 +110,6 @@ DeleteWithNonVirtualDtorChecker::DeleteBugVisitor::VisitNode(
   if (Satisfied)
     return nullptr;
 
-  ProgramStateRef State = N->getState();
-  const LocationContext *LC = N->getLocationContext();
   const Stmt *S = PathDiagnosticLocation::getStmt(N);
   if (!S)
     return nullptr;
@@ -128,7 +126,7 @@ DeleteWithNonVirtualDtorChecker::DeleteBugVisitor::VisitNode(
   }
 
   // Region associated with the current cast expression.
-  const MemRegion *M = State->getSVal(CastE, LC).getAsRegion();
+  const MemRegion *M = N->getSVal(CastE).getAsRegion();
   if (!M)
     return nullptr;
 
diff --git a/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp b/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
index 598502305633..68e5c0297436 100644
--- a/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
@@ -57,7 +57,7 @@ void DivZeroChecker::checkPreStmt(const BinaryOperator *B,
   if (!B->getRHS()->getType()->isScalarType())
     return;
 
-  SVal Denom = C.getState()->getSVal(B->getRHS(), C.getLocationContext());
+  SVal Denom = C.getSVal(B->getRHS());
   Optional<DefinedSVal> DV = Denom.getAs<DefinedSVal>();
 
   // Divide-by-undefined handled in the generic checking for uses of
diff --git a/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp b/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
index db9179e018a1..431c77c9c30a 100644
--- a/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
+++ b/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
@@ -562,7 +562,7 @@ void DynamicTypePropagation::checkPostStmt(const CastExpr *CE,
       DestObjectPtrType->isUnspecialized())
     return;
 
-  SymbolRef Sym = State->getSVal(CE, C.getLocationContext()).getAsSymbol();
+  SymbolRef Sym = C.getSVal(CE).getAsSymbol();
   if (!Sym)
     return;
 
diff --git a/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp b/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
index 3fe89f96a43b..059203fca730 100644
--- a/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
@@ -44,8 +44,7 @@ void FixedAddressChecker::checkPreStmt(const BinaryOperator *B,
   if (!T->isPointerType())
     return;
 
-  ProgramStateRef state = C.getState();
-  SVal RV = state->getSVal(B->getRHS(), C.getLocationContext());
+  SVal RV = C.getSVal(B->getRHS());
 
   if (!RV.isConstant() || RV.isZeroConstant())
     return;
diff --git a/lib/StaticAnalyzer/Checkers/GTestChecker.cpp b/lib/StaticAnalyzer/Checkers/GTestChecker.cpp
index f0be41b293e4..3ef95e673b87 100644
--- a/lib/StaticAnalyzer/Checkers/GTestChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/GTestChecker.cpp
@@ -161,7 +161,7 @@ void GTestChecker::modelAssertionResultCopyConstructor(
     const CXXConstructorCall *Call, CheckerContext &C) const {
   assert(Call->getNumArgs() == 1);
 
-  // The first parameter of the the copy constructor must be the other
+  // The first parameter of the copy constructor must be the other
   // instance to initialize this instances fields from.
   SVal OtherVal = Call->getArgSVal(0);
   SVal ThisVal = Call->getCXXThisVal();
diff --git a/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
index 43966656cd8d..800bbf991d03 100644
--- a/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -468,7 +468,7 @@ bool GenericTaintChecker::checkPre(const CallExpr *CE, CheckerContext &C) const{
 Optional<SVal> GenericTaintChecker::getPointedToSVal(CheckerContext &C,
                                                      const Expr *Arg) {
   ProgramStateRef State = C.getState();
-  SVal AddrVal = State->getSVal(Arg->IgnoreParens(), C.getLocationContext());
+  SVal AddrVal = C.getSVal(Arg->IgnoreParens());
   if (AddrVal.isUnknownOrUndef())
     return None;
 
@@ -621,7 +621,7 @@ ProgramStateRef GenericTaintChecker::postRetTaint(const CallExpr *CE,
 
 bool GenericTaintChecker::isStdin(const Expr *E, CheckerContext &C) {
   ProgramStateRef State = C.getState();
-  SVal Val = State->getSVal(E, C.getLocationContext());
+  SVal Val = C.getSVal(E);
 
   // stdin is a pointer, so it would be a region.
   const MemRegion *MemReg = Val.getAsRegion();
diff --git a/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp b/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
index 0f9b749506fa..11f42229f53d 100644
--- a/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
@@ -355,12 +355,11 @@ void IteratorChecker::checkPostStmt(const MaterializeTemporaryExpr *MTE,
                                     CheckerContext &C) const {
   /* Transfer iterator state to temporary objects */
   auto State = C.getState();
-  const auto *LCtx = C.getLocationContext();
   const auto *Pos =
-      getIteratorPosition(State, State->getSVal(MTE->GetTemporaryExpr(), LCtx));
+      getIteratorPosition(State, C.getSVal(MTE->GetTemporaryExpr()));
   if (!Pos)
     return;
-  State = setIteratorPosition(State, State->getSVal(MTE, LCtx), *Pos);
+  State = setIteratorPosition(State, C.getSVal(MTE), *Pos);
   C.addTransition(State);
 }
 
diff --git a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
index 655ce33390c9..ed0ee9f3a15a 100644
--- a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
@@ -1017,8 +1017,7 @@ NonLocalizedStringBRVisitor::VisitNode(const ExplodedNode *Succ,
   if (!LiteralExpr)
     return nullptr;
 
-  ProgramStateRef State = Succ->getState();
-  SVal LiteralSVal = State->getSVal(LiteralExpr, Succ->getLocationContext());
+  SVal LiteralSVal = Succ->getSVal(LiteralExpr);
   if (LiteralSVal.getAsRegion() != NonLocalizedString)
     return nullptr;
 
diff --git a/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp b/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
index f8473dbd7647..8ba0bcbceeca 100644
--- a/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
@@ -202,7 +202,7 @@ static bool isBadDeallocationArgument(const MemRegion *Arg) {
 static SymbolRef getAsPointeeSymbol(const Expr *Expr,
                                     CheckerContext &C) {
   ProgramStateRef State = C.getState();
-  SVal ArgV = State->getSVal(Expr, C.getLocationContext());
+  SVal ArgV = C.getSVal(Expr);
 
   if (Optional<loc::MemRegionVal> X = ArgV.getAs<loc::MemRegionVal>()) {
     StoreManager& SM = C.getStoreManager();
@@ -297,7 +297,7 @@ void MacOSKeychainAPIChecker::checkPreStmt(const CallExpr *CE,
 
   // Check the argument to the deallocator.
   const Expr *ArgExpr = CE->getArg(paramIdx);
-  SVal ArgSVal = State->getSVal(ArgExpr, C.getLocationContext());
+  SVal ArgSVal = C.getSVal(ArgExpr);
 
   // Undef is reported by another checker.
   if (ArgSVal.isUndef())
@@ -426,8 +426,7 @@ void MacOSKeychainAPIChecker::checkPostStmt(const CallExpr *CE,
     // allocated value symbol, since our diagnostics depend on the value
     // returned by the call. Ex: Data should only be freed if noErr was
     // returned during allocation.)
-    SymbolRef RetStatusSymbol =
-      State->getSVal(CE, C.getLocationContext()).getAsSymbol();
+    SymbolRef RetStatusSymbol = C.getSVal(CE).getAsSymbol();
     C.getSymbolManager().addSymbolDependency(V, RetStatusSymbol);
 
     // Track the allocated value in the checker state.
diff --git a/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 851114004b96..a1a540c4b96e 100644
--- a/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -162,6 +162,7 @@ class MallocChecker : public Checker<check::DeadSymbols,
                                      check::PreCall,
                                      check::PostStmt<CallExpr>,
                                      check::PostStmt<CXXNewExpr>,
+                                     check::NewAllocator,
                                      check::PreStmt<CXXDeleteExpr>,
                                      check::PostStmt<BlockExpr>,
                                      check::PostObjCMessage,
@@ -207,6 +208,8 @@ class MallocChecker : public Checker<check::DeadSymbols,
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPostStmt(const CallExpr *CE, CheckerContext &C) const;
   void checkPostStmt(const CXXNewExpr *NE, CheckerContext &C) const;
+  void checkNewAllocator(const CXXNewExpr *NE, SVal Target,
+                         CheckerContext &C) const;
   void checkPreStmt(const CXXDeleteExpr *DE, CheckerContext &C) const;
   void checkPostObjCMessage(const ObjCMethodCall &Call, CheckerContext &C) const;
   void checkPostStmt(const BlockExpr *BE, CheckerContext &C) const;
@@ -281,10 +284,18 @@ class MallocChecker : public Checker<check::DeadSymbols,
   bool isStandardNewDelete(const FunctionDecl *FD, ASTContext &C) const;
   ///@}
 
+  /// \brief Process C++ operator new()'s allocation, which is the part of C++
+  /// new-expression that goes before the constructor.
+  void processNewAllocation(const CXXNewExpr *NE, CheckerContext &C,
+                            SVal Target) const;
+
   /// \brief Perform a zero-allocation check.
+  /// The optional \p RetVal parameter specifies the newly allocated pointer
+  /// value; if unspecified, the value of expression \p E is used.
   ProgramStateRef ProcessZeroAllocation(CheckerContext &C, const Expr *E,
                                         const unsigned AllocationSizeArg,
-                                        ProgramStateRef State) const;
+                                        ProgramStateRef State,
+                                        Optional<SVal> RetVal = None) const;
 
   ProgramStateRef MallocMemReturnsAttr(CheckerContext &C,
                                        const CallExpr *CE,
@@ -300,7 +311,7 @@ class MallocChecker : public Checker<check::DeadSymbols,
                                       AllocationFamily Family = AF_Malloc);
 
   static ProgramStateRef addExtentSize(CheckerContext &C, const CXXNewExpr *NE,
-                                       ProgramStateRef State);
+                                       ProgramStateRef State, SVal Target);
 
   // Check if this malloc() for special flags. At present that means M_ZERO or
   // __GFP_ZERO (in which case, treat it like calloc).
@@ -309,9 +320,12 @@ class MallocChecker : public Checker<check::DeadSymbols,
                       const ProgramStateRef &State) const;
 
   /// Update the RefState to reflect the new memory allocation.
+  /// The optional \p RetVal parameter specifies the newly allocated pointer
+  /// value; if unspecified, the value of expression \p E is used.
   static ProgramStateRef
   MallocUpdateRefState(CheckerContext &C, const Expr *E, ProgramStateRef State,
-                       AllocationFamily Family = AF_Malloc);
+                       AllocationFamily Family = AF_Malloc,
+                       Optional<SVal> RetVal = None);
 
   ProgramStateRef FreeMemAttr(CheckerContext &C, const CallExpr *CE,
                               const OwnershipAttr* Att,
@@ -758,7 +772,7 @@ llvm::Optional<ProgramStateRef> MallocChecker::performKernelMalloc(
     return None;
 
   const Expr *FlagsEx = CE->getArg(CE->getNumArgs() - 1);
-  const SVal V = State->getSVal(FlagsEx, C.getLocationContext());
+  const SVal V = C.getSVal(FlagsEx);
   if (!V.getAs<NonLoc>()) {
     // The case where 'V' can be a location can only be due to a bad header,
     // so in this case bail out.
@@ -949,13 +963,15 @@ void MallocChecker::checkPostStmt(const CallExpr *CE, CheckerContext &C) const {
 }
 
 // Performs a 0-sized allocations check.
-ProgramStateRef MallocChecker::ProcessZeroAllocation(CheckerContext &C,
-                                               const Expr *E,
-                                               const unsigned AllocationSizeArg,
-                                               ProgramStateRef State) const {
+ProgramStateRef MallocChecker::ProcessZeroAllocation(
+    CheckerContext &C, const Expr *E, const unsigned AllocationSizeArg,
+    ProgramStateRef State, Optional<SVal> RetVal) const {
   if (!State)
     return nullptr;
 
+  if (!RetVal)
+    RetVal = C.getSVal(E);
+
   const Expr *Arg = nullptr;
 
   if (const CallExpr *CE = dyn_cast<CallExpr>(E)) {
@@ -972,8 +988,7 @@ ProgramStateRef MallocChecker::ProcessZeroAllocation(CheckerContext &C,
 
   assert(Arg);
 
-  Optional<DefinedSVal> DefArgVal =
-      State->getSVal(Arg, C.getLocationContext()).getAs<DefinedSVal>();
+  Optional<DefinedSVal> DefArgVal = C.getSVal(Arg).getAs<DefinedSVal>();
 
   if (!DefArgVal)
     return State;
@@ -988,8 +1003,7 @@ ProgramStateRef MallocChecker::ProcessZeroAllocation(CheckerContext &C,
       State->assume(SvalBuilder.evalEQ(State, *DefArgVal, Zero));
 
   if (TrueState && !FalseState) {
-    SVal retVal = State->getSVal(E, C.getLocationContext());
-    SymbolRef Sym = retVal.getAsLocSymbol();
+    SymbolRef Sym = RetVal->getAsLocSymbol();
     if (!Sym)
       return State;
 
@@ -1050,9 +1064,9 @@ static bool treatUnusedNewEscaped(const CXXNewExpr *NE) {
   return false;
 }
 
-void MallocChecker::checkPostStmt(const CXXNewExpr *NE,
-                                  CheckerContext &C) const {
-
+void MallocChecker::processNewAllocation(const CXXNewExpr *NE,
+                                         CheckerContext &C,
+                                         SVal Target) const {
   if (NE->getNumPlacementArgs())
     for (CXXNewExpr::const_arg_iterator I = NE->placement_arg_begin(),
          E = NE->placement_arg_end(); I != E; ++I)
@@ -1072,37 +1086,48 @@ void MallocChecker::checkPostStmt(const CXXNewExpr *NE,
   // MallocUpdateRefState() instead of MallocMemAux() which breakes the
   // existing binding.
   State = MallocUpdateRefState(C, NE, State, NE->isArray() ? AF_CXXNewArray
-                                                           : AF_CXXNew);
-  State = addExtentSize(C, NE, State);
-  State = ProcessZeroAllocation(C, NE, 0, State);
+                                                           : AF_CXXNew, Target);
+  State = addExtentSize(C, NE, State, Target);
+  State = ProcessZeroAllocation(C, NE, 0, State, Target);
   C.addTransition(State);
 }
 
+void MallocChecker::checkPostStmt(const CXXNewExpr *NE,
+                                  CheckerContext &C) const {
+  if (!C.getAnalysisManager().getAnalyzerOptions().mayInlineCXXAllocator())
+    processNewAllocation(NE, C, C.getSVal(NE));
+}
+
+void MallocChecker::checkNewAllocator(const CXXNewExpr *NE, SVal Target,
+                                      CheckerContext &C) const {
+  if (!C.wasInlined)
+    processNewAllocation(NE, C, Target);
+}
+
 // Sets the extent value of the MemRegion allocated by
 // new expression NE to its size in Bytes.
 //
 ProgramStateRef MallocChecker::addExtentSize(CheckerContext &C,
                                              const CXXNewExpr *NE,
-                                             ProgramStateRef State) {
+                                             ProgramStateRef State,
+                                             SVal Target) {
   if (!State)
     return nullptr;
   SValBuilder &svalBuilder = C.getSValBuilder();
   SVal ElementCount;
-  const LocationContext *LCtx = C.getLocationContext();
   const SubRegion *Region;
   if (NE->isArray()) {
     const Expr *SizeExpr = NE->getArraySize();
-    ElementCount = State->getSVal(SizeExpr, C.getLocationContext());
+    ElementCount = C.getSVal(SizeExpr);
     // Store the extent size for the (symbolic)region
     // containing the elements.
-    Region = (State->getSVal(NE, LCtx))
-                 .getAsRegion()
+    Region = Target.getAsRegion()
                  ->getAs<SubRegion>()
-                 ->getSuperRegion()
+                 ->StripCasts()
                  ->getAs<SubRegion>();
   } else {
     ElementCount = svalBuilder.makeIntVal(1, true);
-    Region = (State->getSVal(NE, LCtx)).getAsRegion()->getAs<SubRegion>();
+    Region = Target.getAsRegion()->getAs<SubRegion>();
   }
   assert(Region);
 
@@ -1212,8 +1237,7 @@ ProgramStateRef MallocChecker::MallocMemAux(CheckerContext &C,
   if (!State)
     return nullptr;
 
-  return MallocMemAux(C, CE, State->getSVal(SizeEx, C.getLocationContext()),
-                      Init, State, Family);
+  return MallocMemAux(C, CE, C.getSVal(SizeEx), Init, State, Family);
 }
 
 ProgramStateRef MallocChecker::MallocMemAux(CheckerContext &C,
@@ -1263,18 +1287,22 @@ ProgramStateRef MallocChecker::MallocMemAux(CheckerContext &C,
 ProgramStateRef MallocChecker::MallocUpdateRefState(CheckerContext &C,
                                                     const Expr *E,
                                                     ProgramStateRef State,
-                                                    AllocationFamily Family) {
+                                                    AllocationFamily Family,
+                                                    Optional<SVal> RetVal) {
   if (!State)
     return nullptr;
 
   // Get the return value.
-  SVal retVal = State->getSVal(E, C.getLocationContext());
+  if (!RetVal)
+    RetVal = C.getSVal(E);
 
   // We expect the malloc functions to return a pointer.
-  if (!retVal.getAs<Loc>())
+  if (!RetVal->getAs<Loc>())
     return nullptr;
 
-  SymbolRef Sym = retVal.getAsLocSymbol();
+  SymbolRef Sym = RetVal->getAsLocSymbol();
+  // This is a return value of a function that was not inlined, such as malloc()
+  // or new(). We've checked that in the caller. Therefore, it must be a symbol.
   assert(Sym);
 
   // Set the symbol's state to Allocated.
@@ -1457,7 +1485,7 @@ ProgramStateRef MallocChecker::FreeMemAux(CheckerContext &C,
   if (!State)
     return nullptr;
 
-  SVal ArgVal = State->getSVal(ArgExpr, C.getLocationContext());
+  SVal ArgVal = C.getSVal(ArgExpr);
   if (!ArgVal.getAs<DefinedOrUnknownSVal>())
     return nullptr;
   DefinedOrUnknownSVal location = ArgVal.castAs<DefinedOrUnknownSVal>();
@@ -2084,8 +2112,7 @@ ProgramStateRef MallocChecker::ReallocMemAux(CheckerContext &C,
     return nullptr;
 
   const Expr *arg0Expr = CE->getArg(0);
-  const LocationContext *LCtx = C.getLocationContext();
-  SVal Arg0Val = State->getSVal(arg0Expr, LCtx);
+  SVal Arg0Val = C.getSVal(arg0Expr);
   if (!Arg0Val.getAs<DefinedOrUnknownSVal>())
     return nullptr;
   DefinedOrUnknownSVal arg0Val = Arg0Val.castAs<DefinedOrUnknownSVal>();
@@ -2099,7 +2126,7 @@ ProgramStateRef MallocChecker::ReallocMemAux(CheckerContext &C,
   const Expr *Arg1 = CE->getArg(1);
 
   // Get the value of the size argument.
-  SVal TotalSize = State->getSVal(Arg1, LCtx);
+  SVal TotalSize = C.getSVal(Arg1);
   if (SuffixWithN)
     TotalSize = evalMulForBufferSize(C, Arg1, CE->getArg(2));
   if (!TotalSize.getAs<DefinedOrUnknownSVal>())
@@ -2133,7 +2160,7 @@ ProgramStateRef MallocChecker::ReallocMemAux(CheckerContext &C,
   // Get the from and to pointer symbols as in toPtr = realloc(fromPtr, size).
   assert(!PrtIsNull);
   SymbolRef FromPtr = arg0Val.getAsSymbol();
-  SVal RetVal = State->getSVal(CE, LCtx);
+  SVal RetVal = C.getSVal(CE);
   SymbolRef ToPtr = RetVal.getAsSymbol();
   if (!FromPtr || !ToPtr)
     return nullptr;
@@ -2406,7 +2433,7 @@ void MallocChecker::checkPreStmt(const ReturnStmt *S, CheckerContext &C) const {
 
   // Check if we are returning a symbol.
   ProgramStateRef State = C.getState();
-  SVal RetVal = State->getSVal(E, C.getLocationContext());
+  SVal RetVal = C.getSVal(E);
   SymbolRef Sym = RetVal.getAsSymbol();
   if (!Sym)
     // If we are returning a field of the allocated struct or an array element,
@@ -2436,8 +2463,7 @@ void MallocChecker::checkPostStmt(const BlockExpr *BE,
 
   ProgramStateRef state = C.getState();
   const BlockDataRegion *R =
-    cast<BlockDataRegion>(state->getSVal(BE,
-                                         C.getLocationContext()).getAsRegion());
+    cast<BlockDataRegion>(C.getSVal(BE).getAsRegion());
 
   BlockDataRegion::referenced_vars_iterator I = R->referenced_vars_begin(),
                                             E = R->referenced_vars_end();
@@ -2900,8 +2926,13 @@ void ento::registerNewDeleteLeaksChecker(CheckerManager &mgr) {
       mgr.getCurrentCheckName();
   // We currently treat NewDeleteLeaks checker as a subchecker of NewDelete
   // checker.
-  if (!checker->ChecksEnabled[MallocChecker::CK_NewDeleteChecker])
+  if (!checker->ChecksEnabled[MallocChecker::CK_NewDeleteChecker]) {
     checker->ChecksEnabled[MallocChecker::CK_NewDeleteChecker] = true;
+    // FIXME: This does not set the correct name, but without this workaround
+    //        no name will be set at all.
+    checker->CheckNames[MallocChecker::CK_NewDeleteChecker] =
+        mgr.getCurrentCheckName();
+  }
 }
 
 #define REGISTER_CHECKER(name)                                                 \
diff --git a/lib/StaticAnalyzer/Checkers/MisusedMovedObjectChecker.cpp b/lib/StaticAnalyzer/Checkers/MisusedMovedObjectChecker.cpp
index 497978f07815..c6493fed8ac1 100644
--- a/lib/StaticAnalyzer/Checkers/MisusedMovedObjectChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/MisusedMovedObjectChecker.cpp
@@ -101,8 +101,6 @@ static ProgramStateRef removeFromState(ProgramStateRef State,
                                        const MemRegion *Region) {
   if (!Region)
     return State;
-  // Note: The isSubRegionOf function is not reflexive.
-  State = State->remove<TrackedRegionMap>(Region);
   for (auto &E : State->get<TrackedRegionMap>()) {
     if (E.first->isSubRegionOf(Region))
       State = State->remove<TrackedRegionMap>(E.first);
diff --git a/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index fa9a317683ba..b348453d5e2b 100644
--- a/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -560,8 +560,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
   if (State->get<InvariantViolated>())
     return;
 
-  auto RetSVal =
-      State->getSVal(S, C.getLocationContext()).getAs<DefinedOrUnknownSVal>();
+  auto RetSVal = C.getSVal(S).getAs<DefinedOrUnknownSVal>();
   if (!RetSVal)
     return;
 
@@ -977,8 +976,7 @@ void NullabilityChecker::checkPostStmt(const ExplicitCastExpr *CE,
   if (DestNullability == Nullability::Unspecified)
     return;
 
-  auto RegionSVal =
-      State->getSVal(CE, C.getLocationContext()).getAs<DefinedOrUnknownSVal>();
+  auto RegionSVal = C.getSVal(CE).getAs<DefinedOrUnknownSVal>();
   const MemRegion *Region = getTrackRegion(*RegionSVal);
   if (!Region)
     return;
diff --git a/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp b/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp
index cbaa5c23592d..b7339fe79f69 100644
--- a/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp
@@ -39,7 +39,7 @@ void ObjCAtSyncChecker::checkPreStmt(const ObjCAtSynchronizedStmt *S,
 
   const Expr *Ex = S->getSynchExpr();
   ProgramStateRef state = C.getState();
-  SVal V = state->getSVal(Ex, C.getLocationContext());
+  SVal V = C.getSVal(Ex);
 
   // Uninitialized value used for the mutex?
   if (V.getAs<UndefinedVal>()) {
diff --git a/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp b/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
index 58ebf72660b6..fb05ca630b45 100644
--- a/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
@@ -39,7 +39,7 @@ class ObjCContainersChecker : public Checker< check::PreStmt<CallExpr>,
   }
 
   inline SymbolRef getArraySym(const Expr *E, CheckerContext &C) const {
-    SVal ArrayRef = C.getState()->getSVal(E, C.getLocationContext());
+    SVal ArrayRef = C.getSVal(E);
     SymbolRef ArraySym = ArrayRef.getAsSymbol();
     return ArraySym;
   }
@@ -66,13 +66,13 @@ REGISTER_MAP_WITH_PROGRAMSTATE(ArraySizeMap, SymbolRef, DefinedSVal)
 void ObjCContainersChecker::addSizeInfo(const Expr *Array, const Expr *Size,
                                         CheckerContext &C) const {
   ProgramStateRef State = C.getState();
-  SVal SizeV = State->getSVal(Size, C.getLocationContext());
+  SVal SizeV = C.getSVal(Size);
   // Undefined is reported by another checker.
   if (SizeV.isUnknownOrUndef())
     return;
 
   // Get the ArrayRef symbol.
-  SVal ArrayRef = State->getSVal(Array, C.getLocationContext());
+  SVal ArrayRef = C.getSVal(Array);
   SymbolRef ArraySym = ArrayRef.getAsSymbol();
   if (!ArraySym)
     return;
@@ -128,7 +128,7 @@ void ObjCContainersChecker::checkPreStmt(const CallExpr *CE,
 
     // Get the index.
     const Expr *IdxExpr = CE->getArg(1);
-    SVal IdxVal = State->getSVal(IdxExpr, C.getLocationContext());
+    SVal IdxVal = C.getSVal(IdxExpr);
     if (IdxVal.isUnknownOrUndef())
       return;
     DefinedSVal Idx = IdxVal.castAs<DefinedSVal>();
diff --git a/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp b/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
index ffa3a2700616..6184de9600c2 100644
--- a/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
@@ -132,7 +132,7 @@ static bool hasSelfFlag(SVal val, SelfFlagEnum flag, CheckerContext &C) {
 /// points to and is an object that did not come from the result of calling
 /// an initializer.
 static bool isInvalidSelf(const Expr *E, CheckerContext &C) {
-  SVal exprVal = C.getState()->getSVal(E, C.getLocationContext());
+  SVal exprVal = C.getSVal(E);
   if (!hasSelfFlag(exprVal, SelfFlag_Self, C))
     return false; // value did not come from 'self'.
   if (hasSelfFlag(exprVal, SelfFlag_InitRes, C))
@@ -183,7 +183,7 @@ void ObjCSelfInitChecker::checkPostObjCMessage(const ObjCMethodCall &Msg,
     // value out when we return from this method.
     state = state->set<CalledInit>(true);
 
-    SVal V = state->getSVal(Msg.getOriginExpr(), C.getLocationContext());
+    SVal V = C.getSVal(Msg.getOriginExpr());
     addSelfFlag(state, V, SelfFlag_InitRes, C);
     return;
   }
diff --git a/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp b/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
index 8caf6df4d970..4c4a8e3f0d0d 100644
--- a/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
@@ -154,8 +154,7 @@ void PointerArithChecker::reportPointerArithMisuse(const Expr *E,
     return;
 
   ProgramStateRef State = C.getState();
-  const MemRegion *Region =
-      State->getSVal(E, C.getLocationContext()).getAsRegion();
+  const MemRegion *Region = C.getSVal(E).getAsRegion();
   if (!Region)
     return;
   if (PointedNeeded)
@@ -227,7 +226,7 @@ void PointerArithChecker::checkPostStmt(const CallExpr *CE,
   if (AllocFunctions.count(FunI) == 0)
     return;
 
-  SVal SV = State->getSVal(CE, C.getLocationContext());
+  SVal SV = C.getSVal(CE);
   const MemRegion *Region = SV.getAsRegion();
   if (!Region)
     return;
@@ -248,7 +247,7 @@ void PointerArithChecker::checkPostStmt(const CXXNewExpr *NE,
   AllocKind Kind = getKindOfNewOp(NE, FD);
 
   ProgramStateRef State = C.getState();
-  SVal AllocedVal = State->getSVal(NE, C.getLocationContext());
+  SVal AllocedVal = C.getSVal(NE);
   const MemRegion *Region = AllocedVal.getAsRegion();
   if (!Region)
     return;
@@ -263,7 +262,7 @@ void PointerArithChecker::checkPostStmt(const CastExpr *CE,
 
   const Expr *CastedExpr = CE->getSubExpr();
   ProgramStateRef State = C.getState();
-  SVal CastedVal = State->getSVal(CastedExpr, C.getLocationContext());
+  SVal CastedVal = C.getSVal(CastedExpr);
 
   const MemRegion *Region = CastedVal.getAsRegion();
   if (!Region)
@@ -281,7 +280,7 @@ void PointerArithChecker::checkPreStmt(const CastExpr *CE,
 
   const Expr *CastedExpr = CE->getSubExpr();
   ProgramStateRef State = C.getState();
-  SVal CastedVal = State->getSVal(CastedExpr, C.getLocationContext());
+  SVal CastedVal = C.getSVal(CastedExpr);
 
   const MemRegion *Region = CastedVal.getAsRegion();
   if (!Region)
@@ -304,8 +303,7 @@ void PointerArithChecker::checkPreStmt(const UnaryOperator *UOp,
 
 void PointerArithChecker::checkPreStmt(const ArraySubscriptExpr *SubsExpr,
                                        CheckerContext &C) const {
-  ProgramStateRef State = C.getState();
-  SVal Idx = State->getSVal(SubsExpr->getIdx(), C.getLocationContext());
+  SVal Idx = C.getSVal(SubsExpr->getIdx());
 
   // Indexing with 0 is OK.
   if (Idx.isZeroConstant())
@@ -324,14 +322,14 @@ void PointerArithChecker::checkPreStmt(const BinaryOperator *BOp,
   ProgramStateRef State = C.getState();
 
   if (Rhs->getType()->isIntegerType() && Lhs->getType()->isPointerType()) {
-    SVal RHSVal = State->getSVal(Rhs, C.getLocationContext());
+    SVal RHSVal = C.getSVal(Rhs);
     if (State->isNull(RHSVal).isConstrainedTrue())
       return;
     reportPointerArithMisuse(Lhs, C, !BOp->isAdditiveOp());
   }
   // The int += ptr; case is not valid C++.
   if (Lhs->getType()->isIntegerType() && Rhs->getType()->isPointerType()) {
-    SVal LHSVal = State->getSVal(Lhs, C.getLocationContext());
+    SVal LHSVal = C.getSVal(Lhs);
     if (State->isNull(LHSVal).isConstrainedTrue())
       return;
     reportPointerArithMisuse(Rhs, C);
diff --git a/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
index 2d33ebc2610d..9aa5348e4c34 100644
--- a/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
@@ -39,10 +39,8 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
   if (B->getOpcode() != BO_Sub)
     return;
 
-  ProgramStateRef state = C.getState();
-  const LocationContext *LCtx = C.getLocationContext();
-  SVal LV = state->getSVal(B->getLHS(), LCtx);
-  SVal RV = state->getSVal(B->getRHS(), LCtx);
+  SVal LV = C.getSVal(B->getLHS());
+  SVal RV = C.getSVal(B->getRHS());
 
   const MemRegion *LR = LV.getAsRegion();
   const MemRegion *RR = RV.getAsRegion();
diff --git a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
index dab29be1c8fb..10ab952e069b 100644
--- a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
@@ -109,8 +109,6 @@ REGISTER_MAP_WITH_PROGRAMSTATE(DestroyRetVal, const MemRegion *, SymbolRef)
 
 void PthreadLockChecker::checkPostStmt(const CallExpr *CE,
                                        CheckerContext &C) const {
-  ProgramStateRef state = C.getState();
-  const LocationContext *LCtx = C.getLocationContext();
   StringRef FName = C.getCalleeName(CE);
   if (FName.empty())
     return;
@@ -121,34 +119,31 @@ void PthreadLockChecker::checkPostStmt(const CallExpr *CE,
   if (FName == "pthread_mutex_lock" ||
       FName == "pthread_rwlock_rdlock" ||
       FName == "pthread_rwlock_wrlock")
-    AcquireLock(C, CE, state->getSVal(CE->getArg(0), LCtx),
-                false, PthreadSemantics);
+    AcquireLock(C, CE, C.getSVal(CE->getArg(0)), false, PthreadSemantics);
   else if (FName == "lck_mtx_lock" ||
            FName == "lck_rw_lock_exclusive" ||
            FName == "lck_rw_lock_shared")
-    AcquireLock(C, CE, state->getSVal(CE->getArg(0), LCtx),
-                false, XNUSemantics);
+    AcquireLock(C, CE, C.getSVal(CE->getArg(0)), false, XNUSemantics);
   else if (FName == "pthread_mutex_trylock" ||
            FName == "pthread_rwlock_tryrdlock" ||
            FName == "pthread_rwlock_trywrlock")
-    AcquireLock(C, CE, state->getSVal(CE->getArg(0), LCtx),
+    AcquireLock(C, CE, C.getSVal(CE->getArg(0)),
                 true, PthreadSemantics);
   else if (FName == "lck_mtx_try_lock" ||
            FName == "lck_rw_try_lock_exclusive" ||
            FName == "lck_rw_try_lock_shared")
-    AcquireLock(C, CE, state->getSVal(CE->getArg(0), LCtx),
-                true, XNUSemantics);
+    AcquireLock(C, CE, C.getSVal(CE->getArg(0)), true, XNUSemantics);
   else if (FName == "pthread_mutex_unlock" ||
            FName == "pthread_rwlock_unlock" ||
            FName == "lck_mtx_unlock" ||
            FName == "lck_rw_done")
-    ReleaseLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
+    ReleaseLock(C, CE, C.getSVal(CE->getArg(0)));
   else if (FName == "pthread_mutex_destroy")
-    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx), PthreadSemantics);
+    DestroyLock(C, CE, C.getSVal(CE->getArg(0)), PthreadSemantics);
   else if (FName == "lck_mtx_destroy")
-    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx), XNUSemantics);
+    DestroyLock(C, CE, C.getSVal(CE->getArg(0)), XNUSemantics);
   else if (FName == "pthread_mutex_init")
-    InitLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
+    InitLock(C, CE, C.getSVal(CE->getArg(0)));
 }
 
 // When a lock is destroyed, in some semantics(like PthreadSemantics) we are not
@@ -232,7 +227,7 @@ void PthreadLockChecker::AcquireLock(CheckerContext &C, const CallExpr *CE,
   if (sym)
     state = resolvePossiblyDestroyedMutex(state, lockR, sym);
 
-  SVal X = state->getSVal(CE, C.getLocationContext());
+  SVal X = C.getSVal(CE);
   if (X.isUnknownOrUndef())
     return;
 
diff --git a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
index e47494a3e90b..bf5945c58cff 100644
--- a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
@@ -1929,6 +1929,12 @@ static bool isNumericLiteralExpression(const Expr *E) {
          isa<CXXBoolLiteralExpr>(E);
 }
 
+static std::string describeRegion(const MemRegion *MR) {
+  // Once we support more storage locations for bindings,
+  // this would need to be improved.
+  return cast<VarRegion>(MR)->getDecl()->getName();
+}
+
 /// Returns true if this stack frame is for an Objective-C method that is a
 /// property getter or setter whose body has been synthesized by the analyzer.
 static bool isSynthesizedAccessor(const StackFrameContext *SFC) {
@@ -2395,7 +2401,7 @@ CFRefLeakReportVisitor::getEndPath(BugReporterContext &BRC,
 
   if (FirstBinding) {
     os << "object allocated and stored into '"
-       << FirstBinding->getString() << '\'';
+       << describeRegion(FirstBinding) << '\'';
   }
   else
     os << "allocated object";
@@ -2523,7 +2529,7 @@ void CFRefLeakReport::createDescription(CheckerContext &Ctx, bool GCEnabled, boo
   os << "of an object";
 
   if (AllocBinding) {
-    os << " stored into '" << AllocBinding->getString() << '\'';
+    os << " stored into '" << describeRegion(AllocBinding) << '\'';
     if (IncludeAllocationLine) {
       FullSourceLoc SL(AllocStmt->getLocStart(), Ctx.getSourceManager());
       os << " (allocated on line " << SL.getSpellingLineNumber() << ")";
@@ -2799,9 +2805,7 @@ void RetainCountChecker::checkPostStmt(const BlockExpr *BE,
     return;
 
   ProgramStateRef state = C.getState();
-  const BlockDataRegion *R =
-    cast<BlockDataRegion>(state->getSVal(BE,
-                                         C.getLocationContext()).getAsRegion());
+  auto *R = cast<BlockDataRegion>(C.getSVal(BE).getAsRegion());
 
   BlockDataRegion::referenced_vars_iterator I = R->referenced_vars_begin(),
                                             E = R->referenced_vars_end();
@@ -2851,7 +2855,7 @@ void RetainCountChecker::checkPostStmt(const CastExpr *CE,
   }
 
   ProgramStateRef state = C.getState();
-  SymbolRef Sym = state->getSVal(CE, C.getLocationContext()).getAsLocSymbol();
+  SymbolRef Sym = C.getSVal(CE).getAsLocSymbol();
   if (!Sym)
     return;
   const RefVal* T = getRefBinding(state, Sym);
@@ -2874,7 +2878,7 @@ void RetainCountChecker::processObjCLiterals(CheckerContext &C,
   ProgramStateRef state = C.getState();
   const ExplodedNode *pred = C.getPredecessor();
   for (const Stmt *Child : Ex->children()) {
-    SVal V = state->getSVal(Child, pred->getLocationContext());
+    SVal V = pred->getSVal(Child);
     if (SymbolRef sym = V.getAsSymbol())
       if (const RefVal* T = getRefBinding(state, sym)) {
         RefVal::Kind hasErr = (RefVal::Kind) 0;
@@ -2913,10 +2917,9 @@ void RetainCountChecker::checkPostStmt(const ObjCDictionaryLiteral *DL,
 void RetainCountChecker::checkPostStmt(const ObjCBoxedExpr *Ex,
                                        CheckerContext &C) const {
   const ExplodedNode *Pred = C.getPredecessor();
-  const LocationContext *LCtx = Pred->getLocationContext();
   ProgramStateRef State = Pred->getState();
 
-  if (SymbolRef Sym = State->getSVal(Ex, LCtx).getAsSymbol()) {
+  if (SymbolRef Sym = Pred->getSVal(Ex).getAsSymbol()) {
     QualType ResultTy = Ex->getType();
     State = setRefBinding(State, Sym,
                           RefVal::makeNotOwned(RetEffect::ObjC, ResultTy));
diff --git a/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp b/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp
index 19fa0fb193cc..1952715a9b7c 100644
--- a/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp
@@ -40,7 +40,7 @@ void ReturnPointerRangeChecker::checkPreStmt(const ReturnStmt *RS,
   if (!RetE)
     return;
 
-  SVal V = state->getSVal(RetE, C.getLocationContext());
+  SVal V = C.getSVal(RetE);
   const MemRegion *R = V.getAsRegion();
 
   const ElementRegion *ER = dyn_cast_or_null<ElementRegion>(R);
diff --git a/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
index 25975628c553..fb639adcc1cb 100644
--- a/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
@@ -255,8 +255,7 @@ void StackAddrEscapeChecker::checkPreStmt(const ReturnStmt *RS,
     return;
   RetE = RetE->IgnoreParens();
 
-  const LocationContext *LCtx = C.getLocationContext();
-  SVal V = C.getState()->getSVal(RetE, LCtx);
+  SVal V = C.getSVal(RetE);
   const MemRegion *R = V.getAsRegion();
   if (!R)
     return;
diff --git a/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 915514b42133..d77975559e3f 100644
--- a/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -242,22 +242,19 @@ void StreamChecker::Fclose(CheckerContext &C, const CallExpr *CE) const {
 
 void StreamChecker::Fread(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(3), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(3)), state, C))
     return;
 }
 
 void StreamChecker::Fwrite(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(3), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(3)), state, C))
     return;
 }
 
 void StreamChecker::Fseek(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!(state = CheckNullStream(state->getSVal(CE->getArg(0),
-                                               C.getLocationContext()), state, C)))
+  if (!(state = CheckNullStream(C.getSVal(CE->getArg(0)), state, C)))
     return;
   // Check the legality of the 'whence' argument of 'fseek'.
   SVal Whence = state->getSVal(CE->getArg(2), C.getLocationContext());
@@ -283,57 +280,49 @@ void StreamChecker::Fseek(CheckerContext &C, const CallExpr *CE) const {
 
 void StreamChecker::Ftell(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
 void StreamChecker::Rewind(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
 void StreamChecker::Fgetpos(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
 void StreamChecker::Fsetpos(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
 void StreamChecker::Clearerr(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
 void StreamChecker::Feof(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
 void StreamChecker::Ferror(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
 void StreamChecker::Fileno(CheckerContext &C, const CallExpr *CE) const {
   ProgramStateRef state = C.getState();
-  if (!CheckNullStream(state->getSVal(CE->getArg(0), C.getLocationContext()),
-                       state, C))
+  if (!CheckNullStream(C.getSVal(CE->getArg(0)), state, C))
     return;
 }
 
@@ -363,8 +352,7 @@ ProgramStateRef StreamChecker::CheckNullStream(SVal SV, ProgramStateRef state,
 ProgramStateRef StreamChecker::CheckDoubleClose(const CallExpr *CE,
                                                ProgramStateRef state,
                                                CheckerContext &C) const {
-  SymbolRef Sym =
-    state->getSVal(CE->getArg(0), C.getLocationContext()).getAsSymbol();
+  SymbolRef Sym = C.getSVal(CE->getArg(0)).getAsSymbol();
   if (!Sym)
     return state;
 
diff --git a/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp b/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
index 5268bbf5562e..b83dea772b34 100644
--- a/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
@@ -114,8 +114,7 @@ DivisionBRVisitor::VisitNode(const ExplodedNode *Succ, const ExplodedNode *Pred,
   if (!E)
     return nullptr;
 
-  ProgramStateRef State = Succ->getState();
-  SVal S = State->getSVal(E, Succ->getLocationContext());
+  SVal S = Succ->getSVal(E);
   if (ZeroSymbol == S.getAsSymbol() && SFC == Succ->getStackFrame()) {
     Satisfied = true;
 
diff --git a/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp b/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
index 0a274292aa39..934ee63318fa 100644
--- a/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
@@ -59,7 +59,7 @@ class UndefBranchChecker : public Checker<check::BranchCondition> {
 
 void UndefBranchChecker::checkBranchCondition(const Stmt *Condition,
                                               CheckerContext &Ctx) const {
-  SVal X = Ctx.getState()->getSVal(Condition, Ctx.getLocationContext());
+  SVal X = Ctx.getSVal(Condition);
   if (X.isUndef()) {
     // Generate a sink node, which implicitly marks both outgoing branches as
     // infeasible.
diff --git a/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp b/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp
index 17fe8610da06..6a93c10c7644 100644
--- a/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp
@@ -55,9 +55,7 @@ UndefCapturedBlockVarChecker::checkPostStmt(const BlockExpr *BE,
     return;
 
   ProgramStateRef state = C.getState();
-  const BlockDataRegion *R =
-    cast<BlockDataRegion>(state->getSVal(BE,
-                                         C.getLocationContext()).getAsRegion());
+  auto *R = cast<BlockDataRegion>(C.getSVal(BE).getAsRegion());
 
   BlockDataRegion::referenced_vars_iterator I = R->referenced_vars_begin(),
                                             E = R->referenced_vars_end();
diff --git a/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp b/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp
index 172ce346f1ba..b9a93bedca2e 100644
--- a/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp
@@ -37,12 +37,11 @@ class UndefResultChecker
 
 static bool isArrayIndexOutOfBounds(CheckerContext &C, const Expr *Ex) {
   ProgramStateRef state = C.getState();
-  const LocationContext *LCtx = C.getLocationContext();
 
   if (!isa<ArraySubscriptExpr>(Ex))
     return false;
 
-  SVal Loc = state->getSVal(Ex, LCtx);
+  SVal Loc = C.getSVal(Ex);
   if (!Loc.isValid())
     return false;
 
@@ -64,11 +63,18 @@ static bool isShiftOverflow(const BinaryOperator *B, CheckerContext &C) {
       B->getRHS(), C.getASTContext().getIntWidth(B->getLHS()->getType()));
 }
 
+static bool isLeftShiftResultUnrepresentable(const BinaryOperator *B,
+                                             CheckerContext &C) {
+  SValBuilder &SB = C.getSValBuilder();
+  ProgramStateRef State = C.getState();
+  const llvm::APSInt *LHS = SB.getKnownValue(State, C.getSVal(B->getLHS()));
+  const llvm::APSInt *RHS = SB.getKnownValue(State, C.getSVal(B->getRHS()));
+  return (unsigned)RHS->getZExtValue() > LHS->countLeadingZeros();
+}
+
 void UndefResultChecker::checkPostStmt(const BinaryOperator *B,
                                        CheckerContext &C) const {
-  ProgramStateRef state = C.getState();
-  const LocationContext *LCtx = C.getLocationContext();
-  if (state->getSVal(B, LCtx).isUndef()) {
+  if (C.getSVal(B).isUndef()) {
 
     // Do not report assignments of uninitialized values inside swap functions.
     // This should allow to swap partially uninitialized structs
@@ -92,11 +98,11 @@ void UndefResultChecker::checkPostStmt(const BinaryOperator *B,
     const Expr *Ex = nullptr;
     bool isLeft = true;
 
-    if (state->getSVal(B->getLHS(), LCtx).isUndef()) {
+    if (C.getSVal(B->getLHS()).isUndef()) {
       Ex = B->getLHS()->IgnoreParenCasts();
       isLeft = true;
     }
-    else if (state->getSVal(B->getRHS(), LCtx).isUndef()) {
+    else if (C.getSVal(B->getRHS()).isUndef()) {
       Ex = B->getRHS()->IgnoreParenCasts();
       isLeft = false;
     }
@@ -141,6 +147,19 @@ void UndefResultChecker::checkPostStmt(const BinaryOperator *B,
                  C.isNegative(B->getLHS())) {
         OS << "The result of the left shift is undefined because the left "
               "operand is negative";
+      } else if (B->getOpcode() == BinaryOperatorKind::BO_Shl &&
+                 isLeftShiftResultUnrepresentable(B, C)) {
+        ProgramStateRef State = C.getState();
+        SValBuilder &SB = C.getSValBuilder();
+        const llvm::APSInt *LHS =
+            SB.getKnownValue(State, C.getSVal(B->getLHS()));
+        const llvm::APSInt *RHS =
+            SB.getKnownValue(State, C.getSVal(B->getRHS()));
+        OS << "The result of the left shift is undefined due to shifting \'"
+           << LHS->getSExtValue() << "\' by \'" << RHS->getZExtValue()
+           << "\', which is unrepresentable in the unsigned version of "
+           << "the return type \'" << B->getLHS()->getType().getAsString()
+           << "\'";
       } else {
         OS << "The result of the '"
            << BinaryOperator::getOpcodeStr(B->getOpcode())
diff --git a/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp b/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
index c3dcf1fac197..52b4ca7d7704 100644
--- a/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
@@ -70,8 +70,7 @@ void UndefinedAssignmentChecker::checkBind(SVal location, SVal val,
 
     if (const BinaryOperator *B = dyn_cast<BinaryOperator>(StoreE)) {
       if (B->isCompoundAssignmentOp()) {
-        ProgramStateRef state = C.getState();
-        if (state->getSVal(B->getLHS(), C.getLocationContext()).isUndef()) {
+        if (C.getSVal(B->getLHS()).isUndef()) {
           str = "The left expression of the compound assignment is an "
                 "uninitialized value. The computed value will also be garbage";
           ex = B->getLHS();
diff --git a/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp b/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp
index 7f9a00ff876d..a6b50dc37740 100644
--- a/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp
@@ -194,7 +194,7 @@ void UnixAPIChecker::CheckOpenVariant(CheckerContext &C,
 
   // Now check if oflags has O_CREAT set.
   const Expr *oflagsEx = CE->getArg(FlagsArgIndex);
-  const SVal V = state->getSVal(oflagsEx, C.getLocationContext());
+  const SVal V = C.getSVal(oflagsEx);
   if (!V.getAs<NonLoc>()) {
     // The case where 'V' can be a location can only be due to a bad header,
     // so in this case bail out.
@@ -248,8 +248,7 @@ void UnixAPIChecker::CheckPthreadOnce(CheckerContext &C,
   // Check if the first argument is stack allocated.  If so, issue a warning
   // because that's likely to be bad news.
   ProgramStateRef state = C.getState();
-  const MemRegion *R =
-    state->getSVal(CE->getArg(0), C.getLocationContext()).getAsRegion();
+  const MemRegion *R = C.getSVal(CE->getArg(0)).getAsRegion();
   if (!R || !isa<StackSpaceRegion>(R->getMemorySpace()))
     return;
 
@@ -336,7 +335,7 @@ void UnixAPIChecker::BasicAllocationCheck(CheckerContext &C,
   ProgramStateRef state = C.getState();
   ProgramStateRef trueState = nullptr, falseState = nullptr;
   const Expr *arg = CE->getArg(sizeArg);
-  SVal argVal = state->getSVal(arg, C.getLocationContext());
+  SVal argVal = C.getSVal(arg);
 
   if (argVal.isUnknownOrUndef())
     return;
@@ -364,7 +363,7 @@ void UnixAPIChecker::CheckCallocZero(CheckerContext &C,
   unsigned int i;
   for (i = 0; i < nArgs; i++) {
     const Expr *arg = CE->getArg(i);
-    SVal argVal = state->getSVal(arg, C.getLocationContext());
+    SVal argVal = C.getSVal(arg);
     if (argVal.isUnknownOrUndef()) {
       if (i == 0)
         continue;
diff --git a/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
index 40217bdee892..0497ee7eef81 100644
--- a/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
@@ -94,7 +94,7 @@ void VLASizeChecker::checkPreStmt(const DeclStmt *DS, CheckerContext &C) const {
   // FIXME: Handle multi-dimensional VLAs.
   const Expr *SE = VLA->getSizeExpr();
   ProgramStateRef state = C.getState();
-  SVal sizeV = state->getSVal(SE, C.getLocationContext());
+  SVal sizeV = C.getSVal(SE);
 
   if (sizeV.isUndef()) {
     reportBug(VLA_Garbage, SE, state, C);
diff --git a/lib/StaticAnalyzer/Checkers/ValistChecker.cpp b/lib/StaticAnalyzer/Checkers/ValistChecker.cpp
index 06c4ef71d80b..44147e521433 100644
--- a/lib/StaticAnalyzer/Checkers/ValistChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ValistChecker.cpp
@@ -64,7 +64,7 @@ class ValistChecker : public Checker<check::PreCall, check::PreStmt<VAArgExpr>,
                                  CheckerContext &C) const;
   void reportLeakedVALists(const RegionVector &LeakedVALists, StringRef Msg1,
                            StringRef Msg2, CheckerContext &C, ExplodedNode *N,
-                           bool ForceReport = false) const;
+                           bool ReportUninit = false) const;
 
   void checkVAListStartCall(const CallEvent &Call, CheckerContext &C,
                             bool IsCopy) const;
@@ -189,7 +189,7 @@ void ValistChecker::checkPreStmt(const VAArgExpr *VAA,
                                  CheckerContext &C) const {
   ProgramStateRef State = C.getState();
   const Expr *VASubExpr = VAA->getSubExpr();
-  SVal VAListSVal = State->getSVal(VASubExpr, C.getLocationContext());
+  SVal VAListSVal = C.getSVal(VASubExpr);
   bool Symbolic;
   const MemRegion *VAList =
       getVAListAsRegion(VAListSVal, VASubExpr, Symbolic, C);
@@ -267,15 +267,19 @@ void ValistChecker::reportUninitializedAccess(const MemRegion *VAList,
 void ValistChecker::reportLeakedVALists(const RegionVector &LeakedVALists,
                                         StringRef Msg1, StringRef Msg2,
                                         CheckerContext &C, ExplodedNode *N,
-                                        bool ForceReport) const {
+                                        bool ReportUninit) const {
   if (!(ChecksEnabled[CK_Unterminated] ||
-        (ChecksEnabled[CK_Uninitialized] && ForceReport)))
+        (ChecksEnabled[CK_Uninitialized] && ReportUninit)))
     return;
   for (auto Reg : LeakedVALists) {
     if (!BT_leakedvalist) {
-      BT_leakedvalist.reset(new BugType(CheckNames[CK_Unterminated],
-                                        "Leaked va_list",
-                                        categories::MemoryError));
+      // FIXME: maybe creating a new check name for this type of bug is a better
+      // solution.
+      BT_leakedvalist.reset(
+          new BugType(CheckNames[CK_Unterminated].getName().empty()
+                          ? CheckNames[CK_Uninitialized]
+                          : CheckNames[CK_Unterminated],
+                      "Leaked va_list", categories::MemoryError));
       BT_leakedvalist->setSuppressOnSink(true);
     }
 
@@ -375,7 +379,7 @@ void ValistChecker::checkVAListEndCall(const CallEvent &Call,
 
 std::shared_ptr<PathDiagnosticPiece> ValistChecker::ValistBugVisitor::VisitNode(
     const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
-    BugReport &BR) {
+    BugReport &) {
   ProgramStateRef State = N->getState();
   ProgramStateRef StatePrev = PrevN->getState();
 
diff --git a/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp b/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp
index 48e3e22af04a..26fea772d0a1 100644
--- a/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp
+++ b/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp
@@ -203,7 +203,7 @@ bool AnalyzerOptions::mayInlineTemplateFunctions() {
 bool AnalyzerOptions::mayInlineCXXAllocator() {
   return getBooleanOption(InlineCXXAllocator,
                           "c++-allocator-inlining",
-                          /*Default=*/false);
+                          /*Default=*/true);
 }
 
 bool AnalyzerOptions::mayInlineCXXContainerMethods() {
diff --git a/lib/StaticAnalyzer/Core/BasicValueFactory.cpp b/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
index ec7a7e9e4b1c..11658816ed25 100644
--- a/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
+++ b/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
@@ -224,7 +224,6 @@ BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op,
       // FIXME: This logic should probably go higher up, where we can
       // test these conditions symbolically.
 
-      // FIXME: Expand these checks to include all undefined behavior.
       if (V1.isSigned() && V1.isNegative())
         return nullptr;
 
@@ -236,6 +235,9 @@ BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op,
       if (Amt >= V1.getBitWidth())
         return nullptr;
 
+      if (V1.isSigned() && Amt > V1.countLeadingZeros())
+          return nullptr;
+
       return &getValue( V1.operator<<( (unsigned) Amt ));
     }
 
@@ -244,8 +246,6 @@ BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op,
       // FIXME: This logic should probably go higher up, where we can
       // test these conditions symbolically.
 
-      // FIXME: Expand these checks to include all undefined behavior.
-
       if (V2.isSigned() && V2.isNegative())
         return nullptr;
 
diff --git a/lib/StaticAnalyzer/Core/BugReporter.cpp b/lib/StaticAnalyzer/Core/BugReporter.cpp
index dc284888eb03..1d48bbcf604e 100644
--- a/lib/StaticAnalyzer/Core/BugReporter.cpp
+++ b/lib/StaticAnalyzer/Core/BugReporter.cpp
@@ -229,7 +229,6 @@ adjustCallLocations(PathPieces &Pieces,
     PathDiagnosticCallPiece *Call = dyn_cast<PathDiagnosticCallPiece>(I->get());
 
     if (!Call) {
-      assert((*I)->getLocation().asLocation().isValid());
       continue;
     }
 
@@ -3510,6 +3509,66 @@ void BugReporter::FlushReport(BugReportEquivClass& EQ) {
   }
 }
 
+/// Insert all lines participating in the function signature \p Signature
+/// into \p ExecutedLines.
+static void populateExecutedLinesWithFunctionSignature(
+    const Decl *Signature, SourceManager &SM,
+    std::unique_ptr<FilesToLineNumsMap> &ExecutedLines) {
+
+  SourceRange SignatureSourceRange;
+  const Stmt* Body = Signature->getBody();
+  if (auto FD = dyn_cast<FunctionDecl>(Signature)) {
+    SignatureSourceRange = FD->getSourceRange();
+  } else if (auto OD = dyn_cast<ObjCMethodDecl>(Signature)) {
+    SignatureSourceRange = OD->getSourceRange();
+  } else {
+    return;
+  }
+  SourceLocation Start = SignatureSourceRange.getBegin();
+  SourceLocation End = Body ? Body->getSourceRange().getBegin()
+    : SignatureSourceRange.getEnd();
+  unsigned StartLine = SM.getExpansionLineNumber(Start);
+  unsigned EndLine = SM.getExpansionLineNumber(End);
+
+  FileID FID = SM.getFileID(SM.getExpansionLoc(Start));
+  for (unsigned Line = StartLine; Line <= EndLine; Line++)
+    ExecutedLines->operator[](FID.getHashValue()).insert(Line);
+}
+
+/// \return all executed lines including function signatures on the path
+/// starting from \p N.
+static std::unique_ptr<FilesToLineNumsMap>
+findExecutedLines(SourceManager &SM, const ExplodedNode *N) {
+  auto ExecutedLines = llvm::make_unique<FilesToLineNumsMap>();
+
+  while (N) {
+    if (N->getFirstPred() == nullptr) {
+
+      // First node: show signature of the entrance point.
+      const Decl *D = N->getLocationContext()->getDecl();
+      populateExecutedLinesWithFunctionSignature(D, SM, ExecutedLines);
+
+    } else if (auto CE = N->getLocationAs<CallEnter>()) {
+
+      // Inlined function: show signature.
+      const Decl* D = CE->getCalleeContext()->getDecl();
+      populateExecutedLinesWithFunctionSignature(D, SM, ExecutedLines);
+
+    } else if (const Stmt *S = PathDiagnosticLocation::getStmt(N)) {
+
+      // Otherwise: show lines associated with the processed statement.
+      SourceLocation Loc = S->getSourceRange().getBegin();
+      SourceLocation ExpansionLoc = SM.getExpansionLoc(Loc);
+      FileID FID = SM.getFileID(ExpansionLoc);
+      unsigned LineNo = SM.getExpansionLineNumber(ExpansionLoc);
+      ExecutedLines->operator[](FID.getHashValue()).insert(LineNo);
+    }
+
+    N = N->getFirstPred();
+  }
+  return ExecutedLines;
+}
+
 void BugReporter::FlushReport(BugReport *exampleReport,
                               PathDiagnosticConsumer &PD,
                               ArrayRef<BugReport*> bugReports) {
@@ -3518,13 +3577,13 @@ void BugReporter::FlushReport(BugReport *exampleReport,
   // Probably doesn't make a difference in practice.
   BugType& BT = exampleReport->getBugType();
 
-  std::unique_ptr<PathDiagnostic> D(new PathDiagnostic(
+  auto D = llvm::make_unique<PathDiagnostic>(
       exampleReport->getBugType().getCheckName(),
       exampleReport->getDeclWithIssue(), exampleReport->getBugType().getName(),
       exampleReport->getDescription(),
       exampleReport->getShortDescription(/*Fallback=*/false), BT.getCategory(),
-      exampleReport->getUniqueingLocation(),
-      exampleReport->getUniqueingDecl()));
+      exampleReport->getUniqueingLocation(), exampleReport->getUniqueingDecl(),
+      findExecutedLines(getSourceManager(), exampleReport->getErrorNode()));
 
   if (exampleReport->isPathSensitive()) {
     // Generate the full path diagnostic, using the generation scheme
diff --git a/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 972f4c5f3da2..7484fcd18925 100644
--- a/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -235,7 +235,7 @@ class ReturnVisitor : public BugReporterVisitorImpl<ReturnVisitor> {
 
     // Check the return value.
     ProgramStateRef State = Node->getState();
-    SVal RetVal = State->getSVal(S, Node->getLocationContext());
+    SVal RetVal = Node->getSVal(S);
 
     // Handle cases where a reference is returned and then immediately used.
     if (cast<Expr>(S)->isGLValue())
@@ -480,6 +480,127 @@ static bool isInitializationOfVar(const ExplodedNode *N, const VarRegion *VR) {
   return FrameSpace->getStackFrame() == LCtx->getCurrentStackFrame();
 }
 
+/// Show diagnostics for initializing or declaring a region \p R with a bad value.
+void showBRDiagnostics(const char *action,
+    llvm::raw_svector_ostream& os,
+    const MemRegion *R,
+    SVal V,
+    const DeclStmt *DS) {
+  if (R->canPrintPretty()) {
+    R->printPretty(os);
+    os << " ";
+  }
+
+  if (V.getAs<loc::ConcreteInt>()) {
+    bool b = false;
+    if (R->isBoundable()) {
+      if (const TypedValueRegion *TR = dyn_cast<TypedValueRegion>(R)) {
+        if (TR->getValueType()->isObjCObjectPointerType()) {
+          os << action << "nil";
+          b = true;
+        }
+      }
+    }
+    if (!b)
+      os << action << "a null pointer value";
+
+  } else if (auto CVal = V.getAs<nonloc::ConcreteInt>()) {
+    os << action << CVal->getValue();
+  } else if (DS) {
+    if (V.isUndef()) {
+      if (isa<VarRegion>(R)) {
+        const VarDecl *VD = cast<VarDecl>(DS->getSingleDecl());
+        if (VD->getInit()) {
+          os << (R->canPrintPretty() ? "initialized" : "Initializing")
+            << " to a garbage value";
+        } else {
+          os << (R->canPrintPretty() ? "declared" : "Declaring")
+            << " without an initial value";
+        }
+      }
+    } else {
+      os << (R->canPrintPretty() ? "initialized" : "Initialized")
+        << " here";
+    }
+  }
+}
+
+/// Display diagnostics for passing bad region as a parameter.
+static void showBRParamDiagnostics(llvm::raw_svector_ostream& os,
+    const VarRegion *VR,
+    SVal V) {
+  const auto *Param = cast<ParmVarDecl>(VR->getDecl());
+
+  os << "Passing ";
+
+  if (V.getAs<loc::ConcreteInt>()) {
+    if (Param->getType()->isObjCObjectPointerType())
+      os << "nil object reference";
+    else
+      os << "null pointer value";
+  } else if (V.isUndef()) {
+    os << "uninitialized value";
+  } else if (auto CI = V.getAs<nonloc::ConcreteInt>()) {
+    os << "the value " << CI->getValue();
+  } else {
+    os << "value";
+  }
+
+  // Printed parameter indexes are 1-based, not 0-based.
+  unsigned Idx = Param->getFunctionScopeIndex() + 1;
+  os << " via " << Idx << llvm::getOrdinalSuffix(Idx) << " parameter";
+  if (VR->canPrintPretty()) {
+    os << " ";
+    VR->printPretty(os);
+  }
+}
+
+/// Show default diagnostics for storing bad region.
+static void showBRDefaultDiagnostics(llvm::raw_svector_ostream& os,
+    const MemRegion *R,
+    SVal V) {
+  if (V.getAs<loc::ConcreteInt>()) {
+    bool b = false;
+    if (R->isBoundable()) {
+      if (const TypedValueRegion *TR = dyn_cast<TypedValueRegion>(R)) {
+        if (TR->getValueType()->isObjCObjectPointerType()) {
+          os << "nil object reference stored";
+          b = true;
+        }
+      }
+    }
+    if (!b) {
+      if (R->canPrintPretty())
+        os << "Null pointer value stored";
+      else
+        os << "Storing null pointer value";
+    }
+
+  } else if (V.isUndef()) {
+    if (R->canPrintPretty())
+      os << "Uninitialized value stored";
+    else
+      os << "Storing uninitialized value";
+
+  } else if (auto CV = V.getAs<nonloc::ConcreteInt>()) {
+    if (R->canPrintPretty())
+      os << "The value " << CV->getValue() << " is assigned";
+    else
+      os << "Assigning " << CV->getValue();
+
+  } else {
+    if (R->canPrintPretty())
+      os << "Value assigned";
+    else
+      os << "Assigning value";
+  }
+
+  if (R->canPrintPretty()) {
+    os << " to ";
+    R->printPretty(os);
+  }
+}
+
 std::shared_ptr<PathDiagnosticPiece>
 FindLastStoreBRVisitor::VisitNode(const ExplodedNode *Succ,
                                   const ExplodedNode *Pred,
@@ -596,7 +717,7 @@ FindLastStoreBRVisitor::VisitNode(const ExplodedNode *Succ,
       if (VR) {
         // See if we can get the BlockVarRegion.
         ProgramStateRef State = StoreSite->getState();
-        SVal V = State->getSVal(S, PS->getLocationContext());
+        SVal V = StoreSite->getSVal(S);
         if (const BlockDataRegion *BDR =
               dyn_cast_or_null<BlockDataRegion>(V.getAsRegion())) {
           if (const VarRegion *OriginalR = BDR->getOriginalRegion(VR)) {
@@ -608,122 +729,16 @@ FindLastStoreBRVisitor::VisitNode(const ExplodedNode *Succ,
         }
       }
     }
+    if (action)
+      showBRDiagnostics(action, os, R, V, DS);
 
-    if (action) {
-      if (R->canPrintPretty()) {
-        R->printPretty(os);
-        os << " ";
-      }
-
-      if (V.getAs<loc::ConcreteInt>()) {
-        bool b = false;
-        if (R->isBoundable()) {
-          if (const TypedValueRegion *TR = dyn_cast<TypedValueRegion>(R)) {
-            if (TR->getValueType()->isObjCObjectPointerType()) {
-              os << action << "nil";
-              b = true;
-            }
-          }
-        }
-
-        if (!b)
-          os << action << "a null pointer value";
-      } else if (Optional<nonloc::ConcreteInt> CVal =
-                     V.getAs<nonloc::ConcreteInt>()) {
-        os << action << CVal->getValue();
-      }
-      else if (DS) {
-        if (V.isUndef()) {
-          if (isa<VarRegion>(R)) {
-            const VarDecl *VD = cast<VarDecl>(DS->getSingleDecl());
-            if (VD->getInit()) {
-              os << (R->canPrintPretty() ? "initialized" : "Initializing")
-                 << " to a garbage value";
-            } else {
-              os << (R->canPrintPretty() ? "declared" : "Declaring")
-                 << " without an initial value";
-            }
-          }
-        }
-        else {
-          os << (R->canPrintPretty() ? "initialized" : "Initialized")
-             << " here";
-        }
-      }
-    }
   } else if (StoreSite->getLocation().getAs<CallEnter>()) {
-    if (const VarRegion *VR = dyn_cast<VarRegion>(R)) {
-      const ParmVarDecl *Param = cast<ParmVarDecl>(VR->getDecl());
-
-      os << "Passing ";
-
-      if (V.getAs<loc::ConcreteInt>()) {
-        if (Param->getType()->isObjCObjectPointerType())
-          os << "nil object reference";
-        else
-          os << "null pointer value";
-      } else if (V.isUndef()) {
-        os << "uninitialized value";
-      } else if (Optional<nonloc::ConcreteInt> CI =
-                     V.getAs<nonloc::ConcreteInt>()) {
-        os << "the value " << CI->getValue();
-      } else {
-        os << "value";
-      }
-
-      // Printed parameter indexes are 1-based, not 0-based.
-      unsigned Idx = Param->getFunctionScopeIndex() + 1;
-      os << " via " << Idx << llvm::getOrdinalSuffix(Idx) << " parameter";
-      if (R->canPrintPretty()) {
-        os << " ";
-        R->printPretty(os);
-      }
-    }
+    if (const VarRegion *VR = dyn_cast<VarRegion>(R))
+      showBRParamDiagnostics(os, VR, V);
   }
 
-  if (os.str().empty()) {
-    if (V.getAs<loc::ConcreteInt>()) {
-      bool b = false;
-      if (R->isBoundable()) {
-        if (const TypedValueRegion *TR = dyn_cast<TypedValueRegion>(R)) {
-          if (TR->getValueType()->isObjCObjectPointerType()) {
-            os << "nil object reference stored";
-            b = true;
-          }
-        }
-      }
-      if (!b) {
-        if (R->canPrintPretty())
-          os << "Null pointer value stored";
-        else
-          os << "Storing null pointer value";
-      }
-
-    } else if (V.isUndef()) {
-      if (R->canPrintPretty())
-        os << "Uninitialized value stored";
-      else
-        os << "Storing uninitialized value";
-
-    } else if (Optional<nonloc::ConcreteInt> CV =
-                   V.getAs<nonloc::ConcreteInt>()) {
-      if (R->canPrintPretty())
-        os << "The value " << CV->getValue() << " is assigned";
-      else
-        os << "Assigning " << CV->getValue();
-
-    } else {
-      if (R->canPrintPretty())
-        os << "Value assigned";
-      else
-        os << "Assigning value";
-    }
-
-    if (R->canPrintPretty()) {
-      os << " to ";
-      R->printPretty(os);
-    }
-  }
+  if (os.str().empty())
+    showBRDefaultDiagnostics(os, R, V);
 
   // Construct a new PathDiagnosticPiece.
   ProgramPoint P = StoreSite->getLocation();
@@ -839,6 +854,13 @@ const char *SuppressInlineDefensiveChecksVisitor::getTag() {
   return "IDCVisitor";
 }
 
+/// \return name of the macro inside the location \p Loc.
+static StringRef getMacroName(SourceLocation Loc,
+    BugReporterContext &BRC) {
+  return Lexer::getImmediateMacroName(
+      Loc, BRC.getSourceManager(), BRC.getASTContext().getLangOpts());
+}
+
 std::shared_ptr<PathDiagnosticPiece>
 SuppressInlineDefensiveChecksVisitor::VisitNode(const ExplodedNode *Succ,
                                                 const ExplodedNode *Pred,
@@ -878,9 +900,6 @@ SuppressInlineDefensiveChecksVisitor::VisitNode(const ExplodedNode *Succ,
     if (!BugPoint)
       return nullptr;
 
-    SourceLocation BugLoc = BugPoint->getStmt()->getLocStart();
-    if (BugLoc.isMacroID())
-      return nullptr;
 
     ProgramPoint CurPoint = Succ->getLocation();
     const Stmt *CurTerminatorStmt = nullptr;
@@ -902,14 +921,14 @@ SuppressInlineDefensiveChecksVisitor::VisitNode(const ExplodedNode *Succ,
 
     SourceLocation TerminatorLoc = CurTerminatorStmt->getLocStart();
     if (TerminatorLoc.isMacroID()) {
-      const SourceManager &SMgr = BRC.getSourceManager();
-      std::pair<FileID, unsigned> TLInfo = SMgr.getDecomposedLoc(TerminatorLoc);
-      SrcMgr::SLocEntry SE = SMgr.getSLocEntry(TLInfo.first);
-      const SrcMgr::ExpansionInfo &EInfo = SE.getExpansion();
-      if (EInfo.isFunctionMacroExpansion()) {
+      SourceLocation BugLoc = BugPoint->getStmt()->getLocStart();
+
+      // Suppress reports unless we are in that same macro.
+      if (!BugLoc.isMacroID() ||
+          getMacroName(BugLoc, BRC) != getMacroName(TerminatorLoc, BRC)) {
         BR.markInvalid("Suppress Macro IDC", CurLC);
-        return nullptr;
       }
+      return nullptr;
     }
   }
   return nullptr;
@@ -978,6 +997,65 @@ static const Expr *peelOffOuterExpr(const Expr *Ex,
   return Ex;
 }
 
+/// Walk through nodes until we get one that matches the statement exactly.
+/// Alternately, if we hit a known lvalue for the statement, we know we've
+/// gone too far (though we can likely track the lvalue better anyway).
+static const ExplodedNode* findNodeForStatement(const ExplodedNode *N,
+                                                const Stmt *S,
+                                                const Expr *Inner) {
+  do {
+    const ProgramPoint &pp = N->getLocation();
+    if (auto ps = pp.getAs<StmtPoint>()) {
+      if (ps->getStmt() == S || ps->getStmt() == Inner)
+        break;
+    } else if (auto CEE = pp.getAs<CallExitEnd>()) {
+      if (CEE->getCalleeContext()->getCallSite() == S ||
+          CEE->getCalleeContext()->getCallSite() == Inner)
+        break;
+    }
+    N = N->getFirstPred();
+  } while (N);
+  return N;
+}
+
+/// Find the ExplodedNode where the lvalue (the value of 'Ex')
+/// was computed.
+static const ExplodedNode* findNodeForExpression(const ExplodedNode *N,
+    const Expr *Inner) {
+  while (N) {
+    if (auto P = N->getLocation().getAs<PostStmt>()) {
+      if (P->getStmt() == Inner)
+        break;
+    }
+    N = N->getFirstPred();
+  }
+  assert(N && "Unable to find the lvalue node.");
+  return N;
+
+}
+
+/// Performing operator `&' on an lvalue expression is essentially a no-op.
+/// Then, if we are taking addresses of fields or elements, these are also
+/// unlikely to matter.
+static const Expr* peelOfOuterAddrOf(const Expr* Ex) {
+  Ex = Ex->IgnoreParenCasts();
+
+  // FIXME: There's a hack in our Store implementation that always computes
+  // field offsets around null pointers as if they are always equal to 0.
+  // The idea here is to report accesses to fields as null dereferences
+  // even though the pointer value that's being dereferenced is actually
+  // the offset of the field rather than exactly 0.
+  // See the FIXME in StoreManager's getLValueFieldOrIvar() method.
+  // This code interacts heavily with this hack; otherwise the value
+  // would not be null at all for most fields, so we'd be unable to track it.
+  if (const auto *Op = dyn_cast<UnaryOperator>(Ex))
+    if (Op->getOpcode() == UO_AddrOf && Op->getSubExpr()->isLValue())
+      if (const Expr *DerefEx = bugreporter::getDerefExpr(Op->getSubExpr()))
+        return DerefEx;
+  return Ex;
+
+}
+
 bool bugreporter::trackNullOrUndefValue(const ExplodedNode *N,
                                         const Stmt *S,
                                         BugReport &report, bool IsArg,
@@ -985,52 +1063,23 @@ bool bugreporter::trackNullOrUndefValue(const ExplodedNode *N,
   if (!S || !N)
     return false;
 
-  if (const Expr *Ex = dyn_cast<Expr>(S))
+  if (const auto *Ex = dyn_cast<Expr>(S))
     S = peelOffOuterExpr(Ex, N);
 
   const Expr *Inner = nullptr;
-  if (const Expr *Ex = dyn_cast<Expr>(S)) {
+  if (const auto *Ex = dyn_cast<Expr>(S)) {
+    Ex = peelOfOuterAddrOf(Ex);
     Ex = Ex->IgnoreParenCasts();
 
-    // Performing operator `&' on an lvalue expression is essentially a no-op.
-    // Then, if we are taking addresses of fields or elements, these are also
-    // unlikely to matter.
-    // FIXME: There's a hack in our Store implementation that always computes
-    // field offsets around null pointers as if they are always equal to 0.
-    // The idea here is to report accesses to fields as null dereferences
-    // even though the pointer value that's being dereferenced is actually
-    // the offset of the field rather than exactly 0.
-    // See the FIXME in StoreManager's getLValueFieldOrIvar() method.
-    // This code interacts heavily with this hack; otherwise the value
-    // would not be null at all for most fields, so we'd be unable to track it.
-    if (const auto *Op = dyn_cast<UnaryOperator>(Ex))
-      if (Op->getOpcode() == UO_AddrOf && Op->getSubExpr()->isLValue())
-        if (const Expr *DerefEx = getDerefExpr(Op->getSubExpr()))
-          Ex = DerefEx;
-
-    if (Ex && (ExplodedGraph::isInterestingLValueExpr(Ex) || CallEvent::isCallStmt(Ex)))
+    if (Ex && (ExplodedGraph::isInterestingLValueExpr(Ex)
+          || CallEvent::isCallStmt(Ex)))
       Inner = Ex;
   }
 
   if (IsArg && !Inner) {
     assert(N->getLocation().getAs<CallEnter>() && "Tracking arg but not at call");
   } else {
-    // Walk through nodes until we get one that matches the statement exactly.
-    // Alternately, if we hit a known lvalue for the statement, we know we've
-    // gone too far (though we can likely track the lvalue better anyway).
-    do {
-      const ProgramPoint &pp = N->getLocation();
-      if (Optional<StmtPoint> ps = pp.getAs<StmtPoint>()) {
-        if (ps->getStmt() == S || ps->getStmt() == Inner)
-          break;
-      } else if (Optional<CallExitEnd> CEE = pp.getAs<CallExitEnd>()) {
-        if (CEE->getCalleeContext()->getCallSite() == S ||
-            CEE->getCalleeContext()->getCallSite() == Inner)
-          break;
-      }
-      N = N->getFirstPred();
-    } while (N);
-
+    N = findNodeForStatement(N, S, Inner);
     if (!N)
       return false;
   }
@@ -1041,48 +1090,35 @@ bool bugreporter::trackNullOrUndefValue(const ExplodedNode *N,
   // At this point in the path, the receiver should be live since we are at the
   // message send expr. If it is nil, start tracking it.
   if (const Expr *Receiver = NilReceiverBRVisitor::getNilReceiver(S, N))
-    trackNullOrUndefValue(N, Receiver, report, false, EnableNullFPSuppression);
-
+    trackNullOrUndefValue(N, Receiver, report, /* IsArg=*/ false,
+        EnableNullFPSuppression);
 
   // See if the expression we're interested refers to a variable.
   // If so, we can track both its contents and constraints on its value.
   if (Inner && ExplodedGraph::isInterestingLValueExpr(Inner)) {
-    const MemRegion *R = nullptr;
-
-    // Find the ExplodedNode where the lvalue (the value of 'Ex')
-    // was computed.  We need this for getting the location value.
-    const ExplodedNode *LVNode = N;
-    while (LVNode) {
-      if (Optional<PostStmt> P = LVNode->getLocation().getAs<PostStmt>()) {
-        if (P->getStmt() == Inner)
-          break;
-      }
-      LVNode = LVNode->getFirstPred();
-    }
-    assert(LVNode && "Unable to find the lvalue node.");
+    const ExplodedNode *LVNode = findNodeForExpression(N, Inner);
     ProgramStateRef LVState = LVNode->getState();
-    SVal LVal = LVState->getSVal(Inner, LVNode->getLocationContext());
-
-    if (LVState->isNull(LVal).isConstrainedTrue()) {
-      // In case of C++ references, we want to differentiate between a null
-      // reference and reference to null pointer.
-      // If the LVal is null, check if we are dealing with null reference.
-      // For those, we want to track the location of the reference.
-      if (const MemRegion *RR = getLocationRegionIfReference(Inner, N))
-        R = RR;
-    } else {
-      R = LVState->getSVal(Inner, LVNode->getLocationContext()).getAsRegion();
-
-      // If this is a C++ reference to a null pointer, we are tracking the
-      // pointer. In addition, we should find the store at which the reference
-      // got initialized.
-      if (const MemRegion *RR = getLocationRegionIfReference(Inner, N)) {
-        if (Optional<KnownSVal> KV = LVal.getAs<KnownSVal>())
-          report.addVisitor(llvm::make_unique<FindLastStoreBRVisitor>(
+    SVal LVal = LVNode->getSVal(Inner);
+
+    const MemRegion *RR = getLocationRegionIfReference(Inner, N);
+    bool LVIsNull = LVState->isNull(LVal).isConstrainedTrue();
+
+    // If this is a C++ reference to a null pointer, we are tracking the
+    // pointer. In addition, we should find the store at which the reference
+    // got initialized.
+    if (RR && !LVIsNull) {
+      if (auto KV = LVal.getAs<KnownSVal>())
+        report.addVisitor(llvm::make_unique<FindLastStoreBRVisitor>(
               *KV, RR, EnableNullFPSuppression));
-      }
     }
 
+    // In case of C++ references, we want to differentiate between a null
+    // reference and reference to null pointer.
+    // If the LVal is null, check if we are dealing with null reference.
+    // For those, we want to track the location of the reference.
+    const MemRegion *R = (RR && LVIsNull) ? RR :
+        LVNode->getSVal(Inner).getAsRegion();
+
     if (R) {
       // Mark both the variable region and its contents as interesting.
       SVal V = LVState->getRawSVal(loc::MemRegionVal(R));
@@ -1094,21 +1130,21 @@ bool bugreporter::trackNullOrUndefValue(const ExplodedNode *N,
       // If the contents are symbolic, find out when they became null.
       if (V.getAsLocSymbol(/*IncludeBaseRegions*/ true))
         report.addVisitor(llvm::make_unique<TrackConstraintBRVisitor>(
-            V.castAs<DefinedSVal>(), false));
+              V.castAs<DefinedSVal>(), false));
 
       // Add visitor, which will suppress inline defensive checks.
-      if (Optional<DefinedSVal> DV = V.getAs<DefinedSVal>()) {
+      if (auto DV = V.getAs<DefinedSVal>()) {
         if (!DV->isZeroConstant() && LVState->isNull(*DV).isConstrainedTrue() &&
             EnableNullFPSuppression) {
           report.addVisitor(
               llvm::make_unique<SuppressInlineDefensiveChecksVisitor>(*DV,
-                                                                      LVNode));
+                LVNode));
         }
       }
 
-      if (Optional<KnownSVal> KV = V.getAs<KnownSVal>())
+      if (auto KV = V.getAs<KnownSVal>())
         report.addVisitor(llvm::make_unique<FindLastStoreBRVisitor>(
-            *KV, R, EnableNullFPSuppression));
+              *KV, R, EnableNullFPSuppression));
       return true;
     }
   }
@@ -1119,7 +1155,7 @@ bool bugreporter::trackNullOrUndefValue(const ExplodedNode *N,
 
   // If the value came from an inlined function call, we should at least make
   // sure that function isn't pruned in our output.
-  if (const Expr *E = dyn_cast<Expr>(S))
+  if (const auto *E = dyn_cast<Expr>(S))
     S = E->IgnoreParenCasts();
 
   ReturnVisitor::addVisitorIfNecessary(N, S, report, EnableNullFPSuppression);
@@ -1128,29 +1164,29 @@ bool bugreporter::trackNullOrUndefValue(const ExplodedNode *N,
   // base value that was dereferenced.
   // assert(!V.isUnknownOrUndef());
   // Is it a symbolic value?
-  if (Optional<loc::MemRegionVal> L = V.getAs<loc::MemRegionVal>()) {
+  if (auto L = V.getAs<loc::MemRegionVal>()) {
+    report.addVisitor(llvm::make_unique<UndefOrNullArgVisitor>(L->getRegion()));
+
     // At this point we are dealing with the region's LValue.
     // However, if the rvalue is a symbolic region, we should track it as well.
     // Try to use the correct type when looking up the value.
     SVal RVal;
-    if (const Expr *E = dyn_cast<Expr>(S))
+    if (const auto *E = dyn_cast<Expr>(S))
       RVal = state->getRawSVal(L.getValue(), E->getType());
     else
       RVal = state->getSVal(L->getRegion());
 
-    report.addVisitor(llvm::make_unique<UndefOrNullArgVisitor>(L->getRegion()));
-    if (Optional<KnownSVal> KV = RVal.getAs<KnownSVal>())
+    if (auto KV = RVal.getAs<KnownSVal>())
       report.addVisitor(llvm::make_unique<FindLastStoreBRVisitor>(
-          *KV, L->getRegion(), EnableNullFPSuppression));
+            *KV, L->getRegion(), EnableNullFPSuppression));
 
     const MemRegion *RegionRVal = RVal.getAsRegion();
     if (RegionRVal && isa<SymbolicRegion>(RegionRVal)) {
       report.markInteresting(RegionRVal);
       report.addVisitor(llvm::make_unique<TrackConstraintBRVisitor>(
-          loc::MemRegionVal(RegionRVal), false));
+            loc::MemRegionVal(RegionRVal), false));
     }
   }
-
   return true;
 }
 
@@ -1161,7 +1197,7 @@ const Expr *NilReceiverBRVisitor::getNilReceiver(const Stmt *S,
     return nullptr;
   if (const Expr *Receiver = ME->getInstanceReceiver()) {
     ProgramStateRef state = N->getState();
-    SVal V = state->getSVal(Receiver, N->getLocationContext());
+    SVal V = N->getSVal(Receiver);
     if (state->isNull(V).isConstrainedTrue())
       return Receiver;
   }
@@ -1217,8 +1253,7 @@ void FindLastStoreBRVisitor::registerStatementVarDecls(BugReport &BR,
     const Stmt *Head = WorkList.front();
     WorkList.pop_front();
 
-    ProgramStateRef state = N->getState();
-    ProgramStateManager &StateMgr = state->getStateManager();
+    ProgramStateManager &StateMgr = N->getState()->getStateManager();
 
     if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(Head)) {
       if (const VarDecl *VD = dyn_cast<VarDecl>(DR->getDecl())) {
@@ -1226,7 +1261,7 @@ void FindLastStoreBRVisitor::registerStatementVarDecls(BugReport &BR,
         StateMgr.getRegionManager().getVarRegion(VD, N->getLocationContext());
 
         // What did we load?
-        SVal V = state->getSVal(S, N->getLocationContext());
+        SVal V = N->getSVal(S);
 
         if (V.getAs<loc::ConcreteInt>() || V.getAs<nonloc::ConcreteInt>()) {
           // Register a new visitor with the BugReport.
@@ -1796,7 +1831,7 @@ UndefOrNullArgVisitor::VisitNode(const ExplodedNode *N,
     const MemRegion *ArgReg = Call->getArgSVal(Idx).getAsRegion();
 
     // Are we tracking the argument or its subregion?
-    if ( !ArgReg || (ArgReg != R && !R->isSubRegionOf(ArgReg->StripCasts())))
+    if ( !ArgReg || !R->isSubRegionOf(ArgReg->StripCasts()))
       continue;
 
     // Check the function parameter type.
diff --git a/lib/StaticAnalyzer/Core/CallEvent.cpp b/lib/StaticAnalyzer/Core/CallEvent.cpp
index 776369be9dba..76b9630c3b5a 100644
--- a/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -672,8 +672,13 @@ SVal CXXConstructorCall::getCXXThisVal() const {
 
 void CXXConstructorCall::getExtraInvalidatedValues(ValueList &Values,
                            RegionAndSymbolInvalidationTraits *ETraits) const {
-  if (Data)
-    Values.push_back(loc::MemRegionVal(static_cast<const MemRegion *>(Data)));
+  if (Data) {
+    loc::MemRegionVal MV(static_cast<const MemRegion *>(Data));
+    if (SymbolRef Sym = MV.getAsSymbol(true))
+      ETraits->setTrait(Sym,
+                        RegionAndSymbolInvalidationTraits::TK_SuppressEscape);
+    Values.push_back(MV);
+  }
 }
 
 void CXXConstructorCall::getInitialStackFrameContents(
diff --git a/lib/StaticAnalyzer/Core/CheckerContext.cpp b/lib/StaticAnalyzer/Core/CheckerContext.cpp
index 61cbf3854bb2..6cf931abbddd 100644
--- a/lib/StaticAnalyzer/Core/CheckerContext.cpp
+++ b/lib/StaticAnalyzer/Core/CheckerContext.cpp
@@ -20,9 +20,8 @@ using namespace clang;
 using namespace ento;
 
 const FunctionDecl *CheckerContext::getCalleeDecl(const CallExpr *CE) const {
-  ProgramStateRef State = getState();
   const Expr *Callee = CE->getCallee();
-  SVal L = State->getSVal(Callee, Pred->getLocationContext());
+  SVal L = Pred->getSVal(Callee);
   return L.getAsFunctionDecl();
 }
 
diff --git a/lib/StaticAnalyzer/Core/CheckerManager.cpp b/lib/StaticAnalyzer/Core/CheckerManager.cpp
index 49f3edef2a2d..f3de2e405000 100644
--- a/lib/StaticAnalyzer/Core/CheckerManager.cpp
+++ b/lib/StaticAnalyzer/Core/CheckerManager.cpp
@@ -469,6 +469,41 @@ void CheckerManager::runCheckersForBranchCondition(const Stmt *Condition,
   expandGraphWithCheckers(C, Dst, Src);
 }
 
+namespace {
+  struct CheckNewAllocatorContext {
+    typedef std::vector<CheckerManager::CheckNewAllocatorFunc> CheckersTy;
+    const CheckersTy &Checkers;
+    const CXXNewExpr *NE;
+    SVal Target;
+    bool WasInlined;
+    ExprEngine &Eng;
+
+    CheckersTy::const_iterator checkers_begin() { return Checkers.begin(); }
+    CheckersTy::const_iterator checkers_end() { return Checkers.end(); }
+
+    CheckNewAllocatorContext(const CheckersTy &Checkers, const CXXNewExpr *NE,
+                             SVal Target, bool WasInlined, ExprEngine &Eng)
+        : Checkers(Checkers), NE(NE), Target(Target), WasInlined(WasInlined),
+          Eng(Eng) {}
+
+    void runChecker(CheckerManager::CheckNewAllocatorFunc checkFn,
+                    NodeBuilder &Bldr, ExplodedNode *Pred) {
+      ProgramPoint L = PostAllocatorCall(NE, Pred->getLocationContext());
+      CheckerContext C(Bldr, Eng, Pred, L, WasInlined);
+      checkFn(NE, Target, C);
+    }
+  };
+}
+
+void CheckerManager::runCheckersForNewAllocator(
+    const CXXNewExpr *NE, SVal Target, ExplodedNodeSet &Dst, ExplodedNode *Pred,
+    ExprEngine &Eng, bool WasInlined) {
+  ExplodedNodeSet Src;
+  Src.insert(Pred);
+  CheckNewAllocatorContext C(NewAllocatorCheckers, NE, Target, WasInlined, Eng);
+  expandGraphWithCheckers(C, Dst, Src);
+}
+
 /// \brief Run checkers for live symbols.
 void CheckerManager::runCheckersForLiveSymbols(ProgramStateRef state,
                                                SymbolReaper &SymReaper) {
@@ -711,6 +746,10 @@ void CheckerManager::_registerForBranchCondition(
   BranchConditionCheckers.push_back(checkfn);
 }
 
+void CheckerManager::_registerForNewAllocator(CheckNewAllocatorFunc checkfn) {
+  NewAllocatorCheckers.push_back(checkfn);
+}
+
 void CheckerManager::_registerForLiveSymbols(CheckLiveSymbolsFunc checkfn) {
   LiveSymbolsCheckers.push_back(checkfn);
 }
diff --git a/lib/StaticAnalyzer/Core/CoreEngine.cpp b/lib/StaticAnalyzer/Core/CoreEngine.cpp
index e2e9ddf5048e..2fdd310fa31a 100644
--- a/lib/StaticAnalyzer/Core/CoreEngine.cpp
+++ b/lib/StaticAnalyzer/Core/CoreEngine.cpp
@@ -37,8 +37,6 @@ STATISTIC(NumPathsExplored,
 // Worklist classes for exploration of reachable states.
 //===----------------------------------------------------------------------===//
 
-WorkList::Visitor::~Visitor() {}
-
 namespace {
 class DFS : public WorkList {
   SmallVector<WorkListUnit,20> Stack;
@@ -57,15 +55,6 @@ class DFS : public WorkList {
     Stack.pop_back(); // This technically "invalidates" U, but we are fine.
     return U;
   }
-
-  bool visitItemsInWorkList(Visitor &V) override {
-    for (SmallVectorImpl<WorkListUnit>::iterator
-         I = Stack.begin(), E = Stack.end(); I != E; ++I) {
-      if (V.visit(*I))
-        return true;
-    }
-    return false;
-  }
 };
 
 class BFS : public WorkList {
@@ -85,14 +74,6 @@ class BFS : public WorkList {
     return U;
   }
 
-  bool visitItemsInWorkList(Visitor &V) override {
-    for (std::deque<WorkListUnit>::iterator
-         I = Queue.begin(), E = Queue.end(); I != E; ++I) {
-      if (V.visit(*I))
-        return true;
-    }
-    return false;
-  }
 };
 
 } // end anonymous namespace
@@ -135,20 +116,6 @@ namespace {
       Queue.pop_front();
       return U;
     }
-    bool visitItemsInWorkList(Visitor &V) override {
-      for (SmallVectorImpl<WorkListUnit>::iterator
-           I = Stack.begin(), E = Stack.end(); I != E; ++I) {
-        if (V.visit(*I))
-          return true;
-      }
-      for (std::deque<WorkListUnit>::iterator
-           I = Queue.begin(), E = Queue.end(); I != E; ++I) {
-        if (V.visit(*I))
-          return true;
-      }
-      return false;
-    }
-
   };
 } // end anonymous namespace
 
@@ -275,7 +242,8 @@ void CoreEngine::dispatchWorkItem(ExplodedNode* Pred, ProgramPoint Loc,
              Loc.getAs<PostInitializer>() ||
              Loc.getAs<PostImplicitCall>() ||
              Loc.getAs<CallExitEnd>() ||
-             Loc.getAs<LoopExit>());
+             Loc.getAs<LoopExit>() ||
+             Loc.getAs<PostAllocatorCall>());
       HandlePostStmt(WU.getBlock(), WU.getIndex(), Pred);
       break;
   }
diff --git a/lib/StaticAnalyzer/Core/ExprEngine.cpp b/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 3be37e7ae301..4a179c5cc46c 100644
--- a/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -63,6 +63,21 @@ typedef std::pair<const CXXBindTemporaryExpr *, const StackFrameContext *>
 REGISTER_TRAIT_WITH_PROGRAMSTATE(InitializedTemporariesSet,
                                  llvm::ImmutableSet<CXXBindTemporaryContext>)
 
+typedef llvm::ImmutableMap<std::pair<const CXXNewExpr *,
+                           const LocationContext *>, SVal>
+    CXXNewAllocatorValuesMap;
+
+// Keeps track of return values of various operator new() calls between
+// evaluation of the inlined operator new(), through the constructor call,
+// to the actual evaluation of the CXXNewExpr.
+// TODO: Refactor the key for this trait into a LocationContext sub-class,
+// which would be put on the stack of location contexts before operator new()
+// is evaluated, and removed from the stack when the whole CXXNewExpr
+// is fully evaluated.
+// Probably do something similar to the previous trait as well.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(CXXNewAllocatorValues,
+                                 CXXNewAllocatorValuesMap)
+
 //===----------------------------------------------------------------------===//
 // Engine construction and deletion.
 //===----------------------------------------------------------------------===//
@@ -308,6 +323,43 @@ ExprEngine::createTemporaryRegionIfNeeded(ProgramStateRef State,
   return State;
 }
 
+ProgramStateRef
+ExprEngine::setCXXNewAllocatorValue(ProgramStateRef State,
+                                    const CXXNewExpr *CNE,
+                                    const LocationContext *CallerLC, SVal V) {
+  assert(!State->get<CXXNewAllocatorValues>(std::make_pair(CNE, CallerLC)) &&
+         "Allocator value already set!");
+  return State->set<CXXNewAllocatorValues>(std::make_pair(CNE, CallerLC), V);
+}
+
+SVal ExprEngine::getCXXNewAllocatorValue(ProgramStateRef State,
+                                         const CXXNewExpr *CNE,
+                                         const LocationContext *CallerLC) {
+  return *State->get<CXXNewAllocatorValues>(std::make_pair(CNE, CallerLC));
+}
+
+ProgramStateRef
+ExprEngine::clearCXXNewAllocatorValue(ProgramStateRef State,
+                                      const CXXNewExpr *CNE,
+                                      const LocationContext *CallerLC) {
+  return State->remove<CXXNewAllocatorValues>(std::make_pair(CNE, CallerLC));
+}
+
+bool ExprEngine::areCXXNewAllocatorValuesClear(ProgramStateRef State,
+                                               const LocationContext *FromLC,
+                                               const LocationContext *ToLC) {
+  const LocationContext *LC = FromLC;
+  do {
+    for (auto I : State->get<CXXNewAllocatorValues>())
+      if (I.first.second == LC)
+        return false;
+
+    LC = LC->getParent();
+    assert(LC && "ToLC must be a parent of FromLC!");
+  } while (LC != ToLC);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Top-level transfer function logic (Dispatcher).
 //===----------------------------------------------------------------------===//
@@ -429,6 +481,13 @@ void ExprEngine::removeDead(ExplodedNode *Pred, ExplodedNodeSet &Out,
   const StackFrameContext *SFC = LC ? LC->getCurrentStackFrame() : nullptr;
   SymbolReaper SymReaper(SFC, ReferenceStmt, SymMgr, getStoreManager());
 
+  for (auto I : CleanedState->get<CXXNewAllocatorValues>()) {
+    if (SymbolRef Sym = I.second.getAsSymbol())
+      SymReaper.markLive(Sym);
+    if (const MemRegion *MR = I.second.getAsRegion())
+      SymReaper.markElementIndicesLive(MR);
+  }
+
   getCheckerManager().runCheckersForLiveSymbols(CleanedState, SymReaper);
 
   // Create a state in which dead bindings are removed from the environment
@@ -1265,8 +1324,16 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
 
     case Stmt::CXXNewExprClass: {
       Bldr.takeNodes(Pred);
+
+      ExplodedNodeSet PreVisit;
+      getCheckerManager().runCheckersForPreStmt(PreVisit, Pred, S, *this);
+
       ExplodedNodeSet PostVisit;
-      VisitCXXNewExpr(cast<CXXNewExpr>(S), Pred, PostVisit);
+      for (ExplodedNodeSet::iterator i = PreVisit.begin(),
+                                     e = PreVisit.end(); i != e ; ++i) {
+        VisitCXXNewExpr(cast<CXXNewExpr>(S), *i, PostVisit);
+      }
+
       getCheckerManager().runCheckersForPostStmt(Dst, PostVisit, S, *this);
       Bldr.addNodes(Dst);
       break;
@@ -2151,9 +2218,17 @@ void ExprEngine::VisitArraySubscriptExpr(const ArraySubscriptExpr *A,
     ProgramStateRef state = Node->getState();
 
     if (IsGLValueLike) {
-      SVal V = state->getLValue(A->getType(),
-          state->getSVal(Idx, LCtx),
-          state->getSVal(Base, LCtx));
+      QualType T = A->getType();
+
+      // One of the forbidden LValue types! We still need to have sensible
+      // symbolic locations to represent this stuff. Note that arithmetic on
+      // void pointers is a GCC extension.
+      if (T->isVoidType())
+        T = getContext().CharTy;
+
+      SVal V = state->getLValue(T,
+                                state->getSVal(Idx, LCtx),
+                                state->getSVal(Base, LCtx));
       Bldr.generateNode(A, Node, state->BindExpr(A, LCtx, V), nullptr,
           ProgramPoint::PostLValueKind);
     } else if (IsVectorType) {
@@ -2863,6 +2938,8 @@ struct DOTGraphTraits<ExplodedNode*> :
           Out << "\\lPostStore\\l";
         else if (Loc.getAs<PostLValue>())
           Out << "\\lPostLValue\\l";
+        else if (Loc.getAs<PostAllocatorCall>())
+          Out << "\\lPostAllocatorCall\\l";
 
         break;
       }
diff --git a/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index dad93111966f..9dc50df93f28 100644
--- a/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -115,14 +115,33 @@ ExprEngine::getRegionForConstructedObject(const CXXConstructExpr *CE,
 
   if (auto Elem = findElementDirectlyInitializedByCurrentConstructor()) {
     if (Optional<CFGStmt> StmtElem = Elem->getAs<CFGStmt>()) {
-      auto *DS = cast<DeclStmt>(StmtElem->getStmt());
-      if (const auto *Var = dyn_cast<VarDecl>(DS->getSingleDecl())) {
-        if (Var->getInit() && Var->getInit()->IgnoreImplicit() == CE) {
-          SVal LValue = State->getLValue(Var, LCtx);
-          QualType Ty = Var->getType();
-          LValue = makeZeroElementRegion(State, LValue, Ty);
-          return LValue.getAsRegion();
+      if (const CXXNewExpr *CNE = dyn_cast<CXXNewExpr>(StmtElem->getStmt())) {
+        if (AMgr.getAnalyzerOptions().mayInlineCXXAllocator()) {
+          // TODO: Detect when the allocator returns a null pointer.
+          // Constructor shall not be called in this case.
+          if (const SubRegion *MR = dyn_cast_or_null<SubRegion>(
+                  getCXXNewAllocatorValue(State, CNE, LCtx).getAsRegion())) {
+            if (CNE->isArray()) {
+              // TODO: This code exists only to trigger the suppression for
+              // array constructors. In fact, we need to call the constructor
+              // for every allocated element, not just the first one!
+              return getStoreManager().GetElementZeroRegion(
+                  MR, CNE->getType()->getPointeeType());
+            }
+            return MR;
+          }
         }
+      } else if (auto *DS = dyn_cast<DeclStmt>(StmtElem->getStmt())) {
+        if (const auto *Var = dyn_cast<VarDecl>(DS->getSingleDecl())) {
+          if (Var->getInit() && Var->getInit()->IgnoreImplicit() == CE) {
+            SVal LValue = State->getLValue(Var, LCtx);
+            QualType Ty = Var->getType();
+            LValue = makeZeroElementRegion(State, LValue, Ty);
+            return LValue.getAsRegion();
+          }
+        }
+      } else {
+        llvm_unreachable("Unexpected directly initialized element!");
       }
     } else if (Optional<CFGInitializer> InitElem = Elem->getAs<CFGInitializer>()) {
       const CXXCtorInitializer *Init = InitElem->getInitializer();
@@ -166,6 +185,9 @@ static bool canHaveDirectConstructor(CFGElement Elem){
     if (isa<DeclStmt>(StmtElem->getStmt())) {
       return true;
     }
+    if (isa<CXXNewExpr>(StmtElem->getStmt())) {
+      return true;
+    }
   }
 
   if (Elem.getKind() == CFGElement::Initializer) {
@@ -455,15 +477,57 @@ void ExprEngine::VisitCXXNewAllocatorCall(const CXXNewExpr *CNE,
   getCheckerManager().runCheckersForPreCall(DstPreCall, Pred,
                                             *Call, *this);
 
-  ExplodedNodeSet DstInvalidated;
-  StmtNodeBuilder Bldr(DstPreCall, DstInvalidated, *currBldrCtx);
-  for (ExplodedNodeSet::iterator I = DstPreCall.begin(), E = DstPreCall.end();
-       I != E; ++I)
-    defaultEvalCall(Bldr, *I, *Call);
-  getCheckerManager().runCheckersForPostCall(Dst, DstInvalidated,
-                                             *Call, *this);
-}
+  ExplodedNodeSet DstPostCall;
+  StmtNodeBuilder CallBldr(DstPreCall, DstPostCall, *currBldrCtx);
+  for (auto I : DstPreCall) {
+    // FIXME: Provide evalCall for checkers?
+    defaultEvalCall(CallBldr, I, *Call);
+  }
+  // If the call is inlined, DstPostCall will be empty and we bail out now.
+
+  // Store return value of operator new() for future use, until the actual
+  // CXXNewExpr gets processed.
+  ExplodedNodeSet DstPostValue;
+  StmtNodeBuilder ValueBldr(DstPostCall, DstPostValue, *currBldrCtx);
+  for (auto I : DstPostCall) {
+    // FIXME: Because CNE serves as the "call site" for the allocator (due to
+    // lack of a better expression in the AST), the conjured return value symbol
+    // is going to be of the same type (C++ object pointer type). Technically
+    // this is not correct because the operator new's prototype always says that
+    // it returns a 'void *'. So we should change the type of the symbol,
+    // and then evaluate the cast over the symbolic pointer from 'void *' to
+    // the object pointer type. But without changing the symbol's type it
+    // is breaking too much to evaluate the no-op symbolic cast over it, so we
+    // skip it for now.
+    ProgramStateRef State = I->getState();
+    SVal RetVal = State->getSVal(CNE, LCtx);
+
+    // If this allocation function is not declared as non-throwing, failures
+    // /must/ be signalled by exceptions, and thus the return value will never
+    // be NULL. -fno-exceptions does not influence this semantics.
+    // FIXME: GCC has a -fcheck-new option, which forces it to consider the case
+    // where new can return NULL. If we end up supporting that option, we can
+    // consider adding a check for it here.
+    // C++11 [basic.stc.dynamic.allocation]p3.
+    if (const FunctionDecl *FD = CNE->getOperatorNew()) {
+      QualType Ty = FD->getType();
+      if (const auto *ProtoType = Ty->getAs<FunctionProtoType>())
+        if (!ProtoType->isNothrow(getContext()))
+          State = State->assume(RetVal.castAs<DefinedOrUnknownSVal>(), true);
+    }
 
+    ValueBldr.generateNode(CNE, I,
+                           setCXXNewAllocatorValue(State, CNE, LCtx, RetVal));
+  }
+
+  ExplodedNodeSet DstPostPostCallCallback;
+  getCheckerManager().runCheckersForPostCall(DstPostPostCallCallback,
+                                             DstPostValue, *Call, *this);
+  for (auto I : DstPostPostCallCallback) {
+    getCheckerManager().runCheckersForNewAllocator(
+        CNE, getCXXNewAllocatorValue(I->getState(), CNE, LCtx), Dst, I, *this);
+  }
+}
 
 void ExprEngine::VisitCXXNewExpr(const CXXNewExpr *CNE, ExplodedNode *Pred,
                                    ExplodedNodeSet &Dst) {
@@ -474,69 +538,74 @@ void ExprEngine::VisitCXXNewExpr(const CXXNewExpr *CNE, ExplodedNode *Pred,
 
   unsigned blockCount = currBldrCtx->blockCount();
   const LocationContext *LCtx = Pred->getLocationContext();
-  DefinedOrUnknownSVal symVal = UnknownVal();
+  SVal symVal = UnknownVal();
   FunctionDecl *FD = CNE->getOperatorNew();
 
-  bool IsStandardGlobalOpNewFunction = false;
-  if (FD && !isa<CXXMethodDecl>(FD) && !FD->isVariadic()) {
-    if (FD->getNumParams() == 2) {
-      QualType T = FD->getParamDecl(1)->getType();
-      if (const IdentifierInfo *II = T.getBaseTypeIdentifier())
-        // NoThrow placement new behaves as a standard new.
-        IsStandardGlobalOpNewFunction = II->getName().equals("nothrow_t");
-    }
-    else
-      // Placement forms are considered non-standard.
-      IsStandardGlobalOpNewFunction = (FD->getNumParams() == 1);
+  bool IsStandardGlobalOpNewFunction =
+      FD->isReplaceableGlobalAllocationFunction();
+
+  ProgramStateRef State = Pred->getState();
+
+  // Retrieve the stored operator new() return value.
+  if (AMgr.getAnalyzerOptions().mayInlineCXXAllocator()) {
+    symVal = getCXXNewAllocatorValue(State, CNE, LCtx);
+    State = clearCXXNewAllocatorValue(State, CNE, LCtx);
   }
 
   // We assume all standard global 'operator new' functions allocate memory in
   // heap. We realize this is an approximation that might not correctly model
   // a custom global allocator.
-  if (IsStandardGlobalOpNewFunction)
-    symVal = svalBuilder.getConjuredHeapSymbolVal(CNE, LCtx, blockCount);
-  else
-    symVal = svalBuilder.conjureSymbolVal(nullptr, CNE, LCtx, CNE->getType(),
-                                          blockCount);
+  if (symVal.isUnknown()) {
+    if (IsStandardGlobalOpNewFunction)
+      symVal = svalBuilder.getConjuredHeapSymbolVal(CNE, LCtx, blockCount);
+    else
+      symVal = svalBuilder.conjureSymbolVal(nullptr, CNE, LCtx, CNE->getType(),
+                                            blockCount);
+  }
 
-  ProgramStateRef State = Pred->getState();
   CallEventManager &CEMgr = getStateManager().getCallEventManager();
   CallEventRef<CXXAllocatorCall> Call =
     CEMgr.getCXXAllocatorCall(CNE, State, LCtx);
 
-  // Invalidate placement args.
-  // FIXME: Once we figure out how we want allocators to work,
-  // we should be using the usual pre-/(default-)eval-/post-call checks here.
-  State = Call->invalidateRegions(blockCount);
-  if (!State)
-    return;
+  if (!AMgr.getAnalyzerOptions().mayInlineCXXAllocator()) {
+    // Invalidate placement args.
+    // FIXME: Once we figure out how we want allocators to work,
+    // we should be using the usual pre-/(default-)eval-/post-call checks here.
+    State = Call->invalidateRegions(blockCount);
+    if (!State)
+      return;
 
-  // If this allocation function is not declared as non-throwing, failures
-  // /must/ be signalled by exceptions, and thus the return value will never be
-  // NULL. -fno-exceptions does not influence this semantics.
-  // FIXME: GCC has a -fcheck-new option, which forces it to consider the case
-  // where new can return NULL. If we end up supporting that option, we can
-  // consider adding a check for it here.
-  // C++11 [basic.stc.dynamic.allocation]p3.
-  if (FD) {
-    QualType Ty = FD->getType();
-    if (const FunctionProtoType *ProtoType = Ty->getAs<FunctionProtoType>())
-      if (!ProtoType->isNothrow(getContext()))
-        State = State->assume(symVal, true);
+    // If this allocation function is not declared as non-throwing, failures
+    // /must/ be signalled by exceptions, and thus the return value will never
+    // be NULL. -fno-exceptions does not influence this semantics.
+    // FIXME: GCC has a -fcheck-new option, which forces it to consider the case
+    // where new can return NULL. If we end up supporting that option, we can
+    // consider adding a check for it here.
+    // C++11 [basic.stc.dynamic.allocation]p3.
+    if (FD) {
+      QualType Ty = FD->getType();
+      if (const auto *ProtoType = Ty->getAs<FunctionProtoType>())
+        if (!ProtoType->isNothrow(getContext()))
+          if (auto dSymVal = symVal.getAs<DefinedOrUnknownSVal>())
+            State = State->assume(*dSymVal, true);
+    }
   }
 
   StmtNodeBuilder Bldr(Pred, Dst, *currBldrCtx);
 
+  SVal Result = symVal;
+
   if (CNE->isArray()) {
     // FIXME: allocating an array requires simulating the constructors.
     // For now, just return a symbolicated region.
-    const SubRegion *NewReg =
-        symVal.castAs<loc::MemRegionVal>().getRegionAs<SubRegion>();
-    QualType ObjTy = CNE->getType()->getAs<PointerType>()->getPointeeType();
-    const ElementRegion *EleReg =
-      getStoreManager().GetElementZeroRegion(NewReg, ObjTy);
-    State = State->BindExpr(CNE, Pred->getLocationContext(),
-                            loc::MemRegionVal(EleReg));
+    if (const SubRegion *NewReg =
+            dyn_cast_or_null<SubRegion>(symVal.getAsRegion())) {
+      QualType ObjTy = CNE->getType()->getAs<PointerType>()->getPointeeType();
+      const ElementRegion *EleReg =
+          getStoreManager().GetElementZeroRegion(NewReg, ObjTy);
+      Result = loc::MemRegionVal(EleReg);
+    }
+    State = State->BindExpr(CNE, Pred->getLocationContext(), Result);
     Bldr.generateNode(CNE, Pred, State);
     return;
   }
@@ -545,7 +614,6 @@ void ExprEngine::VisitCXXNewExpr(const CXXNewExpr *CNE, ExplodedNode *Pred,
   // CXXNewExpr, we need to make sure that the constructed object is not
   // immediately invalidated here. (The placement call should happen before
   // the constructor call anyway.)
-  SVal Result = symVal;
   if (FD && FD->isReservedGlobalPlacementOperator()) {
     // Non-array placement new should always return the placement location.
     SVal PlacementLoc = State->getSVal(CNE->getPlacementArg(0), LCtx);
diff --git a/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp b/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
index caf86b26b66d..3c6949786ec4 100644
--- a/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
@@ -276,6 +276,21 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
 
       state = state->BindExpr(CCE, callerCtx, ThisV);
     }
+
+    if (const auto *CNE = dyn_cast<CXXNewExpr>(CE)) {
+      // We are currently evaluating a CXXNewAllocator CFGElement. It takes a
+      // while to reach the actual CXXNewExpr element from here, so keep the
+      // region for later use.
+      // Additionally cast the return value of the inlined operator new
+      // (which is of type 'void *') to the correct object type.
+      SVal AllocV = state->getSVal(CNE, callerCtx);
+      AllocV = svalBuilder.evalCast(
+          AllocV, CNE->getType(),
+          getContext().getPointerType(getContext().VoidTy));
+
+      state =
+          setCXXNewAllocatorValue(state, CNE, calleeCtx->getParent(), AllocV);
+    }
   }
 
   // Step 3: BindedRetNode -> CleanedNodes
@@ -315,6 +330,10 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
     CallExitEnd Loc(calleeCtx, callerCtx);
     bool isNew;
     ProgramStateRef CEEState = (*I == CEBNode) ? state : (*I)->getState();
+
+    // See if we have any stale C++ allocator values.
+    assert(areCXXNewAllocatorValuesClear(CEEState, calleeCtx, callerCtx));
+
     ExplodedNode *CEENode = G.getNode(Loc, CEEState, false, &isNew);
     CEENode->addPredecessor(*I, G);
     if (!isNew)
@@ -331,16 +350,31 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
     CallEventRef<> UpdatedCall = Call.cloneWithState(CEEState);
 
     ExplodedNodeSet DstPostCall;
-    getCheckerManager().runCheckersForPostCall(DstPostCall, CEENode,
-                                               *UpdatedCall, *this,
-                                               /*WasInlined=*/true);
-
+    if (const CXXNewExpr *CNE = dyn_cast_or_null<CXXNewExpr>(CE)) {
+      ExplodedNodeSet DstPostPostCallCallback;
+      getCheckerManager().runCheckersForPostCall(DstPostPostCallCallback,
+                                                 CEENode, *UpdatedCall, *this,
+                                                 /*WasInlined=*/true);
+      for (auto I : DstPostPostCallCallback) {
+        getCheckerManager().runCheckersForNewAllocator(
+            CNE, getCXXNewAllocatorValue(I->getState(), CNE,
+                                         calleeCtx->getParent()),
+            DstPostCall, I, *this,
+            /*WasInlined=*/true);
+      }
+    } else {
+      getCheckerManager().runCheckersForPostCall(DstPostCall, CEENode,
+                                                 *UpdatedCall, *this,
+                                                 /*WasInlined=*/true);
+    }
     ExplodedNodeSet Dst;
     if (const ObjCMethodCall *Msg = dyn_cast<ObjCMethodCall>(Call)) {
       getCheckerManager().runCheckersForPostObjCMessage(Dst, DstPostCall, *Msg,
                                                         *this,
                                                         /*WasInlined=*/true);
-    } else if (CE) {
+    } else if (CE &&
+               !(isa<CXXNewExpr>(CE) && // Called when visiting CXXNewExpr.
+                 AMgr.getAnalyzerOptions().mayInlineCXXAllocator())) {
       getCheckerManager().runCheckersForPostStmt(Dst, DstPostCall, CE,
                                                  *this, /*WasInlined=*/true);
     } else {
@@ -554,7 +588,19 @@ ProgramStateRef ExprEngine::bindReturnValue(const CallEvent &Call,
   QualType ResultTy = Call.getResultType();
   SValBuilder &SVB = getSValBuilder();
   unsigned Count = currBldrCtx->blockCount();
-  SVal R = SVB.conjureSymbolVal(nullptr, E, LCtx, ResultTy, Count);
+
+  // See if we need to conjure a heap pointer instead of
+  // a regular unknown pointer.
+  bool IsHeapPointer = false;
+  if (const auto *CNE = dyn_cast<CXXNewExpr>(E))
+    if (CNE->getOperatorNew()->isReplaceableGlobalAllocationFunction()) {
+      // FIXME: Delegate this to evalCall in MallocChecker?
+      IsHeapPointer = true;
+    }
+
+  SVal R = IsHeapPointer
+               ? SVB.getConjuredHeapSymbolVal(E, LCtx, Count)
+               : SVB.conjureSymbolVal(nullptr, E, LCtx, ResultTy, Count);
   return State->BindExpr(E, LCtx, R);
 }
 
@@ -596,21 +642,34 @@ static CallInlinePolicy mayInlineCallKind(const CallEvent &Call,
 
     const CXXConstructorCall &Ctor = cast<CXXConstructorCall>(Call);
 
+    const CXXConstructExpr *CtorExpr = Ctor.getOriginExpr();
+
+    // FIXME: ParentMap is slow and ugly. The callee should provide the
+    // necessary context. Ideally as part of the call event, or maybe as part of
+    // location context.
+    const Stmt *ParentExpr = CurLC->getParentMap().getParent(CtorExpr);
+
+    if (ParentExpr && isa<CXXNewExpr>(ParentExpr) &&
+        !Opts.mayInlineCXXAllocator())
+      return CIP_DisallowedOnce;
+
     // FIXME: We don't handle constructors or destructors for arrays properly.
     // Even once we do, we still need to be careful about implicitly-generated
     // initializers for array fields in default move/copy constructors.
+    // We still allow construction into ElementRegion targets when they don't
+    // represent array elements.
     const MemRegion *Target = Ctor.getCXXThisVal().getAsRegion();
-    if (Target && isa<ElementRegion>(Target))
-      return CIP_DisallowedOnce;
-
-    // FIXME: This is a hack. We don't use the correct region for a new
-    // expression, so if we inline the constructor its result will just be
-    // thrown away. This short-term hack is tracked in <rdar://problem/12180598>
-    // and the longer-term possible fix is discussed in PR12014.
-    const CXXConstructExpr *CtorExpr = Ctor.getOriginExpr();
-    if (const Stmt *Parent = CurLC->getParentMap().getParent(CtorExpr))
-      if (isa<CXXNewExpr>(Parent))
-        return CIP_DisallowedOnce;
+    if (Target && isa<ElementRegion>(Target)) {
+      if (ParentExpr)
+        if (const CXXNewExpr *NewExpr = dyn_cast<CXXNewExpr>(ParentExpr))
+          if (NewExpr->isArray())
+            return CIP_DisallowedOnce;
+
+      if (const TypedValueRegion *TR = dyn_cast<TypedValueRegion>(
+              cast<SubRegion>(Target)->getSuperRegion()))
+        if (TR->getValueType()->isArrayType())
+          return CIP_DisallowedOnce;
+    }
 
     // Inlining constructors requires including initializers in the CFG.
     const AnalysisDeclContext *ADC = CallerSFC->getAnalysisDeclContext();
@@ -629,7 +688,7 @@ static CallInlinePolicy mayInlineCallKind(const CallEvent &Call,
     // FIXME: This is a hack. We don't handle temporary destructors
     // right now, so we shouldn't inline their constructors.
     if (CtorExpr->getConstructionKind() == CXXConstructExpr::CK_Complete)
-      if (!Target || !isa<DeclRegion>(Target))
+      if (!Target || isa<CXXTempObjectRegion>(Target))
         return CIP_DisallowedOnce;
 
     break;
diff --git a/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index ebf1487d4bfc..c6e397fa6aac 100644
--- a/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -28,6 +28,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <set>
 #include <sstream>
 
 using namespace clang;
@@ -94,6 +96,11 @@ class HTMLDiagnostics : public PathDiagnosticConsumer {
 
   /// \return Javascript for navigating the HTML report using j/k keys.
   std::string generateKeyboardNavigationJavascript();
+
+private:
+
+  /// \return Javascript for displaying shortcuts help;
+  std::string showHelpJavascript();
 };
 
 } // end anonymous namespace
@@ -323,6 +330,115 @@ std::string HTMLDiagnostics::GenerateHTML(const PathDiagnostic& D, Rewriter &R,
   return os.str();
 }
 
+/// Write executed lines from \p D in JSON format into \p os.
+static void serializeExecutedLines(
+    const PathDiagnostic &D,
+    const PathPieces &path,
+    llvm::raw_string_ostream &os) {
+
+  // Copy executed lines from path diagnostics.
+  std::map<unsigned, std::set<unsigned>> ExecutedLines;
+  for (auto I = D.executedLines_begin(),
+            E = D.executedLines_end(); I != E; ++I) {
+    std::set<unsigned> &LinesInFile = ExecutedLines[I->first];
+    for (unsigned LineNo : I->second) {
+      LinesInFile.insert(LineNo);
+    }
+  }
+
+  // We need to include all lines for which any kind of diagnostics appears.
+  for (const auto &P : path) {
+    FullSourceLoc Loc = P->getLocation().asLocation().getExpansionLoc();
+    FileID FID = Loc.getFileID();
+    unsigned LineNo = Loc.getLineNumber();
+    ExecutedLines[FID.getHashValue()].insert(LineNo);
+  }
+
+  os << "var relevant_lines = {";
+  for (auto I = ExecutedLines.begin(),
+            E = ExecutedLines.end(); I != E; ++I) {
+    if (I != ExecutedLines.begin())
+      os << ", ";
+
+    os << "\"" << I->first << "\": {";
+    for (unsigned LineNo : I->second) {
+      if (LineNo != *(I->second.begin()))
+        os << ", ";
+
+      os << "\"" << LineNo << "\": 1";
+    }
+    os << "}";
+  }
+
+  os << "};";
+}
+
+/// \return JavaScript for an option to only show relevant lines.
+static std::string showRelevantLinesJavascript(
+      const PathDiagnostic &D, const PathPieces &path) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  os << "<script type='text/javascript'>\n";
+  serializeExecutedLines(D, path, os);
+  os << R"<<<(
+
+var filterCounterexample = function (hide) {
+  var tables = document.getElementsByClassName("code");
+  for (var t=0; t<tables.length; t++) {
+    var table = tables[t];
+    var file_id = table.getAttribute("data-fileid");
+    var lines_in_fid = relevant_lines[file_id];
+    if (!lines_in_fid) {
+      lines_in_fid = {};
+    }
+    var lines = table.getElementsByClassName("codeline");
+    for (var i=0; i<lines.length; i++) {
+        var el = lines[i];
+        var lineNo = el.getAttribute("data-linenumber");
+        if (!lines_in_fid[lineNo]) {
+          if (hide) {
+            el.setAttribute("hidden", "");
+          } else {
+            el.removeAttribute("hidden");
+          }
+        }
+    }
+  }
+}
+
+window.addEventListener("keydown", function (event) {
+  if (event.defaultPrevented) {
+    return;
+  }
+  if (event.key == "S") {
+    var checked = document.getElementsByName("showCounterexample")[0].checked;
+    filterCounterexample(!checked);
+    document.getElementsByName("showCounterexample")[0].checked = !checked;
+  } else {
+    return;
+  }
+  event.preventDefault();
+}, true);
+
+document.addEventListener("DOMContentLoaded", function() {
+    document.querySelector('input[name="showCounterexample"]').onchange=
+        function (event) {
+      filterCounterexample(this.checked);
+    };
+});
+</script>
+
+<form>
+    <input type="checkbox" name="showCounterexample" id="showCounterexample" />
+    <label for="showCounterexample">
+       Show only relevant lines
+    </label>
+</form>
+)<<<";
+
+  return os.str();
+}
+
 void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic& D, Rewriter &R,
     const SourceManager& SMgr, const PathPieces& path, FileID FID,
     const FileEntry *Entry, const char *declName) {
@@ -340,9 +456,15 @@ void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic& D, Rewriter &R,
   int LineNumber = path.back()->getLocation().asLocation().getExpansionLineNumber();
   int ColumnNumber = path.back()->getLocation().asLocation().getExpansionColumnNumber();
 
+  R.InsertTextBefore(SMgr.getLocForStartOfFile(FID), showHelpJavascript());
+
   R.InsertTextBefore(SMgr.getLocForStartOfFile(FID),
                      generateKeyboardNavigationJavascript());
 
+  // Checkbox and javascript for filtering the output to the counterexample.
+  R.InsertTextBefore(SMgr.getLocForStartOfFile(FID),
+                     showRelevantLinesJavascript(D, path));
+
   // Add the name of the file as an <h1> tag.
   {
     std::string s;
@@ -388,11 +510,24 @@ void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic& D, Rewriter &R,
 </table>
 <!-- REPORTSUMMARYEXTRA -->
 <h3>Annotated Source Code</h3>
-<p><span class='macro'>[?]
-  <span class='expansion'>Use j/k keys for keyboard navigation</span>
-</span></p>
+<p>Press <a href="#" onclick="toggleHelp(); return false;">'?'</a>
+   to see keyboard shortcuts</p>
+<input type="checkbox" class="spoilerhider" id="showinvocation" />
+<label for="showinvocation" >Show analyzer invocation</label>
+<div class="spoiler">clang -cc1 )<<<";
+    os << html::EscapeText(AnalyzerOpts.FullCompilerInvocation);
+    os << R"<<<(
+</div>
+<div id='tooltiphint' hidden="true">
+  <p>Keyboard shortcuts: </p>
+  <ul>
+    <li>Use 'j/k' keys for keyboard navigation</li>
+    <li>Use 'Shift+S' to show/hide relevant lines</li>
+    <li>Use '?' to toggle this window</li>
+  </ul>
+  <a href="#" onclick="toggleHelp(); return false;">Close</a>
+</div>
 )<<<";
-
     R.InsertTextBefore(SMgr.getLocForStartOfFile(FID), os.str());
   }
 
@@ -450,6 +585,37 @@ void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic& D, Rewriter &R,
   html::AddHeaderFooterInternalBuiltinCSS(R, FID, Entry->getName());
 }
 
+std::string HTMLDiagnostics::showHelpJavascript() {
+  return R"<<<(
+<script type='text/javascript'>
+
+var toggleHelp = function() {
+    var hint = document.querySelector("#tooltiphint");
+    var attributeName = "hidden";
+    if (hint.hasAttribute(attributeName)) {
+      hint.removeAttribute(attributeName);
+    } else {
+      hint.setAttribute("hidden", "true");
+    }
+};
+window.addEventListener("keydown", function (event) {
+  if (event.defaultPrevented) {
+    return;
+  }
+  if (event.key == "?") {
+    toggleHelp();
+  } else {
+    return;
+  }
+  event.preventDefault();
+});
+</script>
+)<<<";
+}
+
+
+
+
 void HTMLDiagnostics::RewriteFile(Rewriter &R, const SourceManager& SMgr,
     const PathPieces& path, FileID FID) {
   // Process the path.
@@ -862,7 +1028,7 @@ window.addEventListener("keydown", function (event) {
     navigateTo(/*up=*/true);
   } else {
     return;
-  } 
+  }
   event.preventDefault();
 }, true);
 </script>
diff --git a/lib/StaticAnalyzer/Core/MemRegion.cpp b/lib/StaticAnalyzer/Core/MemRegion.cpp
index cb8ba6de3626..aa54544fa7dd 100644
--- a/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -103,15 +103,15 @@ MemRegionManager::~MemRegionManager() {
 //===----------------------------------------------------------------------===//
 
 bool SubRegion::isSubRegionOf(const MemRegion* R) const {
-  const MemRegion* r = getSuperRegion();
-  while (r != nullptr) {
+  const MemRegion* r = this;
+  do {
     if (r == R)
       return true;
     if (const SubRegion* sr = dyn_cast<SubRegion>(r))
       r = sr->getSuperRegion();
     else
       break;
-  }
+  } while (r != nullptr);
   return false;
 }
 
diff --git a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
index 669748c0127a..2b041241623e 100644
--- a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
+++ b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
@@ -98,20 +98,18 @@ void PathPieces::flattenTo(PathPieces &Primary, PathPieces &Current,
 
 PathDiagnostic::~PathDiagnostic() {}
 
-PathDiagnostic::PathDiagnostic(StringRef CheckName, const Decl *declWithIssue,
-                               StringRef bugtype, StringRef verboseDesc,
-                               StringRef shortDesc, StringRef category,
-                               PathDiagnosticLocation LocationToUnique,
-                               const Decl *DeclToUnique)
-  : CheckName(CheckName),
-    DeclWithIssue(declWithIssue),
-    BugType(StripTrailingDots(bugtype)),
-    VerboseDesc(StripTrailingDots(verboseDesc)),
-    ShortDesc(StripTrailingDots(shortDesc)),
-    Category(StripTrailingDots(category)),
-    UniqueingLoc(LocationToUnique),
-    UniqueingDecl(DeclToUnique),
-    path(pathImpl) {}
+PathDiagnostic::PathDiagnostic(
+    StringRef CheckName, const Decl *declWithIssue, StringRef bugtype,
+    StringRef verboseDesc, StringRef shortDesc, StringRef category,
+    PathDiagnosticLocation LocationToUnique, const Decl *DeclToUnique,
+    std::unique_ptr<FilesToLineNumsMap> ExecutedLines)
+    : CheckName(CheckName), DeclWithIssue(declWithIssue),
+      BugType(StripTrailingDots(bugtype)),
+      VerboseDesc(StripTrailingDots(verboseDesc)),
+      ShortDesc(StripTrailingDots(shortDesc)),
+      Category(StripTrailingDots(category)), UniqueingLoc(LocationToUnique),
+      UniqueingDecl(DeclToUnique), ExecutedLines(std::move(ExecutedLines)),
+      path(pathImpl) {}
 
 static PathDiagnosticCallPiece *
 getFirstStackedCallToHeaderFile(PathDiagnosticCallPiece *CP,
@@ -576,8 +574,11 @@ getLocationForCaller(const StackFrameContext *SFC,
       return PathDiagnosticLocation::createEnd(CallerBody, SM, CallerCtx);
     return PathDiagnosticLocation::create(CallerInfo->getDecl(), SM);
   }
+  case CFGElement::NewAllocator: {
+    const CFGNewAllocator &Alloc = Source.castAs<CFGNewAllocator>();
+    return PathDiagnosticLocation(Alloc.getAllocatorExpr(), SM, CallerCtx);
+  }
   case CFGElement::TemporaryDtor:
-  case CFGElement::NewAllocator:
     llvm_unreachable("not yet implemented!");
   case CFGElement::LifetimeEnds:
   case CFGElement::LoopExit:
@@ -1184,6 +1185,9 @@ void PathDiagnostic::FullProfile(llvm::FoldingSetNodeID &ID) const {
 StackHintGenerator::~StackHintGenerator() {}
 
 std::string StackHintGeneratorForSymbol::getMessage(const ExplodedNode *N){
+  if (!N)
+    return getMessageForSymbolNotFound();
+
   ProgramPoint P = N->getLocation();
   CallExitEnd CExit = P.castAs<CallExitEnd>();
 
@@ -1193,16 +1197,11 @@ std::string StackHintGeneratorForSymbol::getMessage(const ExplodedNode *N){
   if (!CE)
     return "";
 
-  if (!N)
-    return getMessageForSymbolNotFound();
-
   // Check if one of the parameters are set to the interesting symbol.
-  ProgramStateRef State = N->getState();
-  const LocationContext *LCtx = N->getLocationContext();
   unsigned ArgIndex = 0;
   for (CallExpr::const_arg_iterator I = CE->arg_begin(),
                                     E = CE->arg_end(); I != E; ++I, ++ArgIndex){
-    SVal SV = State->getSVal(*I, LCtx);
+    SVal SV = N->getSVal(*I);
 
     // Check if the variable corresponding to the symbol is passed by value.
     SymbolRef AS = SV.getAsLocSymbol();
@@ -1212,7 +1211,10 @@ std::string StackHintGeneratorForSymbol::getMessage(const ExplodedNode *N){
 
     // Check if the parameter is a pointer to the symbol.
     if (Optional<loc::MemRegionVal> Reg = SV.getAs<loc::MemRegionVal>()) {
-      SVal PSV = State->getSVal(Reg->getRegion());
+      // Do not attempt to dereference void*.
+      if ((*I)->getType()->isVoidPointerType())
+        continue;
+      SVal PSV = N->getState()->getSVal(Reg->getRegion());
       SymbolRef AS = PSV.getAsLocSymbol();
       if (AS == Sym) {
         return getMessageForArg(*I, ArgIndex);
@@ -1221,7 +1223,7 @@ std::string StackHintGeneratorForSymbol::getMessage(const ExplodedNode *N){
   }
 
   // Check if we are returning the interesting symbol.
-  SVal SV = State->getSVal(CE, LCtx);
+  SVal SV = N->getSVal(CE);
   SymbolRef RetSym = SV.getAsLocSymbol();
   if (RetSym == Sym) {
     return getMessageForReturn(CE);
diff --git a/lib/StaticAnalyzer/Core/ProgramState.cpp b/lib/StaticAnalyzer/Core/ProgramState.cpp
index 5b6b7339697f..34d4970df2d4 100644
--- a/lib/StaticAnalyzer/Core/ProgramState.cpp
+++ b/lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -354,6 +354,17 @@ ProgramStateRef ProgramState::assumeInBound(DefinedOrUnknownSVal Idx,
   return CM.assume(this, inBound.castAs<DefinedSVal>(), Assumption);
 }
 
+ConditionTruthVal ProgramState::isNonNull(SVal V) const {
+  ConditionTruthVal IsNull = isNull(V);
+  if (IsNull.isUnderconstrained())
+    return IsNull;
+  return ConditionTruthVal(!IsNull.getValue());
+}
+
+ConditionTruthVal ProgramState::areEqual(SVal Lhs, SVal Rhs) const {
+  return stateMgr->getSValBuilder().areEqual(this, Lhs, Rhs);
+}
+
 ConditionTruthVal ProgramState::isNull(SVal V) const {
   if (V.isZeroConstant())
     return true;
@@ -781,8 +792,7 @@ bool ProgramState::isTainted(SymbolRef Sym, TaintTagType Kind) const {
           // complete. For example, this would not currently identify
           // overlapping fields in a union as tainted. To identify this we can
           // check for overlapping/nested byte offsets.
-          if (Kind == I.second &&
-              (R == I.first || R->isSubRegionOf(I.first)))
+          if (Kind == I.second && R->isSubRegionOf(I.first))
             return true;
         }
       }
diff --git a/lib/StaticAnalyzer/Core/RegionStore.cpp b/lib/StaticAnalyzer/Core/RegionStore.cpp
index e2e69bb28ec2..604adde9a7f7 100644
--- a/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -871,7 +871,7 @@ collectSubRegionBindings(SmallVectorImpl<BindingPair> &Bindings,
 
     } else if (NextKey.hasSymbolicOffset()) {
       const MemRegion *Base = NextKey.getConcreteOffsetRegion();
-      if (Top->isSubRegionOf(Base)) {
+      if (Top->isSubRegionOf(Base) && Top != Base) {
         // Case 3: The next key is symbolic and we just changed something within
         // its concrete region. We don't know if the binding is still valid, so
         // we'll be conservative and include it.
@@ -881,7 +881,7 @@ collectSubRegionBindings(SmallVectorImpl<BindingPair> &Bindings,
       } else if (const SubRegion *BaseSR = dyn_cast<SubRegion>(Base)) {
         // Case 4: The next key is symbolic, but we changed a known
         // super-region. In this case the binding is certainly included.
-        if (Top == Base || BaseSR->isSubRegionOf(Top))
+        if (BaseSR->isSubRegionOf(Top))
           if (isCompatibleWithFields(NextKey, FieldsInSymbolicSubregions))
             Bindings.push_back(*I);
       }
diff --git a/lib/StaticAnalyzer/Core/SValBuilder.cpp b/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 04452e3e7cc2..f49995d50ccd 100644
--- a/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -413,10 +413,19 @@ SVal SValBuilder::evalBinOp(ProgramStateRef state, BinaryOperator::Opcode op,
                      type);
 }
 
+ConditionTruthVal SValBuilder::areEqual(ProgramStateRef state, SVal lhs,
+                                        SVal rhs) {
+  return state->isNonNull(evalEQ(state, lhs, rhs));
+}
+
+SVal SValBuilder::evalEQ(ProgramStateRef state, SVal lhs, SVal rhs) {
+  return evalBinOp(state, BO_EQ, lhs, rhs, getConditionType());
+}
+
 DefinedOrUnknownSVal SValBuilder::evalEQ(ProgramStateRef state,
                                          DefinedOrUnknownSVal lhs,
                                          DefinedOrUnknownSVal rhs) {
-  return evalBinOp(state, BO_EQ, lhs, rhs, getConditionType())
+  return evalEQ(state, static_cast<SVal>(lhs), static_cast<SVal>(rhs))
       .castAs<DefinedOrUnknownSVal>();
 }
 
diff --git a/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 94d29d5a6ba3..0ec4a81af3e6 100644
--- a/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -988,6 +988,12 @@ SVal SimpleSValBuilder::evalBinOpLN(ProgramStateRef state,
         elementType = resultTy->getPointeeType();
     }
 
+    // Represent arithmetic on void pointers as arithmetic on char pointers.
+    // It is fine when a TypedValueRegion of char value type represents
+    // a void pointer. Note that arithmetic on void pointers is a GCC extension.
+    if (elementType->isVoidType())
+      elementType = getContext().CharTy;
+
     if (Optional<NonLoc> indexV = index.getAs<NonLoc>()) {
       return loc::MemRegionVal(MemMgr.getElementRegion(elementType, *indexV,
                                                        superR, getContext()));
diff --git a/lib/Tooling/AllTUsExecution.cpp b/lib/Tooling/AllTUsExecution.cpp
new file mode 100644
index 000000000000..0c0854d0ba85
--- /dev/null
+++ b/lib/Tooling/AllTUsExecution.cpp
@@ -0,0 +1,160 @@
+//===- lib/Tooling/AllTUsExecution.cpp - Execute actions on all TUs. ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/AllTUsExecution.h"
+#include "clang/Tooling/ToolExecutorPluginRegistry.h"
+#include "llvm/Support/ThreadPool.h"
+
+namespace clang {
+namespace tooling {
+
+const char *AllTUsToolExecutor::ExecutorName = "AllTUsToolExecutor";
+
+namespace {
+llvm::Error make_string_error(const llvm::Twine &Message) {
+  return llvm::make_error<llvm::StringError>(Message,
+                                             llvm::inconvertibleErrorCode());
+}
+
+ArgumentsAdjuster getDefaultArgumentsAdjusters() {
+  return combineAdjusters(
+      getClangStripOutputAdjuster(),
+      combineAdjusters(getClangSyntaxOnlyAdjuster(),
+                       getClangStripDependencyFileAdjuster()));
+}
+
+class ThreadSafeToolResults : public ToolResults {
+public:
+  void addResult(StringRef Key, StringRef Value) override {
+    std::unique_lock<std::mutex> LockGuard(Mutex);
+    Results.addResult(Key, Value);
+  }
+
+  std::vector<std::pair<std::string, std::string>> AllKVResults() override {
+    return Results.AllKVResults();
+  }
+
+  void forEachResult(llvm::function_ref<void(StringRef Key, StringRef Value)>
+                         Callback) override {
+    Results.forEachResult(Callback);
+  }
+
+private:
+  InMemoryToolResults Results;
+  std::mutex Mutex;
+};
+
+} // namespace
+
+AllTUsToolExecutor::AllTUsToolExecutor(
+    const CompilationDatabase &Compilations, unsigned ThreadCount,
+    std::shared_ptr<PCHContainerOperations> PCHContainerOps)
+    : Compilations(Compilations), Results(new ThreadSafeToolResults),
+      Context(Results.get()), ThreadCount(ThreadCount) {}
+
+AllTUsToolExecutor::AllTUsToolExecutor(
+    CommonOptionsParser Options, unsigned ThreadCount,
+    std::shared_ptr<PCHContainerOperations> PCHContainerOps)
+    : OptionsParser(std::move(Options)),
+      Compilations(OptionsParser->getCompilations()),
+      Results(new ThreadSafeToolResults), Context(Results.get()),
+      ThreadCount(ThreadCount) {}
+
+llvm::Error AllTUsToolExecutor::execute(
+    llvm::ArrayRef<
+        std::pair<std::unique_ptr<FrontendActionFactory>, ArgumentsAdjuster>>
+        Actions) {
+  if (Actions.empty())
+    return make_string_error("No action to execute.");
+
+  if (Actions.size() != 1)
+    return make_string_error(
+        "Only support executing exactly 1 action at this point.");
+
+  std::string ErrorMsg;
+  std::mutex TUMutex;
+  auto AppendError = [&](llvm::Twine Err) {
+    std::unique_lock<std::mutex> LockGuard(TUMutex);
+    ErrorMsg += Err.str();
+  };
+
+  auto Log = [&](llvm::Twine Msg) {
+    std::unique_lock<std::mutex> LockGuard(TUMutex);
+    llvm::errs() << Msg.str() << "\n";
+  };
+
+  auto Files = Compilations.getAllFiles();
+  // Add a counter to track the progress.
+  const std::string TotalNumStr = std::to_string(Files.size());
+  unsigned Counter = 0;
+  auto Count = [&]() {
+    std::unique_lock<std::mutex> LockGuard(TUMutex);
+    return ++Counter;
+  };
+
+  auto &Action = Actions.front();
+
+  {
+    llvm::ThreadPool Pool(ThreadCount == 0 ? llvm::hardware_concurrency()
+                                           : ThreadCount);
+
+    for (std::string File : Files) {
+      Pool.async(
+          [&](std::string Path) {
+            Log("[" + std::to_string(Count()) + "/" + TotalNumStr +
+                "] Processing file " + Path);
+            ClangTool Tool(Compilations, {Path});
+            Tool.appendArgumentsAdjuster(Action.second);
+            Tool.appendArgumentsAdjuster(getDefaultArgumentsAdjusters());
+            for (const auto &FileAndContent : OverlayFiles)
+              Tool.mapVirtualFile(FileAndContent.first(),
+                                  FileAndContent.second);
+            if (Tool.run(Action.first.get()))
+              AppendError(llvm::Twine("Failed to run action on ") + Path +
+                          "\n");
+          },
+          File);
+    }
+  }
+
+  if (!ErrorMsg.empty())
+    return make_string_error(ErrorMsg);
+
+  return llvm::Error::success();
+}
+
+static llvm::cl::opt<unsigned> ExecutorConcurrency(
+    "execute-concurrency",
+    llvm::cl::desc("The number of threads used to process all files in "
+                   "parallel. Set to 0 for hardware concurrency."),
+    llvm::cl::init(0));
+
+class AllTUsToolExecutorPlugin : public ToolExecutorPlugin {
+public:
+  llvm::Expected<std::unique_ptr<ToolExecutor>>
+  create(CommonOptionsParser &OptionsParser) override {
+    if (OptionsParser.getSourcePathList().empty())
+      return make_string_error(
+          "[AllTUsToolExecutorPlugin] Please provide a directory/file path in "
+          "the compilation database.");
+    return llvm::make_unique<AllTUsToolExecutor>(std::move(OptionsParser),
+                                                 ExecutorConcurrency);
+  }
+};
+
+static ToolExecutorPluginRegistry::Add<AllTUsToolExecutorPlugin>
+    X("all-TUs", "Runs FrontendActions on all TUs in the compilation database. "
+                 "Tool results are stored in memory.");
+
+// This anchor is used to force the linker to link in the generated object file
+// and thus register the plugin.
+volatile int AllTUsToolExecutorAnchorSource = 0;
+
+} // end namespace tooling
+} // end namespace clang
diff --git a/lib/Tooling/CMakeLists.txt b/lib/Tooling/CMakeLists.txt
index ee681bbb45ae..1999430aa3df 100644
--- a/lib/Tooling/CMakeLists.txt
+++ b/lib/Tooling/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(Refactoring)
 add_subdirectory(ASTDiff)
 
 add_clang_library(clangTooling
+  AllTUsExecution.cpp
   ArgumentsAdjusters.cpp
   CommonOptionsParser.cpp
   CompilationDatabase.cpp
diff --git a/lib/Tooling/Execution.cpp b/lib/Tooling/Execution.cpp
index 498d683f8924..ff68f85adb24 100644
--- a/lib/Tooling/Execution.cpp
+++ b/lib/Tooling/Execution.cpp
@@ -96,10 +96,13 @@ createExecutorFromCommandLineArgs(int &argc, const char **argv,
 }
 
 // This anchor is used to force the linker to link in the generated object file
-// and thus register the StandaloneToolExecutorPlugin.
+// and thus register the StandaloneToolExecutorPlugin etc.
 extern volatile int StandaloneToolExecutorAnchorSource;
+extern volatile int AllTUsToolExecutorAnchorSource;
 static int LLVM_ATTRIBUTE_UNUSED StandaloneToolExecutorAnchorDest =
     StandaloneToolExecutorAnchorSource;
+static int LLVM_ATTRIBUTE_UNUSED AllTUsToolExecutorAnchorDest =
+    AllTUsToolExecutorAnchorSource;
 
 } // end namespace tooling
 } // end namespace clang
diff --git a/lib/Tooling/Tooling.cpp b/lib/Tooling/Tooling.cpp
index 4fbfa4f00473..b6185086176a 100644
--- a/lib/Tooling/Tooling.cpp
+++ b/lib/Tooling/Tooling.cpp
@@ -328,10 +328,11 @@ bool FrontendActionFactory::runInvocation(
 
 ClangTool::ClangTool(const CompilationDatabase &Compilations,
                      ArrayRef<std::string> SourcePaths,
-                     std::shared_ptr<PCHContainerOperations> PCHContainerOps)
+                     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
+                     IntrusiveRefCntPtr<vfs::FileSystem> BaseFS)
     : Compilations(Compilations), SourcePaths(SourcePaths),
       PCHContainerOps(std::move(PCHContainerOps)),
-      OverlayFileSystem(new vfs::OverlayFileSystem(vfs::getRealFileSystem())),
+      OverlayFileSystem(new vfs::OverlayFileSystem(BaseFS)),
       InMemoryFileSystem(new vfs::InMemoryFileSystem),
       Files(new FileManager(FileSystemOptions(), OverlayFileSystem)),
       DiagConsumer(nullptr) {
diff --git a/test/ARCMT/releases-driver.m b/test/ARCMT/releases-driver.m
index 7b1d2fb8e5b2..3dd546fd6cd5 100644
--- a/test/ARCMT/releases-driver.m
+++ b/test/ARCMT/releases-driver.m
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fblocks -fsyntax-only -fobjc-arc -x objective-c %s.result
-// RUN: cp %s %t
+// RUN: cat %s > %t
 // RUN: %clang_cc1 -arcmt-modify -triple x86_64-apple-macosx10.6 -x objective-c %t
 // RUN: diff %t %s.result
 // RUN: rm %t
diff --git a/test/ARCMT/releases-driver.m.result b/test/ARCMT/releases-driver.m.result
index 4c864bd2a874..e9aa2d5ac44e 100644
--- a/test/ARCMT/releases-driver.m.result
+++ b/test/ARCMT/releases-driver.m.result
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fblocks -fsyntax-only -fobjc-arc -x objective-c %s.result
-// RUN: cp %s %t
+// RUN: cat %s > %t
 // RUN: %clang_cc1 -arcmt-modify -triple x86_64-apple-macosx10.6 -x objective-c %t
 // RUN: diff %t %s.result
 // RUN: rm %t
diff --git a/test/ARCMT/with-arc-mode-modify.m b/test/ARCMT/with-arc-mode-modify.m
index fbbd630700f4..bc4662de23e8 100644
--- a/test/ARCMT/with-arc-mode-modify.m
+++ b/test/ARCMT/with-arc-mode-modify.m
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -fobjc-arc -x objective-c %s.result
-// RUN: cp %s %t
+// RUN: cat %s > %t
 // RUN: %clang_cc1 -arcmt-modify -fsyntax-only -fobjc-arc -x objective-c %t
 // RUN: diff %t %s.result
 // RUN: rm %t
diff --git a/test/ARCMT/with-arc-mode-modify.m.result b/test/ARCMT/with-arc-mode-modify.m.result
index 631f276516f6..b847c13f6b4c 100644
--- a/test/ARCMT/with-arc-mode-modify.m.result
+++ b/test/ARCMT/with-arc-mode-modify.m.result
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -fobjc-arc -x objective-c %s.result
-// RUN: cp %s %t
+// RUN: cat %s > %t
 // RUN: %clang_cc1 -arcmt-modify -fsyntax-only -fobjc-arc -x objective-c %t
 // RUN: diff %t %s.result
 // RUN: rm %t
diff --git a/test/ASTMerge/class-template/Inputs/class-template1.cpp b/test/ASTMerge/class-template/Inputs/class-template1.cpp
index 440b5abfc869..fb5b229e0ac8 100644
--- a/test/ASTMerge/class-template/Inputs/class-template1.cpp
+++ b/test/ASTMerge/class-template/Inputs/class-template1.cpp
@@ -1,5 +1,7 @@
 template<typename T>
-struct X0;
+struct X0 {
+  T getValue(T arg) { return arg; }
+};
 
 template<int I>
 struct X1;
@@ -26,6 +28,7 @@ extern X0<float> *x0r;
 template<>
 struct X0<char> {
   int member;
+  char getValue(char ch) { return static_cast<char>(member); }
 };
 
 template<>
diff --git a/test/ASTMerge/class-template/Inputs/class-template2.cpp b/test/ASTMerge/class-template/Inputs/class-template2.cpp
index 6300301a4ffa..b5d0add13f15 100644
--- a/test/ASTMerge/class-template/Inputs/class-template2.cpp
+++ b/test/ASTMerge/class-template/Inputs/class-template2.cpp
@@ -1,5 +1,7 @@
 template<class T>
-struct X0;
+struct X0 {
+  T getValue(T arg);
+};
 
 template<int I>
 struct X1;
diff --git a/test/ASTMerge/class-template/test.cpp b/test/ASTMerge/class-template/test.cpp
index 0ab5443db7f6..7e25c5d6ccc3 100644
--- a/test/ASTMerge/class-template/test.cpp
+++ b/test/ASTMerge/class-template/test.cpp
@@ -1,24 +1,28 @@
-// RUN: %clang_cc1 -emit-pch -o %t.1.ast %S/Inputs/class-template1.cpp
-// RUN: %clang_cc1 -emit-pch -o %t.2.ast %S/Inputs/class-template2.cpp
-// RUN: not %clang_cc1 -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -std=c++1z -emit-pch -o %t.1.ast %S/Inputs/class-template1.cpp
+// RUN: %clang_cc1 -std=c++1z -emit-pch -o %t.2.ast %S/Inputs/class-template2.cpp
+// RUN: not %clang_cc1 -std=c++1z  -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 | FileCheck %s
 
-// CHECK: class-template1.cpp:7:14: error: non-type template parameter declared with incompatible types in different translation units ('int' vs. 'long')
-// CHECK: class-template2.cpp:7:15: note: declared here with type 'long'
+static_assert(sizeof(X0<char>().getValue(1)) == sizeof(char));
+static_assert(sizeof(X0<int>().getValue(1)) == sizeof(int));
 
-// CHECK: class-template1.cpp:10:14: error: template parameter has different kinds in different translation units
-// CHECK: class-template2.cpp:10:10: note: template parameter declared here
+// CHECK: class-template1.cpp:9:14: error: non-type template parameter declared with incompatible types in different translation units ('int' vs. 'long')
+// CHECK: class-template2.cpp:9:15: note: declared here with type 'long'
 
-// CHECK: class-template1.cpp:16:23: error: non-type template parameter declared with incompatible types in different translation units ('long' vs. 'int')
-// CHECK: class-template2.cpp:16:23: note: declared here with type 'int'
+// CHECK: class-template1.cpp:12:14: error: template parameter has different kinds in different translation units
+// CHECK: class-template2.cpp:12:10: note: template parameter declared here
 
-// CHECK: class-template1.cpp:19:10: error: template parameter has different kinds in different translation units
-// CHECK: class-template2.cpp:19:10: note: template parameter declared here
+// CHECK: class-template1.cpp:18:23: error: non-type template parameter declared with incompatible types in different translation units ('long' vs. 'int')
+// CHECK: class-template2.cpp:18:23: note: declared here with type 'int'
 
-// CHECK: class-template2.cpp:25:20: error: external variable 'x0r' declared with incompatible types in different translation units ('X0<double> *' vs. 'X0<float> *')
-// CHECK: class-template1.cpp:24:19: note: declared here with type 'X0<float> *'
+// CHECK: class-template1.cpp:21:10: error: template parameter has different kinds in different translation units
+// CHECK: class-template2.cpp:21:10: note: template parameter declared here
 
-// CHECK: class-template1.cpp:32:8: warning: type 'X0<wchar_t>' has incompatible definitions in different translation units
-// CHECK: class-template1.cpp:33:7: note: field 'member' has type 'int' here
-// CHECK: class-template2.cpp:34:9: note: field 'member' has type 'float' here
+// CHECK: class-template2.cpp:27:20: error: external variable 'x0r' declared with incompatible types in different translation units ('X0<double> *' vs. 'X0<float> *')
+// CHECK: class-template1.cpp:26:19: note: declared here with type 'X0<float> *'
+
+// CHECK: class-template1.cpp:35:8: warning: type 'X0<wchar_t>' has incompatible definitions in different translation units
+// CHECK: class-template1.cpp:36:7: note: field 'member' has type 'int' here
+// CHECK: class-template2.cpp:36:9: note: field 'member' has type 'float' here
 
 // CHECK: 1 warning and 5 errors generated.
+// CHECK-NOT: static_assert
diff --git a/test/ASTMerge/exprs-cpp/Inputs/exprs3.cpp b/test/ASTMerge/exprs-cpp/Inputs/exprs3.cpp
index 2a33c35d9ea6..6fdc33fb3919 100644
--- a/test/ASTMerge/exprs-cpp/Inputs/exprs3.cpp
+++ b/test/ASTMerge/exprs-cpp/Inputs/exprs3.cpp
@@ -122,3 +122,20 @@ void useTemplateType() {
 const bool ExpressionTrait = __is_lvalue_expr(1);
 const unsigned ArrayRank = __array_rank(int[10][20]);
 const unsigned ArrayExtent = __array_extent(int[10][20], 1);
+
+constexpr int testLambdaAdd(int toAdd) {
+  const int Captured1 = 1, Captured2 = 2;
+  constexpr auto LambdaAdd = [Captured1, Captured2](int k) -> int {
+    return Captured1 + Captured2 + k;
+  };
+  return LambdaAdd(toAdd);
+}
+
+template<typename T>
+struct TestLambdaTemplate {
+  T i, j;
+  TestLambdaTemplate(T i, const T &j) : i(i), j(j) {}
+  T testLambda(T k) {
+    return [this](T k) -> decltype(auto) { return i + j + k; }(k);
+  }
+};
diff --git a/test/ASTMerge/exprs-cpp/test.cpp b/test/ASTMerge/exprs-cpp/test.cpp
index 0535aa85330f..c0b282ec0288 100644
--- a/test/ASTMerge/exprs-cpp/test.cpp
+++ b/test/ASTMerge/exprs-cpp/test.cpp
@@ -30,6 +30,8 @@ static_assert(ExpressionTrait == false);
 static_assert(ArrayRank == 2);
 static_assert(ArrayExtent == 20);
 
+static_assert(testLambdaAdd(3) == 6);
+
 void testImport(int *x, const S1 &cs1, S1 &s1) {
   testNewThrowDelete();
   testArrayElement(nullptr, 12);
@@ -44,4 +46,5 @@ void testImport(int *x, const S1 &cs1, S1 &s1) {
   testDefaultArg();
   testDefaultArgExpr();
   useTemplateType();
+  TestLambdaTemplate<int>(1, 2).testLambda(3);
 }
diff --git a/test/ASTMerge/function-cpp/Inputs/function-1.cpp b/test/ASTMerge/function-cpp/Inputs/function-1.cpp
new file mode 100644
index 000000000000..ee97a1a8a5da
--- /dev/null
+++ b/test/ASTMerge/function-cpp/Inputs/function-1.cpp
@@ -0,0 +1,8 @@
+
+template<typename T> constexpr T add(T arg1, T arg2) {
+  return arg1 + arg2;
+}
+
+template<> constexpr int add(int arg1, int arg2) {
+  return arg1 + arg2 + 2;
+}
diff --git a/test/ASTMerge/function-cpp/test.cpp b/test/ASTMerge/function-cpp/test.cpp
new file mode 100644
index 000000000000..304ce3c634c2
--- /dev/null
+++ b/test/ASTMerge/function-cpp/test.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -std=c++1z -emit-pch -o %t.1.ast %S/Inputs/function-1.cpp
+// RUN: %clang_cc1 -std=c++1z  -ast-merge %t.1.ast -fsyntax-only %s 2>&1 | FileCheck %s
+// XFAIL: *
+
+static_assert(add(1, 2) == 5);
+
+// FIXME: support of templated function overload is still not implemented.
+static_assert(add('\1', '\2') == 3);
+
+// CHECK-NOT: static_assert
diff --git a/test/ASTMerge/interface/Inputs/interface1.m b/test/ASTMerge/interface/Inputs/interface1.m
index 5865c0eff070..6192150089fe 100644
--- a/test/ASTMerge/interface/Inputs/interface1.m
+++ b/test/ASTMerge/interface/Inputs/interface1.m
@@ -100,4 +100,6 @@ @implementation I14 : I12
 @implementation I15 : I12
 @end
 
-
+@interface ImportSelectorSLoc { }
+-(int)addInt:(int)a toInt:(int)b moduloInt:(int)c; // don't crash here
+@end
diff --git a/test/Analysis/Inputs/system-header-simulator-cxx.h b/test/Analysis/Inputs/system-header-simulator-cxx.h
index 809b768d71e0..7702e5196ebe 100644
--- a/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -584,10 +584,21 @@ namespace std {
 
 }
 
+#ifdef TEST_INLINABLE_ALLOCATORS
+namespace std {
+  void *malloc(size_t);
+  void free(void *);
+}
+void* operator new(std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); }
+void* operator new[](std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); }
+void operator delete(void* ptr, const std::nothrow_t&) throw() { std::free(ptr); }
+void operator delete[](void* ptr, const std::nothrow_t&) throw() { std::free(ptr); }
+#else
 void* operator new(std::size_t, const std::nothrow_t&) throw();
 void* operator new[](std::size_t, const std::nothrow_t&) throw();
 void operator delete(void*, const std::nothrow_t&) throw();
 void operator delete[](void*, const std::nothrow_t&) throw();
+#endif
 
 void* operator new (std::size_t size, void* ptr) throw() { return ptr; };
 void* operator new[] (std::size_t size, void* ptr) throw() { return ptr; };
diff --git a/test/Analysis/Inputs/system-header-simulator.h b/test/Analysis/Inputs/system-header-simulator.h
index 2e6f1e7d4a96..ff69a1502b94 100644
--- a/test/Analysis/Inputs/system-header-simulator.h
+++ b/test/Analysis/Inputs/system-header-simulator.h
@@ -32,6 +32,7 @@ typedef __typeof(sizeof(int)) size_t;
 size_t strlen(const char *);
 
 char *strcpy(char *restrict, const char *restrict);
+char *strncpy(char *dst, const char *src, size_t n);
 void *memcpy(void *dst, const void *src, size_t n);
 
 typedef unsigned long __darwin_pthread_key_t;
diff --git a/test/Analysis/MismatchedDeallocator-checker-test.mm b/test/Analysis/MismatchedDeallocator-checker-test.mm
index b80f7df233dd..013d677e515c 100644
--- a/test/Analysis/MismatchedDeallocator-checker-test.mm
+++ b/test/Analysis/MismatchedDeallocator-checker-test.mm
@@ -1,4 +1,5 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.MismatchedDeallocator -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.MismatchedDeallocator -fblocks -DTEST_INLINABLE_ALLOCATORS -verify %s
 
 #include "Inputs/system-header-simulator-objc.h"
 #include "Inputs/system-header-simulator-cxx.h"
diff --git a/test/Analysis/NewDelete-checker-test.cpp b/test/Analysis/NewDelete-checker-test.cpp
index 66e837572bb0..620237cd6ede 100644
--- a/test/Analysis/NewDelete-checker-test.cpp
+++ b/test/Analysis/NewDelete-checker-test.cpp
@@ -1,5 +1,12 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -verify %s
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDeleteLeaks -DLEAKS -std=c++11 -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -analyzer-config c++-allocator-inlining=true -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDeleteLeaks -DLEAKS -std=c++11 -fblocks -analyzer-config c++-allocator-inlining=true -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDeleteLeaks -DLEAKS -std=c++11 -fblocks -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -analyzer-config c++-allocator-inlining=true -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDeleteLeaks -DLEAKS -std=c++11 -fblocks -analyzer-config c++-allocator-inlining=true -DTEST_INLINABLE_ALLOCATORS -verify %s
+
 #include "Inputs/system-header-simulator-cxx.h"
 
 typedef __typeof__(sizeof(int)) size_t;
@@ -45,14 +52,18 @@ void testGlobalNoThrowPlacementOpNewBeforeOverload() {
   void *p = operator new(0, std::nothrow);
 }
 #ifdef LEAKS
-// expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
+#ifndef TEST_INLINABLE_ALLOCATORS
+// expected-warning@-3{{Potential leak of memory pointed to by 'p'}}
+#endif
 #endif
 
 void testGlobalNoThrowPlacementExprNewBeforeOverload() {
   int *p = new(std::nothrow) int;
 }
 #ifdef LEAKS
-// expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
+#ifndef TEST_INLINABLE_ALLOCATORS
+// expected-warning@-3{{Potential leak of memory pointed to by 'p'}}
+#endif
 #endif
 
 //----- Standard pointer placement operators
@@ -186,7 +197,10 @@ void testExprDeleteArrArg() {
 
 void testAllocDeallocNames() {
   int *p = new(std::nothrow) int[1];
-  delete[] (++p); // expected-warning{{Argument to 'delete[]' is offset by 4 bytes from the start of memory allocated by 'new[]'}}
+  delete[] (++p);
+#ifndef TEST_INLINABLE_ALLOCATORS
+  // expected-warning@-2{{Argument to 'delete[]' is offset by 4 bytes from the start of memory allocated by 'new[]'}}
+#endif
 }
 
 //--------------------------------
diff --git a/test/Analysis/NewDelete-custom.cpp b/test/Analysis/NewDelete-custom.cpp
index f06ff4a858e5..f5a2952699ff 100644
--- a/test/Analysis/NewDelete-custom.cpp
+++ b/test/Analysis/NewDelete-custom.cpp
@@ -1,8 +1,10 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,unix.Malloc -std=c++11 -fblocks -verify %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks,unix.Malloc -std=c++11 -DLEAKS -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,unix.Malloc -std=c++11 -analyzer-config c++-allocator-inlining=false -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks,unix.Malloc -std=c++11 -analyzer-config c++-allocator-inlining=false -DLEAKS=1 -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,unix.Malloc -std=c++11 -DALLOCATOR_INLINING=1 -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks,unix.Malloc -std=c++11 -DLEAKS=1 -DALLOCATOR_INLINING=1 -fblocks -verify %s
 #include "Inputs/system-header-simulator-cxx.h"
 
-#ifndef LEAKS
+#if !(LEAKS && !ALLOCATOR_INLINING)
 // expected-no-diagnostics
 #endif
 
@@ -11,7 +13,7 @@ void *allocator(std::size_t size);
 
 void *operator new[](std::size_t size) throw() { return allocator(size); }
 void *operator new(std::size_t size) throw() { return allocator(size); }
-void *operator new(std::size_t size, std::nothrow_t& nothrow) throw() { return allocator(size); }
+void *operator new(std::size_t size, const std::nothrow_t &nothrow) throw() { return allocator(size); }
 void *operator new(std::size_t, double d);
 
 class C {
@@ -26,7 +28,7 @@ void testNewMethod() {
 
   C *c3 = ::new C;
 }
-#ifdef LEAKS
+#if LEAKS && !ALLOCATOR_INLINING
 // expected-warning@-2{{Potential leak of memory pointed to by 'c3'}}
 #endif
 
@@ -37,7 +39,7 @@ void testOpNewArray() {
 void testNewExprArray() {
   int *p = new int[0];
 }
-#ifdef LEAKS
+#if LEAKS && !ALLOCATOR_INLINING
 // expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
 #endif
 
@@ -50,30 +52,27 @@ void testOpNew() {
 void testNewExpr() {
   int *p = new int;
 }
-#ifdef LEAKS
+#if LEAKS && !ALLOCATOR_INLINING
 // expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
 #endif
 
 
 //----- Custom NoThrow placement operators
 void testOpNewNoThrow() {
-  void *p = operator new(0, std::nothrow);
+  void *p = operator new(0, std::nothrow); // call is inlined, no warn
 }
-#ifdef LEAKS
-// expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
-#endif
 
 void testNewExprNoThrow() {
   int *p = new(std::nothrow) int;
 }
-#ifdef LEAKS
+#if LEAKS && !ALLOCATOR_INLINING
 // expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
 #endif
 
 //----- Custom placement operators
 void testOpNewPlacement() {
   void *p = operator new(0, 0.1); // no warn
-} 
+}
 
 void testNewExprPlacement() {
   int *p = new(0.1) int; // no warn
diff --git a/test/Analysis/NewDelete-intersections.mm b/test/Analysis/NewDelete-intersections.mm
index aa52c7902888..b3707858f00c 100644
--- a/test/Analysis/NewDelete-intersections.mm
+++ b/test/Analysis/NewDelete-intersections.mm
@@ -1,5 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -verify %s
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks -std=c++11 -DLEAKS -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks -std=c++11 -DLEAKS -fblocks -DTEST_INLINABLE_ALLOCATORS -verify %s
 #include "Inputs/system-header-simulator-cxx.h"
 #include "Inputs/system-header-simulator-objc.h"
 
diff --git a/test/Analysis/NewDelete-path-notes.cpp b/test/Analysis/NewDelete-path-notes.cpp
index ac760ca60eb4..d57f67e71923 100644
--- a/test/Analysis/NewDelete-path-notes.cpp
+++ b/test/Analysis/NewDelete-path-notes.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=cplusplus.NewDelete,unix.Malloc -analyzer-output=text -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=cplusplus.NewDelete,unix.Malloc -analyzer-output=text -analyzer-config c++-allocator-inlining=true -verify %s
 // RUN: %clang_analyze_cc1 -analyzer-checker=cplusplus.NewDelete,unix.Malloc -analyzer-output=plist -analyzer-config path-diagnostics-alternate=false %s -o %t.plist
 // RUN: FileCheck --input-file=%t.plist %s
 
@@ -40,12 +41,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>6</integer>
+// CHECK-NEXT:            <key>line</key><integer>7</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>6</integer>
+// CHECK-NEXT:            <key>line</key><integer>7</integer>
 // CHECK-NEXT:            <key>col</key><integer>5</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -53,12 +54,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>6</integer>
+// CHECK-NEXT:            <key>line</key><integer>7</integer>
 // CHECK-NEXT:            <key>col</key><integer>12</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>6</integer>
+// CHECK-NEXT:            <key>line</key><integer>7</integer>
 // CHECK-NEXT:            <key>col</key><integer>14</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -70,7 +71,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>6</integer>
+// CHECK-NEXT:       <key>line</key><integer>7</integer>
 // CHECK-NEXT:       <key>col</key><integer>12</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -78,12 +79,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>6</integer>
+// CHECK-NEXT:          <key>line</key><integer>7</integer>
 // CHECK-NEXT:          <key>col</key><integer>12</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>6</integer>
+// CHECK-NEXT:          <key>line</key><integer>7</integer>
 // CHECK-NEXT:          <key>col</key><integer>18</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -103,12 +104,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>6</integer>
+// CHECK-NEXT:            <key>line</key><integer>7</integer>
 // CHECK-NEXT:            <key>col</key><integer>12</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>6</integer>
+// CHECK-NEXT:            <key>line</key><integer>7</integer>
 // CHECK-NEXT:            <key>col</key><integer>14</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -116,12 +117,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>8</integer>
+// CHECK-NEXT:            <key>line</key><integer>9</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>8</integer>
+// CHECK-NEXT:            <key>line</key><integer>9</integer>
 // CHECK-NEXT:            <key>col</key><integer>4</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -137,12 +138,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>8</integer>
+// CHECK-NEXT:            <key>line</key><integer>9</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>8</integer>
+// CHECK-NEXT:            <key>line</key><integer>9</integer>
 // CHECK-NEXT:            <key>col</key><integer>4</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -150,12 +151,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>10</integer>
+// CHECK-NEXT:            <key>line</key><integer>11</integer>
 // CHECK-NEXT:            <key>col</key><integer>5</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>10</integer>
+// CHECK-NEXT:            <key>line</key><integer>11</integer>
 // CHECK-NEXT:            <key>col</key><integer>10</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -167,7 +168,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>10</integer>
+// CHECK-NEXT:       <key>line</key><integer>11</integer>
 // CHECK-NEXT:       <key>col</key><integer>5</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -175,12 +176,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>10</integer>
+// CHECK-NEXT:          <key>line</key><integer>11</integer>
 // CHECK-NEXT:          <key>col</key><integer>5</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>10</integer>
+// CHECK-NEXT:          <key>line</key><integer>11</integer>
 // CHECK-NEXT:          <key>col</key><integer>12</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -200,12 +201,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>10</integer>
+// CHECK-NEXT:            <key>line</key><integer>11</integer>
 // CHECK-NEXT:            <key>col</key><integer>5</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>10</integer>
+// CHECK-NEXT:            <key>line</key><integer>11</integer>
 // CHECK-NEXT:            <key>col</key><integer>10</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -213,12 +214,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>13</integer>
+// CHECK-NEXT:            <key>line</key><integer>14</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>13</integer>
+// CHECK-NEXT:            <key>line</key><integer>14</integer>
 // CHECK-NEXT:            <key>col</key><integer>8</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -230,7 +231,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>13</integer>
+// CHECK-NEXT:       <key>line</key><integer>14</integer>
 // CHECK-NEXT:       <key>col</key><integer>3</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -238,12 +239,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>13</integer>
+// CHECK-NEXT:          <key>line</key><integer>14</integer>
 // CHECK-NEXT:          <key>col</key><integer>3</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>13</integer>
+// CHECK-NEXT:          <key>line</key><integer>14</integer>
 // CHECK-NEXT:          <key>col</key><integer>10</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -267,7 +268,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:   <key>issue_hash_function_offset</key><string>8</string>
 // CHECK-NEXT:   <key>location</key>
 // CHECK-NEXT:   <dict>
-// CHECK-NEXT:    <key>line</key><integer>13</integer>
+// CHECK-NEXT:    <key>line</key><integer>14</integer>
 // CHECK-NEXT:    <key>col</key><integer>3</integer>
 // CHECK-NEXT:    <key>file</key><integer>0</integer>
 // CHECK-NEXT:   </dict>
@@ -279,7 +280,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>24</integer>
+// CHECK-NEXT:       <key>line</key><integer>25</integer>
 // CHECK-NEXT:       <key>col</key><integer>2</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -287,12 +288,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>24</integer>
+// CHECK-NEXT:          <key>line</key><integer>25</integer>
 // CHECK-NEXT:          <key>col</key><integer>2</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>24</integer>
+// CHECK-NEXT:          <key>line</key><integer>25</integer>
 // CHECK-NEXT:          <key>col</key><integer>12</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -308,7 +309,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>18</integer>
+// CHECK-NEXT:       <key>line</key><integer>19</integer>
 // CHECK-NEXT:       <key>col</key><integer>2</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -326,12 +327,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>18</integer>
+// CHECK-NEXT:            <key>line</key><integer>19</integer>
 // CHECK-NEXT:            <key>col</key><integer>2</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>18</integer>
+// CHECK-NEXT:            <key>line</key><integer>19</integer>
 // CHECK-NEXT:            <key>col</key><integer>5</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -339,12 +340,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>19</integer>
+// CHECK-NEXT:            <key>line</key><integer>20</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>19</integer>
+// CHECK-NEXT:            <key>line</key><integer>20</integer>
 // CHECK-NEXT:            <key>col</key><integer>8</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -356,7 +357,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>19</integer>
+// CHECK-NEXT:       <key>line</key><integer>20</integer>
 // CHECK-NEXT:       <key>col</key><integer>3</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -364,12 +365,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>19</integer>
+// CHECK-NEXT:          <key>line</key><integer>20</integer>
 // CHECK-NEXT:          <key>col</key><integer>3</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>19</integer>
+// CHECK-NEXT:          <key>line</key><integer>20</integer>
 // CHECK-NEXT:          <key>col</key><integer>13</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -385,7 +386,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>24</integer>
+// CHECK-NEXT:       <key>line</key><integer>25</integer>
 // CHECK-NEXT:       <key>col</key><integer>2</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -393,12 +394,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>24</integer>
+// CHECK-NEXT:          <key>line</key><integer>25</integer>
 // CHECK-NEXT:          <key>col</key><integer>2</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>24</integer>
+// CHECK-NEXT:          <key>line</key><integer>25</integer>
 // CHECK-NEXT:          <key>col</key><integer>12</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -418,12 +419,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>24</integer>
+// CHECK-NEXT:            <key>line</key><integer>25</integer>
 // CHECK-NEXT:            <key>col</key><integer>2</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>24</integer>
+// CHECK-NEXT:            <key>line</key><integer>25</integer>
 // CHECK-NEXT:            <key>col</key><integer>4</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -431,12 +432,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>26</integer>
+// CHECK-NEXT:            <key>line</key><integer>27</integer>
 // CHECK-NEXT:            <key>col</key><integer>2</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>26</integer>
+// CHECK-NEXT:            <key>line</key><integer>27</integer>
 // CHECK-NEXT:            <key>col</key><integer>7</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -448,7 +449,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>26</integer>
+// CHECK-NEXT:       <key>line</key><integer>27</integer>
 // CHECK-NEXT:       <key>col</key><integer>2</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -456,12 +457,12 @@ void test(Odd *odd) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>26</integer>
+// CHECK-NEXT:          <key>line</key><integer>27</integer>
 // CHECK-NEXT:          <key>col</key><integer>2</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>26</integer>
+// CHECK-NEXT:          <key>line</key><integer>27</integer>
 // CHECK-NEXT:          <key>col</key><integer>11</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -485,7 +486,7 @@ void test(Odd *odd) {
 // CHECK-NEXT:   <key>issue_hash_function_offset</key><string>3</string>
 // CHECK-NEXT:   <key>location</key>
 // CHECK-NEXT:   <dict>
-// CHECK-NEXT:    <key>line</key><integer>26</integer>
+// CHECK-NEXT:    <key>line</key><integer>27</integer>
 // CHECK-NEXT:    <key>col</key><integer>2</integer>
 // CHECK-NEXT:    <key>file</key><integer>0</integer>
 // CHECK-NEXT:   </dict>
diff --git a/test/Analysis/NewDeleteLeaks-PR19102.cpp b/test/Analysis/NewDeleteLeaks-PR19102.cpp
index 502db6122f5c..625b2d4b7af4 100644
--- a/test/Analysis/NewDeleteLeaks-PR19102.cpp
+++ b/test/Analysis/NewDeleteLeaks-PR19102.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDeleteLeaks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDeleteLeaks -analyzer-config c++-allocator-inlining=true -verify %s
 
 class A0 {};
 
diff --git a/test/Analysis/bitwise-ops.c b/test/Analysis/bitwise-ops.c
index fe546580be3d..5cdb668867b6 100644
--- a/test/Analysis/bitwise-ops.c
+++ b/test/Analysis/bitwise-ops.c
@@ -51,3 +51,9 @@ int testNegativeLeftShift(int a) {
   }
   return 0;
 }
+
+int testUnrepresentableLeftShift(int a) {
+  if (a == 8)
+    return a << 30; // expected-warning{{The result of the left shift is undefined due to shifting '8' by '30', which is unrepresentable in the unsigned version of the return type 'int'}}
+  return 0;
+}
diff --git a/test/Analysis/call_once.cpp b/test/Analysis/call_once.cpp
index db9e5cc7ca16..445c616b3cbb 100644
--- a/test/Analysis/call_once.cpp
+++ b/test/Analysis/call_once.cpp
@@ -1,10 +1,11 @@
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -verify %s
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -DEMULATE_LIBSTDCPP -verify %s
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -verify %s -o %t.report
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -DEMULATE_LIBSTDCPP -verify %s -o %t.report
 
 // We do NOT model libcxx03 implementation, but the analyzer should still
 // not crash.
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -DEMULATE_LIBCXX03 -verify %s
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -DEMULATE_LIBCXX03 -DEMULATE_LIBSTDCPP -verify %s
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -DEMULATE_LIBCXX03 -verify %s -o %t.report
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -analyzer-checker=core,debug.ExprInspection -DEMULATE_LIBCXX03 -DEMULATE_LIBSTDCPP -verify %s -o %t.report
+// RUN: rm -rf %t.report
 
 void clang_analyzer_eval(bool);
 
diff --git a/test/Analysis/copypaste/macro-complexity.cpp b/test/Analysis/copypaste/macro-complexity.cpp
index 70d3f0c74820..db90236d0a16 100644
--- a/test/Analysis/copypaste/macro-complexity.cpp
+++ b/test/Analysis/copypaste/macro-complexity.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -analyzer-config alpha.clone.CloneChecker:MinimumCloneComplexity=10 -verify %s
 
 // Tests that the complexity value of a macro expansion is about the same as
-// the complexity value of a normal function call and the the macro body doesn't
+// the complexity value of a normal function call and the macro body doesn't
 // influence the complexity. See the CloneSignature class in CloneDetection.h
 // for more information about complexity values of clones.
 
diff --git a/test/Analysis/ctor.mm b/test/Analysis/ctor.mm
index e903263431fc..5ef2beea4966 100644
--- a/test/Analysis/ctor.mm
+++ b/test/Analysis/ctor.mm
@@ -1,4 +1,5 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -fobjc-arc -analyzer-config c++-inlining=constructors -Wno-null-dereference -std=c++11 -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -fobjc-arc -analyzer-config c++-inlining=constructors -Wno-null-dereference -std=c++11 -verify -DTEST_INLINABLE_ALLOCATORS %s
 
 #include "Inputs/system-header-simulator-cxx.h"
 
@@ -572,10 +573,9 @@ void testArray() {
   }
 
   void testNew() {
-    // FIXME: Pending proper implementation of constructors for 'new'.
     raw_pair *pp = new raw_pair();
-    clang_analyzer_eval(pp->p1 == 0); // expected-warning{{UNKNOWN}}
-    clang_analyzer_eval(pp->p2 == 0); // expected-warning{{UNKNOWN}}
+    clang_analyzer_eval(pp->p1 == 0); // expected-warning{{TRUE}}
+    clang_analyzer_eval(pp->p2 == 0); // expected-warning{{TRUE}}
   }
 
   void testArrayNew() {
@@ -679,8 +679,7 @@ void testStatic() {
 
   void testDynamic() {
     List *list = new List{1, 2};
-    // FIXME: When we handle constructors with 'new', this will be TRUE.
-    clang_analyzer_eval(list->usedInitializerList); // expected-warning{{UNKNOWN}}
+    clang_analyzer_eval(list->usedInitializerList); // expected-warning{{TRUE}}
   }
 }
 
diff --git a/test/Analysis/cxxnewexpr-callback-inline.cpp b/test/Analysis/cxxnewexpr-callback-inline.cpp
new file mode 100644
index 000000000000..c823de85821d
--- /dev/null
+++ b/test/Analysis/cxxnewexpr-callback-inline.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=debug.AnalysisOrder -analyzer-config c++-allocator-inlining=true,debug.AnalysisOrder:PreStmtCXXNewExpr=true,debug.AnalysisOrder:PostStmtCXXNewExpr=true,debug.AnalysisOrder:PreCall=true,debug.AnalysisOrder:PostCall=true,debug.AnalysisOrder:NewAllocator=true %s 2>&1 | FileCheck %s
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+namespace std {
+  void *malloc(size_t);
+}
+
+void *operator new(size_t size) { return std::malloc(size); }
+
+struct S {
+  S() {}
+};
+
+void foo();
+
+void test() {
+  S *s = new S();
+  foo();
+}
+
+// CHECK:      PreCall (operator new)
+// CHECK-NEXT: PreCall (std::malloc)
+// CHECK-NEXT: PostCall (std::malloc)
+// CHECK-NEXT: PostCall (operator new)
+// CHECK-NEXT: NewAllocator
+// CHECK-NEXT: PreCall (S::S)
+// CHECK-NEXT: PostCall (S::S)
+// CHECK-NEXT: PreStmt<CXXNewExpr>
+// CHECK-NEXT: PostStmt<CXXNewExpr>
+// CHECK-NEXT: PreCall (foo)
+// CHECK-NEXT: PostCall (foo)
diff --git a/test/Analysis/cxxnewexpr-callback-noinline.cpp b/test/Analysis/cxxnewexpr-callback-noinline.cpp
new file mode 100644
index 000000000000..87edeac3fd58
--- /dev/null
+++ b/test/Analysis/cxxnewexpr-callback-noinline.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=debug.AnalysisOrder -analyzer-config c++-allocator-inlining=false,debug.AnalysisOrder:PreStmtCXXNewExpr=true,debug.AnalysisOrder:PostStmtCXXNewExpr=true,debug.AnalysisOrder:PreCall=true,debug.AnalysisOrder:PostCall=true,debug.AnalysisOrder:NewAllocator=true %s 2>&1 | FileCheck %s
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+namespace std {
+  void *malloc(size_t);
+}
+
+void *operator new(size_t size) { return std::malloc(size); }
+
+struct S {
+  S() {}
+};
+
+void foo();
+
+void test() {
+  S *s = new S();
+  foo();
+}
+
+// CHECK:      PreCall (S::S)
+// CHECK-NEXT: PostCall (S::S)
+// CHECK-NEXT: PreStmt<CXXNewExpr>
+// CHECK-NEXT: PostStmt<CXXNewExpr>
+// CHECK-NEXT: PreCall (foo)
+// CHECK-NEXT: PostCall (foo)
+// CHECK-NEXT: PreCall (std::malloc)
+// CHECK-NEXT: PostCall (std::malloc)
diff --git a/test/Analysis/diagnostics/implicit-cxx-std-suppression.cpp b/test/Analysis/diagnostics/implicit-cxx-std-suppression.cpp
index 197fad5d0b7d..35f8798c81ae 100644
--- a/test/Analysis/diagnostics/implicit-cxx-std-suppression.cpp
+++ b/test/Analysis/diagnostics/implicit-cxx-std-suppression.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete,debug.ExprInspection -analyzer-config c++-container-inlining=true -analyzer-config c++-stdlib-inlining=false -std=c++11 -verify %s
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete,debug.ExprInspection -analyzer-config c++-container-inlining=true -analyzer-config c++-stdlib-inlining=true -std=c++11 -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete,debug.ExprInspection -analyzer-config c++-container-inlining=true -analyzer-config c++-stdlib-inlining=false -std=c++11 -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete,debug.ExprInspection -analyzer-config c++-container-inlining=true -analyzer-config c++-stdlib-inlining=true -std=c++11 -DTEST_INLINABLE_ALLOCATORS -verify %s
 
 // expected-no-diagnostics
 
diff --git a/test/Analysis/html_diagnostics/relevant_lines/header.h b/test/Analysis/html_diagnostics/relevant_lines/header.h
new file mode 100644
index 000000000000..579b66a1cb31
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/header.h
@@ -0,0 +1,12 @@
+#define deref(X) (*X)
+
+char helper(
+    char *out,
+    int doDereference) {
+  if (doDereference) {
+    return deref(out);
+  } else {
+    return 'x';
+  }
+  return 'c';
+}
diff --git a/test/Analysis/html_diagnostics/relevant_lines/macros_same_file.c b/test/Analysis/html_diagnostics/relevant_lines/macros_same_file.c
new file mode 100644
index 000000000000..5b3556526add
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/macros_same_file.c
@@ -0,0 +1,15 @@
+#define deref(X) (*X)
+
+int f(int coin) {
+  if (coin) {
+    int *x = 0;
+    return deref(x);
+  } else {
+    return 0;
+  }
+}
+
+// RUN: rm -rf %t.output
+// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: cat %t.output/* | FileCheck %s --match-full-lines
+// CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}};
diff --git a/test/Analysis/html_diagnostics/relevant_lines/multifile.c b/test/Analysis/html_diagnostics/relevant_lines/multifile.c
new file mode 100644
index 000000000000..3abffd609b5b
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/multifile.c
@@ -0,0 +1,14 @@
+#include "header.h"
+
+int f(int coin) {
+  char *p = 0;
+  if (coin) {
+    return helper(p, coin);
+  }
+  return 0;
+}
+
+// RUN: rm -rf %t.output
+// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: cat %t.output/* | FileCheck %s --match-full-lines
+// CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}, "3": {"3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
diff --git a/test/Analysis/html_diagnostics/relevant_lines/multiline_func_def.c b/test/Analysis/html_diagnostics/relevant_lines/multiline_func_def.c
new file mode 100644
index 000000000000..35158a46868c
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/multiline_func_def.c
@@ -0,0 +1,16 @@
+int f(
+    int coin,
+    int paramA,
+    int paramB) {
+  if (coin) {
+    int *x = 0;
+    return *x;
+  } else {
+    return 0;
+  }
+}
+
+// RUN: rm -rf %t.output
+// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: cat %t.output/* | FileCheck %s --match-full-lines
+// CHECK: var relevant_lines = {"1": {"1": 1, "2": 1, "3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
diff --git a/test/Analysis/html_diagnostics/relevant_lines/notexecutedlines.c b/test/Analysis/html_diagnostics/relevant_lines/notexecutedlines.c
new file mode 100644
index 000000000000..3c723aa93349
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/notexecutedlines.c
@@ -0,0 +1,12 @@
+int f() {
+  int zzz = 200;
+  zzz += 100;
+  return 0;
+}
+
+// Show line with the warning even if it wasn't executed (e.g. warning given
+// by path-insensitive analysis).
+// RUN: rm -rf %t.output
+// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core,deadcode -analyzer-output html -o %t.output %s
+// RUN: cat %t.output/* | FileCheck %s --match-full-lines
+// CHECK: var relevant_lines = {"1": {"3": 1}};
diff --git a/test/Analysis/html_diagnostics/relevant_lines/objcmethods.m b/test/Analysis/html_diagnostics/relevant_lines/objcmethods.m
new file mode 100644
index 000000000000..41a4c1d2e097
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/objcmethods.m
@@ -0,0 +1,19 @@
+@interface I
+- (int)func;
+@end
+
+@implementation I
+- (int)func:(int *)param {
+  return *param;
+}
+@end
+
+void foo(I *i) {
+  int *x = 0;
+  [i func:x];
+}
+
+// RUN: rm -rf %t.output
+// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output -Wno-objc-root-class %s
+// RUN: cat %t.output/* | FileCheck %s
+// CHECK: var relevant_lines = {"1": {"6": 1, "7": 1, "11": 1, "12": 1, "13": 1}};
diff --git a/test/Analysis/html_diagnostics/relevant_lines/simple_conditional.c b/test/Analysis/html_diagnostics/relevant_lines/simple_conditional.c
new file mode 100644
index 000000000000..769859dea5ef
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/simple_conditional.c
@@ -0,0 +1,13 @@
+int f(int coin) {
+  if (coin) {
+    int *x = 0;
+    return *x;
+  } else {
+    return 0;
+  }
+}
+
+// RUN: rm -rf %t.output
+// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: cat %t.output/* | FileCheck %s --match-full-lines
+// CHECK: var relevant_lines = {"1": {"1": 1, "2": 1, "3": 1, "4": 1}};
diff --git a/test/Analysis/html_diagnostics/relevant_lines/unused_header.c b/test/Analysis/html_diagnostics/relevant_lines/unused_header.c
new file mode 100644
index 000000000000..4b77c651eddd
--- /dev/null
+++ b/test/Analysis/html_diagnostics/relevant_lines/unused_header.c
@@ -0,0 +1,19 @@
+#include "header.h"
+
+int f(int coin) {
+  if (coin) {
+    int *x = 0;
+    return *x;
+  } else {
+    return 0;
+  }
+}
+
+int v(int coin) {
+  return coin;
+}
+
+// RUN: rm -rf %t.output
+// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: cat %t.output/* | FileCheck %s --match-full-lines
+// CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}};
diff --git a/test/Analysis/initializer.cpp b/test/Analysis/initializer.cpp
index 55f0a8950287..b73a94f1dbf6 100644
--- a/test/Analysis/initializer.cpp
+++ b/test/Analysis/initializer.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDeleteLeaks,debug.ExprInspection -analyzer-config c++-inlining=constructors -std=c++11 -verify %s
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDeleteLeaks,debug.ExprInspection -analyzer-config c++-inlining=constructors -std=c++17 -DCPLUSPLUS17 -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDeleteLeaks,debug.ExprInspection -analyzer-config c++-inlining=constructors -std=c++11 -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDeleteLeaks,debug.ExprInspection -analyzer-config c++-inlining=constructors -std=c++17 -DCPLUSPLUS17 -DTEST_INLINABLE_ALLOCATORS -verify %s
 
 void clang_analyzer_eval(bool);
 
diff --git a/test/Analysis/inline.cpp b/test/Analysis/inline.cpp
index 76eee5beffac..c362fc93090b 100644
--- a/test/Analysis/inline.cpp
+++ b/test/Analysis/inline.cpp
@@ -315,17 +315,13 @@ namespace OperatorNew {
     int value;
 
     IntWrapper(int input) : value(input) {
-      // We don't want this constructor to be inlined unless we can actually
-      // use the proper region for operator new.
-      // See PR12014 and <rdar://problem/12180598>.
-      clang_analyzer_checkInlined(false); // no-warning
+      clang_analyzer_checkInlined(true); // expected-warning{{TRUE}}
     }
   };
 
   void test() {
     IntWrapper *obj = new IntWrapper(42);
-    // should be TRUE
-    clang_analyzer_eval(obj->value == 42); // expected-warning{{UNKNOWN}}
+    clang_analyzer_eval(obj->value == 42); // expected-warning{{TRUE}}
     delete obj;
   }
 
@@ -335,8 +331,9 @@ namespace OperatorNew {
 
     clang_analyzer_eval(alias == obj); // expected-warning{{TRUE}}
 
-    // should be TRUE
-    clang_analyzer_eval(obj->value == 42); // expected-warning{{UNKNOWN}}
+    clang_analyzer_eval(obj->value == 42); // expected-warning{{TRUE}}
+    // Because malloc() was never free()d:
+    // expected-warning@-2{{Potential leak of memory pointed to by 'alias'}}
   }
 }
 
diff --git a/test/Analysis/inlining/containers.cpp b/test/Analysis/inlining/containers.cpp
index 16e006b345f2..ffa3b511344a 100644
--- a/test/Analysis/inlining/containers.cpp
+++ b/test/Analysis/inlining/containers.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-config c++-inlining=destructors -analyzer-config c++-container-inlining=false -verify %s
 // RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-config c++-inlining=destructors -analyzer-config c++-container-inlining=true -DINLINE=1 -verify %s
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-config c++-inlining=destructors -analyzer-config c++-container-inlining=false -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-config c++-inlining=destructors -analyzer-config c++-container-inlining=true -DTEST_INLINABLE_ALLOCATORS -DINLINE=1 -verify %s
 
 #ifndef HEADER
 
diff --git a/test/Analysis/inlining/false-positive-suppression.c b/test/Analysis/inlining/false-positive-suppression.c
index 4931695ef127..4d472e0861e0 100644
--- a/test/Analysis/inlining/false-positive-suppression.c
+++ b/test/Analysis/inlining/false-positive-suppression.c
@@ -111,14 +111,12 @@ void testInlineCheckInNestedMacro(int *p) {
   *p = 1; // no-warning
 }
 
-// If there is a check in a macro that is not function-like, don't treat
-// it like a function so don't suppress.
 #define NON_FUNCTION_MACRO_WITH_CHECK ( ((p) != 0) ? *p : 17)
 void testNonFunctionMacro(int *p) {
   int i = NON_FUNCTION_MACRO_WITH_CHECK ;
   (void)i;
 
-  *p = 1; // expected-warning {{Dereference of null pointer (loaded from variable 'p')}}
+  *p = 1; // no-warning
 }
 
 
@@ -163,6 +161,7 @@ void testNestedDisjunctiveMacro2(int *p, int *q) {
 }
 
 
+
 // Here the check is entirely in non-macro code even though the code itself
 // is a macro argument.
 #define MACRO_DO_IT(a) (a)
@@ -171,6 +170,15 @@ void testErrorInArgument(int *p) {
   (void)i;
 }
 
+// No warning should be emitted if dereference is performed from a different
+// macro.
+#define MACRO_CHECK(a) if (a) {}
+#define MACRO_DEREF(a) (*a)
+int testDifferentMacro(int *p) {
+  MACRO_CHECK(p);
+  return MACRO_DEREF(p); // no-warning
+}
+
 // --------------------------
 // "Suppression suppression"
 // --------------------------
diff --git a/test/Analysis/malloc-custom.c b/test/Analysis/malloc-custom.c
index f33b150de678..053d0ee31ca8 100644
--- a/test/Analysis/malloc-custom.c
+++ b/test/Analysis/malloc-custom.c
@@ -1,6 +1,6 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -Wno-incompatible-library-redeclaration -verify %s
 
-// Various tests to make the the analyzer is robust against custom
+// Various tests to make the analyzer is robust against custom
 // redeclarations of memory routines.
 //
 // You wouldn't expect to see much of this in normal code, but, for example,
diff --git a/test/Analysis/malloc.c b/test/Analysis/malloc.c
index 4c364ebd9a2f..50be4ef3ba29 100644
--- a/test/Analysis/malloc.c
+++ b/test/Analysis/malloc.c
@@ -1720,13 +1720,6 @@ void *smallocWarn(size_t size) {
   }
 }
 
-char *dupstrWarn(const char *s) {
-  const int len = strlen(s);
-  char *p = (char*) smallocWarn(len + 1);
-  strcpy(p, s); // expected-warning{{String copy function overflows destination buffer}}
-  return p;
-}
-
 int *radar15580979() {
   int *data = (int *)malloc(32);
   int *p = data ?: (int*)malloc(32); // no warning
@@ -1784,6 +1777,27 @@ void freeFunctionPtr() {
   free((void *)fnptr); // expected-warning {{Argument to free() is a function pointer}}
 }
 
+// Enabling the malloc checker enables some of the buffer-checking portions
+// of the C-string checker.
+void cstringchecker_bounds_nocrash() {
+  char *p = malloc(2);
+  strncpy(p, "AAA", sizeof("AAA")); // expected-warning {{Size argument is greater than the length of the destination buffer}}
+
+  free(p);
+}
+
+void allocateSomeMemory(void *offendingParameter, void **ptr) {
+  *ptr = malloc(1);
+}
+
+void testNoCrashOnOffendingParameter() {
+  // "extern" is necessary to avoid unrelated warnings 
+  // on passing uninitialized value.
+  extern void *offendingParameter;
+  void* ptr;
+  allocateSomeMemory(offendingParameter, &ptr);
+} // expected-warning {{Potential leak of memory pointed to by 'ptr'}}
+
 // ----------------------------------------------------------------------------
 // False negatives.
 
diff --git a/test/Analysis/malloc.cpp b/test/Analysis/malloc.cpp
index c323754bb467..b93c73e591cd 100644
--- a/test/Analysis/malloc.cpp
+++ b/test/Analysis/malloc.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_analyze_cc1 -w -analyzer-checker=core,alpha.deadcode.UnreachableCode,alpha.core.CastSize,unix.Malloc,cplusplus.NewDelete -analyzer-store=region -verify %s
 // RUN: %clang_analyze_cc1 -triple i386-unknown-linux-gnu -w -analyzer-checker=core,alpha.deadcode.UnreachableCode,alpha.core.CastSize,unix.Malloc,cplusplus.NewDelete -analyzer-store=region -verify %s
+// RUN: %clang_analyze_cc1 -w -analyzer-checker=core,alpha.deadcode.UnreachableCode,alpha.core.CastSize,unix.Malloc,cplusplus.NewDelete -analyzer-store=region -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -triple i386-unknown-linux-gnu -w -analyzer-checker=core,alpha.deadcode.UnreachableCode,alpha.core.CastSize,unix.Malloc,cplusplus.NewDelete -analyzer-store=region -DTEST_INLINABLE_ALLOCATORS -verify %s
 
 #include "Inputs/system-header-simulator-cxx.h"
 
diff --git a/test/Analysis/new-ctor-conservative.cpp b/test/Analysis/new-ctor-conservative.cpp
new file mode 100644
index 000000000000..b82df9abf1e2
--- /dev/null
+++ b/test/Analysis/new-ctor-conservative.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_analyze_cc1 -w -analyzer-checker=core,debug.ExprInspection -analyzer-config c++-allocator-inlining=true -std=c++11 -verify %s
+
+void clang_analyzer_eval(bool);
+void clang_analyzer_warnIfReached();
+
+struct S {
+  int x;
+  S() : x(1) {}
+  ~S() {}
+};
+
+void checkConstructorInlining() {
+  S *s = new S;
+  clang_analyzer_eval(s->x == 1); // expected-warning{{TRUE}}
+}
+
+void checkNewPOD() {
+  int *i = new int;
+  clang_analyzer_eval(*i == 0); // expected-warning{{UNKNOWN}}
+  int *j = new int();
+  clang_analyzer_eval(*j == 0); // expected-warning{{TRUE}}
+  int *k = new int(5);
+  clang_analyzer_eval(*k == 5); // expected-warning{{TRUE}}
+}
+
+void checkNewArray() {
+  S *s = new S[10];
+  // FIXME: Should be true once we inline array constructors.
+  clang_analyzer_eval(s[0].x == 1); // expected-warning{{UNKNOWN}}
+}
+
+struct NullS {
+  NullS() {
+    if (this) {}
+  }
+  NullS(int x) {
+    if (!this) {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  }
+};
+
+void checkNullThis() {
+  NullS *nulls = new NullS(); // no-crash
+  NullS *nulls2 = new NullS(0);
+}
diff --git a/test/Analysis/new-ctor-inlined.cpp b/test/Analysis/new-ctor-inlined.cpp
new file mode 100644
index 000000000000..1506bf27f353
--- /dev/null
+++ b/test/Analysis/new-ctor-inlined.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config c++-allocator-inlining=true -std=c++11 -verify %s
+
+void clang_analyzer_eval(bool);
+
+typedef __typeof__(sizeof(int)) size_t;
+
+void *conjure();
+void exit(int);
+
+void *operator new(size_t size) throw() {
+  void *x = conjure();
+  if (x == 0)
+    exit(1);
+  return x;
+}
+
+struct S {
+  int x;
+  S() : x(1) {}
+  ~S() {}
+};
+
+void checkNewAndConstructorInlining() {
+  S *s = new S;
+  // Check that the symbol for 's' is not dying.
+  clang_analyzer_eval(s != 0); // expected-warning{{TRUE}}
+  // Check that bindings are correct (and also not dying).
+  clang_analyzer_eval(s->x == 1); // expected-warning{{TRUE}}
+}
+
+struct Sp {
+  Sp *p;
+  Sp(Sp *p): p(p) {}
+  ~Sp() {}
+};
+
+void checkNestedNew() {
+  Sp *p = new Sp(new Sp(0));
+  clang_analyzer_eval(p->p->p == 0); // expected-warning{{TRUE}}
+}
+
+void checkNewPOD() {
+  int *i = new int;
+  clang_analyzer_eval(*i == 0); // expected-warning{{UNKNOWN}}
+  int *j = new int();
+  clang_analyzer_eval(*j == 0); // expected-warning{{TRUE}}
+  int *k = new int(5);
+  clang_analyzer_eval(*k == 5); // expected-warning{{TRUE}}
+}
+
+void checkTrivialCopy() {
+  S s;
+  S *t = new S(s); // no-crash
+  clang_analyzer_eval(t->x == 1); // expected-warning{{TRUE}}
+}
diff --git a/test/Analysis/new-ctor-malloc.cpp b/test/Analysis/new-ctor-malloc.cpp
new file mode 100644
index 000000000000..74b1e21a5c97
--- /dev/null
+++ b/test/Analysis/new-ctor-malloc.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection,unix.Malloc -analyzer-config c++-allocator-inlining=true -analyzer-output=text -std=c++11 -verify %s
+
+void clang_analyzer_eval(bool);
+
+typedef __typeof__(sizeof(int)) size_t;
+
+void *malloc(size_t size);
+
+void *operator new(size_t size) throw() {
+  void *x = malloc(size); // expected-note {{Memory is allocated}}
+  if (!x) // expected-note    {{Assuming 'x' is non-null}}
+          // expected-note@-1 {{Taking false branch}}
+    return nullptr;
+  return x;
+}
+
+void checkNewAndConstructorInlining() {
+  int *s = new int; // expected-note   {{Calling 'operator new'}}
+                    // expected-note@-1{{Returning from 'operator new'}}
+} // expected-warning {{Potential leak of memory pointed to by 's'}}
+  // expected-note@-1 {{Potential leak of memory pointed to by 's'}}
diff --git a/test/Analysis/new-ctor-null-throw.cpp b/test/Analysis/new-ctor-null-throw.cpp
new file mode 100644
index 000000000000..cdaf01d8de67
--- /dev/null
+++ b/test/Analysis/new-ctor-null-throw.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_analyze_cc1 -w -analyzer-checker=core,debug.ExprInspection -analyzer-config c++-allocator-inlining=true -std=c++11 -verify %s
+
+void clang_analyzer_eval(bool);
+
+typedef __typeof__(sizeof(int)) size_t;
+
+
+// These are ill-formed. One cannot return nullptr from a throwing version of an
+// operator new.
+void *operator new(size_t size) {
+  return nullptr;
+}
+void *operator new[](size_t size) {
+  return nullptr;
+}
+
+struct S {
+  int x;
+  S() : x(1) {}
+  ~S() {}
+};
+
+void testArrays() {
+  S *s = new S[10]; // no-crash
+  s[0].x = 2; // expected-warning{{Dereference of null pointer}}
+}
diff --git a/test/Analysis/new-ctor-null.cpp b/test/Analysis/new-ctor-null.cpp
new file mode 100644
index 000000000000..ac2a39a02840
--- /dev/null
+++ b/test/Analysis/new-ctor-null.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config c++-allocator-inlining=true -std=c++11 -verify %s
+
+void clang_analyzer_eval(bool);
+void clang_analyzer_warnIfReached();
+
+typedef __typeof__(sizeof(int)) size_t;
+
+void *operator new(size_t size) throw() {
+  return nullptr;
+}
+void *operator new[](size_t size) throw() {
+  return nullptr;
+}
+
+struct S {
+  int x;
+  S() : x(1) {
+    // FIXME: Constructor should not be called with null this, even if it was
+    // returned by operator new().
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+  ~S() {}
+};
+
+void testArrays() {
+  S *s = new S[10]; // no-crash
+  s[0].x = 2; // expected-warning{{Dereference of null pointer}}
+}
+
+int global;
+void testInvalidationOnConstructionIntoNull() {
+  global = 0;
+  S *s = new S();
+  // FIXME: Should be FALSE - we should not invalidate globals.
+  clang_analyzer_eval(global); // expected-warning{{UNKNOWN}}
+}
diff --git a/test/Analysis/new-ctor-recursive.cpp b/test/Analysis/new-ctor-recursive.cpp
new file mode 100644
index 000000000000..e0c654bc74ab
--- /dev/null
+++ b/test/Analysis/new-ctor-recursive.cpp
@@ -0,0 +1,118 @@
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks,debug.ExprInspection -analyzer-config c++-allocator-inlining=true -std=c++11 -verify %s
+
+void clang_analyzer_eval(bool);
+void clang_analyzer_dump(int);
+
+typedef __typeof__(sizeof(int)) size_t;
+
+void *conjure();
+void exit(int);
+
+struct S;
+
+S *global_s;
+
+// Recursive operator kinda placement new.
+void *operator new(size_t size, S *place);
+
+enum class ConstructionKind : char {
+  Garbage,
+  Recursive
+};
+
+struct S {
+public:
+  int x;
+  S(): x(1) {}
+  S(int y): x(y) {}
+
+  S(ConstructionKind k) {
+    switch (k) {
+    case ConstructionKind::Recursive: { // Call one more operator new 'r'ecursively.
+      S *s = new (nullptr) S(5);
+      x = s->x + 1;
+      global_s = s;
+      return;
+    }
+    case ConstructionKind::Garbage: {
+      // Leaves garbage in 'x'.
+    }
+    }
+  }
+  ~S() {}
+};
+
+// Do not try this at home!
+void *operator new(size_t size, S *place) {
+  if (!place)
+    return new S();
+  return place;
+}
+
+void testThatCharConstructorIndeedYieldsGarbage() {
+  S *s = new S(ConstructionKind::Garbage);
+  clang_analyzer_eval(s->x == 0); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(s->x == 1); // expected-warning{{UNKNOWN}}
+  // FIXME: This should warn, but MallocChecker doesn't default-bind regions
+  // returned by standard operator new to garbage.
+  s->x += 1; // no-warning
+  delete s;
+}
+
+
+void testChainedOperatorNew() {
+  S *s;
+  // * Evaluate standard new.
+  // * Evaluate constructor S(3).
+  // * Bind value for standard new.
+  // * Evaluate our custom new.
+  // * Evaluate constructor S(Garbage).
+  // * Bind value for our custom new.
+  s = new (new S(3)) S(ConstructionKind::Garbage);
+  clang_analyzer_eval(s->x == 3); // expected-warning{{TRUE}}
+  // expected-warning@+9{{Potential leak of memory pointed to by 's'}}
+
+  // * Evaluate standard new.
+  // * Evaluate constructor S(Garbage).
+  // * Bind value for standard new.
+  // * Evaluate our custom new.
+  // * Evaluate constructor S(4).
+  // * Bind value for our custom new.
+  s = new (new S(ConstructionKind::Garbage)) S(4);
+  clang_analyzer_eval(s->x == 4); // expected-warning{{TRUE}}
+  delete s;
+
+  // -> Enter our custom new (nullptr).
+  //   * Evaluate standard new.
+  //   * Inline constructor S().
+  //   * Bind value for standard new.
+  // <- Exit our custom new (nullptr).
+  // * Evaluate constructor S(Garbage).
+  // * Bind value for our custom new.
+  s = new (nullptr) S(ConstructionKind::Garbage);
+  clang_analyzer_eval(s->x == 1); // expected-warning{{TRUE}}
+  delete s;
+
+  // -> Enter our custom new (nullptr).
+  //   * Evaluate standard new.
+  //   * Inline constructor S().
+  //   * Bind value for standard new.
+  // <- Exit our custom new (nullptr).
+  // -> Enter constructor S(Recursive).
+  //   -> Enter our custom new (nullptr).
+  //     * Evaluate standard new.
+  //     * Inline constructor S().
+  //     * Bind value for standard new.
+  //   <- Exit our custom new (nullptr).
+  //   * Evaluate constructor S(5).
+  //   * Bind value for our custom new (nullptr).
+  //   * Assign that value to global_s.
+  // <- Exit constructor S(Recursive).
+  // * Bind value for our custom new (nullptr).
+  global_s = nullptr;
+  s = new (nullptr) S(ConstructionKind::Recursive);
+  clang_analyzer_eval(global_s); // expected-warning{{TRUE}}
+  clang_analyzer_eval(global_s->x == 5); // expected-warning{{TRUE}}
+  clang_analyzer_eval(s->x == 6); // expected-warning{{TRUE}}
+  delete s;
+}
diff --git a/test/Analysis/new-ctor-symbolic.cpp b/test/Analysis/new-ctor-symbolic.cpp
new file mode 100644
index 000000000000..8bef2ed2402c
--- /dev/null
+++ b/test/Analysis/new-ctor-symbolic.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config c++-allocator-inlining=true -std=c++11 -verify %s
+
+void clang_analyzer_eval(bool);
+void clang_analyzer_warnOnDeadSymbol(int);
+
+typedef __typeof__(sizeof(int)) size_t;
+
+int conjure();
+void exit(int);
+
+struct S {
+  S() {}
+  ~S() {}
+
+  static S buffer[1000];
+
+  // This operator allocates stuff within the buffer. Additionally, it never
+  // places anything at the beginning of the buffer.
+  void *operator new(size_t size) {
+    int i = conjure();
+    if (i == 0)
+      exit(1);
+    // Let's see if the symbol dies before new-expression is evaluated.
+    // It shouldn't.
+    clang_analyzer_warnOnDeadSymbol(i);
+    return buffer + i;
+  }
+};
+
+void testIndexLiveness() {
+  S *s = new S();
+  clang_analyzer_eval(s == S::buffer); // expected-warning{{FALSE}}
+} // expected-warning{{SYMBOL DEAD}}
diff --git a/test/Analysis/new.cpp b/test/Analysis/new.cpp
index 6cfcb1d92719..4ba8fc14f158 100644
--- a/test/Analysis/new.cpp
+++ b/test/Analysis/new.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-store region -std=c++11 -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-store region -std=c++11 -DTEST_INLINABLE_ALLOCATORS -verify %s
 #include "Inputs/system-header-simulator-cxx.h"
 
 void clang_analyzer_eval(bool);
@@ -34,7 +35,7 @@ void *testPlacementNew() {
 
   void *y = new (x) int;
   clang_analyzer_eval(x == y); // expected-warning{{TRUE}};
-  clang_analyzer_eval(*x == 1); // expected-warning{{UNKNOWN}};
+  clang_analyzer_eval(*x == 1); // expected-warning{{TRUE}};
 
   return y;
 }
@@ -200,8 +201,7 @@ int testNoInitializationPlacement() {
   int n;
   new (&n) int;
 
-  // Should warn that n is uninitialized.
-  if (n) { // no-warning
+  if (n) { // expected-warning{{Branch condition evaluates to a garbage value}}
     return 0;
   }
   return 1;
diff --git a/test/Analysis/plist-macros.cpp b/test/Analysis/plist-macros.cpp
index 18d3ce11e67e..821110e1853b 100644
--- a/test/Analysis/plist-macros.cpp
+++ b/test/Analysis/plist-macros.cpp
@@ -65,6 +65,7 @@ if (y) \
   ;
 int useMultiNote(int *p, int y) {;
   y++;
+  if (p) {}
   multiNoteMacro
 
   return *p; // expected-warning {{Dereference of null pointer}}
@@ -1216,7 +1217,41 @@ void test2(int *p) {
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
 // CHECK-NEXT:            <key>line</key><integer>68</integer>
-// CHECK-NEXT:            <key>col</key><integer>16</integer>
+// CHECK-NEXT:            <key>col</key><integer>4</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:          </array>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:       </array>
+// CHECK-NEXT:     </dict>
+// CHECK-NEXT:     <dict>
+// CHECK-NEXT:      <key>kind</key><string>control</string>
+// CHECK-NEXT:      <key>edges</key>
+// CHECK-NEXT:       <array>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>start</key>
+// CHECK-NEXT:          <array>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>col</key><integer>3</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>col</key><integer>4</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:          </array>
+// CHECK-NEXT:         <key>end</key>
+// CHECK-NEXT:          <array>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>col</key><integer>7</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>col</key><integer>7</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:          </array>
@@ -1228,7 +1263,7 @@ void test2(int *p) {
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
 // CHECK-NEXT:       <key>line</key><integer>68</integer>
-// CHECK-NEXT:       <key>col</key><integer>3</integer>
+// CHECK-NEXT:       <key>col</key><integer>7</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
 // CHECK-NEXT:      <key>ranges</key>
@@ -1236,12 +1271,12 @@ void test2(int *p) {
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
 // CHECK-NEXT:          <key>line</key><integer>68</integer>
-// CHECK-NEXT:          <key>col</key><integer>3</integer>
+// CHECK-NEXT:          <key>col</key><integer>7</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
 // CHECK-NEXT:          <key>line</key><integer>68</integer>
-// CHECK-NEXT:          <key>col</key><integer>16</integer>
+// CHECK-NEXT:          <key>col</key><integer>7</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:        </array>
@@ -1253,10 +1288,44 @@ void test2(int *p) {
 // CHECK-NEXT:      <string>Assuming &apos;p&apos; is null</string>
 // CHECK-NEXT:     </dict>
 // CHECK-NEXT:     <dict>
+// CHECK-NEXT:      <key>kind</key><string>control</string>
+// CHECK-NEXT:      <key>edges</key>
+// CHECK-NEXT:       <array>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>start</key>
+// CHECK-NEXT:          <array>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>col</key><integer>7</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>col</key><integer>7</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:          </array>
+// CHECK-NEXT:         <key>end</key>
+// CHECK-NEXT:          <array>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>69</integer>
+// CHECK-NEXT:            <key>col</key><integer>3</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:           <dict>
+// CHECK-NEXT:            <key>line</key><integer>69</integer>
+// CHECK-NEXT:            <key>col</key><integer>16</integer>
+// CHECK-NEXT:            <key>file</key><integer>0</integer>
+// CHECK-NEXT:           </dict>
+// CHECK-NEXT:          </array>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:       </array>
+// CHECK-NEXT:     </dict>
+// CHECK-NEXT:     <dict>
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>68</integer>
+// CHECK-NEXT:       <key>line</key><integer>69</integer>
 // CHECK-NEXT:       <key>col</key><integer>3</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -1264,12 +1333,12 @@ void test2(int *p) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>68</integer>
+// CHECK-NEXT:          <key>line</key><integer>69</integer>
 // CHECK-NEXT:          <key>col</key><integer>3</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>68</integer>
+// CHECK-NEXT:          <key>line</key><integer>69</integer>
 // CHECK-NEXT:          <key>col</key><integer>16</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -1289,12 +1358,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>line</key><integer>69</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>68</integer>
+// CHECK-NEXT:            <key>line</key><integer>69</integer>
 // CHECK-NEXT:            <key>col</key><integer>16</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1302,12 +1371,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>70</integer>
+// CHECK-NEXT:            <key>line</key><integer>71</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>70</integer>
+// CHECK-NEXT:            <key>line</key><integer>71</integer>
 // CHECK-NEXT:            <key>col</key><integer>8</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1323,12 +1392,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>70</integer>
+// CHECK-NEXT:            <key>line</key><integer>71</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>70</integer>
+// CHECK-NEXT:            <key>line</key><integer>71</integer>
 // CHECK-NEXT:            <key>col</key><integer>8</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1336,12 +1405,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>70</integer>
+// CHECK-NEXT:            <key>line</key><integer>71</integer>
 // CHECK-NEXT:            <key>col</key><integer>10</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>70</integer>
+// CHECK-NEXT:            <key>line</key><integer>71</integer>
 // CHECK-NEXT:            <key>col</key><integer>10</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1353,7 +1422,7 @@ void test2(int *p) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>70</integer>
+// CHECK-NEXT:       <key>line</key><integer>71</integer>
 // CHECK-NEXT:       <key>col</key><integer>10</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -1361,12 +1430,12 @@ void test2(int *p) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>70</integer>
+// CHECK-NEXT:          <key>line</key><integer>71</integer>
 // CHECK-NEXT:          <key>col</key><integer>11</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>70</integer>
+// CHECK-NEXT:          <key>line</key><integer>71</integer>
 // CHECK-NEXT:          <key>col</key><integer>11</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -1387,10 +1456,10 @@ void test2(int *p) {
 // CHECK-NEXT:    <key>issue_hash_content_of_line_in_context</key><string>41f58f9549aa1867e461a7996a8d335c</string>
 // CHECK-NEXT:   <key>issue_context_kind</key><string>function</string>
 // CHECK-NEXT:   <key>issue_context</key><string>useMultiNote</string>
-// CHECK-NEXT:   <key>issue_hash_function_offset</key><string>4</string>
+// CHECK-NEXT:   <key>issue_hash_function_offset</key><string>5</string>
 // CHECK-NEXT:   <key>location</key>
 // CHECK-NEXT:   <dict>
-// CHECK-NEXT:    <key>line</key><integer>70</integer>
+// CHECK-NEXT:    <key>line</key><integer>71</integer>
 // CHECK-NEXT:    <key>col</key><integer>10</integer>
 // CHECK-NEXT:    <key>file</key><integer>0</integer>
 // CHECK-NEXT:   </dict>
@@ -1402,7 +1471,7 @@ void test2(int *p) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>82</integer>
+// CHECK-NEXT:       <key>line</key><integer>83</integer>
 // CHECK-NEXT:       <key>col</key><integer>3</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -1410,12 +1479,12 @@ void test2(int *p) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>82</integer>
+// CHECK-NEXT:          <key>line</key><integer>83</integer>
 // CHECK-NEXT:          <key>col</key><integer>3</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>82</integer>
+// CHECK-NEXT:          <key>line</key><integer>83</integer>
 // CHECK-NEXT:          <key>col</key><integer>12</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -1431,7 +1500,7 @@ void test2(int *p) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>82</integer>
+// CHECK-NEXT:       <key>line</key><integer>83</integer>
 // CHECK-NEXT:       <key>col</key><integer>3</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -1439,12 +1508,12 @@ void test2(int *p) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>82</integer>
+// CHECK-NEXT:          <key>line</key><integer>83</integer>
 // CHECK-NEXT:          <key>col</key><integer>3</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>82</integer>
+// CHECK-NEXT:          <key>line</key><integer>83</integer>
 // CHECK-NEXT:          <key>col</key><integer>12</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -1460,7 +1529,7 @@ void test2(int *p) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>75</integer>
+// CHECK-NEXT:       <key>line</key><integer>76</integer>
 // CHECK-NEXT:       <key>col</key><integer>1</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -1478,12 +1547,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>75</integer>
+// CHECK-NEXT:            <key>line</key><integer>76</integer>
 // CHECK-NEXT:            <key>col</key><integer>1</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>75</integer>
+// CHECK-NEXT:            <key>line</key><integer>76</integer>
 // CHECK-NEXT:            <key>col</key><integer>4</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1491,12 +1560,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>76</integer>
+// CHECK-NEXT:            <key>line</key><integer>77</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>76</integer>
+// CHECK-NEXT:            <key>line</key><integer>77</integer>
 // CHECK-NEXT:            <key>col</key><integer>4</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1512,12 +1581,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>76</integer>
+// CHECK-NEXT:            <key>line</key><integer>77</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>76</integer>
+// CHECK-NEXT:            <key>line</key><integer>77</integer>
 // CHECK-NEXT:            <key>col</key><integer>4</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1525,12 +1594,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>78</integer>
+// CHECK-NEXT:            <key>line</key><integer>79</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>78</integer>
+// CHECK-NEXT:            <key>line</key><integer>79</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1546,12 +1615,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>start</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>78</integer>
+// CHECK-NEXT:            <key>line</key><integer>79</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>78</integer>
+// CHECK-NEXT:            <key>line</key><integer>79</integer>
 // CHECK-NEXT:            <key>col</key><integer>3</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1559,12 +1628,12 @@ void test2(int *p) {
 // CHECK-NEXT:         <key>end</key>
 // CHECK-NEXT:          <array>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>78</integer>
+// CHECK-NEXT:            <key>line</key><integer>79</integer>
 // CHECK-NEXT:            <key>col</key><integer>6</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
 // CHECK-NEXT:           <dict>
-// CHECK-NEXT:            <key>line</key><integer>78</integer>
+// CHECK-NEXT:            <key>line</key><integer>79</integer>
 // CHECK-NEXT:            <key>col</key><integer>6</integer>
 // CHECK-NEXT:            <key>file</key><integer>0</integer>
 // CHECK-NEXT:           </dict>
@@ -1576,7 +1645,7 @@ void test2(int *p) {
 // CHECK-NEXT:      <key>kind</key><string>event</string>
 // CHECK-NEXT:      <key>location</key>
 // CHECK-NEXT:      <dict>
-// CHECK-NEXT:       <key>line</key><integer>78</integer>
+// CHECK-NEXT:       <key>line</key><integer>79</integer>
 // CHECK-NEXT:       <key>col</key><integer>6</integer>
 // CHECK-NEXT:       <key>file</key><integer>0</integer>
 // CHECK-NEXT:      </dict>
@@ -1584,12 +1653,12 @@ void test2(int *p) {
 // CHECK-NEXT:      <array>
 // CHECK-NEXT:        <array>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>78</integer>
+// CHECK-NEXT:          <key>line</key><integer>79</integer>
 // CHECK-NEXT:          <key>col</key><integer>4</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
 // CHECK-NEXT:         <dict>
-// CHECK-NEXT:          <key>line</key><integer>78</integer>
+// CHECK-NEXT:          <key>line</key><integer>79</integer>
 // CHECK-NEXT:          <key>col</key><integer>4</integer>
 // CHECK-NEXT:          <key>file</key><integer>0</integer>
 // CHECK-NEXT:         </dict>
@@ -1613,7 +1682,7 @@ void test2(int *p) {
 // CHECK-NEXT:   <key>issue_hash_function_offset</key><string>3</string>
 // CHECK-NEXT:   <key>location</key>
 // CHECK-NEXT:   <dict>
-// CHECK-NEXT:    <key>line</key><integer>78</integer>
+// CHECK-NEXT:    <key>line</key><integer>79</integer>
 // CHECK-NEXT:    <key>col</key><integer>6</integer>
 // CHECK-NEXT:    <key>file</key><integer>0</integer>
 // CHECK-NEXT:   </dict>
diff --git a/test/Analysis/security-syntax-checks.m b/test/Analysis/security-syntax-checks.m
index 04a4c7d86655..8560ba29e702 100644
--- a/test/Analysis/security-syntax-checks.m
+++ b/test/Analysis/security-syntax-checks.m
@@ -146,6 +146,16 @@ void test_strcpy() {
   strcpy(x, y); //expected-warning{{Call to function 'strcpy' is insecure as it does not provide bounding of the memory buffer. Replace unbounded copy functions with analogous functions that support length arguments such as 'strlcpy'. CWE-119}}
 }
 
+void test_strcpy_2() {
+  char x[4];
+  strcpy(x, "abcd"); //expected-warning{{Call to function 'strcpy' is insecure as it does not provide bounding of the memory buffer. Replace unbounded copy functions with analogous functions that support length arguments such as 'strlcpy'. CWE-119}}
+}
+
+void test_strcpy_safe() {
+  char x[5];
+  strcpy(x, "abcd");
+}
+
 //===----------------------------------------------------------------------===
 // strcat()
 //===----------------------------------------------------------------------===
diff --git a/test/Analysis/temporaries.cpp b/test/Analysis/temporaries.cpp
index 99851dd68539..9e2d45f693af 100644
--- a/test/Analysis/temporaries.cpp
+++ b/test/Analysis/temporaries.cpp
@@ -434,7 +434,7 @@ namespace destructors {
     const CtorWithNoReturnDtor &c = CtorWithNoReturnDtor();
 
     // This represents an (expected) loss of coverage, since the destructor
-    // of the lifetime-exended temporary is executed at at the end of
+    // of the lifetime-exended temporary is executed at the end of
     // scope.
     clang_analyzer_warnIfReached();  // no-warning
   }
diff --git a/test/Analysis/uninit-const.cpp b/test/Analysis/uninit-const.cpp
index 2ffc71bcbd66..f5166e6b27b2 100644
--- a/test/Analysis/uninit-const.cpp
+++ b/test/Analysis/uninit-const.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=cplusplus.NewDelete,core,alpha.core.CallAndMessageUnInitRefArg -analyzer-output=text -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=cplusplus.NewDelete,core,alpha.core.CallAndMessageUnInitRefArg -analyzer-output=text -DTEST_INLINABLE_ALLOCATORS -verify %s
 // Passing uninitialized const data to unknown function
 
 #include "Inputs/system-header-simulator-cxx.h"
diff --git a/test/Analysis/virtualcall.cpp b/test/Analysis/virtualcall.cpp
index c22a8463c04a..1929abf6f0aa 100644
--- a/test/Analysis/virtualcall.cpp
+++ b/test/Analysis/virtualcall.cpp
@@ -262,6 +262,9 @@ int main() {
 	//expected-note-re@-2 {{{{^}}Calling default constructor for 'M'}}
 #endif
   Y *y = new Y;
+#if !PUREONLY
+  //expected-note-re@-2 {{{{^}}Calling default constructor for 'Y'}}
+#endif
   delete y;
   header::Z z;
 #if !PUREONLY
diff --git a/test/CXX/dcl.decl/dcl.meaning/p1-0x.cpp b/test/CXX/dcl.decl/dcl.meaning/p1-0x.cpp
index 1d04d7d2a700..fbe9c0895aea 100644
--- a/test/CXX/dcl.decl/dcl.meaning/p1-0x.cpp
+++ b/test/CXX/dcl.decl/dcl.meaning/p1-0x.cpp
@@ -101,4 +101,25 @@ namespace inline_namespaces {
   template<> struct N::V<int> {};
   template struct N::V<int*>;
   template struct N::V<char>; // expected-error {{undefined}}
+
+  struct Q {};
+
+  // Perversely, inline anonymous namespaces can cause an ostensibly
+  // external-linkage declaration to acquire internal linkage when
+  // redeclared with a qualified name.
+  inline namespace {
+    struct Q {} q;
+    int f_in_inline();
+    extern int v_in_inline;
+    typedef int t_in_inline;
+  }
+  // FIXME: These "extra qualification" warnings are bogus: the qualification
+  // changes the meaning of the program.
+  int inline_namespaces::f_in_inline() { // expected-warning {{extra qualification}}
+    // Finds <anon>::Q, not inline_namespaces::Q
+    Q x = q;
+    return 0;
+  }
+  int inline_namespaces::v_in_inline = // expected-warning {{extra qualification}}
+    (Q(q), 0);
 }
diff --git a/test/CXX/drs/dr1xx.cpp b/test/CXX/drs/dr1xx.cpp
index d62ed9f0d36d..ee9f6a1feb81 100644
--- a/test/CXX/drs/dr1xx.cpp
+++ b/test/CXX/drs/dr1xx.cpp
@@ -238,7 +238,20 @@ namespace dr125 {
   };
 }
 
-namespace dr126 { // dr126: no
+namespace dr126 { // dr126: partial
+  // FIXME: We do not yet generate correct code for this change:
+  // eg:
+  //   catch (void*&) should catch void* but not int*
+  //   catch (void*) and catch (void*const&) should catch both
+  // Likewise:
+  //   catch (Base *&) should catch Base* but not Derived*
+  //   catch (Base *) should catch both
+  // In each case, we emit the same code for both catches.
+  //
+  // The ABI does not let us represent the language rule in the unwind tables.
+  // So, when catching by non-const (or volatile) reference to pointer, we
+  // should compare the exception type to the caught type and only accept an
+  // exact match.
 #if __cplusplus <= 201402L
   struct C {};
   struct D : C {};
@@ -262,12 +275,13 @@ namespace dr126 { // dr126: no
     virtual void gr() throw(C&);
     virtual void hr() throw(C&); // expected-note {{overridden}}
 
-    virtual void pv() throw(void*); // expected-note {{overridden}}
+    virtual void pv() throw(void*);
 
 #if __cplusplus >= 201103L
-    virtual void np() throw(C*); // expected-note {{overridden}}
-    virtual void npm() throw(int C::*); // expected-note {{overridden}}
-    virtual void nr() throw(C&); // expected-note {{overridden}}
+    virtual void np() throw(C*);
+    virtual void npm() throw(int C::*);
+    virtual void nr() throw(C*&); // expected-note {{overridden}}
+    virtual void ncr() throw(C*const&);
 #endif
 
     virtual void ref1() throw(C *const&);
@@ -275,7 +289,7 @@ namespace dr126 { // dr126: no
 
     virtual void v() throw(int);
     virtual void w() throw(const int);
-    virtual void x() throw(int*);
+    virtual void x() throw(int*); // expected-note {{overridden}}
     virtual void y() throw(const int*);
     virtual void z() throw(int); // expected-note {{overridden}}
   };
@@ -294,13 +308,14 @@ namespace dr126 { // dr126: no
     virtual void gr() throw(G&);
     virtual void hr() throw(H&); // expected-error {{more lax}}
 
-    virtual void pv() throw(C*); // expected-error {{more lax}} FIXME: This is valid.
+    virtual void pv() throw(C*);
 
 #if __cplusplus >= 201103L
     using nullptr_t = decltype(nullptr);
-    virtual void np() throw(nullptr_t*); // expected-error {{more lax}} FIXME: This is valid.
-    virtual void npm() throw(nullptr_t*); // expected-error {{more lax}} FIXME: This is valid.
-    virtual void nr() throw(nullptr_t&); // expected-error {{more lax}} This is not.
+    virtual void np() throw(nullptr_t);
+    virtual void npm() throw(nullptr_t&);
+    virtual void nr() throw(nullptr_t); // expected-error {{more lax}}
+    virtual void ncr() throw(nullptr_t);
 #endif
 
     virtual void ref1() throw(D *const &);
@@ -308,7 +323,7 @@ namespace dr126 { // dr126: no
 
     virtual void v() throw(const int);
     virtual void w() throw(int);
-    virtual void x() throw(const int*); // FIXME: 'const int*' is not allowed by A::h.
+    virtual void x() throw(const int*); // expected-error {{more lax}}
     virtual void y() throw(int*); // ok
     virtual void z() throw(long); // expected-error {{more lax}}
   };
diff --git a/test/CXX/over/over.match/over.match.funcs/p4-0x.cpp b/test/CXX/over/over.match/over.match.funcs/p4-0x.cpp
index 68c79900b951..df7762703c0f 100644
--- a/test/CXX/over/over.match/over.match.funcs/p4-0x.cpp
+++ b/test/CXX/over/over.match/over.match.funcs/p4-0x.cpp
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
-// expected-no-diagnostics
 
 template<typename T> T &lvalue();
 template<typename T> T &&xvalue();
@@ -20,6 +19,18 @@ struct X0 {
 
   void g();
 
+  void c() const; // expected-note {{'c' declared here}}
+  void v() volatile; // expected-note {{'v' declared here}}
+  void r() __restrict__; // expected-note {{'r' declared here}}
+  void cr() const __restrict__; // expected-note {{'cr' declared here}}
+  void cv() const volatile;
+  void vr() volatile __restrict__; // expected-note {{'vr' declared here}}
+  void cvr() const volatile __restrict__;
+
+  void lvalue() &; // expected-note 2 {{'lvalue' declared here}}
+  void const_lvalue() const&;
+  void rvalue() &&; // expected-note {{'rvalue' declared here}}
+
   int &operator+(const X0&) &;
   float &operator+(const X0&) &&;
 
@@ -32,7 +43,7 @@ struct X0 {
   float &h2() const&&;
 };
 
-void X0::g() {
+void X0::g() { // expected-note {{'g' declared here}}
   int &ir1 = f();
   int &ir2 = X0::f();
 }
@@ -69,3 +80,26 @@ void test_ref_qualifier_overloading() {
   float &fr3 = xvalue<X0>().h2();
   float &fr4 = prvalue<X0>().h2();
 }
+
+void test_diagnostics(const volatile X0 &__restrict__ cvr) {
+  cvr.g(); // expected-error {{'this' argument to member function 'g' has type 'const volatile X0', but function is not marked const or volatile}}
+  cvr.c(); // expected-error {{not marked volatile}}
+  cvr.v(); // expected-error {{not marked const}}
+  cvr.r(); // expected-error {{not marked const or volatile}}
+  cvr.cr(); // expected-error {{not marked volatile}}
+  cvr.cv();
+  cvr.vr(); // expected-error {{not marked const}}
+  cvr.cvr();
+
+  lvalue<X0>().lvalue();
+  lvalue<X0>().const_lvalue();
+  lvalue<X0>().rvalue(); // expected-error {{'this' argument to member function 'rvalue' is an lvalue, but function has rvalue ref-qualifier}}
+
+  xvalue<X0>().lvalue(); // expected-error {{'this' argument to member function 'lvalue' is an rvalue, but function has non-const lvalue ref-qualifier}}
+  xvalue<X0>().const_lvalue();
+  xvalue<X0>().rvalue();
+
+  prvalue<X0>().lvalue(); // expected-error {{'this' argument to member function 'lvalue' is an rvalue, but function has non-const lvalue ref-qualifier}}
+  prvalue<X0>().const_lvalue();
+  prvalue<X0>().rvalue();
+}
diff --git a/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp b/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
index ad6086835876..34202494c0d8 100644
--- a/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
+++ b/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
@@ -215,7 +215,7 @@ template void h<A(&)[13], int>(A(&)[13]); // expected-note {{requested here}}
 template<typename T>
 void i(T t) {
   for (auto u : t) { // expected-error {{invalid range expression of type 'X::A *'; no viable 'begin' function available}} \
-                        expected-error {{member function 'begin' not viable}} \
+                        expected-error {{'this' argument to member function 'begin' has type 'const X::A', but function is not marked const}} \
                         expected-note {{when looking up 'begin' function}}
 
   }
diff --git a/test/CodeCompletion/Inputs/comments.h b/test/CodeCompletion/Inputs/comments.h
new file mode 100644
index 000000000000..7b4b5daa0bd5
--- /dev/null
+++ b/test/CodeCompletion/Inputs/comments.h
@@ -0,0 +1,4 @@
+// PR32732
+struct B {
+  // <- code completion
+};
diff --git a/test/CodeCompletion/comments.cpp b/test/CodeCompletion/comments.cpp
new file mode 100644
index 000000000000..21f1465ebc04
--- /dev/null
+++ b/test/CodeCompletion/comments.cpp
@@ -0,0 +1,13 @@
+// Note: the run lines follow their respective tests, since line/column
+// matter in this test.
+
+#include "comments.h"
+
+struct A {
+  // <- code completion
+  /* <- code completion */
+};
+
+// RUN: %clang_cc1 -I %S/Inputs -fsyntax-only -code-completion-at=%s:7:6 %s
+// RUN: %clang_cc1 -I %S/Inputs -fsyntax-only -code-completion-at=%s:8:6 %s
+// RUN: %clang_cc1 -I %S/Inputs -fsyntax-only -code-completion-at=%S/Inputs/comments.h:3:6 %s
diff --git a/test/CodeCompletion/inside-macros.cpp b/test/CodeCompletion/inside-macros.cpp
new file mode 100644
index 000000000000..dc40c6ad318c
--- /dev/null
+++ b/test/CodeCompletion/inside-macros.cpp
@@ -0,0 +1,13 @@
+#define ID(X) X
+
+void test(bool input_var) {
+  ID(input_var) = true;
+  // Check that input_var shows up when completing at the start, in the middle
+  // and at the end of the identifier.
+  //
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:4:6 %s -o - | FileCheck %s
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:4:8 %s -o - | FileCheck %s
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:4:15 %s -o - | FileCheck %s
+
+  // CHECK: input_var
+}
diff --git a/test/CodeCompletion/member-access.cpp b/test/CodeCompletion/member-access.cpp
index a0dc7b485679..c21265ec1130 100644
--- a/test/CodeCompletion/member-access.cpp
+++ b/test/CodeCompletion/member-access.cpp
@@ -16,6 +16,8 @@ struct Base3 : Base1, Base2 {
 };
 
 struct Derived : Base3 {
+  template <typename T> Derived(T);
+  Derived(int);
   int member4;
   int memfun3(int);
 };
@@ -48,7 +50,7 @@ struct Bar {
   }
 };
 
-  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:29:6 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:31:6 %s -o - | FileCheck -check-prefix=CHECK-CC1 --implicit-check-not="Derived : Derived(" %s
   // CHECK-CC1: Base1 : Base1::
   // CHECK-CC1: member1 : [#int#][#Base1::#]member1
   // CHECK-CC1: member1 : [#int#][#Base2::#]member1
@@ -62,10 +64,10 @@ struct Bar {
   // CHECK-CC1: memfun3 : [#int#]memfun3(<#int#>)
 
 // Make sure this doesn't crash
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:36:7 %s -verify
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:38:7 %s -verify
 
 // Make sure this also doesn't crash
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:47:14 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:49:14 %s
 
 
 template<typename T>
@@ -100,8 +102,8 @@ void completeDependentMembers(TemplateClass<T, S> &object,
 // CHECK-CC2: overload1 : [#void#]overload1(<#const T &#>)
 // CHECK-CC2: overload1 : [#void#]overload1(<#const S &#>)
 
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:92:10 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:93:12 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:94:10 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:95:12 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
 }
 
 
@@ -118,8 +120,8 @@ void completeDependentSpecializedMembers(TemplateClass<int, double> &object,
 // CHECK-CC3: overload1 : [#void#]overload1(<#const int &#>)
 // CHECK-CC3: overload1 : [#void#]overload1(<#const double &#>)
 
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:110:10 %s -o - | FileCheck -check-prefix=CHECK-CC3 %s
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:111:12 %s -o - | FileCheck -check-prefix=CHECK-CC3 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:112:10 %s -o - | FileCheck -check-prefix=CHECK-CC3 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:113:12 %s -o - | FileCheck -check-prefix=CHECK-CC3 %s
 }
 
 template <typename T>
@@ -133,17 +135,17 @@ class Template {
 // CHECK-CC4: BaseTemplate : BaseTemplate::
 // CHECK-CC4: baseTemplateField : [#int#]baseTemplateField
 // CHECK-CC4: baseTemplateFunction : [#int#]baseTemplateFunction()
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:132:8 %s -o - | FileCheck -check-prefix=CHECK-CC4 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:134:8 %s -o - | FileCheck -check-prefix=CHECK-CC4 %s
     o2.baseTemplateField;
 // CHECK-CC5: BaseTemplate : BaseTemplate::
 // CHECK-CC5: baseTemplateField : [#T#]baseTemplateField
 // CHECK-CC5: baseTemplateFunction : [#T#]baseTemplateFunction()
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:137:8 %s -o - | FileCheck -check-prefix=CHECK-CC5 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:139:8 %s -o - | FileCheck -check-prefix=CHECK-CC5 %s
     this->o1;
 // CHECK-CC6: [#void#]function()
 // CHECK-CC6: o1 : [#BaseTemplate<int>#]o1
 // CHECK-CC6: o2 : [#BaseTemplate<T>#]o2
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:142:11 %s -o - | FileCheck -check-prefix=CHECK-CC6 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:144:11 %s -o - | FileCheck -check-prefix=CHECK-CC6 %s
   }
 
   static void staticFn(T &obj);
@@ -160,7 +162,7 @@ void dependentColonColonCompletion() {
 // CHECK-CC7: o2 : [#BaseTemplate<T>#]o2
 // CHECK-CC7: staticFn : [#void#]staticFn(<#T &obj#>)
 // CHECK-CC7: Template : Template
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:156:16 %s -o - | FileCheck -check-prefix=CHECK-CC7 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:158:16 %s -o - | FileCheck -check-prefix=CHECK-CC7 %s
   typename Template<T>::Nested m;
-// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:164:25 %s -o - | FileCheck -check-prefix=CHECK-CC7 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:166:25 %s -o - | FileCheck -check-prefix=CHECK-CC7 %s
 }
diff --git a/test/CodeGen/2007-11-07-CopyAggregateAlign.c b/test/CodeGen/2007-11-07-CopyAggregateAlign.c
index 08d97702e69a..769a38a7695b 100644
--- a/test/CodeGen/2007-11-07-CopyAggregateAlign.c
+++ b/test/CodeGen/2007-11-07-CopyAggregateAlign.c
@@ -2,6 +2,6 @@
 struct A { char s, t, u, v; short a; };
 // CHECK: %a = alloca %struct.A, align 2
 // CHECK: %b = alloca %struct.A, align 2
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{.*}}, i32 2, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{.*}} align 2 {{.*}} align 2 {{.*}}, i1 false)
 
 void q() { struct A a, b; a = b; }
diff --git a/test/CodeGen/2007-11-07-ZeroAggregateAlign.c b/test/CodeGen/2007-11-07-ZeroAggregateAlign.c
index b059607ba586..2b8cfe1f8f77 100644
--- a/test/CodeGen/2007-11-07-ZeroAggregateAlign.c
+++ b/test/CodeGen/2007-11-07-ZeroAggregateAlign.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
 struct A { short s; short t; int i; };
 // CHECK: %a = alloca %struct.A, align 4
-// CHECK: call void @llvm.memset.p0i8.{{.*}}i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.{{.*}} align 4 {{.*}}, i1 false)
 void q() { struct A a = {0}; }
diff --git a/test/CodeGen/64bit-swiftcall.c b/test/CodeGen/64bit-swiftcall.c
index 92ba37cd7fe6..c5944b99b596 100644
--- a/test/CodeGen/64bit-swiftcall.c
+++ b/test/CodeGen/64bit-swiftcall.c
@@ -216,10 +216,10 @@ TEST(struct_misaligned_1)
 // CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
 // CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
+// CHECK:  call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
 // CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
 // CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align 1 [[CASTRET]], i8* align 1 [[CASTRES]], i64 5
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
@@ -269,10 +269,10 @@ TEST(union_het_fp)
 // CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
 // CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CAST]]
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align 8 [[CAST]]
 // CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
 // CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align 8 [[CASTRET]], i8* align 8 [[CASTRES]]
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
index cbc2e72fcbac..fb9f776c4ad2 100644
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -9203,7 +9203,7 @@ poly16x4_t test_vld1_p16(poly16_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
 uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
@@ -9221,7 +9221,7 @@ uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
@@ -9239,7 +9239,7 @@ uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
@@ -9257,7 +9257,7 @@ uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
@@ -9274,7 +9274,7 @@ uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x2_t [[TMP5]]
 int8x16x2_t test_vld2q_s8(int8_t const *a) {
@@ -9292,7 +9292,7 @@ int8x16x2_t test_vld2q_s8(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_s16(int16_t const *a) {
@@ -9310,7 +9310,7 @@ int16x8x2_t test_vld2q_s16(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_s32(int32_t const *a) {
@@ -9328,7 +9328,7 @@ int32x4x2_t test_vld2q_s32(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_s64(int64_t const *a) {
@@ -9346,7 +9346,7 @@ int64x2x2_t test_vld2q_s64(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_f16(float16_t const *a) {
@@ -9364,7 +9364,7 @@ float16x8x2_t test_vld2q_f16(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_f32(float32_t const *a) {
@@ -9382,7 +9382,7 @@ float32x4x2_t test_vld2q_f32(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_f64(float64_t const *a) {
@@ -9399,7 +9399,7 @@ float64x2x2_t test_vld2q_f64(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
 poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
@@ -9417,7 +9417,7 @@ poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
@@ -9434,7 +9434,7 @@ poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
 uint8x8x2_t test_vld2_u8(uint8_t const *a) {
@@ -9452,7 +9452,7 @@ uint8x8x2_t test_vld2_u8(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_u16(uint16_t const *a) {
@@ -9470,7 +9470,7 @@ uint16x4x2_t test_vld2_u16(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_u32(uint32_t const *a) {
@@ -9488,7 +9488,7 @@ uint32x2x2_t test_vld2_u32(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_u64(uint64_t const *a) {
@@ -9505,7 +9505,7 @@ uint64x1x2_t test_vld2_u64(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x2_t [[TMP5]]
 int8x8x2_t test_vld2_s8(int8_t const *a) {
@@ -9523,7 +9523,7 @@ int8x8x2_t test_vld2_s8(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_s16(int16_t const *a) {
@@ -9541,7 +9541,7 @@ int16x4x2_t test_vld2_s16(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_s32(int32_t const *a) {
@@ -9559,7 +9559,7 @@ int32x2x2_t test_vld2_s32(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_s64(int64_t const *a) {
@@ -9577,7 +9577,7 @@ int64x1x2_t test_vld2_s64(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_f16(float16_t const *a) {
@@ -9595,7 +9595,7 @@ float16x4x2_t test_vld2_f16(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_f32(float32_t const *a) {
@@ -9613,7 +9613,7 @@ float32x2x2_t test_vld2_f32(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_f64(float64_t const *a) {
@@ -9630,7 +9630,7 @@ float64x1x2_t test_vld2_f64(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
 poly8x8x2_t test_vld2_p8(poly8_t const *a) {
@@ -9648,7 +9648,7 @@ poly8x8x2_t test_vld2_p8(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_p16(poly16_t const *a) {
@@ -9665,7 +9665,7 @@ poly16x4x2_t test_vld2_p16(poly16_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
 uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
@@ -9683,7 +9683,7 @@ uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
@@ -9701,7 +9701,7 @@ uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
@@ -9719,7 +9719,7 @@ uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
@@ -9736,7 +9736,7 @@ uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x3_t [[TMP5]]
 int8x16x3_t test_vld3q_s8(int8_t const *a) {
@@ -9754,7 +9754,7 @@ int8x16x3_t test_vld3q_s8(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_s16(int16_t const *a) {
@@ -9772,7 +9772,7 @@ int16x8x3_t test_vld3q_s16(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_s32(int32_t const *a) {
@@ -9790,7 +9790,7 @@ int32x4x3_t test_vld3q_s32(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_s64(int64_t const *a) {
@@ -9808,7 +9808,7 @@ int64x2x3_t test_vld3q_s64(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_f16(float16_t const *a) {
@@ -9826,7 +9826,7 @@ float16x8x3_t test_vld3q_f16(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_f32(float32_t const *a) {
@@ -9844,7 +9844,7 @@ float32x4x3_t test_vld3q_f32(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_f64(float64_t const *a) {
@@ -9861,7 +9861,7 @@ float64x2x3_t test_vld3q_f64(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
 poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
@@ -9879,7 +9879,7 @@ poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
@@ -9896,7 +9896,7 @@ poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
 uint8x8x3_t test_vld3_u8(uint8_t const *a) {
@@ -9914,7 +9914,7 @@ uint8x8x3_t test_vld3_u8(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_u16(uint16_t const *a) {
@@ -9932,7 +9932,7 @@ uint16x4x3_t test_vld3_u16(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_u32(uint32_t const *a) {
@@ -9950,7 +9950,7 @@ uint32x2x3_t test_vld3_u32(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_u64(uint64_t const *a) {
@@ -9967,7 +9967,7 @@ uint64x1x3_t test_vld3_u64(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x3_t [[TMP5]]
 int8x8x3_t test_vld3_s8(int8_t const *a) {
@@ -9985,7 +9985,7 @@ int8x8x3_t test_vld3_s8(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_s16(int16_t const *a) {
@@ -10003,7 +10003,7 @@ int16x4x3_t test_vld3_s16(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_s32(int32_t const *a) {
@@ -10021,7 +10021,7 @@ int32x2x3_t test_vld3_s32(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_s64(int64_t const *a) {
@@ -10039,7 +10039,7 @@ int64x1x3_t test_vld3_s64(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_f16(float16_t const *a) {
@@ -10057,7 +10057,7 @@ float16x4x3_t test_vld3_f16(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_f32(float32_t const *a) {
@@ -10075,7 +10075,7 @@ float32x2x3_t test_vld3_f32(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_f64(float64_t const *a) {
@@ -10092,7 +10092,7 @@ float64x1x3_t test_vld3_f64(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
 poly8x8x3_t test_vld3_p8(poly8_t const *a) {
@@ -10110,7 +10110,7 @@ poly8x8x3_t test_vld3_p8(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_p16(poly16_t const *a) {
@@ -10127,7 +10127,7 @@ poly16x4x3_t test_vld3_p16(poly16_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
 uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
@@ -10145,7 +10145,7 @@ uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
@@ -10163,7 +10163,7 @@ uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
@@ -10181,7 +10181,7 @@ uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
@@ -10198,7 +10198,7 @@ uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x4_t [[TMP5]]
 int8x16x4_t test_vld4q_s8(int8_t const *a) {
@@ -10216,7 +10216,7 @@ int8x16x4_t test_vld4q_s8(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_s16(int16_t const *a) {
@@ -10234,7 +10234,7 @@ int16x8x4_t test_vld4q_s16(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_s32(int32_t const *a) {
@@ -10252,7 +10252,7 @@ int32x4x4_t test_vld4q_s32(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_s64(int64_t const *a) {
@@ -10270,7 +10270,7 @@ int64x2x4_t test_vld4q_s64(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_f16(float16_t const *a) {
@@ -10288,7 +10288,7 @@ float16x8x4_t test_vld4q_f16(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_f32(float32_t const *a) {
@@ -10306,7 +10306,7 @@ float32x4x4_t test_vld4q_f32(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_f64(float64_t const *a) {
@@ -10323,7 +10323,7 @@ float64x2x4_t test_vld4q_f64(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
 poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
@@ -10341,7 +10341,7 @@ poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
@@ -10358,7 +10358,7 @@ poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
 uint8x8x4_t test_vld4_u8(uint8_t const *a) {
@@ -10376,7 +10376,7 @@ uint8x8x4_t test_vld4_u8(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_u16(uint16_t const *a) {
@@ -10394,7 +10394,7 @@ uint16x4x4_t test_vld4_u16(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_u32(uint32_t const *a) {
@@ -10412,7 +10412,7 @@ uint32x2x4_t test_vld4_u32(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_u64(uint64_t const *a) {
@@ -10429,7 +10429,7 @@ uint64x1x4_t test_vld4_u64(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x4_t [[TMP5]]
 int8x8x4_t test_vld4_s8(int8_t const *a) {
@@ -10447,7 +10447,7 @@ int8x8x4_t test_vld4_s8(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_s16(int16_t const *a) {
@@ -10465,7 +10465,7 @@ int16x4x4_t test_vld4_s16(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_s32(int32_t const *a) {
@@ -10483,7 +10483,7 @@ int32x2x4_t test_vld4_s32(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_s64(int64_t const *a) {
@@ -10501,7 +10501,7 @@ int64x1x4_t test_vld4_s64(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_f16(float16_t const *a) {
@@ -10519,7 +10519,7 @@ float16x4x4_t test_vld4_f16(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_f32(float32_t const *a) {
@@ -10537,7 +10537,7 @@ float32x2x4_t test_vld4_f32(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_f64(float64_t const *a) {
@@ -10554,7 +10554,7 @@ float64x1x4_t test_vld4_f64(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
 poly8x8x4_t test_vld4_p8(poly8_t const *a) {
@@ -10572,7 +10572,7 @@ poly8x8x4_t test_vld4_p8(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_p16(poly16_t const *a) {
@@ -10854,7 +10854,7 @@ void test_vst1_p16(poly16_t *a, poly16x4_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -10874,7 +10874,7 @@ void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -10899,7 +10899,7 @@ void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -10924,7 +10924,7 @@ void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -10949,7 +10949,7 @@ void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -10969,7 +10969,7 @@ void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -10994,7 +10994,7 @@ void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -11019,7 +11019,7 @@ void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -11044,7 +11044,7 @@ void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -11069,7 +11069,7 @@ void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -11094,7 +11094,7 @@ void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -11119,7 +11119,7 @@ void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11139,7 +11139,7 @@ void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11164,7 +11164,7 @@ void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11184,7 +11184,7 @@ void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11209,7 +11209,7 @@ void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -11234,7 +11234,7 @@ void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -11259,7 +11259,7 @@ void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11279,7 +11279,7 @@ void test_vst2_s8(int8_t *a, int8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11304,7 +11304,7 @@ void test_vst2_s16(int16_t *a, int16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -11329,7 +11329,7 @@ void test_vst2_s32(int32_t *a, int32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -11354,7 +11354,7 @@ void test_vst2_s64(int64_t *a, int64x1x2_t b) {
 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -11379,7 +11379,7 @@ void test_vst2_f16(float16_t *a, float16x4x2_t b) {
 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -11404,7 +11404,7 @@ void test_vst2_f32(float32_t *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -11429,7 +11429,7 @@ void test_vst2_f64(float64_t *a, float64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11449,7 +11449,7 @@ void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11474,7 +11474,7 @@ void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11497,7 +11497,7 @@ void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11527,7 +11527,7 @@ void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -11557,7 +11557,7 @@ void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -11587,7 +11587,7 @@ void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11610,7 +11610,7 @@ void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11640,7 +11640,7 @@ void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -11670,7 +11670,7 @@ void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -11700,7 +11700,7 @@ void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -11730,7 +11730,7 @@ void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -11760,7 +11760,7 @@ void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -11790,7 +11790,7 @@ void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11813,7 +11813,7 @@ void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11843,7 +11843,7 @@ void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11866,7 +11866,7 @@ void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11896,7 +11896,7 @@ void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -11926,7 +11926,7 @@ void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -11956,7 +11956,7 @@ void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11979,7 +11979,7 @@ void test_vst3_s8(int8_t *a, int8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12009,7 +12009,7 @@ void test_vst3_s16(int16_t *a, int16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -12039,7 +12039,7 @@ void test_vst3_s32(int32_t *a, int32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -12069,7 +12069,7 @@ void test_vst3_s64(int64_t *a, int64x1x3_t b) {
 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -12099,7 +12099,7 @@ void test_vst3_f16(float16_t *a, float16x4x3_t b) {
 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -12129,7 +12129,7 @@ void test_vst3_f32(float32_t *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -12159,7 +12159,7 @@ void test_vst3_f64(float64_t *a, float64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -12182,7 +12182,7 @@ void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12212,7 +12212,7 @@ void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -12238,7 +12238,7 @@ void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -12273,7 +12273,7 @@ void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -12308,7 +12308,7 @@ void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -12343,7 +12343,7 @@ void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -12369,7 +12369,7 @@ void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -12404,7 +12404,7 @@ void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -12439,7 +12439,7 @@ void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -12474,7 +12474,7 @@ void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -12509,7 +12509,7 @@ void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -12544,7 +12544,7 @@ void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -12579,7 +12579,7 @@ void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -12605,7 +12605,7 @@ void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -12640,7 +12640,7 @@ void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -12666,7 +12666,7 @@ void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12701,7 +12701,7 @@ void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -12736,7 +12736,7 @@ void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -12771,7 +12771,7 @@ void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -12797,7 +12797,7 @@ void test_vst4_s8(int8_t *a, int8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12832,7 +12832,7 @@ void test_vst4_s16(int16_t *a, int16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -12867,7 +12867,7 @@ void test_vst4_s32(int32_t *a, int32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -12902,7 +12902,7 @@ void test_vst4_s64(int64_t *a, int64x1x4_t b) {
 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -12937,7 +12937,7 @@ void test_vst4_f16(float16_t *a, float16x4x4_t b) {
 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -12972,7 +12972,7 @@ void test_vst4_f32(float32_t *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -13007,7 +13007,7 @@ void test_vst4_f64(float64_t *a, float64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -13033,7 +13033,7 @@ void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -13070,7 +13070,7 @@ void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
@@ -13088,7 +13088,7 @@ uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
@@ -13106,7 +13106,7 @@ uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
@@ -13124,7 +13124,7 @@ uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
@@ -13140,7 +13140,7 @@ uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
@@ -13158,7 +13158,7 @@ int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
@@ -13176,7 +13176,7 @@ int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
@@ -13194,7 +13194,7 @@ int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
@@ -13212,7 +13212,7 @@ int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
@@ -13230,7 +13230,7 @@ float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
@@ -13248,7 +13248,7 @@ float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
@@ -13264,7 +13264,7 @@ float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
@@ -13282,7 +13282,7 @@ poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
@@ -13300,7 +13300,7 @@ poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
@@ -13316,7 +13316,7 @@ poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
@@ -13334,7 +13334,7 @@ uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
@@ -13352,7 +13352,7 @@ uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
@@ -13370,7 +13370,7 @@ uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
@@ -13386,7 +13386,7 @@ uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
@@ -13404,7 +13404,7 @@ int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
@@ -13422,7 +13422,7 @@ int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
@@ -13440,7 +13440,7 @@ int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
@@ -13458,7 +13458,7 @@ int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
@@ -13476,7 +13476,7 @@ float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
@@ -13494,7 +13494,7 @@ float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
@@ -13510,7 +13510,7 @@ float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
@@ -13528,7 +13528,7 @@ poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
@@ -13546,7 +13546,7 @@ poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
@@ -13562,7 +13562,7 @@ poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
@@ -13580,7 +13580,7 @@ uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
@@ -13598,7 +13598,7 @@ uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
@@ -13616,7 +13616,7 @@ uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
@@ -13632,7 +13632,7 @@ uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
@@ -13650,7 +13650,7 @@ int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
@@ -13668,7 +13668,7 @@ int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
@@ -13686,7 +13686,7 @@ int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
@@ -13704,7 +13704,7 @@ int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
@@ -13722,7 +13722,7 @@ float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
@@ -13740,7 +13740,7 @@ float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
@@ -13756,7 +13756,7 @@ float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
@@ -13774,7 +13774,7 @@ poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
@@ -13792,7 +13792,7 @@ poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
@@ -13808,7 +13808,7 @@ poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
@@ -13826,7 +13826,7 @@ uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
@@ -13844,7 +13844,7 @@ uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
@@ -13862,7 +13862,7 @@ uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
@@ -13878,7 +13878,7 @@ uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
@@ -13896,7 +13896,7 @@ int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
@@ -13914,7 +13914,7 @@ int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
@@ -13932,7 +13932,7 @@ int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
@@ -13950,7 +13950,7 @@ int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
@@ -13968,7 +13968,7 @@ float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
@@ -13986,7 +13986,7 @@ float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
@@ -14002,7 +14002,7 @@ float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
@@ -14020,7 +14020,7 @@ poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
@@ -14038,7 +14038,7 @@ poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
@@ -14054,7 +14054,7 @@ poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
@@ -14072,7 +14072,7 @@ uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
@@ -14090,7 +14090,7 @@ uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
@@ -14108,7 +14108,7 @@ uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
@@ -14124,7 +14124,7 @@ uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
@@ -14142,7 +14142,7 @@ int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
@@ -14160,7 +14160,7 @@ int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
@@ -14178,7 +14178,7 @@ int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
@@ -14196,7 +14196,7 @@ int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
@@ -14214,7 +14214,7 @@ float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
@@ -14232,7 +14232,7 @@ float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
@@ -14248,7 +14248,7 @@ float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
@@ -14266,7 +14266,7 @@ poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
@@ -14284,7 +14284,7 @@ poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
@@ -14300,7 +14300,7 @@ poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
@@ -14318,7 +14318,7 @@ uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
@@ -14336,7 +14336,7 @@ uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
@@ -14354,7 +14354,7 @@ uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
@@ -14370,7 +14370,7 @@ uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
@@ -14388,7 +14388,7 @@ int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
@@ -14406,7 +14406,7 @@ int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
@@ -14424,7 +14424,7 @@ int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
@@ -14442,7 +14442,7 @@ int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
@@ -14460,7 +14460,7 @@ float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
@@ -14478,7 +14478,7 @@ float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
@@ -14494,7 +14494,7 @@ float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
@@ -14512,7 +14512,7 @@ poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
@@ -14530,7 +14530,7 @@ poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
@@ -14544,7 +14544,7 @@ poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -14564,7 +14564,7 @@ void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -14590,7 +14590,7 @@ void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -14616,7 +14616,7 @@ void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -14642,7 +14642,7 @@ void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -14662,7 +14662,7 @@ void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -14688,7 +14688,7 @@ void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -14714,7 +14714,7 @@ void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -14740,7 +14740,7 @@ void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -14766,7 +14766,7 @@ void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -14792,7 +14792,7 @@ void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -14818,7 +14818,7 @@ void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -14838,7 +14838,7 @@ void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -14864,7 +14864,7 @@ void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -14890,7 +14890,7 @@ void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -14910,7 +14910,7 @@ void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -14936,7 +14936,7 @@ void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -14962,7 +14962,7 @@ void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -14988,7 +14988,7 @@ void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -15008,7 +15008,7 @@ void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -15034,7 +15034,7 @@ void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -15060,7 +15060,7 @@ void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -15086,7 +15086,7 @@ void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -15112,7 +15112,7 @@ void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -15138,7 +15138,7 @@ void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -15164,7 +15164,7 @@ void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -15184,7 +15184,7 @@ void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -15210,7 +15210,7 @@ void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -15236,7 +15236,7 @@ void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -15259,7 +15259,7 @@ void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -15290,7 +15290,7 @@ void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -15321,7 +15321,7 @@ void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -15352,7 +15352,7 @@ void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -15375,7 +15375,7 @@ void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -15406,7 +15406,7 @@ void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -15437,7 +15437,7 @@ void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -15468,7 +15468,7 @@ void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -15499,7 +15499,7 @@ void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -15530,7 +15530,7 @@ void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -15561,7 +15561,7 @@ void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -15584,7 +15584,7 @@ void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -15615,7 +15615,7 @@ void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -15646,7 +15646,7 @@ void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -15669,7 +15669,7 @@ void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -15700,7 +15700,7 @@ void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -15731,7 +15731,7 @@ void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -15762,7 +15762,7 @@ void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -15785,7 +15785,7 @@ void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -15816,7 +15816,7 @@ void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -15847,7 +15847,7 @@ void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -15878,7 +15878,7 @@ void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -15909,7 +15909,7 @@ void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -15940,7 +15940,7 @@ void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -15971,7 +15971,7 @@ void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -15994,7 +15994,7 @@ void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -16025,7 +16025,7 @@ void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -16056,7 +16056,7 @@ void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16082,7 +16082,7 @@ void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -16118,7 +16118,7 @@ void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -16154,7 +16154,7 @@ void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -16190,7 +16190,7 @@ void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16216,7 +16216,7 @@ void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -16252,7 +16252,7 @@ void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -16288,7 +16288,7 @@ void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -16324,7 +16324,7 @@ void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -16360,7 +16360,7 @@ void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -16396,7 +16396,7 @@ void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -16432,7 +16432,7 @@ void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16458,7 +16458,7 @@ void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -16494,7 +16494,7 @@ void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -16530,7 +16530,7 @@ void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16556,7 +16556,7 @@ void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -16592,7 +16592,7 @@ void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -16628,7 +16628,7 @@ void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -16664,7 +16664,7 @@ void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16690,7 +16690,7 @@ void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -16726,7 +16726,7 @@ void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -16762,7 +16762,7 @@ void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -16798,7 +16798,7 @@ void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -16834,7 +16834,7 @@ void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -16870,7 +16870,7 @@ void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -16906,7 +16906,7 @@ void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16932,7 +16932,7 @@ void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -16968,7 +16968,7 @@ void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
diff --git a/test/CodeGen/aarch64-neon-ldst-one.c b/test/CodeGen/aarch64-neon-ldst-one.c
index a3c5b140a0d2..5d775ce6ddf9 100644
--- a/test/CodeGen/aarch64-neon-ldst-one.c
+++ b/test/CodeGen/aarch64-neon-ldst-one.c
@@ -309,7 +309,7 @@ poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld2q_dup_u8(uint8_t  *a) {
@@ -327,7 +327,7 @@ uint8x16x2_t test_vld2q_dup_u8(uint8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_dup_u16(uint16_t  *a) {
@@ -345,7 +345,7 @@ uint16x8x2_t test_vld2q_dup_u16(uint16_t  *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
@@ -363,7 +363,7 @@ uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
@@ -379,7 +379,7 @@ uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld2q_dup_s8(int8_t  *a) {
@@ -397,7 +397,7 @@ int8x16x2_t test_vld2q_dup_s8(int8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_dup_s16(int16_t  *a) {
@@ -415,7 +415,7 @@ int16x8x2_t test_vld2q_dup_s16(int16_t  *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
@@ -433,7 +433,7 @@ int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
@@ -451,7 +451,7 @@ int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
 // CHECK:   store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_dup_f16(float16_t  *a) {
@@ -469,7 +469,7 @@ float16x8x2_t test_vld2q_dup_f16(float16_t  *a) {
 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
@@ -487,7 +487,7 @@ float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
@@ -503,7 +503,7 @@ float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld2q_dup_p8(poly8_t  *a) {
@@ -521,7 +521,7 @@ poly8x16x2_t test_vld2q_dup_p8(poly8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
@@ -539,7 +539,7 @@ poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
@@ -555,7 +555,7 @@ poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld2_dup_u8(uint8_t  *a) {
@@ -573,7 +573,7 @@ uint8x8x2_t test_vld2_dup_u8(uint8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_dup_u16(uint16_t  *a) {
@@ -591,7 +591,7 @@ uint16x4x2_t test_vld2_dup_u16(uint16_t  *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_dup_u32(uint32_t  *a) {
@@ -609,7 +609,7 @@ uint32x2x2_t test_vld2_dup_u32(uint32_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_dup_u64(uint64_t  *a) {
@@ -625,7 +625,7 @@ uint64x1x2_t test_vld2_dup_u64(uint64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld2_dup_s8(int8_t  *a) {
@@ -643,7 +643,7 @@ int8x8x2_t test_vld2_dup_s8(int8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_dup_s16(int16_t  *a) {
@@ -661,7 +661,7 @@ int16x4x2_t test_vld2_dup_s16(int16_t  *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_dup_s32(int32_t  *a) {
@@ -679,7 +679,7 @@ int32x2x2_t test_vld2_dup_s32(int32_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
@@ -697,7 +697,7 @@ int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
 // CHECK:   store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_dup_f16(float16_t  *a) {
@@ -715,7 +715,7 @@ float16x4x2_t test_vld2_dup_f16(float16_t  *a) {
 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
@@ -733,7 +733,7 @@ float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
@@ -749,7 +749,7 @@ float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld2_dup_p8(poly8_t  *a) {
@@ -767,7 +767,7 @@ poly8x8x2_t test_vld2_dup_p8(poly8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
@@ -785,7 +785,7 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
@@ -801,7 +801,7 @@ poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld3q_dup_u8(uint8_t  *a) {
@@ -820,7 +820,7 @@ uint8x16x3_t test_vld3q_dup_u8(uint8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_dup_u16(uint16_t  *a) {
@@ -839,7 +839,7 @@ uint16x8x3_t test_vld3q_dup_u16(uint16_t  *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
@@ -858,7 +858,7 @@ uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
@@ -875,7 +875,7 @@ uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld3q_dup_s8(int8_t  *a) {
@@ -894,7 +894,7 @@ int8x16x3_t test_vld3q_dup_s8(int8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_dup_s16(int16_t  *a) {
@@ -913,7 +913,7 @@ int16x8x3_t test_vld3q_dup_s16(int16_t  *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
@@ -932,7 +932,7 @@ int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
@@ -951,7 +951,7 @@ int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_dup_f16(float16_t  *a) {
@@ -970,7 +970,7 @@ float16x8x3_t test_vld3q_dup_f16(float16_t  *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
@@ -989,7 +989,7 @@ float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
@@ -1006,7 +1006,7 @@ float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld3q_dup_p8(poly8_t  *a) {
@@ -1025,7 +1025,7 @@ poly8x16x3_t test_vld3q_dup_p8(poly8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
@@ -1044,7 +1044,7 @@ poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
@@ -1061,7 +1061,7 @@ poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld3_dup_u8(uint8_t  *a) {
@@ -1080,7 +1080,7 @@ uint8x8x3_t test_vld3_dup_u8(uint8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_dup_u16(uint16_t  *a) {
@@ -1099,7 +1099,7 @@ uint16x4x3_t test_vld3_dup_u16(uint16_t  *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_dup_u32(uint32_t  *a) {
@@ -1118,7 +1118,7 @@ uint32x2x3_t test_vld3_dup_u32(uint32_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_dup_u64(uint64_t  *a) {
@@ -1135,7 +1135,7 @@ uint64x1x3_t test_vld3_dup_u64(uint64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld3_dup_s8(int8_t  *a) {
@@ -1154,7 +1154,7 @@ int8x8x3_t test_vld3_dup_s8(int8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_dup_s16(int16_t  *a) {
@@ -1173,7 +1173,7 @@ int16x4x3_t test_vld3_dup_s16(int16_t  *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_dup_s32(int32_t  *a) {
@@ -1192,7 +1192,7 @@ int32x2x3_t test_vld3_dup_s32(int32_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
@@ -1211,7 +1211,7 @@ int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_dup_f16(float16_t  *a) {
@@ -1230,7 +1230,7 @@ float16x4x3_t test_vld3_dup_f16(float16_t  *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
@@ -1249,7 +1249,7 @@ float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
@@ -1266,7 +1266,7 @@ float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld3_dup_p8(poly8_t  *a) {
@@ -1285,7 +1285,7 @@ poly8x8x3_t test_vld3_dup_p8(poly8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
@@ -1304,7 +1304,7 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
@@ -1321,7 +1321,7 @@ poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld4q_dup_u8(uint8_t  *a) {
@@ -1339,7 +1339,7 @@ uint8x16x4_t test_vld4q_dup_u8(uint8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_dup_u16(uint16_t  *a) {
@@ -1357,7 +1357,7 @@ uint16x8x4_t test_vld4q_dup_u16(uint16_t  *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
@@ -1375,7 +1375,7 @@ uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
@@ -1391,7 +1391,7 @@ uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld4q_dup_s8(int8_t  *a) {
@@ -1409,7 +1409,7 @@ int8x16x4_t test_vld4q_dup_s8(int8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_dup_s16(int16_t  *a) {
@@ -1427,7 +1427,7 @@ int16x8x4_t test_vld4q_dup_s16(int16_t  *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
@@ -1445,7 +1445,7 @@ int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
@@ -1463,7 +1463,7 @@ int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_dup_f16(float16_t  *a) {
@@ -1481,7 +1481,7 @@ float16x8x4_t test_vld4q_dup_f16(float16_t  *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
@@ -1499,7 +1499,7 @@ float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
@@ -1515,7 +1515,7 @@ float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld4q_dup_p8(poly8_t  *a) {
@@ -1533,7 +1533,7 @@ poly8x16x4_t test_vld4q_dup_p8(poly8_t  *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
@@ -1551,7 +1551,7 @@ poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
@@ -1567,7 +1567,7 @@ poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld4_dup_u8(uint8_t  *a) {
@@ -1585,7 +1585,7 @@ uint8x8x4_t test_vld4_dup_u8(uint8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_dup_u16(uint16_t  *a) {
@@ -1603,7 +1603,7 @@ uint16x4x4_t test_vld4_dup_u16(uint16_t  *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_dup_u32(uint32_t  *a) {
@@ -1621,7 +1621,7 @@ uint32x2x4_t test_vld4_dup_u32(uint32_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_dup_u64(uint64_t  *a) {
@@ -1637,7 +1637,7 @@ uint64x1x4_t test_vld4_dup_u64(uint64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld4_dup_s8(int8_t  *a) {
@@ -1655,7 +1655,7 @@ int8x8x4_t test_vld4_dup_s8(int8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_dup_s16(int16_t  *a) {
@@ -1673,7 +1673,7 @@ int16x4x4_t test_vld4_dup_s16(int16_t  *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_dup_s32(int32_t  *a) {
@@ -1691,7 +1691,7 @@ int32x2x4_t test_vld4_dup_s32(int32_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
@@ -1709,7 +1709,7 @@ int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_dup_f16(float16_t  *a) {
@@ -1727,7 +1727,7 @@ float16x4x4_t test_vld4_dup_f16(float16_t  *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
@@ -1745,7 +1745,7 @@ float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
@@ -1761,7 +1761,7 @@ float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld4_dup_p8(poly8_t  *a) {
@@ -1779,7 +1779,7 @@ poly8x8x4_t test_vld4_dup_p8(poly8_t  *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
@@ -1797,7 +1797,7 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
@@ -2125,7 +2125,7 @@ poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2138,7 +2138,7 @@ poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
@@ -2154,7 +2154,7 @@ int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2167,7 +2167,7 @@ int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
@@ -2183,7 +2183,7 @@ uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2196,7 +2196,7 @@ uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
@@ -2212,7 +2212,7 @@ poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
 // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2228,7 +2228,7 @@ poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x3_t [[TMP9]]
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
@@ -2244,7 +2244,7 @@ int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
 // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2260,7 +2260,7 @@ int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
@@ -2276,7 +2276,7 @@ uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2294,7 +2294,7 @@ uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
@@ -2310,7 +2310,7 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
@@ -2328,7 +2328,7 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
@@ -2344,7 +2344,7 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2362,7 +2362,7 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
@@ -2378,7 +2378,7 @@ uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2396,7 +2396,7 @@ uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x2_t [[TMP13]]
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
@@ -2412,7 +2412,7 @@ int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
@@ -2430,7 +2430,7 @@ int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x2_t [[TMP13]]
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
@@ -2446,7 +2446,7 @@ int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2464,7 +2464,7 @@ int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x2_t [[TMP13]]
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
@@ -2480,7 +2480,7 @@ int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2498,7 +2498,7 @@ int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   store { <8 x half>, <8 x half> } [[VLD2_LANE]], { <8 x half>, <8 x half> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x2_t [[TMP13]]
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
@@ -2514,7 +2514,7 @@ float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
@@ -2532,7 +2532,7 @@ float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x2_t [[TMP13]]
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
@@ -2548,7 +2548,7 @@ float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2566,7 +2566,7 @@ float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP13]]
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
@@ -2582,7 +2582,7 @@ float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2600,7 +2600,7 @@ float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
@@ -2616,7 +2616,7 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2634,7 +2634,7 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
@@ -2650,7 +2650,7 @@ poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -2663,7 +2663,7 @@ poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
@@ -2679,7 +2679,7 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
@@ -2697,7 +2697,7 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
@@ -2713,7 +2713,7 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
@@ -2731,7 +2731,7 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
@@ -2747,7 +2747,7 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
@@ -2765,7 +2765,7 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
@@ -2781,7 +2781,7 @@ uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -2794,7 +2794,7 @@ uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
@@ -2810,7 +2810,7 @@ int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
@@ -2828,7 +2828,7 @@ int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x2_t [[TMP13]]
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
@@ -2844,7 +2844,7 @@ int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
@@ -2862,7 +2862,7 @@ int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x2_t [[TMP13]]
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
@@ -2878,7 +2878,7 @@ int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
@@ -2896,7 +2896,7 @@ int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x2_t [[TMP13]]
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
@@ -2912,7 +2912,7 @@ int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
@@ -2930,7 +2930,7 @@ int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   store { <4 x half>, <4 x half> } [[VLD2_LANE]], { <4 x half>, <4 x half> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x2_t [[TMP13]]
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
@@ -2946,7 +2946,7 @@ float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
@@ -2964,7 +2964,7 @@ float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x2_t [[TMP13]]
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
@@ -2980,7 +2980,7 @@ float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
@@ -2998,7 +2998,7 @@ float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP13]]
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
@@ -3014,7 +3014,7 @@ float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -3027,7 +3027,7 @@ float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
@@ -3043,7 +3043,7 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
@@ -3061,7 +3061,7 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
@@ -3077,7 +3077,7 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
@@ -3095,7 +3095,7 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
@@ -3111,7 +3111,7 @@ poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3134,7 +3134,7 @@ poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
@@ -3150,7 +3150,7 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
@@ -3173,7 +3173,7 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
@@ -3189,7 +3189,7 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3212,7 +3212,7 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
@@ -3228,7 +3228,7 @@ uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3251,7 +3251,7 @@ uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x3_t [[TMP16]]
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
@@ -3267,7 +3267,7 @@ int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
@@ -3290,7 +3290,7 @@ int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x3_t [[TMP16]]
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
@@ -3306,7 +3306,7 @@ int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3329,7 +3329,7 @@ int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x3_t [[TMP16]]
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
@@ -3345,7 +3345,7 @@ int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3368,7 +3368,7 @@ int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x3_t [[TMP16]]
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
@@ -3384,7 +3384,7 @@ float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
@@ -3407,7 +3407,7 @@ float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x3_t [[TMP16]]
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
@@ -3423,7 +3423,7 @@ float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3446,7 +3446,7 @@ float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP16]]
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
@@ -3462,7 +3462,7 @@ float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -3478,7 +3478,7 @@ float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
@@ -3494,7 +3494,7 @@ poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3517,7 +3517,7 @@ poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
@@ -3533,7 +3533,7 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3556,7 +3556,7 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
@@ -3572,7 +3572,7 @@ poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -3588,7 +3588,7 @@ poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
@@ -3604,7 +3604,7 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
@@ -3627,7 +3627,7 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
@@ -3643,7 +3643,7 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
@@ -3666,7 +3666,7 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
@@ -3682,7 +3682,7 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
@@ -3705,7 +3705,7 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
@@ -3721,7 +3721,7 @@ uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -3737,7 +3737,7 @@ uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
@@ -3753,7 +3753,7 @@ int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
@@ -3776,7 +3776,7 @@ int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x3_t [[TMP16]]
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
@@ -3792,7 +3792,7 @@ int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
@@ -3815,7 +3815,7 @@ int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x3_t [[TMP16]]
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
@@ -3831,7 +3831,7 @@ int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
@@ -3854,7 +3854,7 @@ int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x3_t [[TMP16]]
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
@@ -3870,7 +3870,7 @@ int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
@@ -3893,7 +3893,7 @@ int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x3_t [[TMP16]]
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
@@ -3909,7 +3909,7 @@ float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
@@ -3932,7 +3932,7 @@ float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x3_t [[TMP16]]
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
@@ -3948,7 +3948,7 @@ float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
@@ -3971,7 +3971,7 @@ float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP16]]
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
@@ -3987,7 +3987,7 @@ float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -4003,7 +4003,7 @@ float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
@@ -4019,7 +4019,7 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
@@ -4042,7 +4042,7 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
@@ -4058,7 +4058,7 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
@@ -4081,7 +4081,7 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
@@ -4097,7 +4097,7 @@ poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -4116,7 +4116,7 @@ poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
@@ -4132,7 +4132,7 @@ uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4160,7 +4160,7 @@ uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
@@ -4176,7 +4176,7 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
@@ -4204,7 +4204,7 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
@@ -4220,7 +4220,7 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4248,7 +4248,7 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
@@ -4264,7 +4264,7 @@ uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -4283,7 +4283,7 @@ uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x4_t [[TMP10]]
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
@@ -4299,7 +4299,7 @@ int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4327,7 +4327,7 @@ int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x4_t [[TMP19]]
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
@@ -4343,7 +4343,7 @@ int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
@@ -4371,7 +4371,7 @@ int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x4_t [[TMP19]]
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
@@ -4387,7 +4387,7 @@ int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4415,7 +4415,7 @@ int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x4_t [[TMP19]]
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
@@ -4431,7 +4431,7 @@ int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4459,7 +4459,7 @@ int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x4_t [[TMP19]]
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
@@ -4475,7 +4475,7 @@ float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
@@ -4503,7 +4503,7 @@ float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x4_t [[TMP19]]
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
@@ -4519,7 +4519,7 @@ float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4547,7 +4547,7 @@ float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP19]]
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
@@ -4563,7 +4563,7 @@ float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -4582,7 +4582,7 @@ float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
@@ -4598,7 +4598,7 @@ poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4626,7 +4626,7 @@ poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
@@ -4642,7 +4642,7 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4670,7 +4670,7 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
@@ -4686,7 +4686,7 @@ poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -4705,7 +4705,7 @@ poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
@@ -4721,7 +4721,7 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
@@ -4749,7 +4749,7 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
@@ -4765,7 +4765,7 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
@@ -4793,7 +4793,7 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
@@ -4809,7 +4809,7 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
@@ -4837,7 +4837,7 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
@@ -4853,7 +4853,7 @@ uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -4872,7 +4872,7 @@ uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x4_t [[TMP10]]
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
@@ -4888,7 +4888,7 @@ int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
@@ -4916,7 +4916,7 @@ int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x4_t [[TMP19]]
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
@@ -4932,7 +4932,7 @@ int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
@@ -4960,7 +4960,7 @@ int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x4_t [[TMP19]]
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
@@ -4976,7 +4976,7 @@ int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
@@ -5004,7 +5004,7 @@ int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x4_t [[TMP19]]
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
@@ -5020,7 +5020,7 @@ int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
@@ -5048,7 +5048,7 @@ int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x4_t [[TMP19]]
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
@@ -5064,7 +5064,7 @@ float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
@@ -5092,7 +5092,7 @@ float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x4_t [[TMP19]]
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
@@ -5108,7 +5108,7 @@ float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
@@ -5136,7 +5136,7 @@ float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP19]]
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
@@ -5152,7 +5152,7 @@ float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -5171,7 +5171,7 @@ float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
@@ -5187,7 +5187,7 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
@@ -5215,7 +5215,7 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
@@ -5231,7 +5231,7 @@ poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
@@ -5259,7 +5259,7 @@ poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
@@ -5585,7 +5585,7 @@ void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -5605,7 +5605,7 @@ void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -5630,7 +5630,7 @@ void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -5655,7 +5655,7 @@ void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -5680,7 +5680,7 @@ void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -5700,7 +5700,7 @@ void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -5725,7 +5725,7 @@ void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -5750,7 +5750,7 @@ void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -5775,7 +5775,7 @@ void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -5800,7 +5800,7 @@ void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -5825,7 +5825,7 @@ void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -5850,7 +5850,7 @@ void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -5870,7 +5870,7 @@ void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -5895,7 +5895,7 @@ void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -5920,7 +5920,7 @@ void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -5940,7 +5940,7 @@ void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -5965,7 +5965,7 @@ void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -5990,7 +5990,7 @@ void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6015,7 +6015,7 @@ void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6035,7 +6035,7 @@ void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6060,7 +6060,7 @@ void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -6085,7 +6085,7 @@ void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6110,7 +6110,7 @@ void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -6135,7 +6135,7 @@ void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -6160,7 +6160,7 @@ void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -6185,7 +6185,7 @@ void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6205,7 +6205,7 @@ void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6230,7 +6230,7 @@ void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6255,7 +6255,7 @@ void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -6278,7 +6278,7 @@ void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -6308,7 +6308,7 @@ void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -6338,7 +6338,7 @@ void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -6368,7 +6368,7 @@ void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -6391,7 +6391,7 @@ void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -6421,7 +6421,7 @@ void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -6451,7 +6451,7 @@ void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -6481,7 +6481,7 @@ void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -6511,7 +6511,7 @@ void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -6541,7 +6541,7 @@ void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -6571,7 +6571,7 @@ void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -6594,7 +6594,7 @@ void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -6624,7 +6624,7 @@ void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -6654,7 +6654,7 @@ void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6677,7 +6677,7 @@ void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6707,7 +6707,7 @@ void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -6737,7 +6737,7 @@ void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6767,7 +6767,7 @@ void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6790,7 +6790,7 @@ void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6820,7 +6820,7 @@ void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -6850,7 +6850,7 @@ void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6880,7 +6880,7 @@ void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -6910,7 +6910,7 @@ void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -6940,7 +6940,7 @@ void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -6970,7 +6970,7 @@ void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6993,7 +6993,7 @@ void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7023,7 +7023,7 @@ void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -7053,7 +7053,7 @@ void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -7079,7 +7079,7 @@ void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -7114,7 +7114,7 @@ void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -7149,7 +7149,7 @@ void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -7184,7 +7184,7 @@ void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -7210,7 +7210,7 @@ void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -7245,7 +7245,7 @@ void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -7280,7 +7280,7 @@ void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -7315,7 +7315,7 @@ void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -7350,7 +7350,7 @@ void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -7385,7 +7385,7 @@ void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -7420,7 +7420,7 @@ void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -7446,7 +7446,7 @@ void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -7481,7 +7481,7 @@ void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -7516,7 +7516,7 @@ void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -7542,7 +7542,7 @@ void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7577,7 +7577,7 @@ void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -7612,7 +7612,7 @@ void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -7647,7 +7647,7 @@ void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -7673,7 +7673,7 @@ void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7708,7 +7708,7 @@ void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -7743,7 +7743,7 @@ void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -7778,7 +7778,7 @@ void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -7813,7 +7813,7 @@ void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -7848,7 +7848,7 @@ void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -7883,7 +7883,7 @@ void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -7909,7 +7909,7 @@ void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7944,7 +7944,7 @@ void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
diff --git a/test/CodeGen/aarch64-neon-perm.c b/test/CodeGen/aarch64-neon-perm.c
index 471017a99bbd..c24447bf5143 100644
--- a/test/CodeGen/aarch64-neon-perm.c
+++ b/test/CodeGen/aarch64-neon-perm.c
@@ -899,7 +899,7 @@ poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -925,7 +925,7 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -951,7 +951,7 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -975,7 +975,7 @@ int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -1001,7 +1001,7 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1027,7 +1027,7 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1053,7 +1053,7 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1077,7 +1077,7 @@ float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -1103,7 +1103,7 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -1127,7 +1127,7 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -1153,7 +1153,7 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -1179,7 +1179,7 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -1203,7 +1203,7 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -1229,7 +1229,7 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -1255,7 +1255,7 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -1281,7 +1281,7 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -1305,7 +1305,7 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -1331,7 +1331,7 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
@@ -1355,7 +1355,7 @@ poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -1381,7 +1381,7 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -1407,7 +1407,7 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -1431,7 +1431,7 @@ int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -1457,7 +1457,7 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1483,7 +1483,7 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1509,7 +1509,7 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1533,7 +1533,7 @@ float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -1559,7 +1559,7 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -1583,7 +1583,7 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -1609,7 +1609,7 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -1635,7 +1635,7 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -1659,7 +1659,7 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -1685,7 +1685,7 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -1711,7 +1711,7 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -1737,7 +1737,7 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -1761,7 +1761,7 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -1787,7 +1787,7 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
@@ -1811,7 +1811,7 @@ poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -1837,7 +1837,7 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -1863,7 +1863,7 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -1887,7 +1887,7 @@ int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -1913,7 +1913,7 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1939,7 +1939,7 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1965,7 +1965,7 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1989,7 +1989,7 @@ float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -2015,7 +2015,7 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -2039,7 +2039,7 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -2065,7 +2065,7 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -2091,7 +2091,7 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -2115,7 +2115,7 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -2141,7 +2141,7 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -2167,7 +2167,7 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -2193,7 +2193,7 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -2217,7 +2217,7 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -2243,7 +2243,7 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
diff --git a/test/CodeGen/aarch64-poly64.c b/test/CodeGen/aarch64-poly64.c
index 3fb8048fb39e..b70e5f0765e2 100644
--- a/test/CodeGen/aarch64-poly64.c
+++ b/test/CodeGen/aarch64-poly64.c
@@ -248,7 +248,7 @@ void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
@@ -266,7 +266,7 @@ poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
@@ -284,7 +284,7 @@ poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
@@ -302,7 +302,7 @@ poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
@@ -320,7 +320,7 @@ poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
@@ -338,7 +338,7 @@ poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
@@ -352,7 +352,7 @@ poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
 // CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 0
@@ -377,7 +377,7 @@ void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
 // CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 0
@@ -402,7 +402,7 @@ void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
 // CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 0
@@ -432,7 +432,7 @@ void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
 // CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 0
@@ -462,7 +462,7 @@ void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
 // CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 0
@@ -497,7 +497,7 @@ void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
 // CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 0
diff --git a/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c b/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
new file mode 100644
index 000000000000..aa07e3beba9d
--- /dev/null
+++ b/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
@@ -0,0 +1,643 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +fullfp16\
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_fp16.h>
+
+// CHECK-LABEL: test_vabsh_f16
+// CHECK:  [[ABS:%.*]] =  call half @llvm.aarch64.neon.abs.f16(half %a)
+// CHECK:  ret half [[ABS]]
+float16_t test_vabsh_f16(float16_t a) {
+  return vabsh_f16(a);
+}
+
+// CHECK-LABEL: test_vceqzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vceqzh_f16(float16_t a) {
+  return vceqzh_f16(a);
+}
+
+// CHECK-LABEL: test_vcgezh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgezh_f16(float16_t a) {
+  return vcgezh_f16(a);
+}
+
+// CHECK-LABEL: test_vcgtzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ogt half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgtzh_f16(float16_t a) {
+  return vcgtzh_f16(a);
+}
+
+// CHECK-LABEL: test_vclezh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vclezh_f16(float16_t a) {
+  return vclezh_f16(a);
+}
+
+// CHECK-LABEL: test_vcltzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcltzh_f16(float16_t a) {
+  return vcltzh_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s16
+// CHECK:  [[VCVT:%.*]] = sitofp i16 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s16 (int16_t a) {
+  return vcvth_f16_s16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s32
+// CHECK:  [[VCVT:%.*]] = sitofp i32 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s32 (int32_t a) {
+  return vcvth_f16_s32(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s64
+// CHECK:  [[VCVT:%.*]] = sitofp i64 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s64 (int64_t a) {
+  return vcvth_f16_s64(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u16
+// CHECK:  [[VCVT:%.*]] = uitofp i16 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u16 (uint16_t a) {
+  return vcvth_f16_u16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u32
+// CHECK:  [[VCVT:%.*]] = uitofp i32 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u32 (uint32_t a) {
+  return vcvth_f16_u32(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u64
+// CHECK:  [[VCVT:%.*]] = uitofp i64 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u64 (uint64_t a) {
+  return vcvth_f16_u64(a);
+}
+
+// CHECK-LABEL: test_vcvth_s16_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i16
+// CHECK:  ret i16 [[VCVT]]
+int16_t test_vcvth_s16_f16 (float16_t a) {
+  return vcvth_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_s32_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i32
+// CHECK:  ret i32 [[VCVT]]
+int32_t test_vcvth_s32_f16 (float16_t a) {
+  return vcvth_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_s64_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i64
+// CHECK:  ret i64 [[VCVT]]
+int64_t test_vcvth_s64_f16 (float16_t a) {
+  return vcvth_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u16_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i16
+// CHECK:  ret i16 [[VCVT]]
+uint16_t test_vcvth_u16_f16 (float16_t a) {
+  return vcvth_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u32_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i32
+// CHECK:  ret i32 [[VCVT]]
+uint32_t test_vcvth_u32_f16 (float16_t a) {
+  return vcvth_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u64_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i64
+// CHECK:  ret i64 [[VCVT]]
+uint64_t test_vcvth_u64_f16 (float16_t a) {
+  return vcvth_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtah_s16_f16 (float16_t a) {
+  return vcvtah_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtah_s32_f16 (float16_t a) {
+  return vcvtah_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtah_s64_f16 (float16_t a) {
+  return vcvtah_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtah_u16_f16 (float16_t a) {
+  return vcvtah_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtah_u32_f16 (float16_t a) {
+  return vcvtah_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtah_u64_f16 (float16_t a) {
+  return vcvtah_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtmh_s16_f16 (float16_t a) {
+  return vcvtmh_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtmh_s32_f16 (float16_t a) {
+  return vcvtmh_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtmh_s64_f16 (float16_t a) {
+  return vcvtmh_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtmh_u16_f16 (float16_t a) {
+  return vcvtmh_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtmh_u32_f16 (float16_t a) {
+  return vcvtmh_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtmh_u64_f16 (float16_t a) {
+  return vcvtmh_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtnh_s16_f16 (float16_t a) {
+  return vcvtnh_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtnh_s32_f16 (float16_t a) {
+  return vcvtnh_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtnh_s64_f16 (float16_t a) {
+  return vcvtnh_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtnh_u16_f16 (float16_t a) {
+  return vcvtnh_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtnh_u32_f16 (float16_t a) {
+  return vcvtnh_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtnh_u64_f16 (float16_t a) {
+  return vcvtnh_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtph_s16_f16 (float16_t a) {
+  return vcvtph_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtph_s32_f16 (float16_t a) {
+  return vcvtph_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtph_s64_f16 (float16_t a) {
+  return vcvtph_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtph_u16_f16 (float16_t a) {
+  return vcvtph_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtph_u32_f16 (float16_t a) {
+  return vcvtph_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtph_u64_f16 (float16_t a) {
+  return vcvtph_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vnegh_f16
+// CHECK: [[NEG:%.*]] = fsub half 0xH8000, %a
+// CHECK: ret half [[NEG]]
+float16_t test_vnegh_f16(float16_t a) {
+  return vnegh_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpeh_f16
+// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpe.f16(half %a)
+// CHECK: ret half [[VREC]]
+float16_t test_vrecpeh_f16(float16_t a) {
+  return vrecpeh_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpxh_f16
+// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpx.f16(half %a)
+// CHECK: ret half [[VREC]]
+float16_t test_vrecpxh_f16(float16_t a) {
+  return vrecpxh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.trunc.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndh_f16(float16_t a) {
+  return vrndh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndah_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.round.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndah_f16(float16_t a) {
+  return vrndah_f16(a);
+}
+
+// CHECK-LABEL: test_vrndih_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.nearbyint.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndih_f16(float16_t a) {
+  return vrndih_f16(a);
+}
+
+// CHECK-LABEL: test_vrndmh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.floor.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndmh_f16(float16_t a) {
+  return vrndmh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndnh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.aarch64.neon.frintn.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndnh_f16(float16_t a) {
+  return vrndnh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndph_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.ceil.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndph_f16(float16_t a) {
+  return vrndph_f16(a);
+}
+
+// CHECK-LABEL: test_vrndxh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.rint.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndxh_f16(float16_t a) {
+  return vrndxh_f16(a);
+}
+
+// CHECK-LABEL: test_vrsqrteh_f16
+// CHECK:  [[RND:%.*]] = call half @llvm.aarch64.neon.frsqrte.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrsqrteh_f16(float16_t a) {
+  return vrsqrteh_f16(a);
+}
+
+// CHECK-LABEL: test_vsqrth_f16
+// CHECK:  [[SQR:%.*]] = call half @llvm.sqrt.f16(half %a)
+// CHECK:  ret half [[SQR]]
+float16_t test_vsqrth_f16(float16_t a) {
+  return vsqrth_f16(a);
+}
+
+// CHECK-LABEL: test_vaddh_f16
+// CHECK:  [[ADD:%.*]] = fadd half %a, %b
+// CHECK:  ret half [[ADD]]
+float16_t test_vaddh_f16(float16_t a, float16_t b) {
+  return vaddh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vabdh_f16
+// CHECK:  [[ABD:%.*]] = call half @llvm.aarch64.sisd.fabd.f16(half %a, half %b)
+// CHECK:  ret half [[ABD]]
+float16_t test_vabdh_f16(float16_t a, float16_t b) {
+  return vabdh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcageh_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facge.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcageh_f16(float16_t a, float16_t b) {
+  return vcageh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcagth_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facgt.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcagth_f16(float16_t a, float16_t b) {
+  return vcagth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcaleh_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facge.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcaleh_f16(float16_t a, float16_t b) {
+  return vcaleh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcalth_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facgt.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcalth_f16(float16_t a, float16_t b) {
+  return vcalth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vceqh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vceqh_f16(float16_t a, float16_t b) {
+  return vceqh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgeh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgeh_f16(float16_t a, float16_t b) {
+  return vcgeh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgth_f16
+//CHECK:  [[TMP1:%.*]] = fcmp ogt half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgth_f16(float16_t a, float16_t b) {
+  return vcgth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcleh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcleh_f16(float16_t a, float16_t b) {
+  return vcleh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vclth_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vclth_f16(float16_t a, float16_t b) {
+  return vclth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s16
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i16(i16 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s16(int16_t a) {
+  return vcvth_n_f16_s16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s32(int32_t a) {
+  return vcvth_n_f16_s32(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s64
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s64(int64_t a) {
+  return vcvth_n_f16_s64(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_s16_f16
+// CHECK:  [[CVT:%.*]] = call i16 @llvm.aarch64.neon.vcvtfp2fxs.i16.f16(half %a, i32 0)
+// CHECK:  ret i16 [[CVT]]
+int16_t test_vcvth_n_s16_f16(float16_t a) {
+  return vcvth_n_s16_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_s32_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 0)
+// CHECK:  ret i32 [[CVT]]
+int32_t test_vcvth_n_s32_f16(float16_t a) {
+  return vcvth_n_s32_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_s64_f16
+// CHECK:  [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 0)
+// CHECK:  ret i64 [[CVT]]
+int64_t test_vcvth_n_s64_f16(float16_t a) {
+  return vcvth_n_s64_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u16
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i16(i16 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u16(int16_t a) {
+  return vcvth_n_f16_u16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u32(int32_t a) {
+  return vcvth_n_f16_u32(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u64
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i64(i64 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u64(int64_t a) {
+  return vcvth_n_f16_u64(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_u16_f16
+// CHECK:  [[CVT:%.*]] = call i16 @llvm.aarch64.neon.vcvtfp2fxu.i16.f16(half %a, i32 0)
+// CHECK:  ret i16 [[CVT]]
+int16_t test_vcvth_n_u16_f16(float16_t a) {
+  return vcvth_n_u16_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_u32_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 0)
+// CHECK:  ret i32 [[CVT]]
+int32_t test_vcvth_n_u32_f16(float16_t a) {
+  return vcvth_n_u32_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_u64_f16
+// CHECK:  [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f16(half %a, i32 0)
+// CHECK:  ret i64 [[CVT]]
+int64_t test_vcvth_n_u64_f16(float16_t a) {
+  return vcvth_n_u64_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vdivh_f16
+// CHECK:  [[DIV:%.*]] = fdiv half %a, %b
+// CHECK:  ret half [[DIV]]
+float16_t test_vdivh_f16(float16_t a, float16_t b) {
+  return vdivh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxh_f16
+// CHECK:  [[MAX:%.*]] = call half @llvm.aarch64.neon.fmax.f16(half %a, half %b)
+// CHECK:  ret half [[MAX]]
+float16_t test_vmaxh_f16(float16_t a, float16_t b) {
+  return vmaxh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxnmh_f16
+// CHECK:  [[MAX:%.*]] = call half @llvm.aarch64.neon.fmaxnm.f16(half %a, half %b)
+// CHECK:  ret half [[MAX]]
+float16_t test_vmaxnmh_f16(float16_t a, float16_t b) {
+  return vmaxnmh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminh_f16
+// CHECK:  [[MIN:%.*]] = call half @llvm.aarch64.neon.fmin.f16(half %a, half %b)
+// CHECK:  ret half [[MIN]]
+float16_t test_vminh_f16(float16_t a, float16_t b) {
+  return vminh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminnmh_f16
+// CHECK:  [[MIN:%.*]] = call half @llvm.aarch64.neon.fminnm.f16(half %a, half %b)
+// CHECK:  ret half [[MIN]]
+float16_t test_vminnmh_f16(float16_t a, float16_t b) {
+  return vminnmh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulh_f16
+// CHECK:  [[MUL:%.*]] = fmul half %a, %b
+// CHECK:  ret half [[MUL]]
+float16_t test_vmulh_f16(float16_t a, float16_t b) {
+  return vmulh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulxh_f16
+// CHECK:  [[MUL:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
+// CHECK:  ret half [[MUL]]
+float16_t test_vmulxh_f16(float16_t a, float16_t b) {
+  return vmulxh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrecpsh_f16
+// CHECK: [[RECPS:%.*]] = call half @llvm.aarch64.neon.frecps.f16(half %a, half %b)
+// CHECK: ret half [[RECPS]]
+float16_t test_vrecpsh_f16(float16_t a, float16_t b) {
+  return vrecpsh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrsqrtsh_f16
+// CHECK:  [[RSQRTS:%.*]] = call half @llvm.aarch64.neon.frsqrts.f16(half %a, half %b)
+// CHECK:  ret half [[RSQRTS]]
+float16_t test_vrsqrtsh_f16(float16_t a, float16_t b) {
+  return vrsqrtsh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vsubh_f16
+// CHECK:  [[SUB:%.*]] = fsub half %a, %b
+// CHECK:  ret half [[SUB]]
+float16_t test_vsubh_f16(float16_t a, float16_t b) {
+  return vsubh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vfmah_f16
+// CHECK:  [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half %c, half %a)
+// CHECK:  ret half [[FMA]]
+float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
+  return vfmah_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vfmsh_f16
+// CHECK:  [[SUB:%.*]] = fsub half 0xH8000, %b
+// CHECK:  [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half %c, half %a)
+// CHECK:  ret half [[ADD]]
+float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
+  return vfmsh_f16(a, b, c);
+}
+
diff --git a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index 3f61238b64fb..1a5c3a280f85 100644
--- a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1359,7 +1359,7 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
 float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
   return vzip_f16(a, b);
 }
@@ -1376,7 +1376,7 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
 float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
   return vzipq_f16(a, b);
 }
@@ -1393,7 +1393,7 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
 float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
   return vuzp_f16(a, b);
 }
@@ -1410,7 +1410,7 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
 float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
   return vuzpq_f16(a, b);
 }
@@ -1427,7 +1427,7 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
 float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
   return vtrn_f16(a, b);
 }
@@ -1444,7 +1444,7 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
 float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
   return vtrnq_f16(a, b);
 }
diff --git a/test/CodeGen/arm-arguments.c b/test/CodeGen/arm-arguments.c
index ec3e1734b0c3..ef4e76054ff8 100644
--- a/test/CodeGen/arm-arguments.c
+++ b/test/CodeGen/arm-arguments.c
@@ -208,13 +208,13 @@ float32x4_t f35(int i, s35_with_align s1, s35_with_align s2) {
 // APCS-GNU: %[[a:.*]] = alloca %struct.s35, align 16
 // APCS-GNU: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
 // APCS-GNU: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
-// APCS-GNU: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// APCS-GNU: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align {{[0-9]+}} %[[b]], i8* align {{[0-9]+}} %[[c]]
 // APCS-GNU: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
 // APCS-GNU: load <4 x float>, <4 x float>* %[[d]], align 16
 // AAPCS-LABEL: define arm_aapcscc <4 x float> @f35(i32 %i, %struct.s35* byval align 8, %struct.s35* byval align 8)
 // AAPCS: %[[a:.*]] = alloca %struct.s35, align 16
 // AAPCS: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
 // AAPCS: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
-// AAPCS: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// AAPCS: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %[[b]], i8* align 8 %[[c]]
 // AAPCS: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
 // AAPCS: load <4 x float>, <4 x float>* %[[d]], align 16
diff --git a/test/CodeGen/arm-fp16-arguments.c b/test/CodeGen/arm-fp16-arguments.c
index 65f076ac3ca8..d739f4b9c66a 100644
--- a/test/CodeGen/arm-fp16-arguments.c
+++ b/test/CodeGen/arm-fp16-arguments.c
@@ -25,3 +25,27 @@ __fp16 t2() { return g; }
 // HARD: ret float [[BITCAST]]
 // NATIVE: [[LOAD:%.*]] = load half, half* @g
 // NATIVE: ret half [[LOAD]]
+
+_Float16 h;
+
+void t3(_Float16 a) { h = a; }
+// SOFT: define void @t3(i32 [[PARAM:%.*]])
+// SOFT: [[TRUNC:%.*]] = trunc i32 [[PARAM]] to i16
+// HARD: define arm_aapcs_vfpcc void @t3(float [[PARAM:%.*]])
+// HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
+// HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
+// CHECK: store i16 [[TRUNC]], i16* bitcast (half* @h to i16*)
+// NATIVE: define void @t3(half [[PARAM:%.*]])
+// NATIVE: store half [[PARAM]], half* @h
+
+_Float16 t4() { return h; }
+// SOFT: define i32 @t4()
+// HARD: define arm_aapcs_vfpcc float @t4()
+// NATIVE: define half @t4()
+// CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @h to i16*)
+// CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
+// SOFT: ret i32 [[ZEXT]]
+// HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
+// HARD: ret float [[BITCAST]]
+// NATIVE: [[LOAD:%.*]] = load half, half* @h
+// NATIVE: ret half [[LOAD]]
diff --git a/test/CodeGen/arm64-be-bitfield.c b/test/CodeGen/arm64-be-bitfield.c
index 081eab81e905..cee59b87099f 100644
--- a/test/CodeGen/arm64-be-bitfield.c
+++ b/test/CodeGen/arm64-be-bitfield.c
@@ -7,6 +7,6 @@ signed callee_b0f(struct bt3 bp11) {
 // IR: callee_b0f(i64 [[ARG:%.*]])
 // IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8
 // IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8*
-// IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4
+// IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 8 [[BITCAST]], i64 4
   return bp11.b2;
 }
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index 62888dd73339..28b4f116c263 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -4846,7 +4846,7 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
@@ -4873,7 +4873,7 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
@@ -4900,7 +4900,7 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
@@ -4927,7 +4927,7 @@ int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
@@ -4954,7 +4954,7 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
@@ -4981,7 +4981,7 @@ float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
@@ -5008,7 +5008,7 @@ float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
@@ -5035,7 +5035,7 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5057,7 +5057,7 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5084,7 +5084,7 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
@@ -5111,7 +5111,7 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5133,7 +5133,7 @@ int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5160,7 +5160,7 @@ int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
@@ -5187,7 +5187,7 @@ int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5214,7 +5214,7 @@ float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
@@ -5241,7 +5241,7 @@ float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5263,7 +5263,7 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5587,7 +5587,7 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5619,7 +5619,7 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
@@ -5651,7 +5651,7 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5683,7 +5683,7 @@ int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
@@ -5715,7 +5715,7 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5747,7 +5747,7 @@ float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
@@ -5779,7 +5779,7 @@ float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5811,7 +5811,7 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5836,7 +5836,7 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
@@ -5868,7 +5868,7 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
@@ -5900,7 +5900,7 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5925,7 +5925,7 @@ int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
@@ -5957,7 +5957,7 @@ int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
@@ -5989,7 +5989,7 @@ int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
@@ -6021,7 +6021,7 @@ float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
@@ -6053,7 +6053,7 @@ float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6078,7 +6078,7 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
@@ -6407,7 +6407,7 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6444,7 +6444,7 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
@@ -6481,7 +6481,7 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6518,7 +6518,7 @@ int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
@@ -6555,7 +6555,7 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6592,7 +6592,7 @@ float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
@@ -6629,7 +6629,7 @@ float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6666,7 +6666,7 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6694,7 +6694,7 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
@@ -6731,7 +6731,7 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
@@ -6768,7 +6768,7 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6796,7 +6796,7 @@ int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
@@ -6833,7 +6833,7 @@ int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
@@ -6870,7 +6870,7 @@ int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
@@ -6907,7 +6907,7 @@ float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
@@ -6944,7 +6944,7 @@ float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6972,7 +6972,7 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
@@ -16199,7 +16199,7 @@ void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16220,7 +16220,7 @@ void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16246,7 +16246,7 @@ void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16272,7 +16272,7 @@ void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16293,7 +16293,7 @@ void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16319,7 +16319,7 @@ void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16345,7 +16345,7 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -16371,7 +16371,7 @@ void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -16397,7 +16397,7 @@ void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16418,7 +16418,7 @@ void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16444,7 +16444,7 @@ void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16465,7 +16465,7 @@ void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16491,7 +16491,7 @@ void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -16517,7 +16517,7 @@ void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -16543,7 +16543,7 @@ void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16564,7 +16564,7 @@ void test_vst2_s8(int8_t * a, int8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16590,7 +16590,7 @@ void test_vst2_s16(int16_t * a, int16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -16616,7 +16616,7 @@ void test_vst2_s32(int32_t * a, int32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -16642,7 +16642,7 @@ void test_vst2_s64(int64_t * a, int64x1x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -16668,7 +16668,7 @@ void test_vst2_f16(float16_t * a, float16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -16694,7 +16694,7 @@ void test_vst2_f32(float32_t * a, float32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16715,7 +16715,7 @@ void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16741,7 +16741,7 @@ void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16767,7 +16767,7 @@ void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16793,7 +16793,7 @@ void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16819,7 +16819,7 @@ void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16845,7 +16845,7 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -16871,7 +16871,7 @@ void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -16897,7 +16897,7 @@ void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16923,7 +16923,7 @@ void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16944,7 +16944,7 @@ void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16970,7 +16970,7 @@ void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -16996,7 +16996,7 @@ void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17017,7 +17017,7 @@ void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17043,7 +17043,7 @@ void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -17069,7 +17069,7 @@ void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -17095,7 +17095,7 @@ void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -17121,7 +17121,7 @@ void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17142,7 +17142,7 @@ void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17168,7 +17168,7 @@ void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -17192,7 +17192,7 @@ void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17223,7 +17223,7 @@ void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17254,7 +17254,7 @@ void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -17278,7 +17278,7 @@ void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17309,7 +17309,7 @@ void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17340,7 +17340,7 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -17371,7 +17371,7 @@ void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -17402,7 +17402,7 @@ void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -17426,7 +17426,7 @@ void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17457,7 +17457,7 @@ void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17481,7 +17481,7 @@ void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17512,7 +17512,7 @@ void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -17543,7 +17543,7 @@ void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -17574,7 +17574,7 @@ void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17598,7 +17598,7 @@ void test_vst3_s8(int8_t * a, int8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17629,7 +17629,7 @@ void test_vst3_s16(int16_t * a, int16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -17660,7 +17660,7 @@ void test_vst3_s32(int32_t * a, int32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -17691,7 +17691,7 @@ void test_vst3_s64(int64_t * a, int64x1x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -17722,7 +17722,7 @@ void test_vst3_f16(float16_t * a, float16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -17753,7 +17753,7 @@ void test_vst3_f32(float32_t * a, float32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17777,7 +17777,7 @@ void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17808,7 +17808,7 @@ void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17839,7 +17839,7 @@ void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17870,7 +17870,7 @@ void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17901,7 +17901,7 @@ void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17932,7 +17932,7 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -17963,7 +17963,7 @@ void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -17994,7 +17994,7 @@ void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18025,7 +18025,7 @@ void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18049,7 +18049,7 @@ void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18080,7 +18080,7 @@ void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18111,7 +18111,7 @@ void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18135,7 +18135,7 @@ void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18166,7 +18166,7 @@ void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18197,7 +18197,7 @@ void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -18228,7 +18228,7 @@ void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -18259,7 +18259,7 @@ void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18283,7 +18283,7 @@ void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18314,7 +18314,7 @@ void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -18341,7 +18341,7 @@ void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18377,7 +18377,7 @@ void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -18413,7 +18413,7 @@ void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -18440,7 +18440,7 @@ void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18476,7 +18476,7 @@ void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -18512,7 +18512,7 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -18548,7 +18548,7 @@ void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -18584,7 +18584,7 @@ void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -18611,7 +18611,7 @@ void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18647,7 +18647,7 @@ void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18674,7 +18674,7 @@ void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18710,7 +18710,7 @@ void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18746,7 +18746,7 @@ void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -18782,7 +18782,7 @@ void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18809,7 +18809,7 @@ void test_vst4_s8(int8_t * a, int8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18845,7 +18845,7 @@ void test_vst4_s16(int16_t * a, int16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18881,7 +18881,7 @@ void test_vst4_s32(int32_t * a, int32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -18917,7 +18917,7 @@ void test_vst4_s64(int64_t * a, int64x1x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -18953,7 +18953,7 @@ void test_vst4_f16(float16_t * a, float16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -18989,7 +18989,7 @@ void test_vst4_f32(float32_t * a, float32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19016,7 +19016,7 @@ void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -19052,7 +19052,7 @@ void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -19088,7 +19088,7 @@ void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -19124,7 +19124,7 @@ void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -19160,7 +19160,7 @@ void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -19196,7 +19196,7 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -19232,7 +19232,7 @@ void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -19268,7 +19268,7 @@ void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -19304,7 +19304,7 @@ void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19331,7 +19331,7 @@ void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -19367,7 +19367,7 @@ void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -19403,7 +19403,7 @@ void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19430,7 +19430,7 @@ void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -19466,7 +19466,7 @@ void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -19502,7 +19502,7 @@ void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -19538,7 +19538,7 @@ void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -19574,7 +19574,7 @@ void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19601,7 +19601,7 @@ void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -20474,7 +20474,7 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !3
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
@@ -20493,7 +20493,7 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !6
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
@@ -20512,7 +20512,7 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !9
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
@@ -20529,7 +20529,7 @@ int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !12
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
@@ -20548,7 +20548,7 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !15
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
@@ -20567,7 +20567,7 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !18
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
@@ -20586,7 +20586,7 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !noalias !21
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
@@ -20603,7 +20603,7 @@ float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !24
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
@@ -20622,7 +20622,7 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !27
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
@@ -20639,7 +20639,7 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
@@ -20658,7 +20658,7 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
@@ -20677,7 +20677,7 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
@@ -20694,7 +20694,7 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
@@ -20713,7 +20713,7 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
@@ -20732,7 +20732,7 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
@@ -20751,7 +20751,7 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
@@ -20768,7 +20768,7 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
@@ -20787,7 +20787,7 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
@@ -20968,7 +20968,7 @@ uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !57
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
@@ -20987,7 +20987,7 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !60
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
@@ -21006,7 +21006,7 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !63
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
@@ -21023,7 +21023,7 @@ int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !66
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
@@ -21042,7 +21042,7 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !69
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
@@ -21061,7 +21061,7 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !72
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
@@ -21080,7 +21080,7 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !noalias !75
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
@@ -21097,7 +21097,7 @@ float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !78
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
@@ -21116,7 +21116,7 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !81
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
@@ -21133,7 +21133,7 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
@@ -21152,7 +21152,7 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
@@ -21171,7 +21171,7 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
@@ -21188,7 +21188,7 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
@@ -21207,7 +21207,7 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
@@ -21226,7 +21226,7 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
@@ -21245,7 +21245,7 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
@@ -21262,7 +21262,7 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
@@ -21281,7 +21281,7 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
@@ -21298,7 +21298,7 @@ poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !111
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
@@ -21317,7 +21317,7 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !114
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
@@ -21336,7 +21336,7 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !117
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
@@ -21353,7 +21353,7 @@ int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !120
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
@@ -21372,7 +21372,7 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !123
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
@@ -21391,7 +21391,7 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !126
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
@@ -21410,7 +21410,7 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !noalias !129
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
@@ -21427,7 +21427,7 @@ float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !132
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
 // CHECK:   ret void
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
@@ -21446,7 +21446,7 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !135
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
 // CHECK:   ret void
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
@@ -21463,7 +21463,7 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
@@ -21482,7 +21482,7 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
@@ -21501,7 +21501,7 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
@@ -21518,7 +21518,7 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
@@ -21537,7 +21537,7 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
@@ -21556,7 +21556,7 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
@@ -21575,7 +21575,7 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
@@ -21592,7 +21592,7 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
@@ -21611,7 +21611,7 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
diff --git a/test/CodeGen/atomic-arm64.c b/test/CodeGen/atomic-arm64.c
index 5cae3d134984..0e79846085cf 100644
--- a/test/CodeGen/atomic-arm64.c
+++ b/test/CodeGen/atomic-arm64.c
@@ -65,7 +65,7 @@ void test3(pointer_pair_t pair) {
 // CHECK:      [[TEMP:%.*]] = alloca [[QUAD_T:%.*]], align 8
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i8*
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[QUAD_T]]* {{%.*}} to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 32, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T0]], i8* align 8 [[T1]], i64 32, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i256*
 // CHECK-NEXT: [[T1:%.*]] = bitcast i256* [[T0]] to i8*
 // CHECK-NEXT: call void @__atomic_store(i64 32, i8* bitcast ([[QUAD_T]]* @a_pointer_quad to i8*), i8* [[T1]], i32 5)
diff --git a/test/CodeGen/attr-target-mv-func-ptrs.c b/test/CodeGen/attr-target-mv-func-ptrs.c
new file mode 100644
index 000000000000..5df9a927cf8d
--- /dev/null
+++ b/test/CodeGen/attr-target-mv-func-ptrs.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+int __attribute__((target("sse4.2"))) foo(int i) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int);
+int __attribute__((target("arch=ivybridge"))) foo(int i) {return 1;}
+int __attribute__((target("default"))) foo(int i) { return 2; }
+
+typedef int (*FuncPtr)(int);
+void func(FuncPtr);
+
+int bar() {
+  func(foo);
+  FuncPtr Free = &foo;
+  FuncPtr Free2 = foo;
+
+  return 0;
+  return Free(1) + Free(2);
+}
+
+// CHECK: @foo.ifunc = ifunc i32 (i32), i32 (i32)* ()* @foo.resolver
+// CHECK: define i32 @foo.sse4.2(
+// CHECK: ret i32 0
+// CHECK: define i32 @foo.arch_ivybridge(
+// CHECK: ret i32 1
+// CHECK: define i32 @foo(
+// CHECK: ret i32 2
+
+// CHECK: define i32 @bar()
+// CHECK: call void @func(i32 (i32)* @foo.ifunc)
+// CHECK: store i32 (i32)* @foo.ifunc
+// CHECK: store i32 (i32)* @foo.ifunc
+
+// CHECK: declare i32 @foo.arch_sandybridge(
diff --git a/test/CodeGen/attr-target-mv-va-args.c b/test/CodeGen/attr-target-mv-va-args.c
new file mode 100644
index 000000000000..b33f841dba5e
--- /dev/null
+++ b/test/CodeGen/attr-target-mv-va-args.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+int __attribute__((target("sse4.2"))) foo(int i, ...) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int i, ...);
+int __attribute__((target("arch=ivybridge"))) foo(int i, ...) {return 1;}
+int __attribute__((target("default"))) foo(int i, ...) { return 2; }
+
+int bar() {
+  return foo(1, 'a', 1.1) + foo(2, 2.2, "asdf");
+}
+
+// CHECK: @foo.ifunc = ifunc i32 (i32, ...), i32 (i32, ...)* ()* @foo.resolver
+// CHECK: define i32 @foo.sse4.2(i32 %i, ...)
+// CHECK: ret i32 0
+// CHECK: define i32 @foo.arch_ivybridge(i32 %i, ...)
+// CHECK: ret i32 1
+// CHECK: define i32 @foo(i32 %i, ...)
+// CHECK: ret i32 2
+// CHECK: define i32 @bar()
+// CHECK: call i32 (i32, ...) @foo.ifunc(i32 1, i32 97, double
+// CHECK: call i32 (i32, ...) @foo.ifunc(i32 2, double 2.2{{[0-9Ee+]+}}, i8* getelementptr inbounds 
+// CHECK: define i32 (i32, ...)* @foo.resolver() comdat
+// CHECK: ret i32 (i32, ...)* @foo.arch_sandybridge
+// CHECK: ret i32 (i32, ...)* @foo.arch_ivybridge
+// CHECK: ret i32 (i32, ...)* @foo.sse4.2
+// CHECK: ret i32 (i32, ...)* @foo
+// CHECK: declare i32 @foo.arch_sandybridge(i32, ...)
diff --git a/test/CodeGen/attr-target-mv.c b/test/CodeGen/attr-target-mv.c
new file mode 100644
index 000000000000..0085a154ced1
--- /dev/null
+++ b/test/CodeGen/attr-target-mv.c
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+int __attribute__((target("sse4.2"))) foo(void) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(void);
+int __attribute__((target("arch=ivybridge"))) foo(void) {return 1;}
+int __attribute__((target("default"))) foo(void) { return 2; }
+
+int bar() {
+  return foo();
+}
+
+inline int __attribute__((target("sse4.2"))) foo_inline(void) { return 0; }
+inline int __attribute__((target("arch=sandybridge"))) foo_inline(void);
+inline int __attribute__((target("arch=ivybridge"))) foo_inline(void) {return 1;}
+inline int __attribute__((target("default"))) foo_inline(void) { return 2; }
+
+int bar2() {
+  return foo_inline();
+}
+
+inline __attribute__((target("default"))) void foo_decls(void);
+inline __attribute__((target("sse4.2"))) void foo_decls(void);
+void bar3() {
+  foo_decls();
+}
+inline __attribute__((target("default"))) void foo_decls(void) {}
+inline __attribute__((target("sse4.2"))) void foo_decls(void) {}
+
+inline __attribute__((target("default"))) void foo_multi(void) {}
+inline __attribute__((target("avx,sse4.2"))) void foo_multi(void) {}
+inline __attribute__((target("sse4.2,fma4"))) void foo_multi(void) {}
+inline __attribute__((target("arch=ivybridge,fma4,sse4.2"))) void foo_multi(void) {}
+void bar4() {
+  foo_multi();
+}
+
+// CHECK: @foo.ifunc = ifunc i32 (), i32 ()* ()* @foo.resolver
+// CHECK: @foo_inline.ifunc = ifunc i32 (), i32 ()* ()* @foo_inline.resolver
+// CHECK: @foo_decls.ifunc = ifunc void (), void ()* ()* @foo_decls.resolver
+
+// CHECK: define i32 @foo.sse4.2()
+// CHECK: ret i32 0
+// CHECK: define i32 @foo.arch_ivybridge()
+// CHECK: ret i32 1
+// CHECK: define i32 @foo()
+// CHECK: ret i32 2
+// CHECK: define i32 @bar()
+// CHECK: call i32 @foo.ifunc()
+
+// CHECK: define i32 ()* @foo.resolver() comdat
+// CHECK: call void @__cpu_indicator_init()
+// CHECK: ret i32 ()* @foo.arch_sandybridge
+// CHECK: ret i32 ()* @foo.arch_ivybridge
+// CHECK: ret i32 ()* @foo.sse4.2
+// CHECK: ret i32 ()* @foo
+
+// CHECK: define i32 @bar2()
+// CHECK: call i32 @foo_inline.ifunc()
+
+// CHECK: define i32 ()* @foo_inline.resolver() comdat
+// CHECK: call void @__cpu_indicator_init()
+// CHECK: ret i32 ()* @foo_inline.arch_sandybridge
+// CHECK: ret i32 ()* @foo_inline.arch_ivybridge
+// CHECK: ret i32 ()* @foo_inline.sse4.2
+// CHECK: ret i32 ()* @foo_inline
+
+// CHECK: define void @bar3()
+// CHECK: call void @foo_decls.ifunc()
+
+// CHECK: define void ()* @foo_decls.resolver() comdat
+// CHECK: ret void ()* @foo_decls.sse4.2
+// CHECK: ret void ()* @foo_decls
+
+// CHECK: declare i32 @foo.arch_sandybridge()
+
+// CHECK: define available_externally i32 @foo_inline.sse4.2()
+// CHECK: ret i32 0
+
+// CHECK: declare i32 @foo_inline.arch_sandybridge()
+//
+// CHECK: define available_externally i32 @foo_inline.arch_ivybridge()
+// CHECK: ret i32 1
+// CHECK: define available_externally i32 @foo_inline()
+// CHECK: ret i32 2
+
+// CHECK: define available_externally void @foo_decls()
+// CHECK: define available_externally void @foo_decls.sse4.2()
+
+// CHECK: define available_externally void @foo_multi.avx_sse4.2()
+// CHECK: define available_externally void @foo_multi.fma4_sse4.2()
+// CHECK: define available_externally void @foo_multi.arch_ivybridge_fma4_sse4.2()
+
diff --git a/test/CodeGen/avx512bw-builtins.c b/test/CodeGen/avx512bw-builtins.c
index 3160a6667c00..bfdf46d261fa 100644
--- a/test/CodeGen/avx512bw-builtins.c
+++ b/test/CodeGen/avx512bw-builtins.c
@@ -1628,23 +1628,22 @@ __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
 
 __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackd
-  // CHECK: bitcast <64 x i1> %{{.*}} to i64
-  // CHECK: bitcast <64 x i1> %{{.*}} to i64
-  // CHECK: and i64 %{{.*}}, 4294967295
-  // CHECK: shl i64 %{{.*}}, 32
-  // CHECK: or i64 %{{.*}}, %{{.*}}
-  // CHECK: bitcast i64 %{{.*}} to <64 x i1>
+  // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // CHECK: [[LHS2:%.*]] = shufflevector <64 x i1> [[LHS]], <64 x i1> [[LHS]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: [[RHS2:%.*]] = shufflevector <64 x i1> [[RHS]], <64 x i1> [[RHS]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: [[CONCAT:%.*]] = shufflevector <32 x i1> [[LHS2]], <32 x i1> [[RHS2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  // CHECK: bitcast <64 x i1> [[CONCAT]] to i64
   return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); 
 }
 
 __mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackw
-  // CHECK: bitcast <32 x i1> %{{.*}} to i32
-  // CHECK: bitcast <32 x i1> %{{.*}} to i32
-  // CHECK: and i32 %{{.*}}, 65535
-  // CHECK: shl i32 %{{.*}}, 16
-  // CHECK: or i32 %{{.*}}, %{{.*}}
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: [[LHS2:%.*]] = shufflevector <32 x i1> [[LHS]], <32 x i1> [[LHS]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: [[RHS2:%.*]] = shufflevector <32 x i1> [[RHS]], <32 x i1> [[RHS]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: [[CONCAT:%.*]] = shufflevector <16 x i1> [[LHS2]], <16 x i1> [[RHS2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); 
 }
 
@@ -1743,7 +1742,8 @@ __mmask32 test_mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i _
 
 __mmask64 test_mm512_movepi8_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi8_mask
-  // CHECK: @llvm.x86.avx512.cvtb2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <64 x i8> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <64 x i1> [[CMP]] to i64
   return _mm512_movepi8_mask(__A); 
 }
 
@@ -1941,7 +1941,8 @@ __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
 
 __mmask32 test_mm512_movepi16_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi16_mask
-  // CHECK: @llvm.x86.avx512.cvtw2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <32 x i16> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <32 x i1> [[CMP]] to i32
   return _mm512_movepi16_mask(__A); 
 }
 
diff --git a/test/CodeGen/avx512dq-builtins.c b/test/CodeGen/avx512dq-builtins.c
index 1b21ca3c4302..47943c5740b2 100644
--- a/test/CodeGen/avx512dq-builtins.c
+++ b/test/CodeGen/avx512dq-builtins.c
@@ -923,7 +923,8 @@ __m128d test_mm_maskz_reduce_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 
 __mmask16 test_mm512_movepi32_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi32_mask
-  // CHECK: @llvm.x86.avx512.cvtd2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <16 x i32> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <16 x i1> [[CMP]] to i16
   return _mm512_movepi32_mask(__A); 
 }
 
@@ -943,7 +944,8 @@ __m512i test_mm512_movm_epi64(__mmask8 __A) {
 
 __mmask8 test_mm512_movepi64_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi64_mask
-  // CHECK: @llvm.x86.avx512.cvtq2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <8 x i64> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <8 x i1> [[CMP]] to i8
   return _mm512_movepi64_mask(__A); 
 }
 
diff --git a/test/CodeGen/avx512f-builtins.c b/test/CodeGen/avx512f-builtins.c
index ce831d690ee7..35a97912c71e 100644
--- a/test/CodeGen/avx512f-builtins.c
+++ b/test/CodeGen/avx512f-builtins.c
@@ -6261,12 +6261,12 @@ int test_mm512_kortestz(__mmask16 __A, __mmask16 __B) {
 
 __mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: bitcast <16 x i1> %{{.*}} to i16
-  // CHECK: bitcast <16 x i1> %{{.*}} to i16
-  // CHECK: and i32 %{{.*}}, 255
-  // CHECK: shl i32 %{{.*}}, 8
-  // CHECK: or i32 %{{.*}}, %{{.*}}
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[LHS2:%.*]] = shufflevector <16 x i1> [[LHS]], <16 x i1> [[LHS]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: [[RHS2:%.*]] = shufflevector <16 x i1> [[RHS]], <16 x i1> [[RHS]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: [[CONCAT:%.*]] = shufflevector <8 x i1> [[LHS2]], <8 x i1> [[RHS2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: bitcast <16 x i1> [[CONCAT]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, __B),
                                                        _mm512_cmpneq_epu32_mask(__C, __D)),
                                                        __E, __F);
diff --git a/test/CodeGen/avx512vlbw-builtins.c b/test/CodeGen/avx512vlbw-builtins.c
index 23fbd4026aaa..7adc50c23179 100644
--- a/test/CodeGen/avx512vlbw-builtins.c
+++ b/test/CodeGen/avx512vlbw-builtins.c
@@ -2601,13 +2601,15 @@ __mmask16 test_mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i _
 
 __mmask16 test_mm_movepi8_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi8_mask
-  // CHECK: @llvm.x86.avx512.cvtb2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <16 x i8> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <16 x i1> [[CMP]] to i16
   return _mm_movepi8_mask(__A); 
 }
 
 __mmask32 test_mm256_movepi8_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi8_mask
-  // CHECK: @llvm.x86.avx512.cvtb2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <32 x i8> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <32 x i1> [[CMP]] to i32
   return _mm256_movepi8_mask(__A); 
 }
 
@@ -2985,13 +2987,15 @@ __m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
 }
 __mmask8 test_mm_movepi16_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi16_mask
-  // CHECK: @llvm.x86.avx512.cvtw2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <8 x i16> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <8 x i1> [[CMP]] to i8
   return _mm_movepi16_mask(__A); 
 }
 
 __mmask16 test_mm256_movepi16_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi16_mask
-  // CHECK: @llvm.x86.avx512.cvtw2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <16 x i16> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <16 x i1> [[CMP]] to i16
   return _mm256_movepi16_mask(__A); 
 }
 
diff --git a/test/CodeGen/avx512vldq-builtins.c b/test/CodeGen/avx512vldq-builtins.c
index 3ca4b2135ea7..0812d682dc5d 100644
--- a/test/CodeGen/avx512vldq-builtins.c
+++ b/test/CodeGen/avx512vldq-builtins.c
@@ -853,13 +853,16 @@ __m256 test_mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A) {
 
 __mmask8 test_mm_movepi32_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi32_mask
-  // CHECK: @llvm.x86.avx512.cvtd2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <4 x i32> %{{.*}}, zeroinitializer
+  // CHECK: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: bitcast <8 x i1> [[SHUF]] to i8
   return _mm_movepi32_mask(__A); 
 }
 
 __mmask8 test_mm256_movepi32_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi32_mask
-  // CHECK: @llvm.x86.avx512.cvtd2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <8 x i32> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <8 x i1> [[CMP]] to i8
   return _mm256_movepi32_mask(__A); 
 }
 
@@ -896,13 +899,17 @@ __m256i test_mm256_movm_epi64(__mmask8 __A) {
 
 __mmask8 test_mm_movepi64_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi64_mask
-  // CHECK: @llvm.x86.avx512.cvtq2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <2 x i64> %{{.*}}, zeroinitializer
+  // CHECK: [[SHUF:%.*]] = shufflevector <2 x i1> [[CMP]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  // CHECK: bitcast <8 x i1> [[SHUF]] to i8
   return _mm_movepi64_mask(__A); 
 }
 
 __mmask8 test_mm256_movepi64_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi64_mask
-  // CHECK: @llvm.x86.avx512.cvtq2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <4 x i64> %{{.*}}, zeroinitializer
+  // CHECK: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: bitcast <8 x i1> [[SHUF]] to i8
   return _mm256_movepi64_mask(__A); 
 }
 
diff --git a/test/CodeGen/block-byref-aggr.c b/test/CodeGen/block-byref-aggr.c
index 7d146a2d4777..962f100638f0 100644
--- a/test/CodeGen/block-byref-aggr.c
+++ b/test/CodeGen/block-byref-aggr.c
@@ -24,7 +24,7 @@ void test0() {
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[BYREF]], [[BYREF]]* [[T0]], i32 0, i32 4
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
 //   Verify that there's nothing else significant in the function.
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[BYREF]]* [[A]] to i8*
 // CHECK-NEXT: call void @_Block_object_dispose(i8* [[T0]], i32 8)
@@ -50,14 +50,14 @@ void test1() {
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[B_BYREF]], [[B_BYREF]]* [[T0]], i32 0, i32 4
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
 //   Then for 'a':
 // CHECK-NEXT: [[A_FORWARDING:%.*]] = getelementptr inbounds [[A_BYREF]], [[A_BYREF]]* [[A]], i32 0, i32 1
 // CHECK-NEXT: [[T0:%.*]] = load [[A_BYREF]]*, [[A_BYREF]]** [[A_FORWARDING]]
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[A_BYREF]], [[A_BYREF]]* [[T0]], i32 0, i32 4
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
 //   Verify that there's nothing else significant in the function.
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[B_BYREF]]* [[B]] to i8*
 // CHECK-NEXT: call void @_Block_object_dispose(i8* [[T0]], i32 8)
diff --git a/test/CodeGen/builtin-memfns.c b/test/CodeGen/builtin-memfns.c
index 4a06160ccbc6..d93a5aadae27 100644
--- a/test/CodeGen/builtin-memfns.c
+++ b/test/CodeGen/builtin-memfns.c
@@ -54,14 +54,14 @@ int test6(char *X) {
 int test7(int *p) {
   struct snd_pcm_hw_params_t* hwparams;  // incomplete type.
   
-  // CHECK: call void @llvm.memset{{.*}}256, i32 4, i1 false)
+  // CHECK: call void @llvm.memset{{.*}} align 4 {{.*}}256, i1 false)
   __builtin_memset(p, 0, 256);  // Should be alignment = 4
 
-  // CHECK: call void @llvm.memset{{.*}}256, i32 1, i1 false)
+  // CHECK: call void @llvm.memset{{.*}} align 1 {{.*}}256, i1 false)
   __builtin_memset((char*)p, 0, 256);  // Should be alignment = 1
 
   __builtin_memset(hwparams, 0, 256);  // No crash alignment = 1
-  // CHECK: call void @llvm.memset{{.*}}256, i32 1, i1 false)
+  // CHECK: call void @llvm.memset{{.*}} align 1{{.*}}256, i1 false)
 }
 
 // <rdar://problem/11314941>
@@ -73,13 +73,13 @@ struct PS {
 struct PS ps;
 void test8(int *arg) {
   // CHECK: @test8
-  // CHECK: call void @llvm.memcpy{{.*}} 16, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}} align 4 {{.*}} align 1 {{.*}} 16, i1 false)
   __builtin_memcpy(arg, ps.modes, sizeof(struct PS));
 }
 
 __attribute((aligned(16))) int x[4], y[4];
 void test9() {
   // CHECK: @test9
-  // CHECK: call void @llvm.memcpy{{.*}} 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}} align 16 {{.*}} align 16 {{.*}} 16, i1 false)
   __builtin_memcpy(x, y, sizeof(y));
 }
diff --git a/test/CodeGen/builtins-overflow.c b/test/CodeGen/builtins-overflow.c
index 7a30cfbd46ee..f83bbfb9672d 100644
--- a/test/CodeGen/builtins-overflow.c
+++ b/test/CodeGen/builtins-overflow.c
@@ -373,7 +373,9 @@ int test_mixed_sign_mull_overflow_unsigned(int x, unsigned y) {
 // CHECK-NEXT:  [[NotNull:%.*]] = icmp ne i32 [[UnsignedResult]], 0
 // CHECK-NEXT:  [[Underflow:%.*]] = and i1 [[IsNeg]], [[NotNull]]
 // CHECK-NEXT:  [[OFlow:%.*]] = or i1 [[UnsignedOFlow]], [[Underflow]]
-// CHECK-NEXT:  store i32 [[UnsignedResult]], i32* %{{.*}}, align 4
+// CHECK-NEXT:  [[NegatedResult:%.*]] = sub i32 0, [[UnsignedResult]]
+// CHECK-NEXT:  [[Result:%.*]] = select i1 [[IsNeg]], i32 [[NegatedResult]], i32 [[UnsignedResult]]
+// CHECK-NEXT:  store i32 [[Result]], i32* %{{.*}}, align 4
 // CHECK:       br i1 [[OFlow]]
 
   unsigned result;
@@ -432,7 +434,9 @@ long long test_mixed_sign_mulll_overflow_trunc_unsigned(long long x, unsigned lo
 // CHECK-NEXT:  [[OVERFLOW_PRE_TRUNC:%.*]] = or i1 {{.*}}, [[UNDERFLOW]]
 // CHECK-NEXT:  [[TRUNC_OVERFLOW:%.*]] = icmp ugt i64 [[UNSIGNED_RESULT]], 4294967295
 // CHECK-NEXT:  [[OVERFLOW:%.*]] = or i1 [[OVERFLOW_PRE_TRUNC]], [[TRUNC_OVERFLOW]]
-// CHECK-NEXT:  trunc i64 [[UNSIGNED_RESULT]] to i32
+// CHECK-NEXT:  [[NEGATED:%.*]] = sub i64 0, [[UNSIGNED_RESULT]]
+// CHECK-NEXT:  [[RESULT:%.*]] = select i1 {{.*}}, i64 [[NEGATED]], i64 [[UNSIGNED_RESULT]]
+// CHECK-NEXT:  trunc i64 [[RESULT]] to i32
 // CHECK-NEXT:  store
   unsigned result;
   if (__builtin_mul_overflow(y, x, &result))
diff --git a/test/CodeGen/builtins-wasm.c b/test/CodeGen/builtins-wasm.c
index e0f72d2e5034..39252e752ac9 100644
--- a/test/CodeGen/builtins-wasm.c
+++ b/test/CodeGen/builtins-wasm.c
@@ -3,25 +3,37 @@
 // RUN: %clang_cc1 -triple wasm64-unknown-unknown -O3 -emit-llvm -o - %s \
 // RUN:   | FileCheck %s -check-prefix=WEBASSEMBLY64
 
-__SIZE_TYPE__ f1(void) {
+__SIZE_TYPE__ f0(void) {
+  return __builtin_wasm_mem_size(0);
+// WEBASSEMBLY32: call {{i.*}} @llvm.wasm.mem.size.i32(i32 0)
+// WEBASSEMBLY64: call {{i.*}} @llvm.wasm.mem.size.i64(i32 0)
+}
+
+__SIZE_TYPE__ f1(__SIZE_TYPE__ delta) {
+  return __builtin_wasm_mem_grow(0, delta);
+// WEBASSEMBLY32: call i32 @llvm.wasm.mem.grow.i32(i32 0, i32 %{{.*}})
+// WEBASSEMBLY64: call i64 @llvm.wasm.mem.grow.i64(i32 0, i64 %{{.*}})
+}
+
+__SIZE_TYPE__ f2(void) {
   return __builtin_wasm_current_memory();
 // WEBASSEMBLY32: call {{i.*}} @llvm.wasm.current.memory.i32()
 // WEBASSEMBLY64: call {{i.*}} @llvm.wasm.current.memory.i64()
 }
 
-__SIZE_TYPE__ f2(__SIZE_TYPE__ delta) {
+__SIZE_TYPE__ f3(__SIZE_TYPE__ delta) {
   return __builtin_wasm_grow_memory(delta);
 // WEBASSEMBLY32: call i32 @llvm.wasm.grow.memory.i32(i32 %{{.*}})
 // WEBASSEMBLY64: call i64 @llvm.wasm.grow.memory.i64(i64 %{{.*}})
 }
 
-void f3(unsigned int tag, void *obj) {
+void f4(unsigned int tag, void *obj) {
   return __builtin_wasm_throw(tag, obj);
 // WEBASSEMBLY32: call void @llvm.wasm.throw(i32 %{{.*}}, i8* %{{.*}})
 // WEBASSEMBLY64: call void @llvm.wasm.throw(i32 %{{.*}}, i8* %{{.*}})
 }
 
-void f4() {
+void f5(void) {
   return __builtin_wasm_rethrow();
 // WEBASSEMBLY32: call void @llvm.wasm.rethrow()
 // WEBASSEMBLY64: call void @llvm.wasm.rethrow()
diff --git a/test/CodeGen/c11atomics-ios.c b/test/CodeGen/c11atomics-ios.c
index fb731dfd7a1f..0e24ed228cac 100644
--- a/test/CodeGen/c11atomics-ios.c
+++ b/test/CodeGen/c11atomics-ios.c
@@ -132,7 +132,7 @@ void testStruct(_Atomic(S) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = load [[S]]*, [[S]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T1]], i8* align 2 [[T2]], i32 8, i1 false)
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[TMP0]] to i64*
 // CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8
 // CHECK-NEXT: [[T5:%.*]] = bitcast [[S]]* [[T0]] to i64*
@@ -154,7 +154,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 
 // CHECK-NEXT: [[P:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[P]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -165,7 +165,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
   __c11_atomic_init(fp, (PS){1,2,3});
 
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T0]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T0]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[X]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -183,16 +183,16 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 8 [[T2]], i32 6, i1 false)
   PS f = *fp;
 
 // CHECK-NEXT: [[T0:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[TMP1]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T1]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T1]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP1]], i32 0, i32 0
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T2]], i8* align 2 [[T3]], i32 6, i1 false)
 // CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[TMP1]] to i64*
 // CHECK-NEXT: [[T5:%.*]] = load i64, i64* [[T4]], align 8
 // CHECK-NEXT: [[T6:%.*]] = bitcast [[APS]]* [[T0]] to i64*
@@ -215,7 +215,7 @@ PS test_promoted_load(_Atomic(PS) *addr) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
 
   return __c11_atomic_load(addr, 5);
 }
@@ -232,11 +232,11 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 8
   // CHECK:   store atomic i64 [[VAL64]], i64* [[ADDR64]] seq_cst, align 8
@@ -257,11 +257,11 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[ATOMIC_RES64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_RES]] to i64*
   // CHECK:   [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 8
@@ -270,7 +270,7 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
   return __c11_atomic_exchange(addr, *val, 5);
 }
 
@@ -291,15 +291,15 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   [[NEW:%.*]] = load %struct.PS*, %struct.PS** [[NEW_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[NEW8:%.*]] = bitcast %struct.PS* [[NEW]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[NEW8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[NEW8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_DESIRED8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED:%.*]] to i8*
   // CHECK:   [[DESIRED8:%.*]] = bitcast %struct.PS* [[DESIRED]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_DESIRED8]], i8* [[DESIRED8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_DESIRED8]], i8* align 2 [[DESIRED8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_DESIRED64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED:%.*]] to i64*
   // CHECK:   [[ATOMIC_NEW8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_NEW8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_NEW8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_NEW64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i64*
   // CHECK:   [[ATOMIC_DESIRED_VAL64:%.*]] = load i64, i64* [[ATOMIC_DESIRED64]], align 8
   // CHECK:   [[ATOMIC_NEW_VAL64:%.*]] = load i64, i64* [[ATOMIC_NEW64]], align 8
diff --git a/test/CodeGen/c11atomics.c b/test/CodeGen/c11atomics.c
index ccb642174c94..3774ded7c426 100644
--- a/test/CodeGen/c11atomics.c
+++ b/test/CodeGen/c11atomics.c
@@ -282,7 +282,7 @@ void testStruct(_Atomic(S) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = load [[S]]*, [[S]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T1]], i8* align 2 [[T2]], i32 8, i1 false)
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[T0]] to i8*
 // CHECK-NEXT: [[T4:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
 // CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T3]], i8* [[T4]], i32 5)
@@ -307,7 +307,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 
 // CHECK-NEXT: [[P:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[P]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -318,7 +318,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
   __c11_atomic_init(fp, (PS){1,2,3});
 
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T0]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T0]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[X]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -335,16 +335,16 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 8 [[T2]], i32 6, i1 false)
   PS f = *fp;
 
 // CHECK-NEXT: [[T0:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[TMP1]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T1]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T1]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP1]], i32 0, i32 0
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T2]], i8* align 2 [[T3]], i32 6, i1 false)
 // CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[T0]] to i8*
 // CHECK-NEXT: [[T5:%.*]] = bitcast [[APS]]* [[TMP1]] to i8*
 // CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T4]], i8* [[T5]], i32 5)
@@ -357,7 +357,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP3]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = bitcast %struct.PS* [[TMP2]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast %struct.PS* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 8 [[T2]], i32 6, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds %struct.PS, %struct.PS* [[TMP2]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = load i16, i16* [[T0]], align 2
 // CHECK-NEXT: [[T2:%.*]] = sext i16 [[T1]] to i32
@@ -381,7 +381,7 @@ PS test_promoted_load(_Atomic(PS) *addr) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
 
   return __c11_atomic_load(addr, 5);
 }
@@ -398,11 +398,11 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
   // CHECK:   [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 2
@@ -423,11 +423,11 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[ATOMIC_RES64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_RES]] to i64*
   // CHECK:   [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
@@ -437,7 +437,7 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
   return __c11_atomic_exchange(addr, *val, 5);
 }
 
@@ -457,15 +457,15 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   [[NEW:%.*]] = load %struct.PS*, %struct.PS** [[NEW_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[NEW8:%.*]] = bitcast %struct.PS* [[NEW]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[NEW8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[NEW8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_DESIRED8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED]] to i8*
   // CHECK:   [[DESIRED8:%.*]] = bitcast %struct.PS* [[DESIRED]]to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_DESIRED8]], i8* [[DESIRED8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_DESIRED8]], i8* align 2 [[DESIRED8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_DESIRED64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED]] to i64*
   // CHECK:   [[ATOMIC_NEW8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_NEW8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_NEW8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_NEW64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i64*
   // CHECK:   [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
   // CHECK:   [[ATOMIC_DESIRED8:%.*]] = bitcast i64* [[ATOMIC_DESIRED64]] to i8*
diff --git a/test/CodeGen/code-coverage.c b/test/CodeGen/code-coverage.c
index 80307f49ef34..4fbfd5b8ce2a 100644
--- a/test/CodeGen/code-coverage.c
+++ b/test/CodeGen/code-coverage.c
@@ -2,7 +2,16 @@
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -femit-coverage-data -coverage-no-function-names-in-data %s -o - | FileCheck %s --check-prefix WITHOUTNAMES
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -femit-coverage-data -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO
 
-// <rdar://problem/12843084>
+// RUN: %clang_cc1 -emit-llvm-bc -o /dev/null -fexperimental-new-pass-manager -fdebug-pass-manager -femit-coverage-data %s 2>&1 | FileCheck --check-prefix=NEWPM %s
+// RUN: %clang_cc1 -emit-llvm-bc -o /dev/null -fexperimental-new-pass-manager -fdebug-pass-manager -femit-coverage-data -O3 %s 2>&1 | FileCheck --check-prefix=NEWPM-O3 %s
+
+// NEWPM-NOT: Running pass
+// NEWPM: Running pass: GCOVProfilerPass
+
+// NEWPM-O3-NOT: Running pass
+// NEWPM-O3: Running pass: ForceFunctionAttrsPass
+// NEWPM-O3: Running pass: GCOVProfilerPass
+
 
 int test1(int a) {
   switch (a % 2) {
diff --git a/test/CodeGen/compound-literal.c b/test/CodeGen/compound-literal.c
index 6507341ce2c1..38675a7dda22 100644
--- a/test/CodeGen/compound-literal.c
+++ b/test/CodeGen/compound-literal.c
@@ -33,7 +33,7 @@ void f() {
   // CHECK-NEXT: store i32 [[TMP]], i32* [[CY]]
   // CHECK-NEXT: [[SI8:%[a-zA-Z0-9.]+]] = bitcast [[STRUCT]]* [[S]] to i8*
   // CHECK-NEXT: [[COMPOUNDLITI8:%[a-zA-Z0-9.]+]] = bitcast [[STRUCT]]* [[COMPOUNDLIT]] to i8*
-  // CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* [[SI8]], i8* [[COMPOUNDLITI8]]
+  // CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[SI8]], i8* align {{[0-9]+}} [[COMPOUNDLITI8]]
   s = (S){s.y,s.x};
   // CHECK-NEXT: ret void
 }
@@ -67,7 +67,7 @@ struct G g(int x, int y, int z) {
 
   // CHECK-NEXT: [[T0:%.*]] = bitcast i48* [[COERCE_TEMP]] to i8*
   // CHECK-NEXT: [[T1:%.*]] = bitcast [[G]]* [[RESULT]] to i8*
-  // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 6
+  // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} [[T0]], i8* align {{[0-9]+}} [[T1]], i64 6
   // CHECK-NEXT: [[T0:%.*]] = load i48, i48* [[COERCE_TEMP]]
   // CHECK-NEXT: ret i48 [[T0]]
 }
diff --git a/test/CodeGen/debug-info-file-checksum.c b/test/CodeGen/debug-info-file-checksum.c
index 2750800d41ea..d644aac0614f 100644
--- a/test/CodeGen/debug-info-file-checksum.c
+++ b/test/CodeGen/debug-info-file-checksum.c
@@ -1,4 +1,5 @@
 // RUN: %clang -emit-llvm -S -g -gcodeview -x c %S/Inputs/debug-info-file-checksum.c -o - | FileCheck %s
+// RUN: %clang -emit-llvm -S -gdwarf-5 -x c %S/Inputs/debug-info-file-checksum.c -o - | FileCheck %s
 
 // Check that "checksum" is created correctly for the compiled file.
 
diff --git a/test/CodeGen/kr-func-promote.c b/test/CodeGen/kr-func-promote.c
index 8e55dc91edf4..122e6435d0b5 100644
--- a/test/CodeGen/kr-func-promote.c
+++ b/test/CodeGen/kr-func-promote.c
@@ -1,6 +1,12 @@
 // RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s
-// CHECK: i32 @a(i32)
 
+// CHECK: i32 @a(i32
 int a();
 int a(x) short x; {return x;}
 
+// CHECK: void @b(double
+// CHECK: %[[ADDR:.*]] = alloca float, align 4
+// CHECK: %[[TRUNC:.*]] = fptrunc double %0 to float
+// CHECK: store float %[[TRUNC]], float* %[[ADDR]], align 4
+void b();
+void b(f) float f; {}
diff --git a/test/CodeGen/le32-vaarg.c b/test/CodeGen/le32-vaarg.c
index c02af27691f2..7e1dd8a27ae6 100644
--- a/test/CodeGen/le32-vaarg.c
+++ b/test/CodeGen/le32-vaarg.c
@@ -23,7 +23,7 @@ void get_struct(va_list *args) {
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo{{$}}
 // CHECK: store %struct.Foo [[RESULT]], %struct.Foo* [[LOC:%[a-z_0-9]+]]
 // CHECK: [[LOC2:%[a-z_0-9]+]] = bitcast {{.*}} [[LOC]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[LOC2]]
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* align {{[0-9]+}} [[LOC2]]
 
 void skip_struct(va_list *args) {
   va_arg(*args, struct Foo);
diff --git a/test/CodeGen/math-builtins.c b/test/CodeGen/math-builtins.c
index 799d91b4ec00..89be56dc186f 100644
--- a/test/CodeGen/math-builtins.c
+++ b/test/CodeGen/math-builtins.c
@@ -24,23 +24,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO: declare float @atan2f(float, float) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @atan2l(x86_fp80, x86_fp80) [[NOT_READNONE]]
 
-  __builtin_copysign(f,f); __builtin_copysignf(f,f);__builtin_copysignl(f,f);
+  __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f);
 
 // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
 // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
 // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
 
-  __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f);
+  __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f); __builtin_fabsf128(f);
 
 // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
 
   __builtin_frexp(f,i);    __builtin_frexpf(f,i);   __builtin_frexpl(f,i);
 
@@ -51,14 +55,14 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO: declare float @frexpf(float, i32*) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @frexpl(x86_fp80, i32*) [[NOT_READNONE]]
 
-  __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall();
+  __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall(); __builtin_huge_valf128();
 
 // NO__ERRNO-NOT: .huge
 // NO__ERRNO-NOT: @huge
 // HAS_ERRNO-NOT: .huge
 // HAS_ERRNO-NOT: @huge
 
-  __builtin_inf();    __builtin_inff();   __builtin_infl();
+  __builtin_inf();    __builtin_inff();   __builtin_infl(); __builtin_inff128();
 
 // NO__ERRNO-NOT: .inf
 // NO__ERRNO-NOT: @inf
@@ -83,23 +87,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO: declare float @modff(float, float*) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80, x86_fp80*) [[NOT_READNONE]]
 
-  __builtin_nan(c);        __builtin_nanf(c);       __builtin_nanl(c);  
+  __builtin_nan(c);        __builtin_nanf(c);       __builtin_nanl(c); __builtin_nanf128(c);
 
 // NO__ERRNO: declare double @nan(i8*) [[READNONE]]
 // NO__ERRNO: declare float @nanf(i8*) [[READNONE]]
 // NO__ERRNO: declare x86_fp80 @nanl(i8*) [[READNONE]]
+// NO__ERRNO: declare fp128 @nanf128(i8*) [[READNONE]]
 // HAS_ERRNO: declare double @nan(i8*) [[READNONE:#[0-9]+]]
 // HAS_ERRNO: declare float @nanf(i8*) [[READNONE]]
 // HAS_ERRNO: declare x86_fp80 @nanl(i8*) [[READNONE]]
+// HAS_ERRNO: declare fp128 @nanf128(i8*) [[READNONE]]
 
-  __builtin_nans(c);        __builtin_nansf(c);       __builtin_nansl(c);  
+  __builtin_nans(c);        __builtin_nansf(c);       __builtin_nansl(c); __builtin_nansf128(c);
 
 // NO__ERRNO: declare double @nans(i8*) [[READNONE]]
 // NO__ERRNO: declare float @nansf(i8*) [[READNONE]]
 // NO__ERRNO: declare x86_fp80 @nansl(i8*) [[READNONE]]
+// NO__ERRNO: declare fp128 @nansf128(i8*) [[READNONE]]
 // HAS_ERRNO: declare double @nans(i8*) [[READNONE]]
 // HAS_ERRNO: declare float @nansf(i8*) [[READNONE]]
 // HAS_ERRNO: declare x86_fp80 @nansl(i8*) [[READNONE]]
+// HAS_ERRNO: declare fp128 @nansf128(i8*) [[READNONE]]
 
   __builtin_pow(f,f);        __builtin_powf(f,f);       __builtin_powl(f,f);
 
diff --git a/test/CodeGen/mms-bitfields.c b/test/CodeGen/mms-bitfields.c
index 1617e8ac40d9..d4604fbaa44e 100644
--- a/test/CodeGen/mms-bitfields.c
+++ b/test/CodeGen/mms-bitfields.c
@@ -20,3 +20,46 @@ struct s3 {
 } s3;
 
 // CHECK: %struct.s3 = type { i32, [4 x i8], %struct.s1 }
+
+// PR32482:
+
+#pragma pack (push,1)
+
+typedef unsigned int UINT32;
+
+struct Inner {
+  UINT32    A    :  1;
+  UINT32    B    :  1;
+  UINT32    C    :  1;
+  UINT32    D    : 30;
+} Inner;
+
+#pragma pack (pop)
+
+// CHECK: %struct.Inner = type { i32, i32 }
+
+// CHECK: %struct.A = type { i32, i32, i32 }
+
+#pragma pack(push, 1)
+
+union HEADER {
+  struct A {
+    int                                         :  3;  // Bits 2:0
+    int a                                       :  9;  // Bits 11:3
+    int                                         :  12;  // Bits 23:12
+    int b                                       :  17;  // Bits 40:24
+    int                                         :  7;  // Bits 47:41
+    int c                                       :  4;  // Bits 51:48
+    int                                         :  4;  // Bits 55:52
+    int d                                       :  3;  // Bits 58:56
+    int                                         :  5;  // Bits 63:59
+  } Bits;
+} HEADER;
+
+#pragma pack(pop)
+
+struct Inner variable = { 1,0,1, 21 };
+union HEADER hdr = {{1,2,3,4}};
+
+// CHECK: @variable = global { i8, [3 x i8], i8, i8, i8, i8 } { i8 5, [3 x i8] undef, i8 21, i8 0, i8 0, i8 0 }, align 1
+// CHECK: @hdr = global { { i8, i8, [2 x i8], i8, i8, i8, i8, i8, [3 x i8] } } { { i8, i8, [2 x i8], i8, i8, i8, i8, i8, [3 x i8] } { i8 8, i8 0, [2 x i8] undef, i8 2, i8 0, i8 0, i8 3, i8 4, [3 x i8] undef } }, align 1
diff --git a/test/CodeGen/ms-intrinsics.c b/test/CodeGen/ms-intrinsics.c
index 38cda9785029..744cb479c870 100644
--- a/test/CodeGen/ms-intrinsics.c
+++ b/test/CodeGen/ms-intrinsics.c
@@ -20,12 +20,12 @@ void test__stosb(unsigned char *Dest, unsigned char Data, size_t Count) {
 }
 
 // CHECK-I386: define{{.*}}void @test__stosb
-// CHECK-I386:   tail call void @llvm.memset.p0i8.i32(i8* %Dest, i8 %Data, i32 %Count, i32 1, i1 true)
+// CHECK-I386:   tail call void @llvm.memset.p0i8.i32(i8* align 1 %Dest, i8 %Data, i32 %Count, i1 true)
 // CHECK-I386:   ret void
 // CHECK-I386: }
 
 // CHECK-X64: define{{.*}}void @test__stosb
-// CHECK-X64:   tail call void @llvm.memset.p0i8.i64(i8* %Dest, i8 %Data, i64 %Count, i32 1, i1 true)
+// CHECK-X64:   tail call void @llvm.memset.p0i8.i64(i8* align 1 %Dest, i8 %Data, i64 %Count, i1 true)
 // CHECK-X64:   ret void
 // CHECK-X64: }
 
diff --git a/test/CodeGen/no-opt-volatile-memcpy.c b/test/CodeGen/no-opt-volatile-memcpy.c
index bf98df39af16..2781475b6302 100644
--- a/test/CodeGen/no-opt-volatile-memcpy.c
+++ b/test/CodeGen/no-opt-volatile-memcpy.c
@@ -18,10 +18,10 @@ void foo (void) {
 // CHECK: %[[LS:.*]] = alloca %struct.s, align 4
 // CHECK-NEXT: %[[ZERO:.*]] = bitcast %struct.s* %[[LS]] to i8*
 // CHECK-NEXT: %[[ONE:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[ZERO]], i8* %[[ONE]], i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[ZERO]], i8* align 4 %[[ONE]], i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 // CHECK-NEXT: %[[TWO:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[TWO]], i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[TWO]], i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 
 
 struct s1 {
@@ -35,6 +35,6 @@ void fee (void) {
   s.y = gs;
 }
 // CHECK-LABEL: define void @fee()
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 
diff --git a/test/CodeGen/packed-nest-unpacked.c b/test/CodeGen/packed-nest-unpacked.c
index e2bbd41a9daf..4124a8ade86a 100644
--- a/test/CodeGen/packed-nest-unpacked.c
+++ b/test/CodeGen/packed-nest-unpacked.c
@@ -9,26 +9,26 @@ struct X foo(void);
 // <rdar://problem/10463337>
 struct X test1() {
   // CHECK: @test1
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
   return g.y;
 }
 struct X test2() {
   // CHECK: @test2
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
   struct X a = g.y;
   return a;
 }
 
 void test3(struct X a) {
   // CHECK: @test3
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* {{.*}}, i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* {{.*}}, i64 24, i1 false)
   g.y = a;
 }
 
 // <rdar://problem/10530444>
 void test4() {
   // CHECK: @test4
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
   f(g.y);
 }
 
@@ -42,7 +42,7 @@ int test5() {
 // <rdar://problem/11220251>
 void test6() {
   // CHECK: @test6
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* %{{.*}}, i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* align 4 %{{.*}}, i64 24, i1 false)
   g.y = foo();
 }
 
diff --git a/test/CodeGen/packed-structure.c b/test/CodeGen/packed-structure.c
index 7d1183dc5ca6..91cacb400536 100644
--- a/test/CodeGen/packed-structure.c
+++ b/test/CodeGen/packed-structure.c
@@ -29,7 +29,7 @@ int s0_load_x(struct s0 *a) { return a->x; }
 // CHECK-FUNCTIONS: ret i32 [[s0_load_y]]
 int s0_load_y(struct s0 *a) { return a->y; }
 // CHECK-FUNCTIONS-LABEL: define void @s0_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 4, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 {{.*}}, i8* align 4 {{.*}}, i64 8, i1 false)
 void s0_copy(struct s0 *a, struct s0 *b) { *b = *a; }
 
 //
@@ -55,7 +55,7 @@ int s1_load_x(struct s1 *a) { return a->x; }
 // CHECK-FUNCTIONS: ret i32 [[s1_load_y]]
 int s1_load_y(struct s1 *a) { return a->y; }
 // CHECK-FUNCTIONS-LABEL: define void @s1_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 1, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i1 false)
 void s1_copy(struct s1 *a, struct s1 *b) { *b = *a; }
 
 //
@@ -83,7 +83,7 @@ int s2_load_x(struct s2 *a) { return a->x; }
 // CHECK-FUNCTIONS: ret i32 [[s2_load_y]]
 int s2_load_y(struct s2 *a) { return a->y; }
 // CHECK-FUNCTIONS-LABEL: define void @s2_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 2, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 {{.*}}, i8* align 2 {{.*}}, i64 8, i1 false)
 void s2_copy(struct s2 *a, struct s2 *b) { *b = *a; }
 
 struct __attribute__((packed, aligned)) s3 {
diff --git a/test/CodeGen/partial-reinitialization2.c b/test/CodeGen/partial-reinitialization2.c
index c4f6567b36ce..9c3ce99b98dc 100644
--- a/test/CodeGen/partial-reinitialization2.c
+++ b/test/CodeGen/partial-reinitialization2.c
@@ -15,7 +15,7 @@ union ULP3 { struct LP3 l3; };
 // CHECK-LABEL: test1
 void test1(void)
 {
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
   // CHECK: store i8 120, i8* %
 
   struct LP1 l = { .p1 = g1, .p1.x[2] = 'x' };
@@ -24,7 +24,7 @@ void test1(void)
 // CHECK-LABEL: test2
 void test2(void)
 {
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
   // CHECK: store i8 114, i8* %
 
   struct LP1 l = { .p1 = g1, .p1.x[1] = 'r' };
@@ -33,7 +33,7 @@ void test2(void)
 // CHECK-LABEL: test3
 void test3(void)
 {
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P2* @g2{{.*}}i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P2* @g2{{.*}}i64 12, i1 false)
   // CHECK: store i32 10, i32* %
 
   struct LP2 l = { .p2 = g2, .p2.b = 10 };
@@ -66,7 +66,7 @@ void test4(void)
   // CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get123()
   // CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9]+]]
   // CHECK: [[TMP1:%[a-z0-9]+]] = bitcast {{.*}}[[TMP0]]
-  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i1 false)
   // CHECK: store i32 100, i32* %
 
   struct LUP2 { union UP2 up; } var = { get123(), .up.p2.a = 100 };
@@ -76,12 +76,12 @@ void test4(void)
 void test5(void)
 {
   // .l3 = g3
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.LP3, %struct.LP3* @g3{{.*}}i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.LP3, %struct.LP3* @g3{{.*}}i64 12, i1 false)
 
   // .l3.p1 = { [0] = g1 } implicitly sets [1] to zero
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
   // CHECK: getelementptr{{.*}}%struct.P1, %struct.P1*{{.*}}i64 1
-  // CHECK: call void @llvm.memset{{.*}}i8 0, i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memset{{.*}}i8 0, i64 6, i1 false)
 
   // .l3.p1[1].x[1] = 'x'
   // CHECK: store i8 120, i8* %
@@ -98,7 +98,7 @@ void test6(void)
   // CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get235()
   // CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9]+]]
   // CHECK: [[TMP1:%[a-z0-9]+]] = bitcast {{.*}}[[TMP0]]
-  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i1 false)
 
   // CHECK: store i32 10, i32* %
 
diff --git a/test/CodeGen/ppc-varargs-struct.c b/test/CodeGen/ppc-varargs-struct.c
index d7936a126960..c201074e1fd9 100644
--- a/test/CodeGen/ppc-varargs-struct.c
+++ b/test/CodeGen/ppc-varargs-struct.c
@@ -54,7 +54,7 @@ void testva (int n, ...)
 // CHECK-PPC-NEXT:  [[AGGR:%[a-z0-9]+]] = load %struct.x*, %struct.x** [[VAARG_ADDR]]
 // CHECK-PPC-NEXT:  [[DEST:%[0-9]+]] = bitcast %struct.x* %t to i8*
 // CHECK-PPC-NEXT:  [[SRC:%.+]] = bitcast %struct.x* [[AGGR]] to i8*
-// CHECK-PPC-NEXT:  call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DEST]], i8* [[SRC]], i32 16, i32 8, i1 false)
+// CHECK-PPC-NEXT:  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i1 false)
 
   int v = va_arg (ap, int);
   
diff --git a/test/CodeGen/ppc64-align-struct.c b/test/CodeGen/ppc64-align-struct.c
index 5894a6aeb379..3cf3e0475b3a 100644
--- a/test/CodeGen/ppc64-align-struct.c
+++ b/test/CodeGen/ppc64-align-struct.c
@@ -56,7 +56,7 @@ void test7 (int x, struct test7 y)
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test1*
 // CHECK: [[DEST:%.*]] = bitcast %struct.test1* %y to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test1* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 8, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 8 [[SRC]], i64 8, i1 false)
 struct test1 test1va (int x, ...)
 {
   struct test1 y;
@@ -79,7 +79,7 @@ struct test1 test1va (int x, ...)
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test2*
 // CHECK: [[DEST:%.*]] = bitcast %struct.test2* %y to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test2* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
 struct test2 test2va (int x, ...)
 {
   struct test2 y;
@@ -102,7 +102,7 @@ struct test2 test2va (int x, ...)
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test3*
 // CHECK: [[DEST:%.*]] = bitcast %struct.test3* %y to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test3* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 32 [[DEST]], i8* align 16 [[SRC]], i64 32, i1 false)
 struct test3 test3va (int x, ...)
 {
   struct test3 y;
@@ -121,7 +121,7 @@ struct test3 test3va (int x, ...)
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test4*
 // CHECK: [[DEST:%.*]] = bitcast %struct.test4* %y to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test4* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 8 [[SRC]], i64 12, i1 false)
 struct test4 test4va (int x, ...)
 {
   struct test4 y;
@@ -140,7 +140,7 @@ struct test4 test4va (int x, ...)
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test_longdouble*
 // CHECK: [[DEST:%.*]] = bitcast %struct.test_longdouble* %y to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test_longdouble* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 8 [[SRC]], i64 16, i1 false)
 struct test_longdouble { long double x; };
 struct test_longdouble testva_longdouble (int x, ...)
 {
@@ -164,7 +164,7 @@ struct test_longdouble testva_longdouble (int x, ...)
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test_vector*
 // CHECK: [[DEST:%.*]] = bitcast %struct.test_vector* %y to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test_vector* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
 struct test_vector { vector int x; };
 struct test_vector testva_vector (int x, ...)
 {
diff --git a/test/CodeGen/ppc64-soft-float.c b/test/CodeGen/ppc64-soft-float.c
index 95b18292319d..84ac2d55b636 100644
--- a/test/CodeGen/ppc64-soft-float.c
+++ b/test/CodeGen/ppc64-soft-float.c
@@ -92,7 +92,7 @@ void call_f2(void) { global_f2 = func_f2(global_f2); }
 // CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f3, align 4
 // CHECK: %[[TMP1:[^ ]+]] = alloca [2 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f3* @global_f3 to i8*), i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f3* @global_f3 to i8*), i64 12, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP1]]
 // CHECK-LE: call { i64, i64 } @func_f3([2 x i64] %[[TMP3]])
 // CHECK-BE: call void @func_f3(%struct.f3* sret %[[TMP0]], [2 x i64] %[[TMP3]])
@@ -111,7 +111,7 @@ void call_f4(void) { global_f4 = func_f4(global_f4); }
 // CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f5, align 4
 // CHECK: %[[TMP1:[^ ]+]] = alloca [3 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [3 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f5* @global_f5 to i8*), i64 20, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f5* @global_f5 to i8*), i64 20, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [3 x i64], [3 x i64]* %[[TMP1]]
 // CHECK: call void @func_f5(%struct.f5* sret %[[TMP0]], [3 x i64] %[[TMP3]])
 struct f5 global_f5;
@@ -128,7 +128,7 @@ void call_f6(void) { global_f6 = func_f6(global_f6); }
 // CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f7, align 4
 // CHECK: %[[TMP1:[^ ]+]] = alloca [4 x i64], align 8
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [4 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f7* @global_f7 to i8*), i64 28, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f7* @global_f7 to i8*), i64 28, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [4 x i64], [4 x i64]* %[[TMP1]], align 8
 // CHECK: call void @func_f7(%struct.f7* sret %[[TMP0]], [4 x i64] %[[TMP3]])
 struct f7 global_f7;
@@ -144,7 +144,7 @@ void call_f8(void) { global_f8 = func_f8(global_f8); }
 // CHECK-LABEL: @call_f9
 // CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]]
 // CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
 struct f9 global_f9;
@@ -162,7 +162,7 @@ void call_fab(void) { global_fab = func_fab(global_fab); }
 // CHECK-BE: %[[TMPX:[^ ]+]] = alloca %struct.fabc, align 4
 // CHECK: %[[TMP0:[^ ]+]] = alloca [2 x i64], align 8
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP0]], align 8
 // CHECK-LE: %call = call { i64, i64 } @func_fabc([2 x i64] %[[TMP3]])
 // CHECK-BE: call void @func_fabc(%struct.fabc* sret %[[TMPX]], [2 x i64] %[[TMP3]])
diff --git a/test/CodeGen/ppc64le-aggregates.c b/test/CodeGen/ppc64le-aggregates.c
index f78f26a59285..a7780beec5f6 100644
--- a/test/CodeGen/ppc64le-aggregates.c
+++ b/test/CodeGen/ppc64le-aggregates.c
@@ -104,7 +104,7 @@ void call_f8(void) { global_f8 = func_f8(global_f8); }
 // CHECK-LABEL: @call_f9
 // CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]]
 // CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
 struct f9 global_f9;
diff --git a/test/CodeGen/rdpid-builtins.c b/test/CodeGen/rdpid-builtins.c
new file mode 100644
index 000000000000..35516b05bd46
--- /dev/null
+++ b/test/CodeGen/rdpid-builtins.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -target-feature +rdpid -emit-llvm -o - %s | FileCheck %s
+
+
+#include <x86intrin.h>
+
+unsigned int test_rdpid_u32(void) {
+// CHECK-LABEL: @test_rdpid_u32
+// CHECK: call i32 @llvm.x86.rdpid
+  return _rdpid_u32();
+}
diff --git a/test/CodeGen/riscv32-abi.c b/test/CodeGen/riscv32-abi.c
new file mode 100644
index 000000000000..ec98e3d8db04
--- /dev/null
+++ b/test/CodeGen/riscv32-abi.c
@@ -0,0 +1,423 @@
+// RUN: %clang_cc1 -triple riscv32 -emit-llvm %s -o - | FileCheck %s
+
+#include <stddef.h>
+#include <stdint.h>
+
+// CHECK-LABEL: define void @f_void()
+void f_void(void) {}
+
+// Scalar arguments and return values smaller than the word size are extended
+// according to the sign of their type, up to 32 bits
+
+// CHECK-LABEL: define zeroext i1 @f_scalar_0(i1 zeroext %x)
+_Bool f_scalar_0(_Bool x) { return x; }
+
+// CHECK-LABEL: define signext i8 @f_scalar_1(i8 signext %x)
+int8_t f_scalar_1(int8_t x) { return x; }
+
+// CHECK-LABEL: define zeroext i8 @f_scalar_2(i8 zeroext %x)
+uint8_t f_scalar_2(uint8_t x) { return x; }
+
+// CHECK-LABEL: define i32 @f_scalar_3(i32 %x)
+int32_t f_scalar_3(int32_t x) { return x; }
+
+// CHECK-LABEL: define i64 @f_scalar_4(i64 %x)
+int64_t f_scalar_4(int64_t x) { return x; }
+
+// CHECK-LABEL: define float @f_fp_scalar_1(float %x)
+float f_fp_scalar_1(float x) { return x; }
+
+// CHECK-LABEL: define double @f_fp_scalar_2(double %x)
+double f_fp_scalar_2(double x) { return x; }
+
+// Scalars larger than 2*xlen are passed/returned indirect. However, the
+// RISC-V LLVM backend can handle this fine, so the function doesn't need to
+// be modified.
+
+// CHECK-LABEL: define fp128 @f_fp_scalar_3(fp128 %x)
+long double f_fp_scalar_3(long double x) { return x; }
+
+// Empty structs or unions are ignored.
+
+struct empty_s {};
+
+// CHECK-LABEL: define void @f_agg_empty_struct()
+struct empty_s f_agg_empty_struct(struct empty_s x) {
+  return x;
+}
+
+union empty_u {};
+
+// CHECK-LABEL: define void @f_agg_empty_union()
+union empty_u f_agg_empty_union(union empty_u x) {
+  return x;
+}
+
+// Aggregates <= 2*xlen may be passed in registers, so will be coerced to
+// integer arguments. The rules for return are the same.
+
+struct tiny {
+  uint8_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_tiny(i32 %x.coerce)
+void f_agg_tiny(struct tiny x) {
+  x.a += x.b;
+  x.c += x.d;
+}
+
+// CHECK-LABEL: define i32 @f_agg_tiny_ret()
+struct tiny f_agg_tiny_ret() {
+  return (struct tiny){1, 2, 3, 4};
+}
+
+typedef uint8_t v4i8 __attribute__((vector_size(4)));
+typedef int32_t v1i32 __attribute__((vector_size(4)));
+
+// CHECK-LABEL: define void @f_vec_tiny_v4i8(i32 %x.coerce)
+void f_vec_tiny_v4i8(v4i8 x) {
+  x[0] = x[1];
+  x[2] = x[3];
+}
+
+// CHECK-LABEL: define i32 @f_vec_tiny_v4i8_ret()
+v4i8 f_vec_tiny_v4i8_ret() {
+  return (v4i8){1, 2, 3, 4};
+}
+
+// CHECK-LABEL: define void @f_vec_tiny_v1i32(i32 %x.coerce)
+void f_vec_tiny_v1i32(v1i32 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i32 @f_vec_tiny_v1i32_ret()
+v1i32 f_vec_tiny_v1i32_ret() {
+  return (v1i32){1};
+}
+
+struct small {
+  int32_t a, *b;
+};
+
+// CHECK-LABEL: define void @f_agg_small([2 x i32] %x.coerce)
+void f_agg_small(struct small x) {
+  x.a += *x.b;
+  x.b = &x.a;
+}
+
+// CHECK-LABEL: define [2 x i32] @f_agg_small_ret()
+struct small f_agg_small_ret() {
+  return (struct small){1, 0};
+}
+
+typedef uint8_t v8i8 __attribute__((vector_size(8)));
+typedef int64_t v1i64 __attribute__((vector_size(8)));
+
+// CHECK-LABEL: define void @f_vec_small_v8i8(i64 %x.coerce)
+void f_vec_small_v8i8(v8i8 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define i64 @f_vec_small_v8i8_ret()
+v8i8 f_vec_small_v8i8_ret() {
+  return (v8i8){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// CHECK-LABEL: define void @f_vec_small_v1i64(i64 %x.coerce)
+void f_vec_small_v1i64(v1i64 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i64 @f_vec_small_v1i64_ret()
+v1i64 f_vec_small_v1i64_ret() {
+  return (v1i64){1};
+}
+
+// Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
+// single 2*xlen-sized argument, to ensure that alignment can be maintained if
+// passed on the stack.
+
+struct small_aligned {
+  int64_t a;
+};
+
+// CHECK-LABEL: define void @f_agg_small_aligned(i64 %x.coerce)
+void f_agg_small_aligned(struct small_aligned x) {
+  x.a += x.a;
+}
+
+// CHECK-LABEL: define i64 @f_agg_small_aligned_ret(i64 %x.coerce)
+struct small_aligned f_agg_small_aligned_ret(struct small_aligned x) {
+  return (struct small_aligned){10};
+}
+
+// Aggregates greater > 2*xlen will be passed and returned indirectly
+struct large {
+  int32_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_large(%struct.large* %x)
+void f_agg_large(struct large x) {
+  x.a = x.b + x.c + x.d;
+}
+
+// The address where the struct should be written to will be the first
+// argument
+// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret %agg.result, i32 %i, i8 signext %j)
+struct large f_agg_large_ret(int32_t i, int8_t j) {
+  return (struct large){1, 2, 3, 4};
+}
+
+typedef unsigned char v16i8 __attribute__((vector_size(16)));
+
+// CHECK-LABEL: define void @f_vec_large_v16i8(<16 x i8>*)
+void f_vec_large_v16i8(v16i8 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define void @f_vec_large_v16i8_ret(<16 x i8>* noalias sret %agg.result)
+v16i8 f_vec_large_v16i8_ret() {
+  return (v16i8){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// Scalars passed on the stack should have signext/zeroext attributes (they
+// are anyext).
+
+// CHECK-LABEL: define i32 @f_scalar_stack_1(i32 %a.coerce, [2 x i32] %b.coerce, i64 %c.coerce, %struct.large* %d, i8 zeroext %e, i8 signext %f, i8 %g, i8 %h)
+int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
+                     struct large d, uint8_t e, int8_t f, uint8_t g, int8_t h) {
+  return g + h;
+}
+
+// CHECK-LABEL: define i32 @f_scalar_stack_2(i32 %a, i64 %b, float %c, double %d, fp128 %e, i8 zeroext %f, i8 %g, i8 %h)
+int f_scalar_stack_2(int32_t a, int64_t b, float c, double d, long double e,
+                     uint8_t f, int8_t g, uint8_t h) {
+  return g + h;
+}
+
+// Ensure that scalars passed on the stack are still determined correctly in
+// the presence of large return values that consume a register due to the need
+// to pass a pointer.
+
+// CHECK-LABEL: define void @f_scalar_stack_3(%struct.large* noalias sret %agg.result, i32 %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g)
+struct large f_scalar_stack_3(int32_t a, int64_t b, double c, long double d,
+                              uint8_t e, int8_t f, uint8_t g) {
+  return (struct large){a, e, f, g};
+}
+
+// CHECK-LABEL: define fp128 @f_scalar_stack_4(i32 %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g)
+long double f_scalar_stack_4(int32_t a, int64_t b, double c, long double d,
+                             uint8_t e, int8_t f, uint8_t g) {
+  return d;
+}
+
+// Aggregates and >=XLen scalars passed on the stack should be lowered just as
+// they would be if passed via registers.
+
+// CHECK-LABEL: define void @f_scalar_stack_5(double %a, i64 %b, double %c, i64 %d, i32 %e, i64 %f, float %g, double %h, fp128 %i)
+void f_scalar_stack_5(double a, int64_t b, double c, int64_t d, int e,
+                      int64_t f, float g, double h, long double i) {}
+
+// CHECK-LABEL: define void @f_agg_stack(double %a, i64 %b, double %c, i64 %d, i32 %e.coerce, [2 x i32] %f.coerce, i64 %g.coerce, %struct.large* %h)
+void f_agg_stack(double a, int64_t b, double c, int64_t d, struct tiny e,
+                 struct small f, struct small_aligned g, struct large h) {}
+
+// Ensure that ABI lowering happens as expected for vararg calls. For RV32
+// with the base integer calling convention there will be no observable
+// differences in the lowered IR for a call with varargs vs without.
+
+int f_va_callee(int, ...);
+
+// CHECK-LABEL: define void @f_va_caller()
+// CHECK: call i32 (i32, ...) @f_va_callee(i32 1, i32 2, i64 3, double 4.000000e+00, double 5.000000e+00, i32 {{%.*}}, [2 x i32] {{%.*}}, i64 {{%.*}}, %struct.large* {{%.*}})
+void f_va_caller() {
+  f_va_callee(1, 2, 3LL, 4.0f, 5.0, (struct tiny){6, 7, 8, 9},
+              (struct small){10, NULL}, (struct small_aligned){11},
+              (struct large){12, 13, 14, 15});
+}
+
+// CHECK-LABEL: define i32 @f_va_1(i8* %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK:   [[VA:%.*]] = alloca i8*, align 4
+// CHECK:   [[V:%.*]] = alloca i32, align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK:   [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK:   store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK:   [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[V]], align 4
+// CHECK:   ret i32 [[TMP2]]
+// CHECK: }
+int f_va_1(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// An "aligned" register pair (where the first register is even-numbered) is
+// used to pass varargs with 2x xlen alignment and 2x xlen size. Ensure the
+// correct offsets are used.
+
+// CHECK-LABEL: @f_va_2(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 7
+// CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -8
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i32 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
+// CHECK-NEXT:    store double [[TMP4]], double* [[V]], align 8
+// CHECK-NEXT:    [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA2]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[V]], align 8
+// CHECK-NEXT:    ret double [[TMP5]]
+double f_va_2(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  double v = __builtin_va_arg(va, double);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// Two "aligned" register pairs.
+
+// CHECK-LABEL: @f_va_3(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[X:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 7
+// CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -8
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i32 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
+// CHECK-NEXT:    store double [[TMP4]], double* [[V]], align 8
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR2]] to i32*
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-NEXT:    store i32 [[TMP6]], i32* [[W]], align 4
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint i8* [[ARGP_CUR4]] to i32
+// CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 7
+// CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -8
+// CHECK-NEXT:    [[ARGP_CUR4_ALIGNED:%.*]] = inttoptr i32 [[TMP9]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4_ALIGNED]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARGP_CUR4_ALIGNED]] to double*
+// CHECK-NEXT:    [[TMP11:%.*]] = load double, double* [[TMP10]], align 8
+// CHECK-NEXT:    store double [[TMP11]], double* [[X]], align 8
+// CHECK-NEXT:    [[VA6:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA6]])
+// CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[V]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load double, double* [[X]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[TMP13]]
+// CHECK-NEXT:    ret double [[ADD]]
+double f_va_3(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  double v = __builtin_va_arg(va, double);
+  int w = __builtin_va_arg(va, int);
+  double x = __builtin_va_arg(va, double);
+  __builtin_va_end(va);
+
+  return v + x;
+}
+
+// CHECK-LABEL: define i32 @f_va_4(i8* %fmt, ...) {{.*}} {
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LD:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    [[TS:%.*]] = alloca [[STRUCT_TINY:%.*]], align 1
+// CHECK-NEXT:    [[SS:%.*]] = alloca [[STRUCT_SMALL:%.*]], align 4
+// CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
+// CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARGP_CUR2]] to fp128**
+// CHECK-NEXT:    [[TMP3:%.*]] = load fp128*, fp128** [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load fp128, fp128* [[TMP3]], align 16
+// CHECK-NEXT:    store fp128 [[TMP4]], fp128* [[LD]], align 16
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR4]] to %struct.tiny*
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.tiny* [[TS]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.tiny* [[TMP5]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[TMP6]], i8* align 4 [[TMP7]], i32 4, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR6:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT7:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR6]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT7]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[ARGP_CUR6]] to %struct.small*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.small* [[SS]] to i8*
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct.small* [[TMP8]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 [[TMP10]], i32 8, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR8:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT9:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR8]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT9]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[ARGP_CUR8]] to %struct.large**
+// CHECK-NEXT:    [[TMP12:%.*]] = load %struct.large*, %struct.large** [[TMP11]], align 4
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast %struct.large* [[LS]] to i8*
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.large* [[TMP12]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 16, i1 false)
+// CHECK-NEXT:    [[VA10:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA10]])
+int f_va_4(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  long double ld = __builtin_va_arg(va, long double);
+  struct tiny ts = __builtin_va_arg(va, struct tiny);
+  struct small ss = __builtin_va_arg(va, struct small);
+  struct large ls = __builtin_va_arg(va, struct large);
+  __builtin_va_end(va);
+
+  int ret = (int)((long double)v + ld);
+  ret = ret + ts.a + ts.b + ts.c + ts.d;
+  ret = ret + ss.a + (int)ss.b;
+  ret = ret + ls.a + ls.b + ls.c + ls.d;
+
+  return ret;
+}
diff --git a/test/CodeGen/riscv64-abi.c b/test/CodeGen/riscv64-abi.c
new file mode 100644
index 000000000000..7a0f065fb4cf
--- /dev/null
+++ b/test/CodeGen/riscv64-abi.c
@@ -0,0 +1,422 @@
+// RUN: %clang_cc1 -triple riscv64 -emit-llvm %s -o - | FileCheck %s
+
+#include <stddef.h>
+#include <stdint.h>
+
+// CHECK-LABEL: define void @f_void()
+void f_void(void) {}
+
+// Scalar arguments and return values smaller than the word size are extended
+// according to the sign of their type, up to 32 bits
+
+// CHECK-LABEL: define zeroext i1 @f_scalar_0(i1 zeroext %x)
+_Bool f_scalar_0(_Bool x) { return x; }
+
+// CHECK-LABEL: define signext i8 @f_scalar_1(i8 signext %x)
+int8_t f_scalar_1(int8_t x) { return x; }
+
+// CHECK-LABEL: define zeroext i8 @f_scalar_2(i8 zeroext %x)
+uint8_t f_scalar_2(uint8_t x) { return x; }
+
+// CHECK-LABEL: define signext i32 @f_scalar_3(i32 signext %x)
+uint32_t f_scalar_3(int32_t x) { return x; }
+
+// CHECK-LABEL: define i64 @f_scalar_4(i64 %x)
+int64_t f_scalar_4(int64_t x) { return x; }
+
+// CHECK-LABEL: define float @f_fp_scalar_1(float %x)
+float f_fp_scalar_1(float x) { return x; }
+
+// CHECK-LABEL: define double @f_fp_scalar_2(double %x)
+double f_fp_scalar_2(double x) { return x; }
+
+// CHECK-LABEL: define fp128 @f_fp_scalar_3(fp128 %x)
+long double f_fp_scalar_3(long double x) { return x; }
+
+// Empty structs or unions are ignored.
+
+struct empty_s {};
+
+// CHECK-LABEL: define void @f_agg_empty_struct()
+struct empty_s f_agg_empty_struct(struct empty_s x) {
+  return x;
+}
+
+union empty_u {};
+
+// CHECK-LABEL: define void @f_agg_empty_union()
+union empty_u f_agg_empty_union(union empty_u x) {
+  return x;
+}
+
+// Aggregates <= 2*xlen may be passed in registers, so will be coerced to
+// integer arguments. The rules for return are the same.
+
+struct tiny {
+  uint16_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_tiny(i64 %x.coerce)
+void f_agg_tiny(struct tiny x) {
+  x.a += x.b;
+  x.c += x.d;
+}
+
+// CHECK-LABEL: define i64 @f_agg_tiny_ret()
+struct tiny f_agg_tiny_ret() {
+  return (struct tiny){1, 2, 3, 4};
+}
+
+typedef uint16_t v4i16 __attribute__((vector_size(8)));
+typedef int64_t v1i64 __attribute__((vector_size(8)));
+
+// CHECK-LABEL: define void @f_vec_tiny_v4i16(i64 %x.coerce)
+void f_vec_tiny_v4i16(v4i16 x) {
+  x[0] = x[1];
+  x[2] = x[3];
+}
+
+// CHECK-LABEL: define i64 @f_vec_tiny_v4i16_ret()
+v4i16 f_vec_tiny_v4i16_ret() {
+  return (v4i16){1, 2, 3, 4};
+}
+
+// CHECK-LABEL: define void @f_vec_tiny_v1i64(i64 %x.coerce)
+void f_vec_tiny_v1i64(v1i64 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i64 @f_vec_tiny_v1i64_ret()
+v1i64 f_vec_tiny_v1i64_ret() {
+  return (v1i64){1};
+}
+
+struct small {
+  int64_t a, *b;
+};
+
+// CHECK-LABEL: define void @f_agg_small([2 x i64] %x.coerce)
+void f_agg_small(struct small x) {
+  x.a += *x.b;
+  x.b = &x.a;
+}
+
+// CHECK-LABEL: define [2 x i64] @f_agg_small_ret()
+struct small f_agg_small_ret() {
+  return (struct small){1, 0};
+}
+
+typedef uint16_t v8i16 __attribute__((vector_size(16)));
+typedef __int128_t v1i128 __attribute__((vector_size(16)));
+
+// CHECK-LABEL: define void @f_vec_small_v8i16(i128 %x.coerce)
+void f_vec_small_v8i16(v8i16 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define i128 @f_vec_small_v8i16_ret()
+v8i16 f_vec_small_v8i16_ret() {
+  return (v8i16){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// CHECK-LABEL: define void @f_vec_small_v1i128(i128 %x.coerce)
+void f_vec_small_v1i128(v1i128 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i128 @f_vec_small_v1i128_ret()
+v1i128 f_vec_small_v1i128_ret() {
+  return (v1i128){1};
+}
+
+// Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
+// single 2*xlen-sized argument, to ensure that alignment can be maintained if
+// passed on the stack.
+
+struct small_aligned {
+  __int128_t a;
+};
+
+// CHECK-LABEL: define void @f_agg_small_aligned(i128 %x.coerce)
+void f_agg_small_aligned(struct small_aligned x) {
+  x.a += x.a;
+}
+
+// CHECK-LABEL: define i128 @f_agg_small_aligned_ret(i128 %x.coerce)
+struct small_aligned f_agg_small_aligned_ret(struct small_aligned x) {
+  return (struct small_aligned){10};
+}
+
+// Aggregates greater > 2*xlen will be passed and returned indirectly
+struct large {
+  int64_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_large(%struct.large* %x)
+void f_agg_large(struct large x) {
+  x.a = x.b + x.c + x.d;
+}
+
+// The address where the struct should be written to will be the first
+// argument
+// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret %agg.result, i32 signext %i, i8 signext %j)
+struct large f_agg_large_ret(int32_t i, int8_t j) {
+  return (struct large){1, 2, 3, 4};
+}
+
+typedef unsigned char v32i8 __attribute__((vector_size(32)));
+
+// CHECK-LABEL: define void @f_vec_large_v32i8(<32 x i8>*)
+void f_vec_large_v32i8(v32i8 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define void @f_vec_large_v32i8_ret(<32 x i8>* noalias sret %agg.result)
+v32i8 f_vec_large_v32i8_ret() {
+  return (v32i8){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// Scalars passed on the stack should have signext/zeroext attributes (they
+// are anyext).
+
+// CHECK-LABEL: define signext i32 @f_scalar_stack_1(i64 %a.coerce, [2 x i64] %b.coerce, i128 %c.coerce, %struct.large* %d, i8 zeroext %e, i8 signext %f, i8 %g, i8 %h)
+int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
+                     struct large d, uint8_t e, int8_t f, uint8_t g, int8_t h) {
+  return g + h;
+}
+
+// CHECK-LABEL: define signext i32 @f_scalar_stack_2(i32 signext %a, i128 %b, float %c, fp128 %d, <32 x i8>*, i8 zeroext %f, i8 %g, i8 %h)
+int f_scalar_stack_2(int32_t a, __int128_t b, float c, long double d, v32i8 e,
+                     uint8_t f, int8_t g, uint8_t h) {
+  return g + h;
+}
+
+// Ensure that scalars passed on the stack are still determined correctly in
+// the presence of large return values that consume a register due to the need
+// to pass a pointer.
+
+// CHECK-LABEL: define void @f_scalar_stack_3(%struct.large* noalias sret %agg.result, i32 signext %a, i128 %b, fp128 %c, <32 x i8>*, i8 zeroext %e, i8 %f, i8 %g)
+struct large f_scalar_stack_3(uint32_t a, __int128_t b, long double c, v32i8 d,
+                              uint8_t e, int8_t f, uint8_t g) {
+  return (struct large){a, e, f, g};
+}
+
+// Ensure that ABI lowering happens as expected for vararg calls.
+// Specifically, ensure that signext is emitted for varargs that will be
+// passed in registers but not on the stack. Ensure this takes into account
+// the use of "aligned" register pairs for varargs with 2*xlen alignment.
+
+int f_va_callee(int, ...);
+
+// CHECK-LABEL: define void @f_va_caller()
+void f_va_caller() {
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i64 3, double 4.000000e+00, double 5.000000e+00, i64 {{%.*}}, [2 x i64] {{%.*}}, i128 {{%.*}}, %struct.large* {{%.*}})
+  f_va_callee(1, 2, 3LL, 4.0f, 5.0, (struct tiny){6, 7, 8, 9},
+              (struct small){10, NULL}, (struct small_aligned){11},
+              (struct large){12, 13, 14, 15});
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, fp128 0xL00000000000000004001400000000000, i32 signext 6, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5.0L, 6, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i128 {{%.*}}, i32 signext 6, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, (struct small_aligned){5}, 6, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, [2 x i64] {{%.*}}, i32 signext 6, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, (struct small){5, NULL}, 6, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, fp128 0xL00000000000000004001800000000000, i32 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6.0L, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i128 {{%.*}}, i32 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, (struct small_aligned){6}, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, [2 x i64] {{%.*}}, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, (struct small){6, NULL}, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i32 signext 6, fp128 0xL00000000000000004001C00000000000, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6, 7.0L, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i32 signext 6, i128 {{%.*}}, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6, (struct small_aligned){7}, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i32 signext 6, [2 x i64] {{.*}}, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6, (struct small){7, NULL}, 8, 9);
+}
+
+// CHECK-LABEL: define signext i32 @f_va_1(i8* %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK:   [[VA:%.*]] = alloca i8*, align 8
+// CHECK:   [[V:%.*]] = alloca i32, align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 8
+// CHECK:   [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK:   [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i64 8
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK:   [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 8
+// CHECK:   store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK:   [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[V]], align 4
+// CHECK:   ret i32 [[TMP2]]
+// CHECK: }
+int f_va_1(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// An "aligned" register pair (where the first register is even-numbered) is
+// used to pass varargs with 2x xlen alignment and 2x xlen size. Ensure the
+// correct offsets are used.
+
+// CHECK-LABEL: @f_va_2(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 8
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 15
+// CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -16
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i64 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to fp128*
+// CHECK-NEXT:    [[TMP4:%.*]] = load fp128, fp128* [[TMP3]], align 16
+// CHECK-NEXT:    store fp128 [[TMP4]], fp128* [[V]], align 16
+// CHECK-NEXT:    [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA2]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load fp128, fp128* [[V]], align 16
+// CHECK-NEXT:    ret fp128 [[TMP5]]
+long double f_va_2(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  long double v = __builtin_va_arg(va, long double);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// Two "aligned" register pairs.
+
+// CHECK-LABEL: @f_va_3(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[X:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 8
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 15
+// CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -16
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i64 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to fp128*
+// CHECK-NEXT:    [[TMP4:%.*]] = load fp128, fp128* [[TMP3]], align 16
+// CHECK-NEXT:    store fp128 [[TMP4]], fp128* [[V]], align 16
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR2]] to i32*
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 8
+// CHECK-NEXT:    store i32 [[TMP6]], i32* [[W]], align 4
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint i8* [[ARGP_CUR4]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 15
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], -16
+// CHECK-NEXT:    [[ARGP_CUR4_ALIGNED:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4_ALIGNED]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARGP_CUR4_ALIGNED]] to fp128*
+// CHECK-NEXT:    [[TMP11:%.*]] = load fp128, fp128* [[TMP10]], align 16
+// CHECK-NEXT:    store fp128 [[TMP11]], fp128* [[X]], align 16
+// CHECK-NEXT:    [[VA6:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA6]])
+// CHECK-NEXT:    [[TMP12:%.*]] = load fp128, fp128* [[V]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load fp128, fp128* [[X]], align 16
+// CHECK-NEXT:    [[ADD:%.*]] = fadd fp128 [[TMP12]], [[TMP13]]
+// CHECK-NEXT:    ret fp128 [[ADD]]
+long double f_va_3(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  long double v = __builtin_va_arg(va, long double);
+  int w = __builtin_va_arg(va, int);
+  long double x = __builtin_va_arg(va, long double);
+  __builtin_va_end(va);
+
+  return v + x;
+}
+
+// CHECK-LABEL: @f_va_4(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TS:%.*]] = alloca [[STRUCT_TINY:%.*]], align 2
+// CHECK-NEXT:    [[SS:%.*]] = alloca [[STRUCT_SMALL:%.*]], align 8
+// CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 8
+// CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 8
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 8
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARGP_CUR2]] to %struct.tiny*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.tiny* [[TS]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.tiny* [[TMP2]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR4]] to %struct.small*
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.small* [[SS]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.small* [[TMP5]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR6:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT7:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR6]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT7]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[ARGP_CUR6]] to %struct.large**
+// CHECK-NEXT:    [[TMP9:%.*]] = load %struct.large*, %struct.large** [[TMP8]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct.large* [[LS]] to i8*
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.large* [[TMP9]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 32, i1 false)
+// CHECK-NEXT:    [[VA8:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA8]])
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TINY]], %struct.tiny* [[TS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[A]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP12]] to i64
+// CHECK-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_SMALL]], %struct.small* [[SS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[A9]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[CONV]], [[TMP13]]
+// CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_LARGE]], %struct.large* [[LS]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[C]], align 8
+// CHECK-NEXT:    [[ADD10:%.*]] = add nsw i64 [[ADD]], [[TMP14]]
+// CHECK-NEXT:    [[CONV11:%.*]] = trunc i64 [[ADD10]] to i32
+// CHECK-NEXT:    store i32 [[CONV11]], i32* [[RET]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RET]], align 4
+// CHECK-NEXT:    ret i32 [[TMP15]]
+int f_va_4(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  struct tiny ts = __builtin_va_arg(va, struct tiny);
+  struct small ss = __builtin_va_arg(va, struct small);
+  struct large ls = __builtin_va_arg(va, struct large);
+  __builtin_va_end(va);
+
+  int ret = ts.a + ss.a + ls.c;
+
+  return ret;
+}
diff --git a/test/CodeGen/sparc-vaarg.c b/test/CodeGen/sparc-vaarg.c
index 3e4dd7c2c3f2..920b9a18b065 100644
--- a/test/CodeGen/sparc-vaarg.c
+++ b/test/CodeGen/sparc-vaarg.c
@@ -19,7 +19,7 @@ struct Foo dest;
 // CHECK-LABEL: define void @get_struct
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo*{{$}}
 // CHECK: [[RESULT2:%[a-z_0-9]+]] = bitcast {{.*}} [[RESULT]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[RESULT2]]
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* align {{[0-9]+}} [[RESULT2]]
 void get_struct(va_list *args) {
  dest = va_arg(*args, struct Foo);
 }
diff --git a/test/CodeGen/stack-size-section.c b/test/CodeGen/stack-size-section.c
new file mode 100644
index 000000000000..504a42de1341
--- /dev/null
+++ b/test/CodeGen/stack-size-section.c
@@ -0,0 +1,9 @@
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -S -o - | FileCheck %s --check-prefix=CHECK-ABSENT
+// CHECK-ABSENT-NOT: section .stack_sizes
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fstack-size-section %s -S -o - | FileCheck %s --check-prefix=CHECK-PRESENT
+// CHECK-PRESENT: section .stack_sizes
+
+int foo() { return 42; }
diff --git a/test/CodeGen/tbaa-base.cpp b/test/CodeGen/tbaa-base.cpp
new file mode 100644
index 000000000000..175c55f08a18
--- /dev/null
+++ b/test/CodeGen/tbaa-base.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -O1 %s -emit-llvm -o - | FileCheck %s
+//
+// Test generating of TBAA metadata for accesses to members of base classes.
+
+struct A {
+  int x, y, z;
+};
+
+struct B : A {
+  int i;
+};
+
+struct C {
+  int i;
+  B b;
+  int j;
+};
+
+int f1(B *b) {
+// CHECK-LABEL: _Z2f1P1B
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y:!.*]]
+  return b->y;
+}
+
+int f2(C *c) {
+// CHECK-LABEL: _Z2f2P1C
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y]]
+  return (&(c->b))->y;
+}
+
+struct D : virtual A
+{};
+
+struct E {
+  D d;
+};
+
+int f3(D *d) {
+// CHECK-LABEL: _Z2f3P1D
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y]]
+  return d->y;
+}
+
+int f4(E *e) {
+// CHECK-LABEL: _Z2f4P1E
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y]]
+  return (&(e->d))->y;
+}
+
+// CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0}
+// CHECK-DAG: [[TYPE_int:!.*]] = !{!"int", [[TYPE_char]], i64 0}
+// CHECK-DAG: [[TYPE_A:!.*]] = !{!"_ZTS1A", [[TYPE_int]], i64 0, [[TYPE_int]], i64 4, [[TYPE_int]], i64 8}
+// CHECK-DAG: [[TAG_A_y]] = !{[[TYPE_A]], [[TYPE_int]], i64 4}
diff --git a/test/CodeGen/tbaa-struct.cpp b/test/CodeGen/tbaa-struct.cpp
index 2623c42c9a3f..9e5429b2e360 100644
--- a/test/CodeGen/tbaa-struct.cpp
+++ b/test/CodeGen/tbaa-struct.cpp
@@ -1,6 +1,11 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - -O1 %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-OLD %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -new-struct-path-tbaa \
+// RUN:     -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-NEW %s
 //
-// Check that we generate !tbaa.struct metadata for struct copies.
+// Check that we generate TBAA metadata for struct copies correctly.
+
 struct A {
   short s;
   int i;
@@ -8,68 +13,117 @@ struct A {
   int j;
 };
 
-void copy(struct A *a, struct A *b) {
-  *a = *b;
-}
+typedef A __attribute__((may_alias)) AA;
 
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 16, i32 4, i1 false), !tbaa.struct [[TS:!.*]]
+void copy(A *a1, A *a2) {
+// CHECK-LABEL: _Z4copyP1AS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 16, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_A:![0-9]*]]
+  *a1 = *a2;
+}
 
 struct B {
-  char c1;
-  struct A a;
-  int ii;
+  char c;
+  A a;
+  int i;
 };
 
-void copy2(struct B *a, struct B *b) {
-  *a = *b;
+void copy2(B *b1, B *b2) {
+// CHECK-LABEL: _Z5copy2P1BS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 24, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS2:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_B:![0-9]*]]
+  *b1 = *b2;
 }
 
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 24, i32 4, i1 false), !tbaa.struct [[TS2:!.*]]
+struct S {
+  _Complex char cc;
+  _Complex int ci;
+};
 
-typedef _Complex int T2;
-typedef _Complex char T5;
-typedef _Complex int T7;
-typedef struct T4 { T5 field0; T7 field1; } T4;
-typedef union T1 { T2 field0; T4 field1; } T1;
+union U {
+  _Complex int ci;
+  S s;
+};
 
-void copy3 (T1 *a, T1 *b) {
-  *a = *b;
+void copy3(U *u1, U *u2) {
+// CHECK-LABEL: _Z5copy3P1US0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 12, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS3:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_U:![0-9]*]]
+  *u1 = *u2;
 }
 
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 12, i32 4, i1 false), !tbaa.struct [[TS3:!.*]]
-
 // Make sure that zero-length bitfield works.
-#define ATTR __attribute__ ((ms_struct))
-struct five {
+struct C {
   char a;
-  int :0;        /* ignored; prior field is not a bitfield. */
+  int : 0;  // Shall not be ignored; see r185018.
   char b;
   char c;
-} ATTR;
-void copy4(struct five *a, struct five *b) {
-  *a = *b;
+} __attribute__((ms_struct));
+
+void copy4(C *c1, C *c2) {
+// CHECK-LABEL: _Z5copy4P1CS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 3, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS4:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_C:![0-9]*]]
+  *c1 = *c2;
 }
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 3, i32 1, i1 false), !tbaa.struct [[TS4:!.*]]
 
-struct six {
+struct D {
   char a;
-  int :0;
+  int : 0;
   char b;
   char c;
 };
-void copy5(struct six *a, struct six *b) {
-  *a = *b;
+
+void copy5(D *d1, D *d2) {
+// CHECK-LABEL: _Z5copy5P1DS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 6, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS5:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_D:![0-9]*]]
+  *d1 = *d2;
+}
+
+void copy6(AA *a1, A *a2) {
+// CHECK-LABEL: _Z5copy6P1AS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 16, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS]]
+// CHECK-NEW-SAME: !tbaa [[TAG_char:![0-9]*]]
+  *a1 = *a2;
 }
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 6, i32 1, i1 false), !tbaa.struct [[TS5:!.*]]
 
-// CHECK: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
-// CHECK: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
-// CHECK: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
-// CHECK: [[INT]] = !{!"int", [[CHAR]]
-// CHECK: [[TAG_CHAR:!.*]] = !{[[CHAR]], [[CHAR]], i64 0}
+void copy7(A *a1, AA *a2) {
+// CHECK-LABEL: _Z5copy7P1AS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 16, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS]]
+// CHECK-NEW-SAME: !tbaa [[TAG_char]]
+  *a1 = *a2;
+}
+
+// CHECK-OLD: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
+// CHECK-OLD: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
+// CHECK-OLD: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
+// CHECK-OLD: [[INT]] = !{!"int", [[CHAR]]
+// CHECK-OLD: [[TAG_CHAR:!.*]] = !{[[CHAR]], [[CHAR]], i64 0}
 // (offset, size) = (0,1) char; (4,2) short; (8,4) int; (12,1) char; (16,4) int; (20,4) int
-// CHECK: [[TS2]] = !{i64 0, i64 1, !{{.*}}, i64 4, i64 2, !{{.*}}, i64 8, i64 4, !{{.*}}, i64 12, i64 1, !{{.*}}, i64 16, i64 4, {{.*}}, i64 20, i64 4, {{.*}}}
+// CHECK-OLD: [[TS2]] = !{i64 0, i64 1, !{{.*}}, i64 4, i64 2, !{{.*}}, i64 8, i64 4, !{{.*}}, i64 12, i64 1, !{{.*}}, i64 16, i64 4, {{.*}}, i64 20, i64 4, {{.*}}}
 // (offset, size) = (0,8) char; (0,2) char; (4,8) char
-// CHECK: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
-// CHECK: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 4, [[TAG_INT]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
-// CHECK: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
+// CHECK-OLD: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
+// CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 4, [[TAG_INT]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
+// CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
+
+// CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
+// CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
+// CHECK-NEW-DAG: [[TYPE_short:!.*]] = !{[[TYPE_char]], i64 2, !"short"}
+// CHECK-NEW-DAG: [[TYPE_int:!.*]] = !{[[TYPE_char]], i64 4, !"int"}
+// CHECK-NEW-DAG: [[TYPE_A:!.*]] = !{[[TYPE_char]], i64 16, !"_ZTS1A", [[TYPE_short]], i64 0, i64 2, [[TYPE_int]], i64 4, i64 4, [[TYPE_char]], i64 8, i64 1, [[TYPE_int]], i64 12, i64 4}
+// CHECK-NEW-DAG: [[TAG_A]] = !{[[TYPE_A]], [[TYPE_A]], i64 0, i64 16}
+// CHECK-NEW-DAG: [[TYPE_B:!.*]] = !{[[TYPE_char]], i64 24, !"_ZTS1B", [[TYPE_char]], i64 0, i64 1, [[TYPE_A]], i64 4, i64 16, [[TYPE_int]], i64 20, i64 4}
+// CHECK-NEW-DAG: [[TAG_B]] = !{[[TYPE_B]], [[TYPE_B]], i64 0, i64 24}
+// CHECK-NEW-DAG: [[TAG_U]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 12}
+// CHECK-NEW-DAG: [[TYPE_C:!.*]] = !{[[TYPE_char]], i64 3, !"_ZTS1C", [[TYPE_char]], i64 0, i64 1, [[TYPE_int]], i64 1, i64 4, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1}
+// CHECK-NEW-DAG: [[TAG_C]] = !{[[TYPE_C]], [[TYPE_C]], i64 0, i64 3}
+// CHECK-NEW-DAG: [[TYPE_D:!.*]] = !{[[TYPE_char]], i64 6, !"_ZTS1D", [[TYPE_char]], i64 0, i64 1, [[TYPE_int]], i64 4, i64 4, [[TYPE_char]], i64 4, i64 1, [[TYPE_char]], i64 5, i64 1}
+// CHECK-NEW-DAG: [[TAG_D]] = !{[[TYPE_D]], [[TYPE_D]], i64 0, i64 6}
diff --git a/test/CodeGen/transparent-union-redecl.c b/test/CodeGen/transparent-union-redecl.c
new file mode 100644
index 000000000000..31192089827d
--- /dev/null
+++ b/test/CodeGen/transparent-union-redecl.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -Werror -triple i386-linux -emit-llvm -o - %s | FileCheck %s
+
+// Test that different order of declarations is acceptable and that
+// implementing different redeclarations is acceptable.
+// rdar://problem/34949329
+
+typedef union {
+  int i;
+  float f;
+} TU __attribute__((transparent_union));
+
+// CHECK-LABEL: define void @f0(i32 %tu.coerce)
+// CHECK: %tu = alloca %union.TU, align 4
+// CHECK: %coerce.dive = getelementptr inbounds %union.TU, %union.TU* %tu, i32 0, i32 0
+// CHECK: store i32 %tu.coerce, i32* %coerce.dive, align 4
+void f0(TU tu) {}
+void f0(int i);
+
+// CHECK-LABEL: define void @f1(i32 %tu.coerce)
+// CHECK: %tu = alloca %union.TU, align 4
+// CHECK: %coerce.dive = getelementptr inbounds %union.TU, %union.TU* %tu, i32 0, i32 0
+// CHECK: store i32 %tu.coerce, i32* %coerce.dive, align 4
+void f1(int i);
+void f1(TU tu) {}
+
+// CHECK-LABEL: define void @f2(i32 %i)
+// CHECK: %i.addr = alloca i32, align 4
+// CHECK: store i32 %i, i32* %i.addr, align 4
+void f2(TU tu);
+void f2(int i) {}
+
+// CHECK-LABEL: define void @f3(i32 %i)
+// CHECK: %i.addr = alloca i32, align 4
+// CHECK: store i32 %i, i32* %i.addr, align 4
+void f3(int i) {}
+void f3(TU tu);
+
+// Also test functions with parameters specified K&R style.
+// CHECK-LABEL: define void @knrStyle(i32 %tu.coerce)
+// CHECK: %tu = alloca %union.TU, align 4
+// CHECK: %coerce.dive = getelementptr inbounds %union.TU, %union.TU* %tu, i32 0, i32 0
+// CHECK: store i32 %tu.coerce, i32* %coerce.dive, align 4
+void knrStyle(int i);
+void knrStyle(tu) TU tu; {}
diff --git a/test/CodeGen/volatile.c b/test/CodeGen/volatile.c
index 52915f6f4d5b..0f58bb62a248 100644
--- a/test/CodeGen/volatile.c
+++ b/test/CodeGen/volatile.c
@@ -199,7 +199,7 @@ int main() {
 // CHECK: store volatile i32 {{.*}}, i32* @vtS
   (void)vF2;
   // From vF2 to a temporary
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* %{{.*}}, i8* {{.*}} @vF2 {{.*}}, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* {{.*}} @vF2 {{.*}}, i1 true)
   vF2 = vF2;
   // vF2 to itself
 // CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
@@ -209,6 +209,6 @@ int main() {
 // CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
   vF2 = (vF2, vF2);
   // vF2 to a temporary, then vF2 to itself
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* %{{.*}}, i8* {{.*@vF2.*}}, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* {{.*@vF2.*}}, i1 true)
 // CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
 }
diff --git a/test/CodeGen/wasm-arguments.c b/test/CodeGen/wasm-arguments.c
index 723632b62f6b..9283dd5d15f9 100644
--- a/test/CodeGen/wasm-arguments.c
+++ b/test/CodeGen/wasm-arguments.c
@@ -24,7 +24,7 @@ typedef struct {
 // Single-element structs should be returned as the one element.
 // WEBASSEMBLY32: define i32 @f2()
 // WEBASSEMBLY64: define i32 @f2()
-s2 f2() {
+s2 f2(void) {
   s2 foo;
   return foo;
 }
@@ -36,7 +36,7 @@ typedef struct {
 // Structs should be returned sret and not simplified by the frontend.
 // WEBASSEMBLY32: define void @f3(%struct.s3* noalias sret %agg.result)
 // WEBASSEMBLY64: define void @f3(%struct.s3* noalias sret %agg.result)
-s3 f3() {
+s3 f3(void) {
   s3 foo;
   return foo;
 }
diff --git a/test/CodeGen/wasm-varargs.c b/test/CodeGen/wasm-varargs.c
index b8e488ec1440..3883549a0b65 100644
--- a/test/CodeGen/wasm-varargs.c
+++ b/test/CodeGen/wasm-varargs.c
@@ -93,11 +93,11 @@ struct S test_struct(char *fmt, ...) {
 // CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to [[STRUCT_S]]*
 // CHECK:   [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
 // CHECK:   [[R5:%[^,=]+]] = bitcast [[STRUCT_S]]* [[R3]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R4]], i8* [[R5]], i32 12, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[R4]], i8* align 4 [[R5]], i32 12, i1 false)
 // CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
 // CHECK:   call void @llvm.va_end(i8* [[VA2]])
 // CHECK:   [[R6:%[^,=]+]] = bitcast [[STRUCT_S]]* %agg.result to i8*
 // CHECK:   [[R7:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R6]], i8* [[R7]], i32 12, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[R6]], i8* align 4 [[R7]], i32 12, i1 false)
 // CHECK:   ret void
 // CHECK: }
diff --git a/test/CodeGen/windows-swiftcall.c b/test/CodeGen/windows-swiftcall.c
index f76b2fd3a40f..21cf2f7fddc6 100644
--- a/test/CodeGen/windows-swiftcall.c
+++ b/test/CodeGen/windows-swiftcall.c
@@ -207,10 +207,10 @@ TEST(struct_misaligned_1)
 // CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
 // CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
+// CHECK:  call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
 // CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
 // CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CASTRET]], i8* align {{[0-9]+}} [[CASTRES]], i64 5
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
@@ -260,10 +260,10 @@ TEST(union_het_fp)
 // CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
 // CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CAST]]
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CAST]]
 // CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
 // CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CASTRET]], i8* align {{[0-9]+}} [[CASTRES]]
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8
diff --git a/test/CodeGen/wmemcmp.c b/test/CodeGen/wmemcmp.c
new file mode 100644
index 000000000000..ad0886192bcf
--- /dev/null
+++ b/test/CodeGen/wmemcmp.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -emit-llvm -o - | FileCheck %s
+
+typedef __SIZE_TYPE__ size_t;
+typedef __WCHAR_TYPE__ wchar_t;
+
+int wmemcmp_test(const wchar_t *s1, const wchar_t *s2, size_t n) {
+  // CHECK: [[S1:%.*]] = load
+  // CHECK: [[S2:%.*]] = load
+  // CHECK: [[N:%.*]] = load
+  // CHECK: [[N0:%.*]] = icmp eq i64 [[N]], 0
+  // CHECK: br i1 [[N0]], label %[[EXIT:.*]], label %[[GT:.*]]
+
+  // CHECK: [[GT]]:
+  // CHECK: [[S1P:%.*]] = phi i16* [ [[S1]], %[[ENTRY:.*]] ], [ [[S1N:.*]], %[[NEXT:.*]] ]
+  // CHECK: [[S2P:%.*]] = phi i16* [ [[S2]], %[[ENTRY]] ], [ [[S2N:.*]], %[[NEXT]] ]
+  // CHECK: [[NP:%.*]] = phi i64 [ [[N]], %[[ENTRY]] ], [ [[NN:.*]], %[[NEXT]] ]
+  // CHECK: [[S1L:%.*]] = load i16, i16* [[S1P]], align 2
+  // CHECK: [[S2L:%.*]] = load i16, i16* [[S2P]], align 2
+  // CHECK: [[CMPGT:%.*]] = icmp ugt i16 [[S1L]], [[S2L]]
+  // CHECK: br i1 [[CMPGT]], label %[[EXIT]], label %[[LT:.*]]
+
+  // CHECK: [[LT]]:
+  // CHECK: [[CMPLT:%.*]] = icmp ult i16 [[S1L]], [[S2L]]
+  // CHECK: br i1 [[CMPLT]], label %[[EXIT]], label %[[NEXT:.*]]
+
+  // CHECK: [[NEXT]]:
+  // CHECK: [[S1N]] = getelementptr inbounds i16, i16* [[S1P]], i32 1
+  // CHECK: [[S2N]] = getelementptr inbounds i16, i16* [[S2P]], i32 1
+  // CHECK: [[NN]] = sub i64 [[NP]], 1
+  // CHECK: [[NN0:%.*]] = icmp eq i64 [[NN]], 0
+  // CHECK: br i1 [[NN0]], label %[[EXIT]], label %[[GT]]
+  //
+  // CHECK: [[EXIT]]:
+  // CHECK: [[RV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[GT]] ], [ -1, %[[LT]] ], [ 0, %[[NEXT]] ]
+  // CHECK: ret i32 [[RV]]
+  return __builtin_wmemcmp(s1, s2, n);
+}
diff --git a/test/CodeGen/x86-atomic-long_double.c b/test/CodeGen/x86-atomic-long_double.c
index 9857c67592f2..130ff45c0346 100644
--- a/test/CodeGen/x86-atomic-long_double.c
+++ b/test/CodeGen/x86-atomic-long_double.c
@@ -15,12 +15,12 @@ long double testinc(_Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[INC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -46,10 +46,10 @@ long double testinc(_Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -77,12 +77,12 @@ long double testdec(_Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -108,10 +108,10 @@ long double testdec(_Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -140,12 +140,12 @@ long double testcompassign(_Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[SUB_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -177,10 +177,10 @@ long double testcompassign(_Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -203,7 +203,7 @@ long double testassign(_Atomic long double *addr) {
   // CHECK: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 8
   // CHECK: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 8
   // CHECK: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 16
   // CHECK: [[STORE_TEMP_INT_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i128*
   // CHECK: [[STORE_TEMP_INT:%.+]] = load i128, i128* [[STORE_TEMP_INT_PTR]], align 16
@@ -213,7 +213,7 @@ long double testassign(_Atomic long double *addr) {
   // CHECK32: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 4
   // CHECK32: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 4
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 4
   // CHECK32: [[ADDR_VOID:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i8*
@@ -250,12 +250,12 @@ long double test_volatile_inc(volatile _Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[INC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -281,10 +281,10 @@ long double test_volatile_inc(volatile _Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -311,12 +311,12 @@ long double test_volatile_dec(volatile _Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -342,10 +342,10 @@ long double test_volatile_dec(volatile _Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -373,12 +373,12 @@ long double test_volatile_compassign(volatile _Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[SUB_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -409,10 +409,10 @@ long double test_volatile_compassign(volatile _Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -435,7 +435,7 @@ long double test_volatile_assign(volatile _Atomic long double *addr) {
   // CHECK: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 8
   // CHECK: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 8
   // CHECK: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 16
   // CHECK: [[STORE_TEMP_INT_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i128*
   // CHECK: [[STORE_TEMP_INT:%.+]] = load i128, i128* [[STORE_TEMP_INT_PTR]], align 16
@@ -445,7 +445,7 @@ long double test_volatile_assign(volatile _Atomic long double *addr) {
   // CHECK32: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 4
   // CHECK32: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 4
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 4
   // CHECK32: [[ADDR_VOID:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i8*
diff --git a/test/CodeGen/x86-cf-protection.c b/test/CodeGen/x86-cf-protection.c
new file mode 100644
index 000000000000..c853791d2ddd
--- /dev/null
+++ b/test/CodeGen/x86-cf-protection.c
@@ -0,0 +1,6 @@
+// RUN: not %clang_cc1 -fsyntax-only -S -emit-llvm -o %t -triple i386-unknown-unknown -fcf-protection=return %s 2>&1 | FileCheck %s --check-prefix=RETURN
+// RUN: not %clang_cc1 -fsyntax-only -S -emit-llvm -o %t -triple i386-unknown-unknown -fcf-protection=branch %s 2>&1 | FileCheck %s --check-prefix=BRANCH
+
+// RETURN: error: option 'cf-protection=return' cannot be specified without '-mshstk'
+// BRANCH: error: option 'cf-protection=branch' cannot be specified without '-mibt'
+void foo() {}
diff --git a/test/CodeGen/x86_32-arguments-realign.c b/test/CodeGen/x86_32-arguments-realign.c
index 768e1cc4690f..b99523b7eee6 100644
--- a/test/CodeGen/x86_32-arguments-realign.c
+++ b/test/CodeGen/x86_32-arguments-realign.c
@@ -2,7 +2,7 @@
 // RUN: FileCheck < %t %s
 
 // CHECK-LABEL: define void @f0(%struct.s0* byval align 4)
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* %{{.*}}, i32 16, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %{{.*}}, i8* align 4 %{{.*}}, i32 16, i1 false)
 // CHECK: }
 struct s0 { long double a; };
 void f0(struct s0 a0) {
diff --git a/test/CodeGen/x86_64-arguments.c b/test/CodeGen/x86_64-arguments.c
index d24ea4dbab3d..548980b32ae4 100644
--- a/test/CodeGen/x86_64-arguments.c
+++ b/test/CodeGen/x86_64-arguments.c
@@ -434,7 +434,7 @@ void test51(struct test51_s *s, __builtin_va_list argList) {
 // CHECK-NEXT: [[CASTED_VALUE_ADDR:%.*]] = bitcast i8* [[VALUE_ADDR]] to [[STRUCT_TEST51]]
 // CHECK-NEXT: [[CASTED_TMP_ADDR:%.*]] = bitcast [[STRUCT_TEST51]]* [[TMP_ADDR]] to i8*
 // CHECK-NEXT: [[RECASTED_VALUE_ADDR:%.*]] = bitcast [[STRUCT_TEST51]]* [[CASTED_VALUE_ADDR]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[CASTED_TMP_ADDR]], i8* [[RECASTED_VALUE_ADDR]], i64 16, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[CASTED_TMP_ADDR]], i8* align 8 [[RECASTED_VALUE_ADDR]], i64 16, i1 false)
 // CHECK-NEXT: add i32 {{.*}}, 16
 // CHECK-NEXT: store i32 {{.*}}, i32* {{.*}}
 // CHECK-NEXT: br label
diff --git a/test/CodeGen/xcore-abi.c b/test/CodeGen/xcore-abi.c
index 2bac78d92edd..90306ce31a53 100644
--- a/test/CodeGen/xcore-abi.c
+++ b/test/CodeGen/xcore-abi.c
@@ -80,7 +80,7 @@ void testva (int n, ...) {
   // CHECK: store i8* [[IN]], i8** [[AP]]
   // CHECK: [[V1:%[a-z0-9]+]] = bitcast %struct.x* [[V:%[a-z0-9]+]] to i8*
   // CHECK: [[P1:%[a-z0-9]+]] = bitcast %struct.x* [[P]] to i8*
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[V1]], i8* [[P1]], i32 20, i32 4, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[V1]], i8* align 4 [[P1]], i32 20, i1 false)
   // CHECK: [[V2:%[a-z0-9]+]] = bitcast %struct.x* [[V]] to i8*
   // CHECK: call void @f(i8* [[V2]])
 
@@ -93,7 +93,7 @@ void testva (int n, ...) {
   // CHECK: store i8* [[IN]], i8** [[AP]]
   // CHECK: [[V1:%[a-z0-9]+]] = bitcast [4 x i32]* [[V0:%[a-z0-9]+]] to i8*
   // CHECK: [[P1:%[a-z0-9]+]] = bitcast [4 x i32]* [[P]] to i8*
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[V1]], i8* [[P1]], i32 16, i32 4, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[V1]], i8* align 4 [[P1]], i32 16, i1 false)
   // CHECK: [[V2:%[a-z0-9]+]] = getelementptr inbounds [4 x i32], [4 x i32]* [[V0]], i32 0, i32 0
   // CHECK: store i32* [[V2]], i32** [[V:%[a-z0-9]+]], align 4
   // CHECK: [[V3:%[a-z0-9]+]] = load i32*, i32** [[V]], align 4
diff --git a/test/CodeGenCUDA/library-builtin.cu b/test/CodeGenCUDA/library-builtin.cu
new file mode 100644
index 000000000000..4804c75cde38
--- /dev/null
+++ b/test/CodeGenCUDA/library-builtin.cu
@@ -0,0 +1,22 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \
+// RUN:  FileCheck --check-prefixes=HOST,BOTH %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda \
+// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=DEVICE,BOTH
+
+// BOTH-LABEL: define float @logf(float
+
+// logf() should be calling itself recursively as we don't have any standard
+// library on device side.
+// DEVICE: call float @logf(float
+extern "C" __attribute__((device)) float logf(float __x) { return logf(__x); }
+
+// NOTE: this case is to illustrate the expected differences in behavior between
+// the host and device. In general we do not mess with host-side standard
+// library.
+//
+// Host is assumed to have standard library, so logf() calls LLVM intrinsic.
+// HOST: call float @llvm.log.f32(float
+extern "C" float logf(float __x) { return logf(__x); }
diff --git a/test/CodeGenCXX/alignment.cpp b/test/CodeGenCXX/alignment.cpp
index 4c44badd21cd..49cb34402c5d 100644
--- a/test/CodeGenCXX/alignment.cpp
+++ b/test/CodeGenCXX/alignment.cpp
@@ -204,7 +204,7 @@ namespace test1 {
     // CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[A]], [[A]]* [[A_P]], i32 0, i32 0
     // CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
     // CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 64 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
     AlignedArray result = a.aArray;
   }
 
@@ -223,7 +223,7 @@ namespace test1 {
     // CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[A]], [[A]]* [[A_P]], i32 0, i32 0
     // CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
     // CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 64 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
     AlignedArray result = b.aArray;
   }
 
@@ -234,7 +234,7 @@ namespace test1 {
     // CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
     // CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
     // CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 8, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 64 [[T0]], i8* align 8 [[T1]], i64 16, i1 false)
     AlignedArray result = b.bArray;
   }
 
@@ -245,7 +245,7 @@ namespace test1 {
     // CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
     // CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
     // CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 8, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 64 [[T0]], i8* align 8 [[T1]], i64 16, i1 false)
     AlignedArray result = b->bArray;
   }
 
@@ -256,7 +256,7 @@ namespace test1 {
     // CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
     // CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
     // CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 64 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
     B b;
     AlignedArray result = b.bArray;
   }
@@ -277,7 +277,7 @@ namespace test1 {
     // CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[A]], [[A]]* [[A_P]], i32 0, i32 0
     // CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
     // CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 64 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
     D d;
     AlignedArray result = d.aArray;
   }
@@ -292,7 +292,7 @@ namespace test1 {
     // CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
     // CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
     // CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 8, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 64 [[T0]], i8* align 8 [[T1]], i64 16, i1 false)
     D d;
     AlignedArray result = d.bArray;
   }
diff --git a/test/CodeGenCXX/assign-construct-memcpy.cpp b/test/CodeGenCXX/assign-construct-memcpy.cpp
index 3d4205132409..5e52b1625b61 100644
--- a/test/CodeGenCXX/assign-construct-memcpy.cpp
+++ b/test/CodeGenCXX/assign-construct-memcpy.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -emit-llvm -o - -std=c++11 %s -DPOD | FileCheck %s -check-prefix=CHECK-POD
+/// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -emit-llvm -o - -std=c++11 %s -DPOD | FileCheck %s -check-prefix=CHECK-POD
 // RUN: %clang_cc1 -triple x86_64-apple-darwin12 -emit-llvm -o - -std=c++11 %s | FileCheck %s -check-prefix=CHECK-NONPOD
 
 // Declare the reserved placement operators.
@@ -23,47 +23,47 @@ struct foo {
 foo *test1(void *f, const foo &x) {
   return new (f) foo(x);
 // CHECK-POD: test1
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 
 // CHECK-NONPOD: test1
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 }
 
 foo *test2(const foo &x) {
   return new foo(x);
 // CHECK-POD: test2
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 16 {{.*}} align 8 {{.*}}i64 24
 
 // CHECK-NONPOD: test2
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 16 {{.*}} align 8 {{.*}}i64 24
 }
 
 foo test3(const foo &x) {
   foo f = x;
   return f;
 // CHECK-POD: test3
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 
 // CHECK-NONPOD: test3
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 }
 
 foo *test4(foo &&x) {
   return new foo(x);
 // CHECK-POD: test4
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 16 {{.*}} align 8 {{.*}}i64 24
 
 // CHECK-NONPOD: test4
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 16 {{.*}} align 8 {{.*}}i64 24
 }
 
 void test5(foo &f, const foo &x) {
   f = x;
 // CHECK-POD: test5
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 
 // CHECK-NONPOD: test5
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 17, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 17
 }
 
 extern foo globtest;
@@ -71,10 +71,10 @@ extern foo globtest;
 void test6(foo &&x) {
   globtest = move(x);
 // CHECK-POD: test6
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 
 // CHECK-NONPOD: test6
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 17, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 17
 }
 
 void byval(foo f);
@@ -82,8 +82,8 @@ void byval(foo f);
 void test7(const foo &x) {
   byval(x);
 // CHECK-POD: test7
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 
 // CHECK-NONPOD: test7
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
 }
diff --git a/test/CodeGenCXX/atomic-inline.cpp b/test/CodeGenCXX/atomic-inline.cpp
index fe727589d2e2..327f85d5667f 100644
--- a/test/CodeGenCXX/atomic-inline.cpp
+++ b/test/CodeGenCXX/atomic-inline.cpp
@@ -1,6 +1,52 @@
 // RUN: %clang_cc1 %s -std=c++11 -emit-llvm -o - -triple=x86_64-linux-gnu | FileCheck %s
 // RUN: %clang_cc1 %s -std=c++11 -emit-llvm -o - -triple=x86_64-linux-gnu -target-cpu core2 | FileCheck %s --check-prefix=CORE2
-// Check the atomic code generation for cpu targets w/wo cx16 support.
+// RUN: %clang_cc1 %s -std=c++11 -emit-llvm -o - -triple=i386-linux-gnu -target-cpu i386 | FileCheck %s --check-prefix=I386
+// RUN: %clang_cc1 %s -std=c++11 -emit-llvm -o - -triple=i386-linux-gnu -target-cpu i486 | FileCheck %s --check-prefix=I486
+// Check the atomic code generation for cpu targets w/wo cx, cx8 and cx16 support.
+
+struct alignas(4) AM4 {
+  short f1, f2;
+};
+AM4 m4;
+AM4 load4() {
+  AM4 am;
+  // CHECK-LABEL: @_Z5load4v
+  // CHECK: load atomic i32, {{.*}} monotonic
+  // CORE2-LABEL: @_Z5load4v
+  // CORE2: load atomic i32, {{.*}} monotonic
+  // I386-LABEL: @_Z5load4v
+  // I386: call i32 @__atomic_load_4
+  // I486-LABEL: @_Z5load4v
+  // I486: load atomic i32, {{.*}} monotonic
+  __atomic_load(&m4, &am, 0);
+  return am;
+}
+
+AM4 s4;
+void store4() {
+  // CHECK-LABEL: @_Z6store4v
+  // CHECK: store atomic i32 {{.*}} monotonic
+  // CORE2-LABEL: @_Z6store4v
+  // CORE2: store atomic i32 {{.*}} monotonic
+  // I386-LABEL: @_Z6store4v
+  // I386: call void @__atomic_store_4
+  // I486-LABEL: @_Z6store4v
+  // I486: store atomic i32 {{.*}} monotonic
+  __atomic_store(&m4, &s4, 0);
+}
+
+bool cmpxchg4() {
+  AM4 am;
+  // CHECK-LABEL: @_Z8cmpxchg4v
+  // CHECK: cmpxchg i32* {{.*}} monotonic
+  // CORE2-LABEL: @_Z8cmpxchg4v
+  // CORE2: cmpxchg i32* {{.*}} monotonic
+  // I386-LABEL: @_Z8cmpxchg4v
+  // I386: call zeroext i1 @__atomic_compare_exchange_4
+  // I486-LABEL: @_Z8cmpxchg4v
+  // I486: cmpxchg i32* {{.*}} monotonic
+  return __atomic_compare_exchange(&m4, &s4, &am, 0, 0, 0);
+}
 
 struct alignas(8) AM8 {
   int f1, f2;
@@ -12,6 +58,10 @@ AM8 load8() {
   // CHECK: load atomic i64, {{.*}} monotonic
   // CORE2-LABEL: @_Z5load8v
   // CORE2: load atomic i64, {{.*}} monotonic
+  // I386-LABEL: @_Z5load8v
+  // I386: call i64 @__atomic_load_8
+  // I486-LABEL: @_Z5load8v
+  // I486: call i64 @__atomic_load_8
   __atomic_load(&m8, &am, 0);
   return am;
 }
@@ -22,6 +72,10 @@ void store8() {
   // CHECK: store atomic i64 {{.*}} monotonic
   // CORE2-LABEL: @_Z6store8v
   // CORE2: store atomic i64 {{.*}} monotonic
+  // I386-LABEL: @_Z6store8v
+  // I386: call void @__atomic_store_8
+  // I486-LABEL: @_Z6store8v
+  // I486: call void @__atomic_store_8
   __atomic_store(&m8, &s8, 0);
 }
 
@@ -31,6 +85,10 @@ bool cmpxchg8() {
   // CHECK: cmpxchg i64* {{.*}} monotonic
   // CORE2-LABEL: @_Z8cmpxchg8v
   // CORE2: cmpxchg i64* {{.*}} monotonic
+  // I386-LABEL: @_Z8cmpxchg8v
+  // I386: call zeroext i1 @__atomic_compare_exchange_8
+  // I486-LABEL: @_Z8cmpxchg8v
+  // I486: call zeroext i1 @__atomic_compare_exchange_8
   return __atomic_compare_exchange(&m8, &s8, &am, 0, 0, 0);
 }
 
@@ -66,4 +124,3 @@ bool cmpxchg16() {
   // CORE2: cmpxchg i128* {{.*}} monotonic
   return __atomic_compare_exchange(&m16, &s16, &am, 0, 0, 0);
 }
-
diff --git a/test/CodeGenCXX/attr-target-mv-diff-ns.cpp b/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
new file mode 100644
index 000000000000..4dc2b67b4644
--- /dev/null
+++ b/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// Test ensures that this properly differentiates between types in different 
+// namespaces.
+int __attribute__((target("sse4.2"))) foo(int) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int);
+int __attribute__((target("arch=ivybridge"))) foo(int) {return 1;}
+int __attribute__((target("default"))) foo(int) { return 2; }
+
+namespace ns {
+int __attribute__((target("sse4.2"))) foo(int) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int);
+int __attribute__((target("arch=ivybridge"))) foo(int) {return 1;}
+int __attribute__((target("default"))) foo(int) { return 2; }
+}
+
+int bar() {
+  return foo(1) + ns::foo(2);
+}
+
+// CHECK: @_Z3fooi.ifunc = ifunc i32 (i32), i32 (i32)* ()* @_Z3fooi.resolver
+// CHECK: @_ZN2ns3fooEi.ifunc = ifunc i32 (i32), i32 (i32)* ()* @_ZN2ns3fooEi.resolver
+
+// CHECK: define i32 @_Z3fooi.sse4.2(i32)
+// CHECK: ret i32 0
+// CHECK: define i32 @_Z3fooi.arch_ivybridge(i32)
+// CHECK: ret i32 1
+// CHECK: define i32 @_Z3fooi(i32)
+// CHECK: ret i32 2
+
+// CHECK: define i32 @_ZN2ns3fooEi.sse4.2(i32)
+// CHECK: ret i32 0
+// CHECK: define i32 @_ZN2ns3fooEi.arch_ivybridge(i32)
+// CHECK: ret i32 1
+// CHECK: define i32 @_ZN2ns3fooEi(i32)
+// CHECK: ret i32 2
+
+// CHECK: define i32 @_Z3barv()
+// CHECK: call i32 @_Z3fooi.ifunc(i32 1)
+// CHECK: call i32 @_ZN2ns3fooEi.ifunc(i32 2)
+
+// CHECK: define i32 (i32)* @_Z3fooi.resolver() comdat
+// CHECK: ret i32 (i32)* @_Z3fooi.arch_sandybridge
+// CHECK: ret i32 (i32)* @_Z3fooi.arch_ivybridge
+// CHECK: ret i32 (i32)* @_Z3fooi.sse4.2
+// CHECK: ret i32 (i32)* @_Z3fooi
+//
+// CHECK: define i32 (i32)* @_ZN2ns3fooEi.resolver() comdat
+// CHECK: ret i32 (i32)* @_ZN2ns3fooEi.arch_sandybridge
+// CHECK: ret i32 (i32)* @_ZN2ns3fooEi.arch_ivybridge
+// CHECK: ret i32 (i32)* @_ZN2ns3fooEi.sse4.2
+// CHECK: ret i32 (i32)* @_ZN2ns3fooEi
+
+// CHECK: declare i32 @_Z3fooi.arch_sandybridge(i32)
+// CHECK: declare i32 @_ZN2ns3fooEi.arch_sandybridge(i32)
diff --git a/test/CodeGenCXX/attr-target-mv-func-ptrs.cpp b/test/CodeGenCXX/attr-target-mv-func-ptrs.cpp
new file mode 100644
index 000000000000..290d6b5c6489
--- /dev/null
+++ b/test/CodeGenCXX/attr-target-mv-func-ptrs.cpp
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+void temp();
+void temp(int);
+using FP = void(*)(int);
+void b() {
+  FP f = temp; 
+}
+
+int __attribute__((target("sse4.2"))) foo(int) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int);
+int __attribute__((target("arch=ivybridge"))) foo(int) {return 1;}
+int __attribute__((target("default"))) foo(int) { return 2; }
+
+struct S {
+int __attribute__((target("sse4.2"))) foo(int) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int);
+int __attribute__((target("arch=ivybridge"))) foo(int) {return 1;}
+int __attribute__((target("default"))) foo(int) { return 2; }
+};
+
+using FuncPtr = int (*)(int);
+using MemFuncPtr = int (S::*)(int);
+
+void f(FuncPtr, MemFuncPtr);
+
+int bar() {
+  FuncPtr Free = &foo;
+  MemFuncPtr Member = &S::foo;
+  S s;
+  f(foo, &S::foo);
+  return Free(1) + (s.*Member)(2);
+}
+
+
+// CHECK: @_Z3fooi.ifunc 
+// CHECK: @_ZN1S3fooEi.ifunc
+
+// CHECK: define i32 @_Z3barv()
+// Store to Free of ifunc
+// CHECK: store i32 (i32)* @_Z3fooi.ifunc
+// Store to Member of ifunc
+// CHECK: store { i64, i64 } { i64 ptrtoint (i32 (%struct.S*, i32)* @_ZN1S3fooEi.ifunc to i64), i64 0 }, { i64, i64 }* [[MEMBER:%[a-z]+]]
+
+// Call to 'f' with the ifunc
+// CHECK: call void @_Z1fPFiiEM1SFiiE(i32 (i32)* @_Z3fooi.ifunc
diff --git a/test/CodeGenCXX/attr-target-mv-member-funcs.cpp b/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
new file mode 100644
index 000000000000..2e5db3b705e4
--- /dev/null
+++ b/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
@@ -0,0 +1,137 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+struct S {
+  int __attribute__((target("sse4.2"))) foo(int) { return 0; }
+  int __attribute__((target("arch=sandybridge"))) foo(int);
+  int __attribute__((target("arch=ivybridge"))) foo(int) { return 1; }
+  int __attribute__((target("default"))) foo(int) { return 2; }
+
+  S &__attribute__((target("arch=ivybridge"))) operator=(const S &) {
+    return *this;
+  }
+  S &__attribute__((target("default"))) operator=(const S &) {
+    return *this;
+  }
+};
+
+struct ConvertTo {
+  __attribute__((target("arch=ivybridge"))) operator S() const {
+    return S{};
+  }
+  __attribute__((target("default"))) operator S() const {
+    return S{};
+  }
+};
+
+int bar() {
+  S s;
+  S s2;
+  s2 = s;
+
+  ConvertTo C;
+  s2 = static_cast<S>(C);
+
+  return s.foo(0);
+}
+
+struct S2 {
+  int __attribute__((target("sse4.2"))) foo(int);
+  int __attribute__((target("arch=sandybridge"))) foo(int);
+  int __attribute__((target("arch=ivybridge"))) foo(int);
+  int __attribute__((target("default"))) foo(int);
+};
+
+int bar2() {
+  S2 s;
+  return s.foo(0);
+}
+
+int __attribute__((target("sse4.2"))) S2::foo(int) { return 0; }
+int __attribute__((target("arch=ivybridge"))) S2::foo(int) { return 1; }
+int __attribute__((target("default"))) S2::foo(int) { return 2; }
+
+template<typename T>
+struct templ {
+  int __attribute__((target("sse4.2"))) foo(int) { return 0; }
+  int __attribute__((target("arch=sandybridge"))) foo(int);
+  int __attribute__((target("arch=ivybridge"))) foo(int) { return 1; }
+  int __attribute__((target("default"))) foo(int) { return 2; }
+};
+
+int templ_use() {
+  templ<int> a;
+  templ<double> b;
+  return a.foo(1) + b.foo(2);
+}
+
+// CHECK: @_ZN1SaSERKS_.ifunc = ifunc %struct.S* (%struct.S*, %struct.S*), %struct.S* (%struct.S*, %struct.S*)* ()* @_ZN1SaSERKS_.resolver
+// CHECK: @_ZNK9ConvertTocv1SEv.ifunc = ifunc void (%struct.ConvertTo*), void (%struct.ConvertTo*)* ()* @_ZNK9ConvertTocv1SEv.resolver
+// CHECK: @_ZN1S3fooEi.ifunc = ifunc i32 (%struct.S*, i32), i32 (%struct.S*, i32)* ()* @_ZN1S3fooEi.resolver
+// CHECK: @_ZN2S23fooEi.ifunc = ifunc i32 (%struct.S2*, i32), i32 (%struct.S2*, i32)* ()* @_ZN2S23fooEi.resolver
+// Templates:
+// CHECK: @_ZN5templIiE3fooEi.ifunc = ifunc i32 (%struct.templ*, i32), i32 (%struct.templ*, i32)* ()* @_ZN5templIiE3fooEi.resolver
+// CHECK: @_ZN5templIdE3fooEi.ifunc = ifunc i32 (%struct.templ.0*, i32), i32 (%struct.templ.0*, i32)* ()* @_ZN5templIdE3fooEi.resolver
+
+// CHECK: define i32 @_Z3barv()
+// CHECK: %s = alloca %struct.S, align 1
+// CHECK: %s2 = alloca %struct.S, align 1
+// CHECK: %C = alloca %struct.ConvertTo, align 1
+// CHECK: call dereferenceable(1) %struct.S* @_ZN1SaSERKS_.ifunc(%struct.S* %s2
+// CHECK: call void @_ZNK9ConvertTocv1SEv.ifunc(%struct.ConvertTo* %C)
+// CHECK: call dereferenceable(1) %struct.S* @_ZN1SaSERKS_.ifunc(%struct.S* %s2
+// CHECK: call i32 @_ZN1S3fooEi.ifunc(%struct.S* %s, i32 0)
+
+// CHECK: define %struct.S* (%struct.S*, %struct.S*)* @_ZN1SaSERKS_.resolver() comdat
+// CHECK: ret %struct.S* (%struct.S*, %struct.S*)* @_ZN1SaSERKS_.arch_ivybridge
+// CHECK: ret %struct.S* (%struct.S*, %struct.S*)* @_ZN1SaSERKS_
+
+// CHECK: define void (%struct.ConvertTo*)* @_ZNK9ConvertTocv1SEv.resolver() comdat
+// CHECK: ret void (%struct.ConvertTo*)* @_ZNK9ConvertTocv1SEv.arch_ivybridge
+// CHECK: ret void (%struct.ConvertTo*)* @_ZNK9ConvertTocv1SEv
+
+// CHECK: define i32 (%struct.S*, i32)* @_ZN1S3fooEi.resolver() comdat 
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi.arch_sandybridge
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi.arch_ivybridge
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi.sse4.2
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi
+
+// CHECK: define i32 @_Z4bar2v()
+// CHECK:call i32 @_ZN2S23fooEi.ifunc
+// define i32 (%struct.S2*, i32)* @_ZN2S23fooEi.resolver() comdat
+// CHECK: ret i32 (%struct.S2*, i32)* @_ZN2S23fooEi.arch_sandybridge
+// CHECK: ret i32 (%struct.S2*, i32)* @_ZN2S23fooEi.arch_ivybridge
+// CHECK: ret i32 (%struct.S2*, i32)* @_ZN2S23fooEi.sse4.2
+// CHECK: ret i32 (%struct.S2*, i32)* @_ZN2S23fooEi
+
+// CHECK: define i32 @_ZN2S23fooEi.sse4.2(%struct.S2* %this, i32)
+// CHECK: define i32 @_ZN2S23fooEi.arch_ivybridge(%struct.S2* %this, i32)
+// CHECK: define i32 @_ZN2S23fooEi(%struct.S2* %this, i32)
+
+// CHECK: define i32 @_Z9templ_usev()
+// CHECK:  call i32 @_ZN5templIiE3fooEi.ifunc
+// CHECK:  call i32 @_ZN5templIdE3fooEi.ifunc
+
+
+// CHECK: define i32 (%struct.templ*, i32)* @_ZN5templIiE3fooEi.resolver() comdat
+// CHECK: ret i32 (%struct.templ*, i32)* @_ZN5templIiE3fooEi.arch_sandybridge
+// CHECK: ret i32 (%struct.templ*, i32)* @_ZN5templIiE3fooEi.arch_ivybridge
+// CHECK: ret i32 (%struct.templ*, i32)* @_ZN5templIiE3fooEi.sse4.2
+// CHECK: ret i32 (%struct.templ*, i32)* @_ZN5templIiE3fooEi
+//
+// CHECK: define i32 (%struct.templ.0*, i32)* @_ZN5templIdE3fooEi.resolver() comdat
+// CHECK: ret i32 (%struct.templ.0*, i32)* @_ZN5templIdE3fooEi.arch_sandybridge
+// CHECK: ret i32 (%struct.templ.0*, i32)* @_ZN5templIdE3fooEi.arch_ivybridge
+// CHECK: ret i32 (%struct.templ.0*, i32)* @_ZN5templIdE3fooEi.sse4.2
+// CHECK: ret i32 (%struct.templ.0*, i32)* @_ZN5templIdE3fooEi
+
+// CHECK: define linkonce_odr i32 @_ZN1S3fooEi.sse4.2(%struct.S* %this, i32)
+// CHECK: ret i32 0
+
+// CHECK: declare i32 @_ZN1S3fooEi.arch_sandybridge(%struct.S*, i32)
+
+// CHECK: define linkonce_odr i32 @_ZN1S3fooEi.arch_ivybridge(%struct.S* %this, i32)
+// CHECK: ret i32 1
+
+// CHECK: define linkonce_odr i32 @_ZN1S3fooEi(%struct.S* %this, i32)
+// CHECK: ret i32 2
+
diff --git a/test/CodeGenCXX/attr-target-mv-modules.cpp b/test/CodeGenCXX/attr-target-mv-modules.cpp
new file mode 100644
index 000000000000..6ff2046831e6
--- /dev/null
+++ b/test/CodeGenCXX/attr-target-mv-modules.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -fmodules -emit-llvm %s -o - | FileCheck %s
+#pragma clang module build A
+module A {}
+#pragma clang module contents
+#pragma clang module begin A
+__attribute__((target("default"))) void f();
+__attribute__((target("sse4.2"))) void f();
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module build B
+module B {}
+#pragma clang module contents
+#pragma clang module begin B
+__attribute__((target("default"))) void f();
+__attribute__((target("sse4.2"))) void f();
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module import A
+#pragma clang module import B
+void g() { f(); }
+
+// Negative tests to validate that the resolver only calls each 1x.
+// CHECK: define void ()* @_Z1fv.resolver
+// CHECK: ret void ()* @_Z1fv.sse4.2
+// CHECK-NOT: ret void ()* @_Z1fv.sse4.2
+// CHECK: ret void ()* @_Z1fv
+// CHECK-NOT: ret void ()* @_Z1fv
diff --git a/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp b/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
new file mode 100644
index 000000000000..63353c12d29a
--- /dev/null
+++ b/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+struct S {
+  int __attribute__((target("sse4.2"))) foo(int);
+  int __attribute__((target("arch=sandybridge"))) foo(int);
+  int __attribute__((target("arch=ivybridge"))) foo(int);
+  int __attribute__((target("default"))) foo(int);
+};
+
+int __attribute__((target("default"))) S::foo(int) { return 2; }
+int __attribute__((target("sse4.2"))) S::foo(int) { return 0; }
+int __attribute__((target("arch=ivybridge"))) S::foo(int) { return 1; }
+
+int bar() {
+  S s;
+  return s.foo(0);
+}
+
+// CHECK: @_ZN1S3fooEi.ifunc = ifunc i32 (%struct.S*, i32), i32 (%struct.S*, i32)* ()* @_ZN1S3fooEi.resolver
+
+// CHECK: define i32 @_ZN1S3fooEi(%struct.S* %this, i32)
+// CHECK: ret i32 2
+
+// CHECK: define i32 @_ZN1S3fooEi.sse4.2(%struct.S* %this, i32)
+// CHECK: ret i32 0
+
+// CHECK: define i32 @_ZN1S3fooEi.arch_ivybridge(%struct.S* %this, i32)
+// CHECK: ret i32 1
+
+// CHECK: define i32 @_Z3barv()
+// CHECK: %s = alloca %struct.S, align 1
+// CHECK: %call = call i32 @_ZN1S3fooEi.ifunc(%struct.S* %s, i32 0)
+
+// CHECK: define i32 (%struct.S*, i32)* @_ZN1S3fooEi.resolver() comdat
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi.arch_sandybridge
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi.arch_ivybridge
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi.sse4.2
+// CHECK: ret i32 (%struct.S*, i32)* @_ZN1S3fooEi
+
+// CHECK: declare i32 @_ZN1S3fooEi.arch_sandybridge(%struct.S*, i32)
diff --git a/test/CodeGenCXX/attr-target-mv-overloads.cpp b/test/CodeGenCXX/attr-target-mv-overloads.cpp
new file mode 100644
index 000000000000..c72ea77fa46f
--- /dev/null
+++ b/test/CodeGenCXX/attr-target-mv-overloads.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+int __attribute__((target("sse4.2"))) foo_overload(int) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo_overload(int);
+int __attribute__((target("arch=ivybridge"))) foo_overload(int) {return 1;}
+int __attribute__((target("default"))) foo_overload(int) { return 2; }
+int __attribute__((target("sse4.2"))) foo_overload(void) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo_overload(void);
+int __attribute__((target("arch=ivybridge"))) foo_overload(void) {return 1;}
+int __attribute__((target("default"))) foo_overload(void) { return 2; }
+
+int bar2() {
+  return foo_overload() + foo_overload(1);
+}
+
+// CHECK: @_Z12foo_overloadv.ifunc = ifunc i32 (), i32 ()* ()* @_Z12foo_overloadv.resolver
+// CHECK: @_Z12foo_overloadi.ifunc = ifunc i32 (i32), i32 (i32)* ()* @_Z12foo_overloadi.resolver
+
+
+// CHECK: define i32 @_Z12foo_overloadi.sse4.2(i32)
+// CHECK: ret i32 0
+// CHECK: define i32 @_Z12foo_overloadi.arch_ivybridge(i32)
+// CHECK: ret i32 1
+// CHECK: define i32 @_Z12foo_overloadi(i32)
+// CHECK: ret i32 2
+// CHECK: define i32 @_Z12foo_overloadv.sse4.2()
+// CHECK: ret i32 0
+// CHECK: define i32 @_Z12foo_overloadv.arch_ivybridge()
+// CHECK: ret i32 1
+// CHECK: define i32 @_Z12foo_overloadv()
+// CHECK: ret i32 2
+
+// CHECK: define i32 @_Z4bar2v()
+// CHECK: call i32 @_Z12foo_overloadv.ifunc()
+// CHECK: call i32 @_Z12foo_overloadi.ifunc(i32 1)
+
+// CHECK: define i32 ()* @_Z12foo_overloadv.resolver() comdat
+// CHECK: ret i32 ()* @_Z12foo_overloadv.arch_sandybridge
+// CHECK: ret i32 ()* @_Z12foo_overloadv.arch_ivybridge
+// CHECK: ret i32 ()* @_Z12foo_overloadv.sse4.2
+// CHECK: ret i32 ()* @_Z12foo_overloadv
+
+// CHECK: define i32 (i32)* @_Z12foo_overloadi.resolver() comdat
+// CHECK: ret i32 (i32)* @_Z12foo_overloadi.arch_sandybridge
+// CHECK: ret i32 (i32)* @_Z12foo_overloadi.arch_ivybridge
+// CHECK: ret i32 (i32)* @_Z12foo_overloadi.sse4.2
+// CHECK: ret i32 (i32)* @_Z12foo_overloadi
+
+// CHECK: declare i32 @_Z12foo_overloadv.arch_sandybridge()
+// CHECK: declare i32 @_Z12foo_overloadi.arch_sandybridge(i32)
diff --git a/test/CodeGenCXX/catch-undef-behavior.cpp b/test/CodeGenCXX/catch-undef-behavior.cpp
index e8287538982b..786c6da21a18 100644
--- a/test/CodeGenCXX/catch-undef-behavior.cpp
+++ b/test/CodeGenCXX/catch-undef-behavior.cpp
@@ -371,7 +371,7 @@ class C : public A, public B // align=16
 void downcast_pointer(B *b) {
   (void) static_cast<C*>(b);
   // Alignment check from EmitTypeCheck(TCK_DowncastPointer, ...)
-  // CHECK: [[SUB:%[.a-z0-9]*]] = getelementptr i8, i8* {{.*}}, i64 -16
+  // CHECK: [[SUB:%[.a-z0-9]*]] = getelementptr inbounds i8, i8* {{.*}}, i64 -16
   // CHECK-NEXT: [[C:%.+]] = bitcast i8* [[SUB]] to %class.C*
   // null check goes here
   // CHECK: [[FROM_PHI:%.+]] = phi %class.C* [ [[C]], {{.*}} ], {{.*}}
@@ -388,7 +388,7 @@ void downcast_pointer(B *b) {
 void downcast_reference(B &b) {
   (void) static_cast<C&>(b);
   // Alignment check from EmitTypeCheck(TCK_DowncastReference, ...)
-  // CHECK:      [[SUB:%[.a-z0-9]*]] = getelementptr i8, i8* {{.*}}, i64 -16
+  // CHECK:      [[SUB:%[.a-z0-9]*]] = getelementptr inbounds i8, i8* {{.*}}, i64 -16
   // CHECK-NEXT: [[C:%.+]] = bitcast i8* [[SUB]] to %class.C*
   // Objectsize check goes here
   // CHECK:      [[C_INT:%.+]] = ptrtoint %class.C* [[C]] to i64
diff --git a/test/CodeGenCXX/constructor-direct-call.cpp b/test/CodeGenCXX/constructor-direct-call.cpp
index 9567d0968dae..bcddc0fa7a49 100644
--- a/test/CodeGenCXX/constructor-direct-call.cpp
+++ b/test/CodeGenCXX/constructor-direct-call.cpp
@@ -9,7 +9,7 @@ void f1() {
   Test1 var;
   var.Test1::Test1();
 
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* %{{.*}}, i32 4, i32 4, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i32 4, i1 false)
   var.Test1::Test1(var);
 }
 
@@ -28,7 +28,7 @@ void f2() {
   // CHECK-NEXT:  call x86_thiscallcc void @_ZN5Test2C1Ev(%class.Test2* %var)
   var.Test2::Test2();
 
-  // CHECK:  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* %{{.*}}, i32 8, i32 4, i1 false)
+  // CHECK:  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i32 8, i1 false)
   var.Test2::Test2(var);
 }
 
diff --git a/test/CodeGenCXX/copy-constructor-elim.cpp b/test/CodeGenCXX/copy-constructor-elim.cpp
index 4abe456e4b29..7ba231071e76 100644
--- a/test/CodeGenCXX/copy-constructor-elim.cpp
+++ b/test/CodeGenCXX/copy-constructor-elim.cpp
@@ -56,4 +56,4 @@ extern "C" V f() { return gv1; }
 // Make sure that we obey the destination's alignment requirements when emitting
 // the copy.
 // CHECK-LABEL: define {{.*}} @f(
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}({{.*}}, i8* bitcast (%struct.V* @gv1 to i8*), {{i64|i32}} 4, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}({{.*}}align 4{{.*}}, i8* align 8 bitcast (%struct.V* @gv1 to i8*), {{i64|i32}} 4, i1 false)
diff --git a/test/CodeGenCXX/copy-constructor-synthesis-2.cpp b/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
index 9d7e18e80f67..7940101c0cfb 100644
--- a/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
+++ b/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
@@ -10,15 +10,15 @@ extern PR23373 pr23373_a;
 
 PR23373 pr23373_b(pr23373_a);
 // CHECK-LABEL: define {{.*}} @__cxx_global_var_init(
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i32|i64}}({{.*}} @pr23373_b{{.*}}, {{.*}} @pr23373_a{{.*}}, [[W:i32|i64]] 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i32|i64}}({{.*}}align 4{{.*}}@pr23373_b{{.*}}, {{.*}}align 4{{.*}} @pr23373_a{{.*}}, [[W:i32|i64]] 4, i1 false)
 
 PR23373 pr23373_f() { return pr23373_a; }
 // CHECK-LABEL: define {{.*}} @_Z9pr23373_fv(
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}, [[W]] 4, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}align 4{{.*}}align 4{{.*}}, [[W]] 4, i1 false)
 
 void pr23373_g(PR23373 &a, PR23373 &b) { a = b; }
 // CHECK-LABEL: define {{.*}} @_Z9pr23373_g
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}, [[W]] 4, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}align 4{{.*}}align 4{{.*}}, [[W]] 4, i1 false)
 
 struct A { virtual void a(); };
 A x(A& y) { return y; }
diff --git a/test/CodeGenCXX/copy-constructor-synthesis.cpp b/test/CodeGenCXX/copy-constructor-synthesis.cpp
index 5a2cc1440f74..4fdd8a35c4de 100644
--- a/test/CodeGenCXX/copy-constructor-synthesis.cpp
+++ b/test/CodeGenCXX/copy-constructor-synthesis.cpp
@@ -143,7 +143,7 @@ void f(B b1) {
 // CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds [[A]], [[A]]* [[OTHER]], i32 0, i32 1
 // CHECK-NEXT: [[T4:%.*]] = bitcast i16* [[T0]] to i8*
 // CHECK-NEXT: [[T5:%.*]] = bitcast i16* [[T2]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T4]], i8* [[T5]], i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T4]], i8* align 8 [[T5]], i64 8, i1 false)
 // CHECK-NEXT: ret [[A]]* [[THIS]]
 
 // CHECK-LABEL: define linkonce_odr void @_ZN6PR66281BC2ERKS0_(%"struct.PR6628::B"* %this, %"struct.PR6628::B"* dereferenceable({{[0-9]+}})) unnamed_addr
@@ -172,7 +172,7 @@ void f(B b1) {
 // CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds [[A]], [[A]]* [[OTHER]], i32 0, i32 1
 // CHECK-NEXT: [[T4:%.*]] = bitcast i16* [[T0]] to i8*
 // CHECK-NEXT: [[T5:%.*]] = bitcast i16* [[T2]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T4]], i8* [[T5]], i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T4]], i8* align 8 [[T5]], i64 8, i1 false)
 // CHECK-NEXT: ret void
 }
 
diff --git a/test/CodeGenCXX/cxx0x-delegating-ctors.cpp b/test/CodeGenCXX/cxx0x-delegating-ctors.cpp
index dcc055696e4d..614983d4fc59 100644
--- a/test/CodeGenCXX/cxx0x-delegating-ctors.cpp
+++ b/test/CodeGenCXX/cxx0x-delegating-ctors.cpp
@@ -66,7 +66,7 @@ namespace PR12890 {
   X::X(int) : X() {}
 }
 // CHECK: define {{.*}} @_ZN7PR128901XC1Ei(%"class.PR12890::X"* %this, i32)
-// CHECK: call void @llvm.memset.p0i8.{{i32|i64}}(i8* {{.*}}, i8 0, {{i32|i64}} 4, i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.{{i32|i64}}(i8* align 4 {{.*}}, i8 0, {{i32|i64}} 4, i1 false)
 
 namespace PR14588 {
   void other();
diff --git a/test/CodeGenCXX/cxx0x-initializer-array.cpp b/test/CodeGenCXX/cxx0x-initializer-array.cpp
index de10aee67f13..4a7e452304cc 100644
--- a/test/CodeGenCXX/cxx0x-initializer-array.cpp
+++ b/test/CodeGenCXX/cxx0x-initializer-array.cpp
@@ -39,7 +39,7 @@ namespace ValueInitArrayOfMemPtr {
     // CHECK: store i32 -1,
 
     Agg2 b = { n };
-    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* bitcast ([3 x i32]* @[[THREE_NULL_MEMPTRS]] to i8*), i32 12, i32 4, i1 false)
+    // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %{{.*}}, i8* align 4 bitcast ([3 x i32]* @[[THREE_NULL_MEMPTRS]] to i8*), i32 12, i1 false)
   }
 
   // Test dynamic initialization.
diff --git a/test/CodeGenCXX/cxx11-initializer-array-new.cpp b/test/CodeGenCXX/cxx11-initializer-array-new.cpp
index 59f96031fc40..6e242124fea0 100644
--- a/test/CodeGenCXX/cxx11-initializer-array-new.cpp
+++ b/test/CodeGenCXX/cxx11-initializer-array-new.cpp
@@ -154,7 +154,7 @@ void *r = new T[n][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 //
 // CHECK: %[[SIZE:.*]] = sub i64 %{{.*}}, 24
 // CHECK: %[[REST:.*]] = bitcast %[[T]]* %[[T_2_AS_T]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* %[[REST]], i8 0, i64 %[[SIZE]], i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %[[REST]], i8 0, i64 %[[SIZE]], i1 false)
 //
 // CHECK: }
 
diff --git a/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp b/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp
index 9110e49f93a1..f59ec51136fe 100644
--- a/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp
+++ b/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp
@@ -112,3 +112,21 @@ namespace Dynamic {
   //   A_CLEANUP:
   // CHECK: call void @_ZN7Dynamic1AD1Ev({{.*}} @_ZN7Dynamic2d3E
 }
+
+namespace Instantiated1 {
+  struct A { A(); };
+  struct B : A { using A::A; };
+  template<int> B v({});
+  template B v<0>;
+  // CHECK-LABEL: define {{.*}}global_var_init{{.*}} comdat($_ZN13Instantiated11vILi0EEE) {
+  // CHECK: call void @_ZN13Instantiated11BC1Ev(%{{.*}}* @_ZN13Instantiated11vILi0EEE)
+}
+
+namespace Instantiated2 {
+  struct A { A(); };
+  struct B : A {};
+  template<int> B v({});
+  template B v<0>;
+  // CHECK-LABEL: define {{.*}}global_var_init{{.*}} comdat($_ZN13Instantiated21vILi0EEE) {
+  // CHECK: call void @_ZN13Instantiated21AC2Ev(
+}
diff --git a/test/CodeGenCXX/cxx1z-inline-variables.cpp b/test/CodeGenCXX/cxx1z-inline-variables.cpp
index 2d16acd8a8c2..50eab3b70611 100644
--- a/test/CodeGenCXX/cxx1z-inline-variables.cpp
+++ b/test/CodeGenCXX/cxx1z-inline-variables.cpp
@@ -58,14 +58,22 @@ template<typename T> struct X {
   static int a;
   static inline int b;
   static int c;
+  static const int d;
+  static int e;
 };
 // CHECK: @_ZN1XIiE1aE = linkonce_odr global i32 10
 // CHECK: @_ZN1XIiE1bE = global i32 20
 // CHECK-NOT: @_ZN1XIiE1cE
+// CHECK: @_ZN1XIiE1dE = linkonce_odr constant i32 40
+// CHECK: @_ZN1XIiE1eE = linkonce_odr global i32 50
 template<> inline int X<int>::a = 10;
 int &use3 = X<int>::a;
 template<> int X<int>::b = 20;
 template<> inline int X<int>::c = 30;
+template<typename T> constexpr int X<T>::d = 40;
+template<typename T> inline int X<T>::e = 50;
+const int *use_x_int_d = &X<int>::d;
+const int *use_x_int_e = &X<int>::e;
 
 template<typename T> struct Y;
 template<> struct Y<int> {
diff --git a/test/CodeGenCXX/cxx1z-lambda-star-this.cpp b/test/CodeGenCXX/cxx1z-lambda-star-this.cpp
index a7e4aadbd573..379dbfd68148 100644
--- a/test/CodeGenCXX/cxx1z-lambda-star-this.cpp
+++ b/test/CodeGenCXX/cxx1z-lambda-star-this.cpp
@@ -16,7 +16,7 @@ int X = A{}.foo()();
 //CHECK: %[[I1:.+]] = bitcast %struct.A* %[[I0]] to i8*
 //CHECK: %[[I2:.+]] = bitcast %struct.A* %this1 to i8*
 // copy the contents ...
-//CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[I1]], i8* %[[I2]], i32 8, i32 8, i1 false)
+//CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %[[I1]], i8* align 8 %[[I2]], i32 8, i1 false)
 
 struct B {
   double b = 222;
diff --git a/test/CodeGenCXX/debug-info-composite-cc.cpp b/test/CodeGenCXX/debug-info-composite-cc.cpp
new file mode 100644
index 000000000000..2f0911612196
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-composite-cc.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -triple %itanium_abi_triple %s -o - | FileCheck %s
+
+// Not trivially copyable because of the explicit destructor.
+// CHECK-DAG: !DICompositeType({{.*}}, name: "RefDtor",{{.*}}flags: DIFlagTypePassByReference
+struct RefDtor {
+  int i;
+  ~RefDtor() {}
+} refDtor;
+
+// Not trivially copyable because of the explicit copy constructor.
+// CHECK-DAG: !DICompositeType({{.*}}, name: "RefCopy",{{.*}}flags: DIFlagTypePassByReference
+struct RefCopy {
+  int i;
+  RefCopy() = default;
+  RefCopy(RefCopy &Copy) {}
+} refCopy;
+
+// POD-like type even though it defines a destructor.
+// CHECK-DAG: !DICompositeType({{.*}}, name: "Podlike", {{.*}}flags: DIFlagTypePassByValue
+struct Podlike {
+  int i;
+  Podlike() = default;
+  Podlike(Podlike &&Move) = default;
+  ~Podlike() = default;
+} podlike;
+
+
+// This is a POD type.
+// CHECK-DAG: !DICompositeType({{.*}}, name: "Pod",{{.*}}flags: DIFlagTypePassByValue
+struct Pod {
+  int i;
+} pod;
+
+// This is definitely not a POD type.
+// CHECK-DAG: !DICompositeType({{.*}}, name: "Complex",{{.*}}flags: DIFlagTypePassByReference
+struct Complex {
+  Complex() {}
+  Complex(Complex &Copy) : i(Copy.i) {};
+  int i;
+} complex;
diff --git a/test/CodeGenCXX/derived-cast.cpp b/test/CodeGenCXX/derived-cast.cpp
new file mode 100644
index 000000000000..bf2b258c5e9f
--- /dev/null
+++ b/test/CodeGenCXX/derived-cast.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+class A {
+    int a;
+};
+
+class B {
+    int b;
+public:
+    A *getAsA();
+};
+
+class X : public A, public B {
+    int x;
+};
+
+// PR35909 - https://bugs.llvm.org/show_bug.cgi?id=35909
+
+A *B::getAsA() {
+  return static_cast<X*>(this);
+
+  // CHECK-LABEL: define %class.A* @_ZN1B6getAsAEv
+  // CHECK: %[[THIS:.*]] = load %class.B*, %class.B**
+  // CHECK-NEXT: %[[BC:.*]] = bitcast %class.B* %[[THIS]] to i8*
+  // CHECK-NEXT: getelementptr inbounds i8, i8* %[[BC]], i64 -4
+}
+
diff --git a/test/CodeGenCXX/eh.cpp b/test/CodeGenCXX/eh.cpp
index dfbe48c37239..d393ee13b1ca 100644
--- a/test/CodeGenCXX/eh.cpp
+++ b/test/CodeGenCXX/eh.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin -std=c++11 -emit-llvm %s -o %t.ll
-// RUN: FileCheck --input-file=%t.ll %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin -std=c++11 -emit-llvm %s -o - \
+// RUN:   | FileCheck %s
 
 struct test1_D {
   double d;
@@ -13,7 +13,7 @@ void test1() {
 // CHECK:       [[EXNOBJ:%.*]] = call i8* @__cxa_allocate_exception(i64 8)
 // CHECK-NEXT:  [[EXN:%.*]] = bitcast i8* [[EXNOBJ]] to [[DSTAR:%[^*]*\*]]
 // CHECK-NEXT:  [[EXN2:%.*]] = bitcast [[DSTAR]] [[EXN]] to i8*
-// CHECK-NEXT:  call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[EXN2]], i8* bitcast ([[DSTAR]] @d1 to i8*), i64 8, i32 8, i1 false)
+// CHECK-NEXT:  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[EXN2]], i8* align 8 bitcast ([[DSTAR]] @d1 to i8*), i64 8, i1 false)
 // CHECK-NEXT:  call void @__cxa_throw(i8* [[EXNOBJ]], i8* bitcast ({ i8*, i8* }* @_ZTI7test1_D to i8*), i8* null) [[NR:#[0-9]+]]
 // CHECK-NEXT:  unreachable
 
@@ -466,7 +466,7 @@ int foo() {
   // CHECK: [[T0:%.*]] = call i8* @__cxa_allocate_exception(i64 16)
   // CHECK-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to %"class.test17::DerivedException"*
   // CHECK-NEXT: [[T2:%.*]] = bitcast %"class.test17::DerivedException"* [[T1]] to i8*
-  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T2]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[T2]], i8 0, i64 16, i1 false)
 }
 }
 
diff --git a/test/CodeGenCXX/float16-declarations.cpp b/test/CodeGenCXX/float16-declarations.cpp
index 87ef139f8634..e0d62b71c4cd 100644
--- a/test/CodeGenCXX/float16-declarations.cpp
+++ b/test/CodeGenCXX/float16-declarations.cpp
@@ -108,7 +108,7 @@ int main(void) {
   S1<_Float16> s1 = { 132.f16 };
 // CHECK-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { half 0xH5820 }, align 2
 // CHECK-DAG:  [[S1:%[0-9]+]] = bitcast %struct.S1* %{{.*}} to i8*
-// CHECK-DAG: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[S1]], i8* bitcast (%struct.S1* @_ZZ4mainE2s1 to i8*), i64 2, i32 2, i1 false)
+// CHECK-DAG: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[S1]], i8* align 2 bitcast (%struct.S1* @_ZZ4mainE2s1 to i8*), i64 2, i1 false)
 
   _Float16 f4l = func1n(f1l)  + func1f(f2l) + c1.func1c(f3l) + c1.func2c(f1l) +
     func1t(f1l) + s1.mem2 - f1n + f2n;
diff --git a/test/CodeGenCXX/hidden-dllimport.cpp b/test/CodeGenCXX/hidden-dllimport.cpp
new file mode 100644
index 000000000000..9de0d71084ff
--- /dev/null
+++ b/test/CodeGenCXX/hidden-dllimport.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm -fvisibility-inlines-hidden -o - %s | FileCheck %s
+
+// We used to declare this hidden dllimport, which is contradictory.
+
+// CHECK: declare dllimport void @"\01?bar@foo@@QEAAXXZ"(%struct.foo*)
+
+struct __attribute__((dllimport)) foo {
+  void bar() {}
+};
+void zed(foo *p) { p->bar(); }
diff --git a/test/CodeGenCXX/microsoft-abi-multiple-nonvirtual-inheritance.cpp b/test/CodeGenCXX/microsoft-abi-multiple-nonvirtual-inheritance.cpp
index 5bed69ff117f..d51c4bea990a 100644
--- a/test/CodeGenCXX/microsoft-abi-multiple-nonvirtual-inheritance.cpp
+++ b/test/CodeGenCXX/microsoft-abi-multiple-nonvirtual-inheritance.cpp
@@ -88,8 +88,11 @@ void ChildOverride::right() {
 // ChildOverride::right gets 'this' cast to Right* in ECX (i.e. this+4) so we
 // need to adjust 'this' before use.
 //
+// CHECK: %[[THIS_STORE:.*]] = alloca %struct.ChildOverride*, align 4
 // CHECK: %[[THIS_ADDR:.*]] = alloca %struct.ChildOverride*, align 4
-// CHECK: %[[THIS_INIT:.*]] = bitcast i8* %[[ECX:.*]] to %struct.ChildOverride*
+// CHECK: %[[COERCE_VAL:.*]] = bitcast i8* %[[ECX:.*]] to %struct.ChildOverride*
+// CHECK: store %struct.ChildOverride* %[[COERCE_VAL]], %struct.ChildOverride** %[[THIS_STORE]], align 4
+// CHECK: %[[THIS_INIT:.*]] = load %struct.ChildOverride*, %struct.ChildOverride** %[[THIS_STORE]], align 4
 // CHECK: store %struct.ChildOverride* %[[THIS_INIT]], %struct.ChildOverride** %[[THIS_ADDR]], align 4
 // CHECK: %[[THIS_RELOAD:.*]] = load %struct.ChildOverride*, %struct.ChildOverride** %[[THIS_ADDR]]
 // CHECK: %[[THIS_i8:.*]] = bitcast %struct.ChildOverride* %[[THIS_RELOAD]] to i8*
@@ -132,8 +135,11 @@ struct GrandchildOverride : ChildOverride {
 void GrandchildOverride::right() {
 // CHECK-LABEL: define x86_thiscallcc void @"\01?right@GrandchildOverride@@UAEXXZ"(i8*
 //
+// CHECK: %[[THIS_STORE:.*]] = alloca %struct.GrandchildOverride*, align 4
 // CHECK: %[[THIS_ADDR:.*]] = alloca %struct.GrandchildOverride*, align 4
-// CHECK: %[[THIS_INIT:.*]] = bitcast i8* %[[ECX:.*]] to %struct.GrandchildOverride*
+// CHECK: %[[COERCE_VAL:.*]] = bitcast i8* %[[ECX:.*]] to %struct.GrandchildOverride*
+// CHECK: store %struct.GrandchildOverride* %[[COERCE_VAL]], %struct.GrandchildOverride** %[[THIS_STORE]], align 4
+// CHECK: %[[THIS_INIT:.*]] = load %struct.GrandchildOverride*, %struct.GrandchildOverride** %[[THIS_STORE]], align 4
 // CHECK: store %struct.GrandchildOverride* %[[THIS_INIT]], %struct.GrandchildOverride** %[[THIS_ADDR]], align 4
 // CHECK: %[[THIS_RELOAD:.*]] = load %struct.GrandchildOverride*, %struct.GrandchildOverride** %[[THIS_ADDR]]
 // CHECK: %[[THIS_i8:.*]] = bitcast %struct.GrandchildOverride* %[[THIS_RELOAD]] to i8*
diff --git a/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp b/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
index eaae3493ccdc..9e430457e56b 100644
--- a/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
+++ b/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
@@ -405,7 +405,7 @@ void fn2(FnPtr1 a, SmallWithDtor b) { fn1(a, b); };
 // WIN32:   %[[argmem:[^ ]*]] = alloca inalloca [[argmem_ty]]
 // WIN32:   %[[gep1:[^ ]*]] = getelementptr inbounds [[argmem_ty]], [[argmem_ty]]* %[[argmem]], i32 0, i32 1
 // WIN32:   %[[bc1:[^ ]*]] = bitcast %struct.SmallWithDtor* %[[gep1]] to i8*
-// WIN32:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[bc1]], i8* {{.*}}, i32 4, i32 4, i1 false)
+// WIN32:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %[[bc1]], i8* align 4 {{.*}}, i32 4, i1 false)
 // WIN32:   %[[a2:[^ ]*]] = load void [[dst_ty]], void [[dst_ty]]* %[[a1]], align 4
 // WIN32:   %[[gep2:[^ ]*]] = getelementptr inbounds [[argmem_ty]], [[argmem_ty]]* %[[argmem]], i32 0, i32 0
 // WIN32:   %[[addr:[^ ]*]] = bitcast {}** %[[gep2]] to void [[dst_ty]]*
diff --git a/test/CodeGenCXX/microsoft-abi-static-initializers.cpp b/test/CodeGenCXX/microsoft-abi-static-initializers.cpp
index 0b84f07e1154..f834c5593d1f 100644
--- a/test/CodeGenCXX/microsoft-abi-static-initializers.cpp
+++ b/test/CodeGenCXX/microsoft-abi-static-initializers.cpp
@@ -28,11 +28,11 @@ S s;
 // See @llvm.global_ctors above.
 __declspec(selectany) S selectany1;
 __declspec(selectany) S selectany2;
-// CHECK: define linkonce_odr void @"\01??__Eselectany1@@YAXXZ"() {{.*}} comdat
+// CHECK: define linkonce_odr dso_local void @"\01??__Eselectany1@@YAXXZ"() {{.*}} comdat
 // CHECK-NOT: @"\01??_Bselectany1
 // CHECK: call x86_thiscallcc %struct.S* @"\01??0S@@QAE@XZ"
 // CHECK: ret void
-// CHECK: define linkonce_odr void @"\01??__Eselectany2@@YAXXZ"() {{.*}} comdat
+// CHECK: define linkonce_odr dso_local void @"\01??__Eselectany2@@YAXXZ"() {{.*}} comdat
 // CHECK-NOT: @"\01??_Bselectany2
 // CHECK: call x86_thiscallcc %struct.S* @"\01??0S@@QAE@XZ"
 // CHECK: ret void
@@ -231,7 +231,7 @@ void force_usage() {
   DynamicDLLImportInitVSMangling::switch_test3();
 }
 
-// CHECK: define linkonce_odr void @"\01??__Efoo@?$B@H@@2VA@@A@YAXXZ"() {{.*}} comdat
+// CHECK: define linkonce_odr dso_local void @"\01??__Efoo@?$B@H@@2VA@@A@YAXXZ"() {{.*}} comdat
 // CHECK-NOT: and
 // CHECK-NOT: ?_Bfoo@
 // CHECK: call x86_thiscallcc %class.A* @"\01??0A@@QAE@XZ"
diff --git a/test/CodeGenCXX/microsoft-abi-virtual-inheritance-vtordisps.cpp b/test/CodeGenCXX/microsoft-abi-virtual-inheritance-vtordisps.cpp
index 2f141b2a66ec..377a6701f300 100644
--- a/test/CodeGenCXX/microsoft-abi-virtual-inheritance-vtordisps.cpp
+++ b/test/CodeGenCXX/microsoft-abi-virtual-inheritance-vtordisps.cpp
@@ -25,6 +25,7 @@ D::D() {}  // Forces vftable emission.
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc void @"\01?f@D@@$4PPPPPPPM@A@AEXXZ"
 // Note that the vtordisp is applied before really adjusting to D*.
+// CHECK: %[[COERCE_LOAD:.*]] = load %struct.D*, %struct.D** %{{.*}}
 // CHECK: %[[ECX:.*]] = load %struct.D*, %struct.D** %{{.*}}
 // CHECK: %[[ECX_i8:.*]] = bitcast %struct.D* %[[ECX]] to i8*
 // CHECK: %[[VTORDISP_PTR_i8:.*]] = getelementptr inbounds i8, i8* %[[ECX_i8]], i32 -4
@@ -36,6 +37,7 @@ D::D() {}  // Forces vftable emission.
 // CHECK: ret void
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc void @"\01?f@D@@$4PPPPPPPI@3AEXXZ"
+// CHECK: %[[COERCE_LOAD:.*]] = load %struct.D*, %struct.D** %{{.*}}
 // CHECK: %[[ECX:.*]] = load %struct.D*, %struct.D** %{{.*}}
 // CHECK: %[[ECX_i8:.*]] = bitcast %struct.D* %[[ECX]] to i8*
 // CHECK: %[[VTORDISP_PTR_i8:.*]] = getelementptr inbounds i8, i8* %[[ECX_i8]], i32 -8
@@ -64,7 +66,8 @@ struct G : virtual F, virtual E {
 
 G::G() {}  // Forces vftable emission.
 
-// CHECK-LABEL: define linkonce_odr x86_thiscallcc void @"\01?f@E@@$R4BA@M@PPPPPPPM@7AEXXZ"(i8*)
+// CHECK-LABEL: define linkonce_odr x86_thiscallcc void @"\01?f@E@@$R4BA@M@PPPPPPPM@7AEXXZ"(i8*
+// CHECK: %[[COERCE_LOAD:.*]] = load %struct.E*, %struct.E** %{{.*}}
 // CHECK: %[[ECX:.*]] = load %struct.E*, %struct.E** %{{.*}}
 // CHECK: %[[ECX_i8:.*]] = bitcast %struct.E* %[[ECX]] to i8*
 // CHECK: %[[VTORDISP_PTR_i8:.*]] = getelementptr inbounds i8, i8* %[[ECX_i8]], i32 -4
diff --git a/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp b/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
index c9dd1dd7d8e2..2dc30b758c02 100644
--- a/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
+++ b/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
@@ -115,9 +115,15 @@ void B::foo() {
 // B::foo gets 'this' cast to VBase* in ECX (i.e. this+8) so we
 // need to adjust 'this' before use.
 //
-// Store initial this:
+// Coerce this to correct type:
+// CHECK:   %[[THIS_STORE:.*]] = alloca %struct.B*
 // CHECK:   %[[THIS_ADDR:.*]] = alloca %struct.B*
-// CHECK:   store %struct.B* %{{.*}}, %struct.B** %[[THIS_ADDR]], align 4
+// CHECK:   %[[COERCE_VAL:.*]] = bitcast i8* %{{.*}} to %struct.B*
+// CHECK:   store %struct.B* %[[COERCE_VAL]], %struct.B** %[[THIS_STORE]], align 4
+//
+// Store initial this:
+// CHECK:   %[[THIS_INIT:.*]] = load %struct.B*, %struct.B** %[[THIS_STORE]]
+// CHECK:   store %struct.B* %[[THIS_INIT]], %struct.B** %[[THIS_ADDR]], align 4
 //
 // Reload and adjust the this parameter:
 // CHECK:   %[[THIS_RELOAD:.*]] = load %struct.B*, %struct.B** %[[THIS_ADDR]]
@@ -490,7 +496,7 @@ C::C() : B() {}
 // CHECK:   %[[B:.*]] = bitcast %"struct.test5::C"* %[[THIS]] to %"struct.test5::B"*
 // CHECK:   %[[B_i8:.*]] = bitcast %"struct.test5::B"* %[[B]] to i8*
 // CHECK:   %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[B_i8]], i32 4
-// CHECK:   call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
+// CHECK:   call void @llvm.memset.p0i8.i32(i8* align 4 %[[FIELD]], i8 0, i32 4, i1 false)
 }
 
 namespace pr27621 {
@@ -530,5 +536,5 @@ D::D() : C() {}
 // CHECK:   %[[C:.*]] = bitcast %"class.test6::D"* %[[THIS]] to %"class.test6::C"*
 // CHECK:   %[[C_i8:.*]] = bitcast %"class.test6::C"* %[[C]] to i8*
 // CHECK:   %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[C_i8]], i32 8
-// CHECK:   call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
+// CHECK:   call void @llvm.memset.p0i8.i32(i8* align 4 %[[FIELD]], i8 0, i32 4, i1 false)
 }
diff --git a/test/CodeGenCXX/microsoft-uuidof.cpp b/test/CodeGenCXX/microsoft-uuidof.cpp
index 9b4ff68d9aa8..f8d2da737910 100644
--- a/test/CodeGenCXX/microsoft-uuidof.cpp
+++ b/test/CodeGenCXX/microsoft-uuidof.cpp
@@ -63,12 +63,12 @@ const GUID& zeroiid = __uuidof(0);
 // CHECK: @_GUID_87654321_4321_4321_4321_ba0987654321 = linkonce_odr constant { i32, i16, i16, [8 x i8] } { i32 -2023406815, i16 17185, i16 17185, [8 x i8] c"C!\BA\09\87eC!" }, comdat
 
 // The static initializer for thing.
-// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @thing to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 16, i32 4, i1 false)
-// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @thing to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 4, i32 4, i1 false)
+// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @thing to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 16, i1 false)
+// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @thing to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 4, i1 false)
 
 // The static initializer for g.
-// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @g to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
-// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @g to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @g to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @g to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
 
 #ifdef DEFINE_GUID
 void fun() {
@@ -81,20 +81,20 @@ void fun() {
 
   // CHECK-DEFINE-GUID: [[U1:%.+]] = bitcast %struct._GUID* %s1_1 to i8*
   // CHECK-DEFINE-WRONG-GUID: [[U1:%.+]] = bitcast %struct._GUID* %s1_1 to i8*
-  // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U1]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
-  // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U1]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+  // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U1]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+  // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U1]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
   GUID s1_1 = __uuidof(S1);
 
   // CHECK-DEFINE-GUID: [[U2:%.+]] = bitcast %struct._GUID* %s1_2 to i8*
   // CHECK-DEFINE-WRONG-GUID: [[U2:%.+]] = bitcast %struct._GUID* %s1_2 to i8*
-  // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U2]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
-  // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U2]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+  // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U2]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+  // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U2]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
   GUID s1_2 = __uuidof(S1);
 
   // CHECK-DEFINE-GUID: [[U3:%.+]] = bitcast %struct._GUID* %s1_3 to i8*
   // CHECK-DEFINE-WRONG-GUID: [[U3:%.+]] = bitcast %struct._GUID* %s1_3 to i8*
-  // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U3]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
-  // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U3]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+  // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U3]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+  // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U3]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
   GUID s1_3 = __uuidof(s1);
 }
 #endif
diff --git a/test/CodeGenCXX/new-array-init.cpp b/test/CodeGenCXX/new-array-init.cpp
index ccc218e2b2dc..90cd600229b5 100644
--- a/test/CodeGenCXX/new-array-init.cpp
+++ b/test/CodeGenCXX/new-array-init.cpp
@@ -50,10 +50,10 @@ void string_nonconst(int n) {
   // FIXME: Conditionally throw an exception rather than passing -1 to alloc function
   // CHECK: select
   // CHECK: %[[PTR:.*]] = call i8* @_Zna{{.}}(i{{32|64}}
-  // CHECK: call void @llvm.memcpy{{.*}}(i8* %[[PTR]], i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
+  // CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %[[PTR]], i8* align {{[0-9]+}} getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
   // CHECK: %[[REST:.*]] = getelementptr inbounds i8, i8* %[[PTR]], i32 4
   // CHECK: %[[RESTSIZE:.*]] = sub {{.*}}, 4
-  // CHECK: call void @llvm.memset{{.*}}(i8* %[[REST]], i8 0, i{{32|64}} %[[RESTSIZE]],
+  // CHECK: call void @llvm.memset{{.*}}(i8* align {{[0-9]+}} %[[REST]], i8 0, i{{32|64}} %[[RESTSIZE]],
   new char[n] { "abc" };
 }
 
@@ -61,7 +61,7 @@ void string_nonconst(int n) {
 void string_exact() {
   // CHECK-NOT: icmp
   // CHECK: %[[PTR:.*]] = call i8* @_Zna{{.}}(i{{32|64}} 4)
-  // CHECK: call void @llvm.memcpy{{.*}}(i8* %[[PTR]], i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
+  // CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %[[PTR]], i8* align {{[0-9]+}} getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
   // CHECK-NOT: memset
   new char[4] { "abc" };
 }
@@ -71,7 +71,7 @@ void string_sufficient() {
   // CHECK-NOT: icmp
   // CHECK: %[[PTR:.*]] = call i8* @_Zna{{.}}(i{{32|64}} 15)
   // FIXME: For very large arrays, it would be preferable to emit a small copy and a memset.
-  // CHECK: call void @llvm.memcpy{{.*}}(i8* %[[PTR]], i8* getelementptr inbounds ([15 x i8], [15 x i8]* @[[ABC15]], i32 0, i32 0), i32 15,
+  // CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %[[PTR]], i8* align {{[0-9]+}} getelementptr inbounds ([15 x i8], [15 x i8]* @[[ABC15]], i32 0, i32 0), i32 15,
   // CHECK-NOT: memset
   new char[15] { "abc" };
 }
@@ -113,7 +113,7 @@ void aggr_sufficient(int n) {
   // CHECK: %[[PTR2:.*]] = getelementptr inbounds %[[AGGR]], %[[AGGR]]* %[[PTR1]], i32 1{{$}}
   // CHECK: %[[REMAIN:.*]] = sub i32 {{.*}}, 16
   // CHECK: %[[MEM:.*]] = bitcast %[[AGGR]]* %[[PTR2]] to i8*
-  // CHECK: call void @llvm.memset{{.*}}(i8* %[[MEM]], i8 0, i32 %[[REMAIN]],
+  // CHECK: call void @llvm.memset{{.*}}(i8* align {{[0-9]+}} %[[MEM]], i8 0, i32 %[[REMAIN]],
   struct Aggr { int a, b; };
   new Aggr[n] { 1, 2, 3 };
 }
diff --git a/test/CodeGenCXX/no-opt-volatile-memcpy.cpp b/test/CodeGenCXX/no-opt-volatile-memcpy.cpp
index a061daaf2636..a51421cdb3df 100644
--- a/test/CodeGenCXX/no-opt-volatile-memcpy.cpp
+++ b/test/CodeGenCXX/no-opt-volatile-memcpy.cpp
@@ -18,10 +18,10 @@ void foo (void) {
 // CHECK: %[[LS:.*]] = alloca %struct.s, align 4
 // CHECK-NEXT: %[[ZERO:.*]] = bitcast %struct.s* %[[LS]] to i8*
 // CHECK-NEXT:  %[[ONE:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT:  call void @llvm.memcpy.{{.*}}(i8* %[[ZERO]], i8* %[[ONE]], i64 132, i32 4, i1 true)
-// CHECK-NEXT:  call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT:  call void @llvm.memcpy.{{.*}}(i8* align 4 %[[ZERO]], i8* align 4 %[[ONE]], i64 132, i1 true)
+// CHECK-NEXT:  call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 // CHECK-NEXT:  %[[TWO:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT:  call void @llvm.memcpy.{{.*}}(i8* %[[TWO]], i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT:  call void @llvm.memcpy.{{.*}}(i8* align 4 %[[TWO]], i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 
 
 struct s1 {
@@ -35,8 +35,8 @@ void fee (void) {
   s.y = gs;
 }
 // CHECK-LABEL: define void @_Z3feev()
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 
 struct d : s1 {
 };
@@ -47,4 +47,4 @@ void gorf(void) {
   gd = gd;
 }
 // CHECK-LABEL: define void @_Z4gorfv()
-// CHECK:   call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK:   call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i64 132, i1 true)
diff --git a/test/CodeGenCXX/pod-member-memcpys.cpp b/test/CodeGenCXX/pod-member-memcpys.cpp
index 97d203fde2e0..9facb8ad507c 100644
--- a/test/CodeGenCXX/pod-member-memcpys.cpp
+++ b/test/CodeGenCXX/pod-member-memcpys.cpp
@@ -116,59 +116,59 @@ CALL_AO(PackedMembers)
 
 // Basic copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.Basic* @_ZN5BasicaSERKS_(%struct.Basic* %this, %struct.Basic* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret %struct.Basic*
 
 // PODMember copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.PODMember* @_ZN9PODMemberaSERKS_(%struct.PODMember* %this, %struct.PODMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret %struct.PODMember*
 
 // PODLikeMember copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.PODLikeMember* @_ZN13PODLikeMemberaSERKS_(%struct.PODLikeMember* %this, %struct.PODLikeMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret %struct.PODLikeMember*
 
 // ArrayMember copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.ArrayMember* @_ZN11ArrayMemberaSERKS_(%struct.ArrayMember* %this, %struct.ArrayMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
 // CHECK: ret %struct.ArrayMember*
 
 // VolatileMember copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.VolatileMember* @_ZN14VolatileMemberaSERKS_(%struct.VolatileMember* %this, %struct.VolatileMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: load volatile i32, i32* {{.*}}, align 4
 // CHECK: store volatile i32 {{.*}}, align 4
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret %struct.VolatileMember*
 
 // BitfieldMember copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.BitfieldMember* @_ZN14BitfieldMemberaSERKS_(%struct.BitfieldMember* %this, %struct.BitfieldMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 3, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 3, i1 {{.*}})
 // CHECK: ret %struct.BitfieldMember*
 
 // InnerClass copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.InnerClassMember* @_ZN16InnerClassMemberaSERKS_(%struct.InnerClassMember* %this, %struct.InnerClassMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret %struct.InnerClassMember*
 
 // PackedMembers copy-assignment:
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.PackedMembers* @_ZN13PackedMembersaSERKS_(%struct.PackedMembers* %this, %struct.PackedMembers* dereferenceable({{[0-9]+}}))
 // CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret %struct.PackedMembers*
 
 // COPY-CONSTRUCTORS:
@@ -183,70 +183,70 @@ CALL_CC(PackedMembers)
 // PackedMembers copy-assignment:
 // CHECK-LABEL: define linkonce_odr void @_ZN13PackedMembersC2ERKS_(%struct.PackedMembers* %this, %struct.PackedMembers* dereferenceable({{[0-9]+}}))
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
 
 CALL_CC(BitfieldMember2)
 // BitfieldMember2 copy-constructor:
 // CHECK-2-LABEL: define linkonce_odr void @_ZN15BitfieldMember2C2ERKS_(%struct.BitfieldMember2* %this, %struct.BitfieldMember2* dereferenceable({{[0-9]+}}))
-// CHECK-2: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4, i1 false)
+// CHECK-2: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 false)
 // CHECK-2: call void @_ZN6NonPODC1ERKS_
 // CHECK-2: ret void
 
 CALL_CC(BitfieldMember3)
 // BitfieldMember3 copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN15BitfieldMember3C2ERKS_(%struct.BitfieldMember3* %this, %struct.BitfieldMember3* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 8, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 8, i1 false)
 // CHECK: ret void
 
 CALL_CC(ReferenceMember)
 // ReferenceMember copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN15ReferenceMemberC2ERKS_(%struct.ReferenceMember* %this, %struct.ReferenceMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 8{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 16, i1 {{.*}})
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 8{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
 
 CALL_CC(InnerClassMember)
 // InnerClass copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN16InnerClassMemberC2ERKS_(%struct.InnerClassMember* %this, %struct.InnerClassMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
 
 CALL_CC(BitfieldMember)
 // BitfieldMember copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN14BitfieldMemberC2ERKS_(%struct.BitfieldMember* %this, %struct.BitfieldMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 3, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 3, i1 {{.*}})
 // CHECK: ret void
 
 CALL_CC(VolatileMember)
 // VolatileMember copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN14VolatileMemberC2ERKS_(%struct.VolatileMember* %this, %struct.VolatileMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: load volatile i32, i32* {{.*}}, align 4
 // CHECK: store volatile i32 {{.*}}, align 4
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
 
 CALL_CC(ArrayMember)
 // ArrayMember copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN11ArrayMemberC2ERKS_(%struct.ArrayMember* %this, %struct.ArrayMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
 // CHECK: ret void
 
 CALL_CC(PODLikeMember)
 // PODLikeMember copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN13PODLikeMemberC2ERKS_(%struct.PODLikeMember* %this, %struct.PODLikeMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
 // CHECK: invoke void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
 // CHECK: landingpad
 // CHECK: invoke void @_ZN7PODLikeD1Ev
@@ -254,15 +254,15 @@ CALL_CC(PODLikeMember)
 CALL_CC(PODMember)
 // PODMember copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN9PODMemberC2ERKS_(%struct.PODMember* %this, %struct.PODMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
 
 CALL_CC(Basic)
 // Basic copy-constructor:
 // CHECK-LABEL: define linkonce_odr void @_ZN5BasicC2ERKS_(%struct.Basic* %this, %struct.Basic* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
diff --git a/test/CodeGenCXX/pr20897.cpp b/test/CodeGenCXX/pr20897.cpp
index 9a7fdb999ba4..8c86149b0745 100644
--- a/test/CodeGenCXX/pr20897.cpp
+++ b/test/CodeGenCXX/pr20897.cpp
@@ -9,7 +9,7 @@ struct __declspec(dllexport) Derived : virtual Base {
 // CHECK:      %[[dest_a_gep:.*]] = getelementptr inbounds %struct.Derived, %struct.Derived* %[[this]], i32 0, i32 1
 // CHECK-NEXT: %[[src_load:.*]]   = load %struct.Derived*, %struct.Derived** {{.*}}
 // CHECK-NEXT: %[[src_a_gep:.*]]  = getelementptr inbounds %struct.Derived, %struct.Derived* %[[src_load:.*]], i32 0, i32 1
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[dest_a_gep]], i8* %[[src_a_gep]], i64 1, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[dest_a_gep]], i8* align 4 %[[src_a_gep]], i64 1, i1 false)
 // CHECK-NEXT: %[[dest_this:.*]] = load %struct.Derived*, %struct.Derived** %[[retval]]
 // CHECK-NEXT: ret %struct.Derived* %[[dest_this]]
   bool a : 1;
@@ -26,7 +26,7 @@ struct __declspec(dllexport) Derived2 : virtual Base {
 // CHECK-NEXT: %[[src_a_gep:.*]]  = getelementptr inbounds %struct.Derived2, %struct.Derived2* %[[src_load:.*]], i32 0, i32 1
 // CHECK-NEXT: %[[dest_a_bitcast:.*]]  = bitcast [1 x i32]* %[[dest_a_gep]] to i8*
 // CHECK-NEXT: %[[src_a_bitcast:.*]] = bitcast [1 x i32]* %[[src_a_gep]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[dest_a_bitcast]], i8* %[[src_a_bitcast]], i32 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %[[dest_a_bitcast]], i8* align 4 %[[src_a_bitcast]], i32 4, i1 false)
 // CHECK-NEXT: %[[dest_this:.*]] = load %struct.Derived2*, %struct.Derived2** %[[retval]]
 // CHECK-NEXT: ret %struct.Derived2* %[[dest_this]]
   int Array[1];
diff --git a/test/CodeGenCXX/sanitize-no-dtor-callback.cpp b/test/CodeGenCXX/sanitize-no-dtor-callback.cpp
index 2c355766216d..afc5382eb464 100644
--- a/test/CodeGenCXX/sanitize-no-dtor-callback.cpp
+++ b/test/CodeGenCXX/sanitize-no-dtor-callback.cpp
@@ -1,8 +1,9 @@
-// Test without the flag -fsanitize-memory-use-after-dtor, to ensure that
+// Test with the flag -fno-sanitize-memory-use-after-dtor, to ensure that
 // instrumentation is not erroneously inserted
-// RUN: %clang_cc1 -fsanitize=memory -triple=x86_64-pc-linux -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fsanitize=memory -fno-sanitize-memory-use-after-dtor -triple=x86_64-pc-linux -emit-llvm -o - %s | FileCheck %s
 
 struct Simple {
+  int x;
   ~Simple() {}
 };
 Simple s;
@@ -10,6 +11,7 @@ Simple s;
 // CHECK-NOT: call void @__sanitizer_dtor_callback
 
 struct Inlined {
+  int x;
   inline ~Inlined() {}
 };
 Inlined i;
diff --git a/test/CodeGenCXX/ubsan-function-noexcept.cpp b/test/CodeGenCXX/ubsan-function-noexcept.cpp
new file mode 100644
index 000000000000..45c2764add7d
--- /dev/null
+++ b/test/CodeGenCXX/ubsan-function-noexcept.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -std=c++17 -fsanitize=function -emit-llvm -triple x86_64-linux-gnu %s -o - | FileCheck %s
+
+// Check that typeinfo recorded in function prolog doesn't have "Do" noexcept
+// qualifier in its mangled name.
+// CHECK: @[[RTTI:[0-9]+]] = private constant i8* bitcast ({ i8*, i8* }* @_ZTIFvvE to i8*)
+// CHECK: define void @_Z1fv() #{{.*}} prologue <{ i32, i32 }> <{ i32 {{.*}}, i32 trunc (i64 sub (i64 ptrtoint (i8** @[[RTTI]] to i64), i64 ptrtoint (void ()* @_Z1fv to i64)) to i32) }>
+void f() noexcept {}
+
+// CHECK: define void @_Z1gPDoFvvE
+void g(void (*p)() noexcept) {
+  // Check that reference typeinfo at call site doesn't have "Do" noexcept
+  // qualifier in its mangled name, either.
+  // CHECK: icmp eq i8* %{{.*}}, bitcast ({ i8*, i8* }* @_ZTIFvvE to i8*), !nosanitize
+  p();
+}
diff --git a/test/CodeGenCXX/value-init.cpp b/test/CodeGenCXX/value-init.cpp
index fc4e0d3a55e0..348cbf78157e 100644
--- a/test/CodeGenCXX/value-init.cpp
+++ b/test/CodeGenCXX/value-init.cpp
@@ -244,7 +244,7 @@ namespace PR11124 {
   struct C : B { C(); };      
   C::C() : A(3), B() {}
   // CHECK-LABEL: define void @_ZN7PR111241CC1Ev
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 12, i32 8, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 {{.*}}, i8 0, i64 12, i1 false)
   // CHECK-NEXT: call void @_ZN7PR111241BC2Ev
   // Make sure C::C doesn't overwrite parts of A while it is zero-initializing B
 
@@ -252,7 +252,7 @@ namespace PR11124 {
   struct C2 : B2 { C2(); };      
   C2::C2() : A(3), B2() {}
   // CHECK-LABEL: define void @_ZN7PR111242C2C1Ev
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* {{.*}}, i64 16, i32 8, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %{{.*}}, i8* align 8 {{.*}}, i64 16, i1 false)
   // CHECK-NEXT: call void @_ZN7PR111242B2C2Ev
 }
 
diff --git a/test/CodeGenCXX/varargs.cpp b/test/CodeGenCXX/varargs.cpp
index e0165994d013..1f82d6098269 100644
--- a/test/CodeGenCXX/varargs.cpp
+++ b/test/CodeGenCXX/varargs.cpp
@@ -35,7 +35,7 @@ namespace test1 {
   // CHECK-NEXT: [[TMP:%.*]] = alloca [[A]], align 4
   // CHECK-NEXT: [[T0:%.*]] = bitcast [[A]]* [[TMP]] to i8*
   // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[X]] to i8*
-  // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 8, i32 4, i1 false)
+  // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T0]], i8* align 4 [[T1]], i64 8, i1 false)
   // CHECK-NEXT: [[T0:%.*]] = bitcast [[A]]* [[TMP]] to i64*
   // CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[T0]], align 4
   // CHECK-NEXT: call void (...) @_ZN5test13fooEz(i64 [[T1]])
diff --git a/test/CodeGenCXX/virtual-bases.cpp b/test/CodeGenCXX/virtual-bases.cpp
index e9c568c31b48..18d4bf0cdedc 100644
--- a/test/CodeGenCXX/virtual-bases.cpp
+++ b/test/CodeGenCXX/virtual-bases.cpp
@@ -46,3 +46,37 @@ struct D : B, C  {
 D::D() { }
 
 }
+
+namespace virtualBaseAlignment {
+
+// Check that the store to B::x in the base constructor has an 8-byte alignment.
+
+// CHECK: define linkonce_odr void @_ZN20virtualBaseAlignment1BC1Ev(%[[STRUCT_B:.*]]* %[[THIS:.*]])
+// CHECK: %[[THIS_ADDR:.*]] = alloca %[[STRUCT_B]]*, align 8
+// CHECK: store %[[STRUCT_B]]* %[[THIS]], %[[STRUCT_B]]** %[[THIS_ADDR]], align 8
+// CHECK: %[[THIS1:.*]] = load %[[STRUCT_B]]*, %[[STRUCT_B]]** %[[THIS_ADDR]], align 8
+// CHECK: %[[X:.*]] = getelementptr inbounds %[[STRUCT_B]], %[[STRUCT_B]]* %[[THIS1]], i32 0, i32 2
+// CHECK: store i32 123, i32* %[[X]], align 16
+
+// CHECK: define linkonce_odr void @_ZN20virtualBaseAlignment1BC2Ev(%[[STRUCT_B]]* %[[THIS:.*]], i8** %{{.*}})
+// CHECK: %[[THIS_ADDR:.*]] = alloca %[[STRUCT_B]]*, align 8
+// CHECK: store %[[STRUCT_B]]* %[[THIS]], %[[STRUCT_B]]** %[[THIS_ADDR]], align 8
+// CHECK: %[[THIS1:.*]] = load %[[STRUCT_B]]*, %[[STRUCT_B]]** %[[THIS_ADDR]], align 8
+// CHECK: %[[X:.*]] = getelementptr inbounds %[[STRUCT_B]], %[[STRUCT_B]]* %[[THIS1]], i32 0, i32 2
+// CHECK: store i32 123, i32* %[[X]], align 8
+
+struct A {
+  __attribute__((aligned(16))) double data1;
+};
+
+struct B : public virtual A {
+  B() : x(123) {}
+  double a;
+  int x;
+};
+
+struct C : public virtual B {};
+
+void test() { B b; C c; }
+
+}
diff --git a/test/CodeGenCoroutines/coro-params.cpp b/test/CodeGenCoroutines/coro-params.cpp
index 540f84585c8e..45b8aeffd11f 100644
--- a/test/CodeGenCoroutines/coro-params.cpp
+++ b/test/CodeGenCoroutines/coro-params.cpp
@@ -1,6 +1,7 @@
 // Verifies that parameters are copied with move constructors
 // Verifies that parameter copies are destroyed
 // Vefifies that parameter copies are used in the body of the coroutine
+// Verifies that parameter copies are used to construct the promise type, if that type has a matching constructor
 // RUN: %clang_cc1 -std=c++1z -fcoroutines-ts -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s -disable-llvm-passes -fexceptions | FileCheck %s
 
 namespace std::experimental {
@@ -127,3 +128,31 @@ struct B {
 void call_dependent_params() {
   dependent_params(A{}, B{}, B{});
 }
+
+// Test that, when the promise type has a constructor whose signature matches
+// that of the coroutine function, that constructor is used. This is an
+// experimental feature that will be proposed for the Coroutines TS.
+
+struct promise_matching_constructor {};
+
+template<>
+struct std::experimental::coroutine_traits<void, promise_matching_constructor, int, float, double> {
+  struct promise_type {
+    promise_type(promise_matching_constructor, int, float, double) {}
+    promise_type() = delete;
+    void get_return_object() {}
+    suspend_always initial_suspend() { return {}; }
+    suspend_always final_suspend() { return {}; }
+    void return_void() {}
+    void unhandled_exception() {}
+  };
+};
+
+// CHECK-LABEL: void @_Z38coroutine_matching_promise_constructor28promise_matching_constructorifd(i32, float, double)
+void coroutine_matching_promise_constructor(promise_matching_constructor, int, float, double) {
+  // CHECK: %[[INT:.+]] = load i32, i32* %.addr, align 4
+  // CHECK: %[[FLOAT:.+]] = load float, float* %.addr1, align 4
+  // CHECK: %[[DOUBLE:.+]] = load double, double* %.addr2, align 8
+  // CHECK: invoke void @_ZNSt12experimental16coroutine_traitsIJv28promise_matching_constructorifdEE12promise_typeC1ES1_ifd(%"struct.std::experimental::coroutine_traits<void, promise_matching_constructor, int, float, double>::promise_type"* %__promise, i32 %[[INT]], float %[[FLOAT]], double %[[DOUBLE]])
+  co_return;
+}
diff --git a/test/CodeGenObjC/arc-foreach.m b/test/CodeGenObjC/arc-foreach.m
index 77cb068187fa..ffeb37247b1b 100644
--- a/test/CodeGenObjC/arc-foreach.m
+++ b/test/CodeGenObjC/arc-foreach.m
@@ -42,7 +42,7 @@ void test0(NSArray *array) {
 
 // Initialize the fast enumaration state.
 // CHECK-LP64-NEXT: [[T0:%.*]] = bitcast [[STATE_T]]* [[STATE]] to i8*
-// CHECK-LP64-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 64, i32 8, i1 false)
+// CHECK-LP64-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 64, i1 false)
 
 // Evaluate the collection expression and retain.
 // CHECK-LP64-NEXT: [[T0:%.*]] = load [[ARRAY_T]]*, [[ARRAY_T]]** [[ARRAY]], align 8
diff --git a/test/CodeGenObjC/arc.m b/test/CodeGenObjC/arc.m
index d34156ee3473..8772ac5d6d6a 100644
--- a/test/CodeGenObjC/arc.m
+++ b/test/CodeGenObjC/arc.m
@@ -7,30 +7,30 @@
 // RUN: %clang_cc1 -fobjc-runtime=macosx-10.7.0 -triple x86_64-apple-darwin11 -Wno-objc-root-class -Wno-incompatible-pointer-types -Wno-arc-unsafe-retained-assign -emit-llvm -fblocks -fobjc-arc -fobjc-runtime-has-weak -o - %s | FileCheck -check-prefix=ARC-NATIVE %s
 
 // ARC-ALIEN: declare extern_weak void @objc_storeStrong(i8**, i8*)
-// ARC-ALIEN: declare extern_weak i8* @objc_retain(i8* returned)
-// ARC-ALIEN: declare extern_weak i8* @objc_autoreleaseReturnValue(i8* returned)
+// ARC-ALIEN: declare extern_weak i8* @objc_retain(i8*)
+// ARC-ALIEN: declare extern_weak i8* @objc_autoreleaseReturnValue(i8*)
 // ARC-ALIEN: declare i8* @objc_msgSend(i8*, i8*, ...) [[NLB:#[0-9]+]]
 // ARC-ALIEN: declare extern_weak void @objc_release(i8*)
-// ARC-ALIEN: declare extern_weak i8* @objc_retainAutoreleasedReturnValue(i8* returned)
+// ARC-ALIEN: declare extern_weak i8* @objc_retainAutoreleasedReturnValue(i8*)
 // ARC-ALIEN: declare extern_weak i8* @objc_initWeak(i8**, i8*)
 // ARC-ALIEN: declare extern_weak i8* @objc_storeWeak(i8**, i8*)
 // ARC-ALIEN: declare extern_weak i8* @objc_loadWeakRetained(i8**)
 // ARC-ALIEN: declare extern_weak void @objc_destroyWeak(i8**)
-// declare extern_weak i8* @objc_autorelease(i8*)
-// ARC-ALIEN: declare extern_weak i8* @objc_retainAutorelease(i8* returned)
+// ARC-ALIEN: declare extern_weak i8* @objc_autorelease(i8*)
+// ARC-ALIEN: declare extern_weak i8* @objc_retainAutorelease(i8*)
 
 // ARC-NATIVE: declare void @objc_storeStrong(i8**, i8*)
-// ARC-NATIVE: declare i8* @objc_retain(i8* returned) [[NLB:#[0-9]+]]
-// ARC-NATIVE: declare i8* @objc_autoreleaseReturnValue(i8* returned)
+// ARC-NATIVE: declare i8* @objc_retain(i8*) [[NLB:#[0-9]+]]
+// ARC-NATIVE: declare i8* @objc_autoreleaseReturnValue(i8*)
 // ARC-NATIVE: declare i8* @objc_msgSend(i8*, i8*, ...) [[NLB]]
 // ARC-NATIVE: declare void @objc_release(i8*) [[NLB]]
-// ARC-NATIVE: declare i8* @objc_retainAutoreleasedReturnValue(i8* returned)
+// ARC-NATIVE: declare i8* @objc_retainAutoreleasedReturnValue(i8*)
 // ARC-NATIVE: declare i8* @objc_initWeak(i8**, i8*)
 // ARC-NATIVE: declare i8* @objc_storeWeak(i8**, i8*)
 // ARC-NATIVE: declare i8* @objc_loadWeakRetained(i8**)
 // ARC-NATIVE: declare void @objc_destroyWeak(i8**)
-// declare i8* @objc_autorelease(i8*)
-// ARC-NATIVE: declare i8* @objc_retainAutorelease(i8* returned)
+// ARC-NATIVE: declare i8* @objc_autorelease(i8*)
+// ARC-NATIVE: declare i8* @objc_retainAutorelease(i8*)
 
 // CHECK-LABEL: define void @test0
 void test0(id x) {
@@ -507,7 +507,7 @@ void test19() {
   // CHECK:      [[X:%.*]] = alloca [5 x i8*], align 16
   // CHECK: call void @llvm.lifetime.start
   // CHECK-NEXT: [[T0:%.*]] = bitcast [5 x i8*]* [[X]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 40, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[T0]], i8 0, i64 40, i1 false)
   id x[5];
 
   extern id test19_helper(void);
@@ -556,7 +556,7 @@ void test20(unsigned n) {
   // Zero-initialize.
   // CHECK-NEXT: [[T0:%.*]] = bitcast i8** [[VLA]] to i8*
   // CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[DIM]], 8
-  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 [[T1]], i32 16, i1 false)
+  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[T0]], i8 0, i64 [[T1]], i1 false)
 
   // Destroy.
   // CHECK-NEXT: [[END:%.*]] = getelementptr inbounds i8*, i8** [[VLA]], i64 [[DIM]]
@@ -599,7 +599,7 @@ void test21(unsigned n) {
   // CHECK-NEXT: [[T0:%.*]] = bitcast [3 x i8*]* [[VLA]] to i8*
   // CHECK-NEXT: [[T1:%.*]] = mul nuw i64 2, [[DIM]]
   // CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[T1]], 24
-  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 [[T2]], i32 16, i1 false)
+  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[T0]], i8 0, i64 [[T2]], i1 false)
 
   // Destroy.
   // CHECK-NEXT: [[T0:%.*]] = mul nuw i64 2, [[DIM]]
@@ -1504,9 +1504,7 @@ - (id) foo { return self; }
 // CHECK:      [[SELF:%.*]] = alloca [[TEST69:%.*]]*, align 8
 // CHECK:      [[T0:%.*]] = load [[TEST69]]*, [[TEST69]]** [[SELF]], align 8
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[TEST69]]* [[T0]] to i8*
-// CHECK-NEXT: [[RETAIN:%.*]] = call i8* @objc_retain(i8* [[T1]])
-// CHECK-NEXT: [[AUTORELEASE:%.*]] = tail call i8* @objc_autoreleaseReturnValue(i8* [[RETAIN]])
-// CHECK-NEXT: ret i8* [[AUTORELEASE]]
+// CHECK-NEXT: ret i8* [[T1]]
 
 // rdar://problem/10907547
 void test70(id i) {
diff --git a/test/CodeGenObjC/builtin-memfns.m b/test/CodeGenObjC/builtin-memfns.m
index bb425c037c64..6cc91b109cf5 100644
--- a/test/CodeGenObjC/builtin-memfns.m
+++ b/test/CodeGenObjC/builtin-memfns.m
@@ -5,6 +5,6 @@
 // PR13697
 void test1(int *a, id b) {
   // CHECK: @test1
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i1 false)
   memcpy(a, b, 8);
 }
diff --git a/test/CodeGenObjC/forward-protocol-metadata-symbols.m b/test/CodeGenObjC/forward-protocol-metadata-symbols.m
index 78c51e490137..16d33ec15d82 100644
--- a/test/CodeGenObjC/forward-protocol-metadata-symbols.m
+++ b/test/CodeGenObjC/forward-protocol-metadata-symbols.m
@@ -23,4 +23,17 @@ int main() {
 // CHECK: @"\01l_OBJC_LABEL_PROTOCOL_$_P0" = weak hidden global
 // CHECK: @"\01l_OBJC_PROTOCOL_REFERENCE_$_P0" = weak hidden global
 
-// CHECK: llvm.compiler.used = appending global [10 x i8*] {{[^"]*}}OBJC_CLASS_NAME_{{[^"]*}}OBJC_METH_VAR_NAME_{{[^"]*}}OBJC_METH_VAR_TYPE_{{[^"]*}}"\01l_OBJC_$_CLASS_METHODS_A"{{[^"]*}}"\01l_OBJC_CLASS_PROTOCOLS_$_A"{{[^"]*}}OBJC_CLASS_NAME_.1{{[^"]*}}"\01l_OBJC_PROTOCOL_$_P0"{{[^"]*}}"\01l_OBJC_LABEL_PROTOCOL_$_P0"{{[^"]*}}"\01l_OBJC_PROTOCOL_REFERENCE_$_P0"{{[^"]*}}"OBJC_LABEL_CLASS_$"{{[^"]*}} section "llvm.metadata"
+// CHECK: llvm.used = appending global [3 x i8*]
+// CHECK-SAME: "\01l_OBJC_PROTOCOL_$_P0"
+// CHECK-SAME: "\01l_OBJC_LABEL_PROTOCOL_$_P0"
+// CHECK-SAME: "\01l_OBJC_PROTOCOL_REFERENCE_$_P0"
+
+// CHECK: llvm.compiler.used = appending global [7 x i8*]
+// CHECK-SAME: OBJC_CLASS_NAME_
+// CHECK-SAME: OBJC_METH_VAR_NAME_
+// CHECK-SAME: OBJC_METH_VAR_TYPE_
+// CHECK-SAME: "\01l_OBJC_$_CLASS_METHODS_A"
+// CHECK-SAME: "\01l_OBJC_CLASS_PROTOCOLS_$_A"
+// CHECK-SAME: OBJC_CLASS_NAME_.1
+// CHECK-SAME: "OBJC_LABEL_CLASS_$"
+// CHECK-SAME: section "llvm.metadata"
diff --git a/test/CodeGenObjC/messages-2.m b/test/CodeGenObjC/messages-2.m
index be66f71f66f7..38c5d5016a92 100644
--- a/test/CodeGenObjC/messages-2.m
+++ b/test/CodeGenObjC/messages-2.m
@@ -157,7 +157,7 @@ void test0(A *x) {
   // CHECK:         call {{.*}} @objc_msgSend_stret to
   // CHECK-NEXT:    br label
   // CHECK:         [[T0:%.*]] = bitcast [[POINT_T]]* [[POINT]] to i8*
-  // CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 48, i32 4, i1 false)
+  // CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[T0]], i8 0, i64 48, i1 false)
   // CHECK-NEXT:    br label
 
   // CHECK-NF:      [[X:%.*]] = alloca [[A]]*
@@ -169,7 +169,7 @@ void test0(A *x) {
   // CHECK-NF:      call {{.*}} @objc_msgSend_stret to
   // CHECK-NF-NEXT: br label
   // CHECK-NF:      [[T0:%.*]] = bitcast [[POINT_T]]* [[POINT]] to i8*
-  // CHECK-NF-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 48, i32 4, i1 false)
+  // CHECK-NF-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[T0]], i8 0, i64 48, i1 false)
   // CHECK-NF-NEXT: br label
   MyPoint point = [x returnAPoint];
 }
diff --git a/test/CodeGenObjC/stret-1.m b/test/CodeGenObjC/stret-1.m
index 2314d5a9ecf5..1122c28a468b 100644
--- a/test/CodeGenObjC/stret-1.m
+++ b/test/CodeGenObjC/stret-1.m
@@ -16,7 +16,7 @@ int main(int argc, const char **argv)
     s = [(id)(argc&~255) method];
     // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T0:%[^,]+]]
     // CHECK: [[T0P:%.*]] = bitcast %struct.stret* [[T0]] to i8*
-    // CHECK: call void @llvm.memset.p0i8.i64(i8* [[T0P]], i8 0, i64 400, i32 4, i1 false)
+    // CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 [[T0P]], i8 0, i64 400, i1 false)
 
     s = [Test method];
     // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T1:%[^,]+]]
diff --git a/test/CodeGenObjCXX/arc-exceptions.mm b/test/CodeGenObjCXX/arc-exceptions.mm
index 0ae306943186..3d5046135078 100644
--- a/test/CodeGenObjCXX/arc-exceptions.mm
+++ b/test/CodeGenObjCXX/arc-exceptions.mm
@@ -101,7 +101,7 @@ void test3(void) {
   //   Construct array.
   // CHECK-NEXT: [[ARRAY:%.*]] = getelementptr inbounds [[A]], [[A]]* [[THIS]], i32 0, i32 1
   // CHECK-NEXT: [[T0:%.*]] = bitcast [2 x [3 x i8*]]* [[ARRAY]] to i8*
-  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 48, i32 8, i1 false)
+  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 48, i1 false)
   //   throw 0;
   // CHECK:      invoke void @__cxa_throw(
   //   Landing pad from throw site:
diff --git a/test/CodeGenObjCXX/msabi-objc-types.mm b/test/CodeGenObjCXX/msabi-objc-types.mm
index 013a9c84daee..6b20d0b3d7ac 100644
--- a/test/CodeGenObjCXX/msabi-objc-types.mm
+++ b/test/CodeGenObjCXX/msabi-objc-types.mm
@@ -33,7 +33,7 @@ void g(id &) {}
 // CHECK-LABEL: "\01?g@@YAXAAPAUobjc_object@@@Z"
 
 void g(const id &) {}
-// CHECK-LABEL: "\01?g@@YAXABPAUobjc_object@@@Z"
+// CHECK-LABEL: "\01?g@@YAXABQAUobjc_object@@@Z"
 
 void g(id &&) {}
 // CHECK-LABEL: "\01?g@@YAX$$QAPAUobjc_object@@@Z"
@@ -45,7 +45,7 @@ void h(Class &) {}
 // CHECK-LABEL: "\01?h@@YAXAAPAUobjc_class@@@Z"
 
 void h(const Class &) {}
-// CHECK-LABEL: "\01?h@@YAXABPAUobjc_class@@@Z"
+// CHECK-LABEL: "\01?h@@YAXABQAUobjc_class@@@Z"
 
 void h(Class &&) {}
 // CHECK-LABEL: "\01?h@@YAX$$QAPAUobjc_class@@@Z"
@@ -62,6 +62,30 @@ void h(Class &&) {}
 const I &l() { return *kI; }
 // CHECK-LABEL: "\01?l@@YAABUI@@XZ"
 
+void m(const id) {}
+// CHECK-LABEL: "\01?m@@YAXQAUobjc_object@@@Z"
+
+void m(const I *) {}
+// CHECK-LABEL: "\01?m@@YAXPBUI@@@Z"
+
+void n(SEL) {}
+// CHECK-LABEL: "\01?n@@YAXPAUobjc_selector@@@Z"
+
+void n(SEL *) {}
+// CHECK-LABEL: "\01?n@@YAXPAPAUobjc_selector@@@Z"
+
+void n(const SEL *) {}
+// CHECK-LABEL: "\01?n@@YAXPBQAUobjc_selector@@@Z"
+
+void n(SEL &) {}
+// CHECK-LABEL: "\01?n@@YAXAAPAUobjc_selector@@@Z"
+
+void n(const SEL &) {}
+// CHECK-LABEL: "\01?n@@YAXABQAUobjc_selector@@@Z"
+
+void n(SEL &&) {}
+// CHECK-LABEL: "\01?n@@YAX$$QAPAUobjc_selector@@@Z"
+
 struct __declspec(dllexport) s {
   struct s &operator=(const struct s &) = delete;
 
@@ -93,16 +117,16 @@ void m(id &&) {}
   // CHECK-LABEL: "\01?m@s@@QAAX$$QAPAUobjc_object@@@Z"
 
   void m(const id &) {}
-  // CHECK-LABEL: "\01?m@s@@QAAXABPAUobjc_object@@@Z"
+  // CHECK-LABEL: "\01?m@s@@QAAXABQAUobjc_object@@@Z"
 
   void m(const id &&) {}
-  // CHECK-LABEL: "\01?m@s@@QAAX$$QBPAUobjc_object@@@Z"
+  // CHECK-LABEL: "\01?m@s@@QAAX$$QBQAUobjc_object@@@Z"
 
   void m(Class *) {}
   // CHECK-LABEL: "\01?m@s@@QAAXPAPAUobjc_class@@@Z"
 
   void m(const Class *) {}
-  // CHECK-LABEL: "\01?m@s@@QAAXPBPAUobjc_class@@@Z"
+  // CHECK-LABEL: "\01?m@s@@QAAXPBQAUobjc_class@@@Z"
 
   void m(Class) {}
   // CHECK-LABEL: "\01?m@s@@QAAXPAUobjc_class@@@Z"
@@ -111,12 +135,58 @@ void m(Class &) {}
   // CHECK-LABEL: "\01?m@s@@QAAXAAPAUobjc_class@@@Z"
 
   void m(const Class &) {}
-  // CHECK-LABEL: "\01?m@s@@QAAXABPAUobjc_class@@@Z"
+  // CHECK-LABEL: "\01?m@s@@QAAXABQAUobjc_class@@@Z"
 
   void m(Class &&) {}
   // CHECK-LABEL: "\01?m@s@@QAAX$$QAPAUobjc_class@@@Z"
 
   void m(const Class &&) {}
-  // CHECK-LABEL: "\01?m@s@@QAAX$$QBPAUobjc_class@@@Z"
+  // CHECK-LABEL: "\01?m@s@@QAAX$$QBQAUobjc_class@@@Z"
+
+  void m(SEL) {}
+  // CHECK-LABEL: "\01?m@s@@QAAXPAUobjc_selector@@@Z"
+
+  void m(SEL *) {}
+  // CHECK-LABEL: "\01?m@s@@QAAXPAPAUobjc_selector@@@Z"
+
+  void m(const SEL *) {}
+  // CHECK-LABEL: "\01?m@s@@QAAXPBQAUobjc_selector@@@Z"
+
+  void m(SEL &) {}
+  // CHECK-LABEL: "\01?m@s@@QAAXAAPAUobjc_selector@@@Z"
+
+  void m(const SEL &) {}
+  // CHECK-LABEL: "\01?m@s@@QAAXABQAUobjc_selector@@@Z"
+
+  void m(SEL &&) {}
+  // CHECK-LABEL: "\01?m@s@@QAAX$$QAPAUobjc_selector@@@Z"
+
+  void m(const SEL &&) {}
+  // CHECK-LABEL: "\01?m@s@@QAAX$$QBQAUobjc_selector@@@Z"
+};
+
+template <typename T>
+struct remove_pointer { typedef T type; };
+
+template <typename T>
+struct remove_pointer<T *> {
+  typedef T type;
 };
 
+template <typename T>
+struct t {
+  t() {}
+};
+
+template struct t<id>;
+// CHECK-LABEL: "\01??0?$t@PAUobjc_object@@@@QAA@XZ"
+
+template struct t<remove_pointer<id>::type>;
+// CHECK-LABEL: "\01??0?$t@Uobjc_object@@@@QAA@XZ"
+
+template struct t<SEL>;
+// CHECK-LABEL: "\01??0?$t@PAUobjc_selector@@@@QAA@XZ"
+
+template struct t<remove_pointer<SEL>::type>;
+// CHECK-LABEL: "\01??0?$t@Uobjc_selector@@@@QAA@XZ"
+
diff --git a/test/CodeGenObjCXX/msabi-protocol-conformance.mm b/test/CodeGenObjCXX/msabi-protocol-conformance.mm
new file mode 100644
index 000000000000..9f668d1f27c8
--- /dev/null
+++ b/test/CodeGenObjCXX/msabi-protocol-conformance.mm
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple thumbv7-windows-msvc -fobjc-runtime=ios-6.0 -o - -emit-llvm %s | FileCheck %s
+
+@protocol P;
+@protocol Q;
+
+@class I;
+
+void f(id<P>, id, id<P>, id) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@@@PAUobjc_object@@01@Z"
+
+void f(id, id<P>, id<P>, id) {}
+// CHECK-LABEL: "\01?f@@YAXPAUobjc_object@@PAU?$objc_object@YP@@@@10@Z"
+
+void f(id<P>, id<P>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@@@0@Z"
+
+void f(id<P>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@@@@Z"
+
+void f(id<P, Q>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@YQ@@@@@Z"
+
+void f(Class<P>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_class@YP@@@@@Z"
+
+void f(Class<P, Q>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_class@YP@@YQ@@@@@Z"
+
+void f(I<P> *) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$I@YP@@@@@Z"
+
+void f(I<P, Q> *) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$I@YP@@YQ@@@@@Z"
+
diff --git a/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
index fefe1c4a41d5..59f38f80dca4 100644
--- a/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
+++ b/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
@@ -62,7 +62,7 @@ void func2(void) {
 // CHECK-LABEL: define void @func3()
 // CHECK: %a = alloca [16 x [1 x float]], align 4, addrspace(5)
 // CHECK: %[[CAST:.+]] = bitcast [16 x [1 x float]] addrspace(5)* %a to i8 addrspace(5)*
-// CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* %[[CAST]], i8 0, i64 64, i32 4, i1 false)
+// CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 4 %[[CAST]], i8 0, i64 64, i1 false)
 void func3(void) {
   float a[16][1] = {{0.}};
 }
diff --git a/test/CodeGenOpenCL/amdgpu-nullptr.cl b/test/CodeGenOpenCL/amdgpu-nullptr.cl
index 513d56c19d60..6dbd8a6cf520 100644
--- a/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -143,9 +143,9 @@ void test_static_var_local(void) {
 // NOOPT: store i8* null, i8** %sp3, align 4
 // NOOPT: store i8* null, i8** %sp4, align 4
 // NOOPT: %[[SS1:.*]] = bitcast %struct.StructTy1* %SS1 to i8*
-// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_private.SS1 to i8 addrspace(2)*), i64 32, i32 8, i1 false)
+// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* align 8 %[[SS1]], i8 addrspace(2)* align 8 bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_private.SS1 to i8 addrspace(2)*), i64 32, i1 false)
 // NOOPT: %[[SS2:.*]] = bitcast %struct.StructTy2* %SS2 to i8*
-// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 8, i1 false)
+// NOOPT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SS2]], i8 0, i64 24, i1 false)
 void test_func_scope_var_private(void) {
   private char *sp1 = 0;
   private char *sp2 = NULL;
@@ -163,9 +163,9 @@ void test_func_scope_var_private(void) {
 // NOOPT: store i8 addrspace(3)* null, i8 addrspace(3)** %sp3, align 4
 // NOOPT: store i8 addrspace(3)* null, i8 addrspace(3)** %sp4, align 4
 // NOOPT: %[[SS1:.*]] = bitcast %struct.StructTy1* %SS1 to i8*
-// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_local.SS1 to i8 addrspace(2)*), i64 32, i32 8, i1 false)
+// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* align 8 %[[SS1]], i8 addrspace(2)* align 8 bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_local.SS1 to i8 addrspace(2)*), i64 32, i1 false)
 // NOOPT: %[[SS2:.*]] = bitcast %struct.StructTy2* %SS2 to i8*
-// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 8, i1 false)
+// NOOPT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SS2]], i8 0, i64 24, i1 false)
 void test_func_scope_var_local(void) {
   local char *sp1 = 0;
   local char *sp2 = NULL;
@@ -510,7 +510,7 @@ typedef struct {
 } StructTy3;
 
 // CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull {{.*}}, i8 0, i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 {{.*}}, i8 0, i64 40, i1 false)
 void test_memset_private(private StructTy3 *ptr) {
   StructTy3 S3 = {0, 0, 0, 0, 0};
   *ptr = S3;
diff --git a/test/CodeGenOpenCL/partial_initializer.cl b/test/CodeGenOpenCL/partial_initializer.cl
index 454c82f89299..cdc469f7d60d 100644
--- a/test/CodeGenOpenCL/partial_initializer.cl
+++ b/test/CodeGenOpenCL/partial_initializer.cl
@@ -36,7 +36,7 @@ void f(void) {
   // CHECK: %[[V2:.*]] = alloca <4 x i32>, align 16
 
   // CHECK: %[[v0:.*]] = bitcast [6 x [6 x float]]* %A to i8*
-  // CHECK: call void @llvm.memset.p0i8.i32(i8* %[[v0]], i8 0, i32 144, i32 4, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* align 4 %[[v0]], i8 0, i32 144, i1 false)
   // CHECK: %[[v1:.*]] = bitcast i8* %[[v0]] to [6 x [6 x float]]*
   // CHECK: %[[v2:.*]] = getelementptr [6 x [6 x float]], [6 x [6 x float]]* %[[v1]], i32 0, i32 0
   // CHECK: %[[v3:.*]] = getelementptr [6 x float], [6 x float]* %[[v2]], i32 0, i32 0
@@ -46,7 +46,7 @@ void f(void) {
   float A[6][6]  = {1.0f, 2.0f};
 
   // CHECK: %[[v5:.*]] = bitcast %struct.StrucTy* %S to i8*
-  // CHECK: call void @llvm.memcpy.p0i8.p2i8.i32(i8* %[[v5]], i8 addrspace(2)* bitcast (%struct.StrucTy addrspace(2)* @f.S to i8 addrspace(2)*), i32 12, i32 4, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p2i8.i32(i8* align 4 %[[v5]], i8 addrspace(2)* align 4 bitcast (%struct.StrucTy addrspace(2)* @f.S to i8 addrspace(2)*), i32 12, i1 false)
   StrucTy S = {1, 2};
 
   // CHECK: store <2 x i32> <i32 1, i32 2>, <2 x i32>* %[[compoundliteral1]], align 8
diff --git a/test/CodeGenOpenCL/private-array-initialization.cl b/test/CodeGenOpenCL/private-array-initialization.cl
index a7d4a9891977..6c2f45f8378e 100644
--- a/test/CodeGenOpenCL/private-array-initialization.cl
+++ b/test/CodeGenOpenCL/private-array-initialization.cl
@@ -5,5 +5,5 @@
 void test() {
   __private int arr[] = {1, 2, 3};
 // CHECK:  %[[arr_i8_ptr:[0-9]+]] = bitcast [3 x i32]* %arr to i8*
-// CHECK:  call void @llvm.memcpy.p0i8.p2i8.i32(i8* %[[arr_i8_ptr]], i8 addrspace(2)* bitcast ([3 x i32] addrspace(2)* @test.arr to i8 addrspace(2)*), i32 12, i32 4, i1 false)
+// CHECK:  call void @llvm.memcpy.p0i8.p2i8.i32(i8* align 4 %[[arr_i8_ptr]], i8 addrspace(2)* align 4 bitcast ([3 x i32] addrspace(2)* @test.arr to i8 addrspace(2)*), i32 12, i1 false)
 }
diff --git a/test/CoverageMapping/classtemplate.cpp b/test/CoverageMapping/classtemplate.cpp
index 0ccdcb2431b5..0dbb0c0ede84 100644
--- a/test/CoverageMapping/classtemplate.cpp
+++ b/test/CoverageMapping/classtemplate.cpp
@@ -2,6 +2,7 @@
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-CONSTRUCTOR
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-GETTER
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-SETTER
+// RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-INIT-LIST
 
 template<class TT>
 class Test {
@@ -44,11 +45,51 @@ template <class T> class Test3 {
   void unmangleable(UninstantiatedClassWithTraits<T> x) {}
 };
 
+void abort() __attribute__((noreturn));
+
+namespace std {
+typedef decltype(sizeof(int)) size_t;
+
+template <typename E> struct initializer_list {
+  const E *p;
+  size_t n;
+  initializer_list(const E *p, size_t n) : p(p), n(n) {}
+};
+
+template <typename F, typename S> struct pair {
+  F f;
+  S s;
+  pair(const F &f, const S &s) : f(f), s(s) {}
+};
+
+struct string {
+  const char *str;
+  string() { abort(); }
+  string(const char *S) : str(S) {}
+  ~string() { abort(); }
+};
+
+template<typename K, typename V>
+struct map {
+  using T = pair<K, V>;
+  map(initializer_list<T> i, const string &s = string()) {}
+  ~map() { abort(); }
+};
+
+}; // namespace std
+
+// CHECK-INIT-LIST-LABEL: _Z5Test4v:
+std::map<int, int> Test4() { // CHECK-INIT-LIST: File 0, [[@LINE]]:28 -> [[@LINE+3]]:2 = #0
+  abort();
+  return std::map<int, int>{{0, 0}}; // CHECK-INIT-LIST-NEXT: [[@LINE]]:3 -> [[@LINE]]:36 = 0
+}
+
 int main() {
   Test<unsigned> t;
   t.set(Test<unsigned>::A, 5.5);
   t.set(Test<unsigned>::T, 5.6);
   t.set(Test<unsigned>::G, 5.7);
   t.set(Test<unsigned>::C, 5.8);
+  Test4();
   return 0;
 }
diff --git a/test/Driver/Inputs/resource_dir/asan_blacklist.txt b/test/Driver/Inputs/CUDA-nolibdevice/usr/local/cuda/bin/ptxas
old mode 100644
new mode 100755
similarity index 100%
rename from test/Driver/Inputs/resource_dir/asan_blacklist.txt
rename to test/Driver/Inputs/CUDA-nolibdevice/usr/local/cuda/bin/ptxas
diff --git a/test/Driver/Inputs/resource_dir/hwasan_blacklist.txt b/test/Driver/Inputs/CUDA-symlinks/opt/cuda/bin/ptxas
old mode 100644
new mode 100755
similarity index 100%
rename from test/Driver/Inputs/resource_dir/hwasan_blacklist.txt
rename to test/Driver/Inputs/CUDA-symlinks/opt/cuda/bin/ptxas
diff --git a/test/Driver/Inputs/resource_dir/ubsan_blacklist.txt b/test/Driver/Inputs/CUDA-symlinks/opt/cuda/include/.keep
similarity index 100%
rename from test/Driver/Inputs/resource_dir/ubsan_blacklist.txt
rename to test/Driver/Inputs/CUDA-symlinks/opt/cuda/include/.keep
diff --git a/test/Driver/Inputs/resource_dir/vtables_blacklist.txt b/test/Driver/Inputs/CUDA-symlinks/opt/cuda/lib/.keep
similarity index 100%
rename from test/Driver/Inputs/resource_dir/vtables_blacklist.txt
rename to test/Driver/Inputs/CUDA-symlinks/opt/cuda/lib/.keep
diff --git a/test/Driver/Inputs/sparc-sun-solaris2.11/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crt1.o b/test/Driver/Inputs/CUDA-symlinks/opt/cuda/nvvm/libdevice/libdevice.compute_30.10.bc
similarity index 100%
rename from test/Driver/Inputs/sparc-sun-solaris2.11/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crt1.o
rename to test/Driver/Inputs/CUDA-symlinks/opt/cuda/nvvm/libdevice/libdevice.compute_30.10.bc
diff --git a/test/Driver/Inputs/sparc-sun-solaris2.11/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbegin.o b/test/Driver/Inputs/CUDA-symlinks/opt/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
similarity index 100%
rename from test/Driver/Inputs/sparc-sun-solaris2.11/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbegin.o
rename to test/Driver/Inputs/CUDA-symlinks/opt/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
diff --git a/test/Driver/Inputs/CUDA-symlinks/usr/bin/ptxas b/test/Driver/Inputs/CUDA-symlinks/usr/bin/ptxas
new file mode 120000
index 000000000000..59eefd95a902
--- /dev/null
+++ b/test/Driver/Inputs/CUDA-symlinks/usr/bin/ptxas
@@ -0,0 +1 @@
+../../opt/cuda/bin/ptxas
\ No newline at end of file
diff --git a/test/Driver/Inputs/sparc-sun-solaris2.11/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtend.o b/test/Driver/Inputs/CUDA/usr/local/cuda/bin/ptxas
old mode 100644
new mode 100755
similarity index 100%
rename from test/Driver/Inputs/sparc-sun-solaris2.11/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtend.o
rename to test/Driver/Inputs/CUDA/usr/local/cuda/bin/ptxas
diff --git a/test/Driver/Inputs/config-1.cfg b/test/Driver/Inputs/config-1.cfg
new file mode 100644
index 000000000000..7d1326fe8bc8
--- /dev/null
+++ b/test/Driver/Inputs/config-1.cfg
@@ -0,0 +1,6 @@
+
+# Empty lines and line started with # are ignored
+-Werror
+
+    # Language
+    -std=c99
diff --git a/test/Driver/Inputs/config-2.cfg b/test/Driver/Inputs/config-2.cfg
new file mode 100644
index 000000000000..c803bd2a78f1
--- /dev/null
+++ b/test/Driver/Inputs/config-2.cfg
@@ -0,0 +1,2 @@
+# nested inclusion
+@config-3.cfg
diff --git a/test/Driver/Inputs/config-2a.cfg b/test/Driver/Inputs/config-2a.cfg
new file mode 100644
index 000000000000..ecdc55515682
--- /dev/null
+++ b/test/Driver/Inputs/config-2a.cfg
@@ -0,0 +1,2 @@
+# nested inclusion
+@config/config-4.cfg
diff --git a/test/Driver/Inputs/config-3.cfg b/test/Driver/Inputs/config-3.cfg
new file mode 100644
index 000000000000..d5086a8e4e08
--- /dev/null
+++ b/test/Driver/Inputs/config-3.cfg
@@ -0,0 +1 @@
+-Wundefined-func-template
diff --git a/test/Driver/Inputs/config-4.cfg b/test/Driver/Inputs/config-4.cfg
new file mode 100644
index 000000000000..d8a022ae7592
--- /dev/null
+++ b/test/Driver/Inputs/config-4.cfg
@@ -0,0 +1,2 @@
+-L/usr/local/lib
+-stdlib=libc++
\ No newline at end of file
diff --git a/test/Driver/Inputs/config-5.cfg b/test/Driver/Inputs/config-5.cfg
new file mode 100644
index 000000000000..09787ae6c492
--- /dev/null
+++ b/test/Driver/Inputs/config-5.cfg
@@ -0,0 +1,2 @@
+--serialize-diagnostics diag.ser
+-target
diff --git a/test/Driver/Inputs/config-6.cfg b/test/Driver/Inputs/config-6.cfg
new file mode 100644
index 000000000000..24d93cf8eca6
--- /dev/null
+++ b/test/Driver/Inputs/config-6.cfg
@@ -0,0 +1 @@
+--config config-5
diff --git a/test/Driver/Inputs/config/config-4.cfg b/test/Driver/Inputs/config/config-4.cfg
new file mode 100644
index 000000000000..033e86a52a64
--- /dev/null
+++ b/test/Driver/Inputs/config/config-4.cfg
@@ -0,0 +1 @@
+-isysroot /opt/data
diff --git a/test/Driver/Inputs/config/i386-qqq.cfg b/test/Driver/Inputs/config/i386-qqq.cfg
new file mode 100644
index 000000000000..41e1a9460b63
--- /dev/null
+++ b/test/Driver/Inputs/config/i386-qqq.cfg
@@ -0,0 +1 @@
+-target i386
diff --git a/test/Driver/Inputs/config/i386-qqq3.cfg b/test/Driver/Inputs/config/i386-qqq3.cfg
new file mode 100644
index 000000000000..59928c10cba8
--- /dev/null
+++ b/test/Driver/Inputs/config/i386-qqq3.cfg
@@ -0,0 +1 @@
+-target i383
diff --git a/test/Driver/Inputs/config/x86_64-qqq.cfg b/test/Driver/Inputs/config/x86_64-qqq.cfg
new file mode 100644
index 000000000000..9d8565a336e1
--- /dev/null
+++ b/test/Driver/Inputs/config/x86_64-qqq.cfg
@@ -0,0 +1 @@
+-target x86_64
diff --git a/test/Driver/Inputs/config/x86_64-qqq2.cfg b/test/Driver/Inputs/config/x86_64-qqq2.cfg
new file mode 100644
index 000000000000..9d8565a336e1
--- /dev/null
+++ b/test/Driver/Inputs/config/x86_64-qqq2.cfg
@@ -0,0 +1 @@
+-target x86_64
diff --git a/test/Driver/Inputs/config/x86_64.cfg b/test/Driver/Inputs/config/x86_64.cfg
new file mode 100644
index 000000000000..9d8565a336e1
--- /dev/null
+++ b/test/Driver/Inputs/config/x86_64.cfg
@@ -0,0 +1 @@
+-target x86_64
diff --git a/test/Driver/Inputs/config2/config-4.cfg b/test/Driver/Inputs/config2/config-4.cfg
new file mode 100644
index 000000000000..bd866b6966d0
--- /dev/null
+++ b/test/Driver/Inputs/config2/config-4.cfg
@@ -0,0 +1 @@
+-Wall
diff --git a/test/Driver/Inputs/config2/i386.cfg b/test/Driver/Inputs/config2/i386.cfg
new file mode 100644
index 000000000000..41e1a9460b63
--- /dev/null
+++ b/test/Driver/Inputs/config2/i386.cfg
@@ -0,0 +1 @@
+-target i386
diff --git a/test/Driver/Inputs/sparc-sun-solaris2.11/usr/lib/crti.o b/test/Driver/Inputs/multilib_riscv_linux_sdk/bin/.keep
similarity index 100%
rename from test/Driver/Inputs/sparc-sun-solaris2.11/usr/lib/crti.o
rename to test/Driver/Inputs/multilib_riscv_linux_sdk/bin/.keep
diff --git a/test/Driver/Inputs/sparc-sun-solaris2.11/usr/lib/crtn.o b/test/Driver/Inputs/multilib_riscv_linux_sdk/include/.keep
similarity index 100%
rename from test/Driver/Inputs/sparc-sun-solaris2.11/usr/lib/crtn.o
rename to test/Driver/Inputs/multilib_riscv_linux_sdk/include/.keep
diff --git a/test/Driver/Inputs/sparc-sun-solaris2.11/usr/lib/ld.so.1 b/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/crtbegin.o
similarity index 100%
rename from test/Driver/Inputs/sparc-sun-solaris2.11/usr/lib/ld.so.1
rename to test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/crtbegin.o
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32/crtbegin.o b/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32d/crtbegin.o b/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32d/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib64/lp64/crtbegin.o b/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib64/lp64/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib64/lp64d/crtbegin.o b/test/Driver/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib64/lp64d/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/riscv64-unknown-linux-gnu/bin/ld b/test/Driver/Inputs/multilib_riscv_linux_sdk/riscv64-unknown-linux-gnu/bin/ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/test/Driver/Inputs/multilib_riscv_linux_sdk/riscv64-unknown-linux-gnu/bin/ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib32/ilp32/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib32/ilp32/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib32/ilp32d/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib32/ilp32d/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib64/lp64/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib64/lp64/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib64/lp64d/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/lib64/lp64d/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib32/ilp32/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib32/ilp32/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib32/ilp32d/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib32/ilp32d/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib64/lp64/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib64/lp64/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib64/lp64d/.keep b/test/Driver/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib64/lp64d/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/resource_dir/share/asan_blacklist.txt b/test/Driver/Inputs/resource_dir/share/asan_blacklist.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/resource_dir/share/hwasan_blacklist.txt b/test/Driver/Inputs/resource_dir/share/hwasan_blacklist.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/resource_dir/share/ubsan_blacklist.txt b/test/Driver/Inputs/resource_dir/share/ubsan_blacklist.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/resource_dir/share/vtables_blacklist.txt b/test/Driver/Inputs/resource_dir/share/vtables_blacklist.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/include/c++/4.8.2/sparc-sun-solaris2.11/bits/gthr.h b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/include/c++/4.8.2/sparc-sun-solaris2.11/bits/gthr.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/include/c++/4.8.2/typeinfo b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/include/c++/4.8.2/typeinfo
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crt1.o b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crt1.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbegin.o b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtend.o b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crt1.o b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crt1.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtbegin.o b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtend.o b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/libatomic.a b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/libatomic.a
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/sparcv9/libatomic.a b/test/Driver/Inputs/solaris_sparc_tree/usr/gcc/4.8/lib/sparcv9/libatomic.a
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/lib/crti.o b/test/Driver/Inputs/solaris_sparc_tree/usr/lib/crti.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/lib/crtn.o b/test/Driver/Inputs/solaris_sparc_tree/usr/lib/crtn.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/lib/ld.so.1 b/test/Driver/Inputs/solaris_sparc_tree/usr/lib/ld.so.1
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/lib/sparcv9/crti.o b/test/Driver/Inputs/solaris_sparc_tree/usr/lib/sparcv9/crti.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/lib/sparcv9/crtn.o b/test/Driver/Inputs/solaris_sparc_tree/usr/lib/sparcv9/crtn.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_sparc_tree/usr/lib/sparcv9/ld.so.1 b/test/Driver/Inputs/solaris_sparc_tree/usr/lib/sparcv9/ld.so.1
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/include/c++/4.9.4/i386-pc-solaris2.11/bits/gthr.h b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/include/c++/4.9.4/i386-pc-solaris2.11/bits/gthr.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/include/c++/4.9.4/typeinfo b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/include/c++/4.9.4/typeinfo
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/amd64/libatomic.a b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/amd64/libatomic.a
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/amd64/crtbegin.o b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/amd64/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/amd64/crtend.o b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/amd64/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/crtbegin.o b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/crtend.o b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/libatomic.a b/test/Driver/Inputs/solaris_x86_tree/usr/gcc/4.9/lib/libatomic.a
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/crt1.o b/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/crt1.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/crti.o b/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/crti.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/crtn.o b/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/crtn.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/ld.so.1 b/test/Driver/Inputs/solaris_x86_tree/usr/lib/amd64/ld.so.1
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/crt1.o b/test/Driver/Inputs/solaris_x86_tree/usr/lib/crt1.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/crti.o b/test/Driver/Inputs/solaris_x86_tree/usr/lib/crti.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/crtn.o b/test/Driver/Inputs/solaris_x86_tree/usr/lib/crtn.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/solaris_x86_tree/usr/lib/ld.so.1 b/test/Driver/Inputs/solaris_x86_tree/usr/lib/ld.so.1
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/ananas.c b/test/Driver/ananas.c
index 2a5b35ed6cec..4edc2a10941c 100644
--- a/test/Driver/ananas.c
+++ b/test/Driver/ananas.c
@@ -7,3 +7,11 @@
 // CHECK-STATIC: crtbegin.o
 // CHECK-STATIC: crtend.o
 // CHECK-STATIC: crtn.o
+
+// RUN: %clang -no-canonical-prefixes -target x86_64-unknown-ananas -shared %s \
+// RUN:   --sysroot=%S/Inputs/ananas-tree -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-SHARED %s
+// CHECK-SHARED: crti.o
+// CHECK-SHARED: crtbeginS.o
+// CHECK-SHARED: crtendS.o
+// CHECK-SHARED: crtn.o
diff --git a/test/Driver/cl-options.c b/test/Driver/cl-options.c
index b8a0166e7ad3..abe55b74946f 100644
--- a/test/Driver/cl-options.c
+++ b/test/Driver/cl-options.c
@@ -344,6 +344,7 @@
 // RUN:    /kernel- \
 // RUN:    /nologo \
 // RUN:    /openmp- \
+// RUN:    /permissive- \
 // RUN:    /RTC1 \
 // RUN:    /sdl \
 // RUN:    /sdl- \
@@ -376,6 +377,8 @@
 // (/Zs is for syntax-only)
 // RUN: %clang_cl /Zs \
 // RUN:     /AIfoo \
+// RUN:     /Bt \
+// RUN:     /Bt+ \
 // RUN:     /clr:pure \
 // RUN:     /docname \
 // RUN:     /EHsc \
diff --git a/test/Driver/cl-pch-search.cpp b/test/Driver/cl-pch-search.cpp
index ca62668ef13b..5c072c3b7f78 100644
--- a/test/Driver/cl-pch-search.cpp
+++ b/test/Driver/cl-pch-search.cpp
@@ -2,5 +2,5 @@
 // command-line option, e.g. on Mac where %s is commonly under /Users.
 
 // REQUIRES: x86-registered-target
-// Check that pchfile.h next to to pchfile.cc is found correctly.
-// RUN: %clang_cl -Werror --target=x86_64 /Ycpchfile.h /FIpchfile.h /c /Fo%t.obj /Fp%t.pch -- %S/Inputs/pchfile.cpp
+// Check that pchfile.h next to pchfile.cc is found correctly.
+// RUN: %clang_cl -Werror --target=x86_64-windows /Ycpchfile.h /FIpchfile.h /c /Fo%t.obj /Fp%t.pch -- %S/Inputs/pchfile.cpp
diff --git a/test/Driver/cl-pch-showincludes.cpp b/test/Driver/cl-pch-showincludes.cpp
index 7e0e1097361d..75a634bc125c 100644
--- a/test/Driver/cl-pch-showincludes.cpp
+++ b/test/Driver/cl-pch-showincludes.cpp
@@ -8,14 +8,14 @@
 
 // When building the pch, header1.h (included by header2.h), header2.h (the pch
 // input itself) and header3.h (included directly, above) should be printed.
-// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Ycheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN: %clang_cl -Werror --target=x86_64-windows /showIncludes /I%S/Inputs /Ycheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
 // RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YC %s
 // CHECK-YC: Note: including file: {{[^ ]*header2.h}}
 // CHECK-YC: Note: including file:  {{[^ ]*header1.h}}
 // CHECK-YC: Note: including file: {{[^ ]*header3.h}}
 
 // When using the pch, only the direct include is printed.
-// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Yuheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN: %clang_cl -Werror --target=x86_64-windows /showIncludes /I%S/Inputs /Yuheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
 // RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YU %s
 // CHECK-YU-NOT: Note: including file: {{.*pch}}
 // CHECK-YU-NOT: Note: including file: {{.*header1.h}}
@@ -23,7 +23,7 @@
 // CHECK-YU: Note: including file: {{[^ ]*header3.h}}
 
 // When not using pch at all, all the /FI files are printed.
-// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /FIheader2.h /c /Fo%t -- %s \
+// RUN: %clang_cl -Werror --target=x86_64-windows /showIncludes /I%S/Inputs /FIheader2.h /c /Fo%t -- %s \
 // RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-FI %s
 // CHECK-FI: Note: including file: {{[^ ]*header2.h}}
 // CHECK-FI: Note: including file:  {{[^ ]*header1.h}}
@@ -32,7 +32,7 @@
 // Also check that /FI arguments before the /Yc / /Yu flags are printed right.
 
 // /FI flags before the /Yc arg should be printed, /FI flags after it shouldn't.
-// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Ycheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN: %clang_cl -Werror --target=x86_64-windows /showIncludes /I%S/Inputs /Ycheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
 // RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YCFI %s
 // CHECK-YCFI: Note: including file: {{[^ ]*header0.h}}
 // CHECK-YCFI: Note: including file: {{[^ ]*header2.h}}
@@ -40,7 +40,7 @@
 // CHECK-YCFI: Note: including file: {{[^ ]*header4.h}}
 // CHECK-YCFI: Note: including file: {{[^ ]*header3.h}}
 
-// RUN: %clang_cl -Werror --target=x86_64 /showIncludes /I%S/Inputs /Yuheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN: %clang_cl -Werror --target=x86_64-windows /showIncludes /I%S/Inputs /Yuheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
 // RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YUFI %s
 // CHECK-YUFI-NOT: Note: including file: {{.*pch}}
 // CHECK-YUFI-NOT: Note: including file: {{.*header0.h}}
diff --git a/test/Driver/cl-x86-flags.c b/test/Driver/cl-x86-flags.c
index 32cd072ec1ef..595959b48828 100644
--- a/test/Driver/cl-x86-flags.c
+++ b/test/Driver/cl-x86-flags.c
@@ -1,88 +1,132 @@
 // REQUIRES: x86-registered-target
 
+// expected-no-diagnostics
+
 // We support -m32 and -m64.  We support all x86 CPU feature flags in gcc's -m
 // flag space.
 // RUN: %clang_cl /Zs /WX -m32 -m64 -msse3 -msse4.1 -mavx -mno-avx \
 // RUN:     --target=i386-pc-win32 -### -- 2>&1 %s | FileCheck -check-prefix=MFLAGS %s
 // MFLAGS-NOT: argument unused during compilation
 
-// RUN: %clang_cl -m32 -arch:IA32 --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=IA32 %s
-// IA32: "-target-cpu" "i386"
-// IA32-NOT: -target-feature
-// IA32-NOT: argument unused during compilation
+// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_IA32 -- %s
+#if defined(TEST_32_ARCH_IA32)
+#if _M_IX86_FP || __AVX__ || __AVX2__ || __AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
 
-// RUN: %clang_cl -m32 -arch:ia32 --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=ia32 %s
+// arch: args are case-sensitive.
+// RUN: %clang_cl -m32 -arch:ia32 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=ia32 %s
 // ia32: argument unused during compilation
-// ia32-NOT: -target-feature
 
-// RUN: %clang_cl -m64 -arch:IA32 --target=x86_64 -### -- 2>&1 %s | FileCheck -check-prefix=IA3264 %s
+// RUN: %clang_cl -m64 -arch:IA32 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=IA3264 %s
 // IA3264: argument unused during compilation
-// IA3264-NOT: -target-feature
 
-// RUN: %clang_cl -m32 -arch:SSE --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=SSE %s
-// SSE: "-target-cpu" "pentium3"
-// SSE: -target-feature
-// SSE: +sse
-// SSE-NOT: argument unused during compilation
+// RUN: %clang_cl -m32 -arch:SSE --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_SSE -- %s
+#if defined(TEST_32_ARCH_SSE)
+#if _M_IX86_FP != 1 || __AVX__ || __AVX2__ || __AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
 
-// RUN: %clang_cl -m32 -arch:sse --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=sse %s
+// RUN: %clang_cl -m32 -arch:sse --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=sse %s
 // sse: argument unused during compilation
-// sse-NOT: -target-feature
 
-// RUN: %clang_cl -m32 -arch:SSE2 --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=SSE2 %s
-// SSE2: "-target-cpu" "pentium4"
-// SSE2: -target-feature
-// SSE2: +sse2
-// SSE2-NOT: argument unused during compilation
+// RUN: %clang_cl -m32 -arch:SSE2 --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_SSE2 -- %s
+#if defined(TEST_32_ARCH_SSE2)
+#if _M_IX86_FP != 2 || __AVX__ || __AVX2__ || __AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
 
-// RUN: %clang_cl -m32 -arch:sse2 --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=sse %s
+// RUN: %clang_cl -m32 -arch:sse2 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=sse %s
 // sse2: argument unused during compilation
-// sse2-NOT: -target-feature
 
-// RUN: %clang_cl -m64 -arch:SSE --target=x86_64 -### -- 2>&1 %s | FileCheck -check-prefix=SSE64 %s
+// RUN: %clang_cl -m64 -arch:SSE --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=SSE64 %s
 // SSE64: argument unused during compilation
-// SSE64-NOT: -target-feature
-// SSE64-NOT: pentium3
 
-// RUN: %clang_cl -m64 -arch:SSE2 --target=x86_64 -### -- 2>&1 %s | FileCheck -check-prefix=SSE264 %s
+// RUN: %clang_cl -m64 -arch:SSE2 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=SSE264 %s
 // SSE264: argument unused during compilation
-// SSE264-NOT: -target-feature
 
-// RUN: %clang_cl -m32 -arch:AVX --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=AVX %s
-// AVX: "-target-cpu" "sandybridge"
-// AVX: -target-feature
-// AVX: +avx
+// RUN: %clang_cl -m32 -arch:AVX --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_AVX -- %s
+#if defined(TEST_32_ARCH_AVX)
+#if _M_IX86_FP != 2 || !__AVX__ || __AVX2__ || __AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
 
-// RUN: %clang_cl -m32 -arch:avx --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=avx %s
+// RUN: %clang_cl -m32 -arch:avx --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx %s
 // avx: argument unused during compilation
-// avx-NOT: -target-feature
 
-// RUN: %clang_cl -m32 -arch:AVX2 --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=AVX2 %s
-// AVX2: "-target-cpu" "haswell"
-// AVX2: -target-feature
-// AVX2: +avx2
+// RUN: %clang_cl -m32 -arch:AVX2 --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_AVX2 -- %s
+#if defined(TEST_32_ARCH_AVX2)
+#if _M_IX86_FP != 2 || !__AVX__ || !__AVX2__ || __AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
 
-// RUN: %clang_cl -m32 -arch:avx2 --target=i386 -### -- 2>&1 %s | FileCheck -check-prefix=avx2 %s
+// RUN: %clang_cl -m32 -arch:avx2 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2 %s
 // avx2: argument unused during compilation
-// avx2-NOT: -target-feature
-
-// RUN: %clang_cl -m64 -arch:AVX --target=x86_64 -### -- 2>&1 %s | FileCheck -check-prefix=AVX64 %s
-// AVX64: "-target-cpu" "sandybridge"
-// AVX64: -target-feature
-// AVX64: +avx
 
-// RUN: %clang_cl -m64 -arch:avx --target=x86_64 -### -- 2>&1 %s | FileCheck -check-prefix=avx64 %s
+// RUN: %clang_cl -m32 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_AVX512F -- %s
+#if defined(TEST_32_ARCH_AVX512F)
+#if _M_IX86_FP != 2 || !__AVX__ || !__AVX2__ || !__AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
+
+// RUN: %clang_cl -m32 -arch:avx512f --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx512f %s
+// avx512f: argument unused during compilation
+
+// RUN: %clang_cl -m32 -arch:AVX512 --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_AVX512 -- %s
+#if defined(TEST_32_ARCH_AVX512)
+#if _M_IX86_FP != 2 || !__AVX__ || !__AVX2__ || !__AVX512F__  || !__AVX512BW__
+#error fail
+#endif
+#endif
+
+// RUN: %clang_cl -m32 -arch:avx512 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx512 %s
+// avx512: argument unused during compilation
+
+// RUN: %clang_cl -m64 -arch:AVX --target=x86_64-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_64_ARCH_AVX -- %s
+#if defined(TEST_64_ARCH_AVX)
+#if _M_IX86_FP || !__AVX__ || __AVX2__ || __AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
+
+// RUN: %clang_cl -m64 -arch:avx --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx64 %s
 // avx64: argument unused during compilation
-// avx64-NOT: -target-feature
 
-// RUN: %clang_cl -m64 -arch:AVX2 --target=x86_64 -### -- 2>&1 %s | FileCheck -check-prefix=AVX264 %s
-// AVX264: "-target-cpu" "haswell"
-// AVX264: -target-feature
-// AVX264: +avx2
+// RUN: %clang_cl -m64 -arch:AVX2 --target=x86_64-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_64_ARCH_AVX2 -- %s
+#if defined(TEST_64_ARCH_AVX2)
+#if _M_IX86_FP || !__AVX__ || !__AVX2__ || __AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
 
-// RUN: %clang_cl -m64 -arch:avx2 --target=x86_64 -### -- 2>&1 %s | FileCheck -check-prefix=avx264 %s
+// RUN: %clang_cl -m64 -arch:avx2 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx264 %s
 // avx264: argument unused during compilation
-// avx264-NOT: -target-feature
+
+// RUN: %clang_cl -m64 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_64_ARCH_AVX512F -- %s
+#if defined(TEST_64_ARCH_AVX512F)
+#if _M_IX86_FP || !__AVX__ || !__AVX2__ || !__AVX512F__  || __AVX512BW__
+#error fail
+#endif
+#endif
+
+// RUN: %clang_cl -m64 -arch:avx512f --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx512f64 %s
+// avx512f64: argument unused during compilation
+
+// RUN: %clang_cl -m64 -arch:AVX512 --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_64_ARCH_AVX512 -- %s
+#if defined(TEST_64_ARCH_AVX512)
+#if _M_IX86_FP || !__AVX__ || !__AVX2__ || !__AVX512F__  || !__AVX512BW__
+#error fail
+#endif
+#endif
+
+// RUN: %clang_cl -m64 -arch:avx512 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx51264 %s
+// avx51264: argument unused during compilation
 
 void f() {
 }
diff --git a/test/Driver/clang_f_opts.c b/test/Driver/clang_f_opts.c
index b22f74fb5557..5755c5d36f50 100644
--- a/test/Driver/clang_f_opts.c
+++ b/test/Driver/clang_f_opts.c
@@ -503,3 +503,17 @@
 // CHECK-WINDOWS-ISO10646: "-fwchar-type=int"
 // CHECK-WINDOWS-ISO10646: "-fsigned-wchar"
 
+// RUN: %clang -### -S -fcf-protection %s 2>&1 | FileCheck -check-prefix=CHECK-CF-PROTECTION-FULL %s
+// RUN: %clang -### -S %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CF-PROTECTION-FULL %s
+// RUN: %clang -### -S -fcf-protection=full %s 2>&1 | FileCheck -check-prefix=CHECK-CF-PROTECTION-FULL %s
+// RUN: %clang -### -S %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CF-PROTECTION-FULL %s
+// CHECK-CF-PROTECTION-FULL: -fcf-protection=full
+// CHECK-NO-CF-PROTECTION-FULL-NOT: -fcf-protection=full
+// RUN: %clang -### -S -fcf-protection=return %s 2>&1 | FileCheck -check-prefix=CHECK-CF-PROTECTION-RETURN %s
+// RUN: %clang -### -S %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CF-PROTECTION-RETURN %s
+// CHECK-CF-PROTECTION-RETURN: -fcf-protection=return
+// CHECK-NO-CF-PROTECTION-RETURN-NOT: -fcf-protection=return
+// RUN: %clang -### -S -fcf-protection=branch %s 2>&1 | FileCheck -check-prefix=CHECK-CF-PROTECTION-BRANCH %s
+// RUN: %clang -### -S %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CF-PROTECTION-BRANCH %s
+// CHECK-CF-PROTECTION-BRANCH: -fcf-protection=branch
+// CHECK-NO-CF-PROTECTION-BRANCH-NOT: -fcf-protection=branch
diff --git a/test/Driver/config-file-errs.c b/test/Driver/config-file-errs.c
new file mode 100644
index 000000000000..8db2ea439298
--- /dev/null
+++ b/test/Driver/config-file-errs.c
@@ -0,0 +1,54 @@
+//--- No more than one '--config' may be specified.
+//
+// RUN: not %clang --config 1.cfg --config 2.cfg 2>&1 | FileCheck %s -check-prefix CHECK-DUPLICATE
+// CHECK-DUPLICATE: no more than one option '--config' is allowed
+
+
+//--- '--config' must be followed by config file name.
+//
+// RUN: not %clang --config 2>&1 | FileCheck %s -check-prefix CHECK-MISSING-FILE
+// CHECK-MISSING-FILE: argument to '--config' is missing (expected 1 value)
+
+
+//--- '--config' must not be found in config files.
+//
+// RUN: not %clang --config %S/Inputs/config-6.cfg 2>&1 | FileCheck %s -check-prefix CHECK-NESTED
+// CHECK-NESTED: option '--config' is not allowed inside configuration file
+
+
+//--- Argument of '--config' must be existing file, if it is specified by path.
+//
+// RUN: not %clang --config somewhere/nonexistent-config-file 2>&1 | FileCheck %s -check-prefix CHECK-NONEXISTENT
+// CHECK-NONEXISTENT: configuration file '{{.*}}somewhere/nonexistent-config-file' does not exist
+
+
+//--- Argument of '--config' must exist somewhere in well-known directories, if it is specified by bare name.
+//
+// RUN: not %clang --config-system-dir= --config-user-dir= --config nonexistent-config-file 2>&1 | FileCheck %s -check-prefix CHECK-NOTFOUND0
+// CHECK-NOTFOUND0: configuration file 'nonexistent-config-file.cfg' cannot be found
+// CHECK-NOTFOUND0-NEXT: was searched for in the directory:
+// CHECK-NOTFOUND0-NOT: was searched for in the directory:
+//
+// RUN: not %clang --config-system-dir= --config-user-dir=%S/Inputs/config2 --config nonexistent-config-file 2>&1 | FileCheck %s -check-prefix CHECK-NOTFOUND1
+// CHECK-NOTFOUND1: configuration file 'nonexistent-config-file.cfg' cannot be found
+// CHECK-NOTFOUND1-NEXT: was searched for in the directory: {{.*}}/Inputs/config2
+// CHECK-NOTFOUND1-NEXT: was searched for in the directory:
+// CHECK-NOTFOUND1-NOT: was searched for in the directory:
+//
+// RUN: not %clang --config-system-dir=%S/Inputs/config --config-user-dir= --config nonexistent-config-file 2>&1 | FileCheck %s -check-prefix CHECK-NOTFOUND2
+// CHECK-NOTFOUND2: configuration file 'nonexistent-config-file.cfg' cannot be found
+// CHECK-NOTFOUND2-NEXT: was searched for in the directory: {{.*}}/Inputs/config
+// CHECK-NOTFOUND2-NEXT: was searched for in the directory:
+// CHECK-NOTFOUND2-NOT: was searched for in the directory:
+//
+// RUN: not %clang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 --config nonexistent-config-file 2>&1 | FileCheck %s -check-prefix CHECK-NOTFOUND3
+// CHECK-NOTFOUND3: configuration file 'nonexistent-config-file.cfg' cannot be found
+// CHECK-NOTFOUND3-NEXT: was searched for in the directory: {{.*}}/Inputs/config2
+// CHECK-NOTFOUND3-NEXT: was searched for in the directory: {{.*}}/Inputs/config
+// CHECK-NOTFOUND3-NEXT: was searched for in the directory:
+
+
+//--- Argument in config file cannot cross the file boundary
+//
+// RUN: not %clang --config %S/Inputs/config-5.cfg x86_64-unknown-linux-gnu -c %s 2>&1 | FileCheck %s -check-prefix CHECK-CROSS
+// CHECK-CROSS: error: argument to '-target' is missing
diff --git a/test/Driver/config-file.c b/test/Driver/config-file.c
new file mode 100644
index 000000000000..f9e6ce62cd58
--- /dev/null
+++ b/test/Driver/config-file.c
@@ -0,0 +1,72 @@
+//--- Config file search directories
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-DIRS
+// CHECK-DIRS: System configuration file directory: {{.*}}/Inputs/config
+// CHECK-DIRS: User configuration file directory: {{.*}}/Inputs/config2
+
+
+//--- Config file (full path) in output of -###
+//
+// RUN: %clang --config %S/Inputs/config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH
+// CHECK-HHH: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+// CHECK-HHH: -Werror
+// CHECK-HHH: -std=c99
+
+
+//--- Config file (full path) in output of -v
+//
+// RUN: %clang --config %S/Inputs/config-1.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-V
+// CHECK-V: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+// CHECK-V: -Werror
+// CHECK-V: -std=c99
+
+
+//--- Config file in output of -###
+//
+// RUN: %clang --config-system-dir=%S/Inputs --config-user-dir= --config config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH2
+// CHECK-HHH2: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+// CHECK-HHH2: -Werror
+// CHECK-HHH2: -std=c99
+
+
+//--- Config file in output of -v
+//
+// RUN: %clang --config-system-dir=%S/Inputs --config-user-dir= --config config-1.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-V2
+// CHECK-V2: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+// CHECK-V2: -Werror
+// CHECK-V2: -std=c99
+
+
+//--- Nested config files
+//
+// RUN: %clang --config-system-dir=%S/Inputs --config-user-dir= --config config-2.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED
+// CHECK-NESTED: Configuration file: {{.*}}Inputs{{.}}config-2.cfg
+// CHECK-NESTED: -Wundefined-func-template
+
+// RUN: %clang --config-system-dir=%S/Inputs --config-user-dir= --config config-2.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED2
+// CHECK-NESTED2: Configuration file: {{.*}}Inputs{{.}}config-2.cfg
+// CHECK-NESTED2: -Wundefined-func-template
+
+
+// RUN: %clang --config %S/Inputs/config-2a.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTEDa
+// CHECK-NESTEDa: Configuration file: {{.*}}Inputs{{.}}config-2a.cfg
+// CHECK-NESTEDa: -isysroot
+// CHECK-NESTEDa-SAME: /opt/data
+
+// RUN: %clang --config-system-dir=%S/Inputs --config-user-dir= --config config-2a.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED2a
+// CHECK-NESTED2a: Configuration file: {{.*}}Inputs{{.}}config-2a.cfg
+// CHECK-NESTED2a: -isysroot
+// CHECK-NESTED2a-SAME: /opt/data
+
+
+//--- Unused options in config file do not produce warnings
+//
+// RUN: %clang --config %S/Inputs/config-4.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-UNUSED
+// CHECK-UNUSED-NOT: argument unused during compilation:
+
+
+//--- User directory is searched first.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 --config config-4 -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-PRECEDENCE
+// CHECK-PRECEDENCE: Configuration file: {{.*}}Inputs{{.}}config2{{.}}config-4.cfg
+// CHECK-PRECEDENCE: -Wall
diff --git a/test/Driver/config-file2.c b/test/Driver/config-file2.c
new file mode 100644
index 000000000000..ff19a93697e9
--- /dev/null
+++ b/test/Driver/config-file2.c
@@ -0,0 +1,51 @@
+// REQUIRES: x86-registered-target
+
+//--- Invocation `clang --config x86_64-qqq -m32` loads `i386-qqq.cfg` if the latter exists.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir= --config x86_64-qqq -m32 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD
+// CHECK-RELOAD: Target: i386
+// CHECK-RELOAD: Configuration file: {{.*}}Inputs{{.}}config{{.}}i386-qqq.cfg
+
+
+//--- Invocation `clang --config x86_64-qqq2 -m32` loads `i386.cfg` if the latter exists in another search directory.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 --config x86_64-qqq2 -m32 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD1
+// CHECK-RELOAD1: Target: i386
+// CHECK-RELOAD1: Configuration file: {{.*}}Inputs{{.}}config2{{.}}i386.cfg
+
+
+//--- Invocation `clang --config x86_64-qqq2 -m32` loads `x86_64-qqq2.cfg` if `i386-qqq2.cfg` and `i386.cfg` do not exist.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir= --config x86_64-qqq2 -m32 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD2
+// note: target is overriden due to -m32
+// CHECK-RELOAD2: Target: i386
+// CHECK-RELOAD2: Configuration file: {{.*}}Inputs{{.}}config{{.}}x86_64-qqq2.cfg
+
+
+//--- Invocation `clang --config i386-qqq3 -m64` loads `x86_64.cfg` if `x86_64-qqq3.cfg` does not exist.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir= --config i386-qqq3 -m64 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD3
+// CHECK-RELOAD3: Target: x86_64
+// CHECK-RELOAD3: Configuration file: {{.*}}Inputs{{.}}config{{.}}x86_64.cfg
+
+
+//--- Invocation `clang --config x86_64-qqq -target i386` loads `i386-qqq.cfg` if the latter exists.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir= --config x86_64-qqq -target i386 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD4
+// CHECK-RELOAD4: Target: i386
+// CHECK-RELOAD4: Configuration file: {{.*}}Inputs{{.}}config{{.}}i386-qqq.cfg
+
+
+//--- Invocation `clang --config x86_64-qqq2 -target i386` loads `x86_64-qqq2.cfg` if `i386-qqq2.cfg` and `i386.cfg` do not exist.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir= --config x86_64-qqq2 -target i386 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD5
+// note: target is overriden due to -target i386
+// CHECK-RELOAD5: Target: i386
+// CHECK-RELOAD5: Configuration file: {{.*}}Inputs{{.}}config{{.}}x86_64-qqq2.cfg
+
+
+//--- Invocation `clang --config x86_64-qqq -target i386 -m64` loads `x86_64-qqq.cfg`.
+//
+// RUN: %clang --config-system-dir=%S/Inputs/config --config-user-dir= --config x86_64-qqq -target i386 -m64 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD6
+// CHECK-RELOAD6: Target: x86_64
+// CHECK-RELOAD6: Configuration file: {{.*}}Inputs{{.}}config{{.}}x86_64-qqq.cfg
diff --git a/test/Driver/config-file3.c b/test/Driver/config-file3.c
new file mode 100644
index 000000000000..c1b523cbf97e
--- /dev/null
+++ b/test/Driver/config-file3.c
@@ -0,0 +1,98 @@
+// REQUIRES: shell
+// REQUIRES: x86-registered-target
+
+//--- If config file is specified by relative path (workdir/cfg-s2), it is searched for by that path.
+//
+// RUN: mkdir -p %T/workdir
+// RUN: echo "@subdir/cfg-s2" > %T/workdir/cfg-1
+// RUN: mkdir -p %T/workdir/subdir
+// RUN: echo "-Wundefined-var-template" > %T/workdir/subdir/cfg-s2
+//
+// RUN: ( cd %T && %clang --config workdir/cfg-1 -c %s -### 2>&1 | FileCheck %s -check-prefix CHECK-REL )
+//
+// CHECK-REL: Configuration file: {{.*}}/workdir/cfg-1
+// CHECK-REL: -Wundefined-var-template
+
+
+//--- Invocation qqq-clang-g++ tries to find config file qqq-clang-g++.cfg first.
+//
+// RUN: mkdir -p %T/testdmode
+// RUN: [ ! -s %T/testdmode/qqq-clang-g++ ] || rm %T/testdmode/qqq-clang-g++
+// RUN: ln -s %clang %T/testdmode/qqq-clang-g++
+// RUN: echo "-Wundefined-func-template" > %T/testdmode/qqq-clang-g++.cfg
+// RUN: echo "-Werror" > %T/testdmode/qqq.cfg
+// RUN: %T/testdmode/qqq-clang-g++ --config-system-dir= --config-user-dir= -c -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix FULL-NAME
+//
+// FULL-NAME: Configuration file: {{.*}}/testdmode/qqq-clang-g++.cfg
+// FULL-NAME: -Wundefined-func-template
+// FULL-NAME-NOT: -Werror
+//
+//--- File specified by --config overrides config inferred from clang executable.
+//
+// RUN: %T/testdmode/qqq-clang-g++ --config-system-dir=%S/Inputs/config --config-user-dir= --config i386-qqq -c -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix CHECK-EXPLICIT
+//
+// CHECK-EXPLICIT: Configuration file: {{.*}}/Inputs/config/i386-qqq.cfg
+//
+//--- Invocation qqq-clang-g++ tries to find config file qqq.cfg if qqq-clang-g++.cfg is not found.
+//
+// RUN: rm %T/testdmode/qqq-clang-g++.cfg
+// RUN: %T/testdmode/qqq-clang-g++ --config-system-dir= --config-user-dir= -c -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix SHORT-NAME
+//
+// SHORT-NAME: Configuration file: {{.*}}/testdmode/qqq.cfg
+// SHORT-NAME: -Werror
+// SHORT-NAME-NOT: -Wundefined-func-template
+
+
+//--- Config files are searched for in binary directory as well.
+//
+// RUN: mkdir -p %T/testbin
+// RUN: [ ! -s %T/testbin/clang ] || rm %T/testbin/clang
+// RUN: ln -s %clang %T/testbin/clang
+// RUN: echo "-Werror" > %T/testbin/aaa.cfg
+// RUN: %T/testbin/clang --config-system-dir= --config-user-dir= --config aaa.cfg -c -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix CHECK-BIN
+//
+// CHECK-BIN: Configuration file: {{.*}}/testbin/aaa.cfg
+// CHECK-BIN: -Werror
+
+
+//--- If command line contains options that change triple (for instance, -m32), clang tries
+//    reloading config file.
+
+//--- When reloading config file, x86_64-clang-g++ tries to find config i386-clang-g++.cfg first.
+//
+// RUN: mkdir -p %T/testreload
+// RUN: [ ! -s %T/testreload/x86_64-clang-g++ ] || rm %T/testreload/x86_64-clang-g++
+// RUN: ln -s %clang %T/testreload/x86_64-clang-g++
+// RUN: echo "-Wundefined-func-template" > %T/testreload/i386-clang-g++.cfg
+// RUN: echo "-Werror" > %T/testreload/i386.cfg
+// RUN: %T/testreload/x86_64-clang-g++ --config-system-dir= --config-user-dir= -c -m32 -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD
+//
+// CHECK-RELOAD: Configuration file: {{.*}}/testreload/i386-clang-g++.cfg
+// CHECK-RELOAD: -Wundefined-func-template
+// CHECK-RELOAD-NOT: -Werror
+
+//--- If config file is specified by --config and its name does not start with architecture, it is used without reloading.
+//
+// RUN: %T/testreload/x86_64-clang-g++ --config-system-dir=%S/Inputs --config-user-dir= --config config-3 -c -m32 -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD1a
+//
+// CHECK-RELOAD1a: Configuration file: {{.*}}/Inputs/config-3.cfg
+//
+// RUN: %T/testreload/x86_64-clang-g++ --config-system-dir=%S/Inputs --config-user-dir= --config config-3 -c -target i386 -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD1b
+//
+// CHECK-RELOAD1b: Configuration file: {{.*}}/Inputs/config-3.cfg
+
+//--- If config file is specified by --config and its name starts with architecture, it is reloaded.
+//
+// RUN: %T/testreload/x86_64-clang-g++ --config-system-dir=%S/Inputs/config --config-user-dir= --config x86_64-qqq -c -m32 -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD1c
+//
+// CHECK-RELOAD1c: Configuration file: {{.*}}/Inputs/config/i386-qqq.cfg
+
+//--- x86_64-clang-g++ tries to find config i386.cfg if i386-clang-g++.cfg is not found.
+//
+// RUN: rm %T/testreload/i386-clang-g++.cfg
+// RUN: %T/testreload/x86_64-clang-g++ --config-system-dir= --config-user-dir= -c -m32 -no-canonical-prefixes %s -### 2>&1 | FileCheck %s -check-prefix CHECK-RELOAD1d
+//
+// CHECK-RELOAD1d: Configuration file: {{.*}}/testreload/i386.cfg
+// CHECK-RELOAD1d: -Werror
+// CHECK-RELOAD1d-NOT: -Wundefined-func-template
+
diff --git a/test/Driver/cuda-detect-path.cu b/test/Driver/cuda-detect-path.cu
new file mode 100644
index 000000000000..61b9400760c2
--- /dev/null
+++ b/test/Driver/cuda-detect-path.cu
@@ -0,0 +1,83 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+// This tests uses the PATH environment variable.
+// REQUIRES: !system-windows
+
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-unknown-linux --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-apple-macosx --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-unknown-linux --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-apple-macosx --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s
+
+
+// Check that we follow ptxas binaries that are symlinks.
+// RUN: env PATH=%S/Inputs/CUDA-symlinks/usr/bin \
+// RUN:    %clang -v --target=i386-unknown-linux --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix SYMLINKS
+// RUN: env PATH=%S/Inputs/CUDA-symlinks/usr/bin \
+// RUN:    %clang -v --target=i386-apple-macosx --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix SYMLINKS
+// RUN: env PATH=%S/Inputs/CUDA-symlinks/usr/bin \
+// RUN:    %clang -v --target=x86_64-unknown-linux --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix SYMLINKS
+// RUN: env PATH=%S/Inputs/CUDA-symlinks/usr/bin \
+// RUN:    %clang -v --target=x86_64-apple-macosx --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix SYMLINKS
+
+
+// We only take a CUDA installation from PATH if it contains libdevice.
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-unknown-linux --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-apple-macosx --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-unknown-linux --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-apple-macosx --sysroot=%S/no-cuda-there \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+
+// We even require libdevice if -nocudalib is passed to avoid false positives
+// if the distribution merges CUDA into /usr and ptxas ends up /usr/bin.
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-unknown-linux --sysroot=%S/no-cuda-there -nocudalib \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-apple-macosx --sysroot=%S/no-cuda-there -nocudalib \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-unknown-linux --sysroot=%S/no-cuda-there -nocudalib \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA-nolibdevice/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-apple-macosx --sysroot=%S/no-cuda-there -nocudalib \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+
+
+// Check that the CUDA installation in PATH is not taken when passing
+// the option --cuda-path-ignore-env.
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-unknown-linux --sysroot=%S/no-cuda-there --cuda-path-ignore-env \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=i386-apple-macosx --sysroot=%S/no-cuda-there --cuda-path-ignore-env \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-unknown-linux --sysroot=%S/no-cuda-there --cuda-path-ignore-env \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+// RUN: env PATH=%S/Inputs/CUDA/usr/local/cuda/bin \
+// RUN:    %clang -v --target=x86_64-apple-macosx --sysroot=%S/no-cuda-there --cuda-path-ignore-env \
+// RUN:    2>&1 | FileCheck %s --check-prefix NOCUDA
+
+// CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda
+// SYMLINKS: Found CUDA installation: {{.*}}/Inputs/CUDA-symlinks/opt/cuda
+// NOCUDA-NOT: Found CUDA installation:
diff --git a/test/Driver/cuda-detect.cu b/test/Driver/cuda-detect.cu
index 1db0cfbfa3ce..f086ba4c42e3 100644
--- a/test/Driver/cuda-detect.cu
+++ b/test/Driver/cuda-detect.cu
@@ -4,9 +4,14 @@
 //
 // Check that we properly detect CUDA installation.
 // RUN: %clang -v --target=i386-unknown-linux \
-// RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN:   --sysroot=%S/no-cuda-there --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
 // RUN: %clang -v --target=i386-apple-macosx \
-// RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN:   --sysroot=%S/no-cuda-there --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN: %clang -v --target=x86_64-unknown-linux \
+// RUN:   --sysroot=%S/no-cuda-there --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN: %clang -v --target=x86_64-apple-macosx \
+// RUN:   --sysroot=%S/no-cuda-there --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
+
 
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
@@ -20,15 +25,23 @@
 
 // Check that we don't find a CUDA installation without libdevice ...
 // RUN: %clang -v --target=i386-unknown-linux \
-// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
 // RUN: %clang -v --target=i386-apple-macosx \
-// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN: %clang -v --target=x86_64-unknown-linux \
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN: %clang -v --target=x84_64-apple-macosx \
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NOCUDA
 
 // ... unless the user doesn't need libdevice
 // RUN: %clang -v --target=i386-unknown-linux -nocudalib \
-// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice 2>&1 | FileCheck %s -check-prefix NO-LIBDEVICE
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NO-LIBDEVICE
 // RUN: %clang -v --target=i386-apple-macosx -nocudalib \
-// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice 2>&1 | FileCheck %s -check-prefix NO-LIBDEVICE
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NO-LIBDEVICE
+// RUN: %clang -v --target=x86_64-unknown-linux -nocudalib \
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NO-LIBDEVICE
+// RUN: %clang -v --target=x86_64-apple-macosx -nocudalib \
+// RUN:   --sysroot=%S/Inputs/CUDA-nolibdevice --cuda-path-ignore-env 2>&1 | FileCheck %s -check-prefix NO-LIBDEVICE
 
 
 // Make sure we map libdevice bitcode files to proper GPUs. These
diff --git a/test/Driver/cuda-no-pgo-or-coverage.cu b/test/Driver/cuda-no-pgo-or-coverage.cu
new file mode 100644
index 000000000000..a2cb610f7527
--- /dev/null
+++ b/test/Driver/cuda-no-pgo-or-coverage.cu
@@ -0,0 +1,34 @@
+// Check that profiling/coverage arguments doen't get passed down to device-side
+// compilation.
+//
+// REQUIRES: clang-driver
+//
+// XRUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
+// XRUN:   -fprofile-generate %s 2>&1 | \
+// XRUN: FileCheck --check-prefixes=CHECK,PROF %s
+//
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
+// RUN:   -fprofile-instr-generate %s 2>&1 | \
+// RUN: FileCheck --check-prefixes=CHECK,PROF %s
+//
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
+// RUN:   -coverage %s 2>&1 | \
+// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
+//
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 \
+// RUN:   -ftest-coverage %s 2>&1 | \
+// RUN: FileCheck --check-prefixes=CHECK,GCOV %s
+//
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20   \
+// RUN:   -fprofile-instr-generate -fcoverage-mapping %s 2>&1 | \
+// RUN: FileCheck --check-prefixes=CHECK,PROF,GCOV %s
+//
+//
+// CHECK-NOT: error: unsupported option '-fprofile
+// CHECK-NOT: error: invalid argument
+// CHECK-DAG: "-fcuda-is-device"
+// CHECK-NOT: "-f{{[^"]*coverage.*}}"
+// CHECK-NOT: "-fprofile{{[^"]*}}"
+// CHECK: "-triple" "x86_64--linux-gnu"
+// PROF-DAG: "-fprofile{{.*}}"
+// GCOV-DAG: "-f{{(coverage|emit-coverage).*}}"
diff --git a/test/Driver/cuda-not-found.cu b/test/Driver/cuda-not-found.cu
index b63623ae56c1..48cf19a424cc 100644
--- a/test/Driver/cuda-not-found.cu
+++ b/test/Driver/cuda-not-found.cu
@@ -3,10 +3,10 @@
 // Check that we raise an error if we're trying to compile CUDA code but can't
 // find a CUDA install, unless -nocudainc was passed.
 
-// RUN: %clang -### --sysroot=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix ERR
+// RUN: %clang -### --sysroot=%s/no-cuda-there --cuda-path-ignore-env %s 2>&1 | FileCheck %s --check-prefix ERR
 // RUN: %clang -### --cuda-path=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix ERR
 // ERR: cannot find CUDA installation
 
-// RUN: %clang -### -nocudainc --sysroot=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix OK
+// RUN: %clang -### -nocudainc --sysroot=%s/no-cuda-there --cuda-path-ignore-env %s 2>&1 | FileCheck %s --check-prefix OK
 // RUN: %clang -### -nocudainc --cuda-path=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix OK
 // OK-NOT: cannot find CUDA installation
diff --git a/test/Driver/cuda-version-check.cu b/test/Driver/cuda-version-check.cu
index 46ca72f2ea0c..2fdd9c4afbe4 100644
--- a/test/Driver/cuda-version-check.cu
+++ b/test/Driver/cuda-version-check.cu
@@ -2,40 +2,40 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
 
 // The installation at Inputs/CUDA is CUDA 7.0, which doesn't support sm_60.
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60
 
 // This should only complain about sm_60, not sm_35.
 // RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_35 \
-// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=OK_SM35
 
 // We should get two errors here, one for sm_60 and one for sm_61.
 // RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_61 \
-// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=ERR_SM61
 
 // We should still get an error if we pass -nocudainc, because this compilation
 // would invoke ptxas, and we do a version check on that, too.
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 -nocudainc --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60
 
 // If with -nocudainc and -E, we don't touch the CUDA install, so we
 // shouldn't get an error.
 // RUN: %clang --target=x86_64-linux -v -### -E --cuda-device-only --cuda-gpu-arch=sm_60 -nocudainc \
-// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
 
 // --no-cuda-version-check should suppress all of these errors.
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 \
 // RUN:    --no-cuda-version-check %s | \
 // RUN:    FileCheck %s --check-prefix=OK
 
@@ -43,9 +43,9 @@
 // therefore we should not get an error in host-only mode. We use the -S here
 // to avoid the error being produced in case by the assembler tool, which does
 // the same check.
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-host-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-host-only --cuda-path=%S/Inputs/CUDA/usr/local/cuda -S 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
-// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-device-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-device-only --cuda-path=%S/Inputs/CUDA/usr/local/cuda -S 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60
 
 // OK-NOT: error: GPU arch
diff --git a/test/Driver/frame-pointer.c b/test/Driver/frame-pointer.c
index ecb16af3cf80..cf5c07d542f0 100644
--- a/test/Driver/frame-pointer.c
+++ b/test/Driver/frame-pointer.c
@@ -33,6 +33,18 @@
 // RUN: %clang -target mips64el-linux-gnu -### -S -O0 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK0-32 %s
 // RUN: %clang -target mips64el-linux-gnu -### -S -O1 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK1-32 %s
 
+// RUN: %clang -target riscv32-unknown-elf -### -S -O0 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK0-32 %s
+// RUN: %clang -target riscv32-unknown-elf -### -S -O1 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK1-32 %s
+// RUN: %clang -target riscv32-unknown-elf -### -S -O2 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK2-32 %s
+// RUN: %clang -target riscv32-unknown-elf -### -S -O3 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK3-32 %s
+// RUN: %clang -target riscv32-unknown-elf -### -S -Os %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECKs-32 %s
+
+// RUN: %clang -target riscv64-unknown-elf -### -S -O0 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK0-64 %s
+// RUN: %clang -target riscv64-unknown-elf -### -S -O1 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK1-64 %s
+// RUN: %clang -target riscv64-unknown-elf -### -S -O2 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK2-64 %s
+// RUN: %clang -target riscv64-unknown-elf -### -S -O3 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK3-64 %s
+// RUN: %clang -target riscv64-unknown-elf -### -S -Os %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECKs-64 %s
+
 // CHECK0-32: -mdisable-fp-elim
 // CHECK1-32-NOT: -mdisable-fp-elim
 // CHECK2-32-NOT: -mdisable-fp-elim
diff --git a/test/Driver/fsanitize.c b/test/Driver/fsanitize.c
index 7870f724b6c3..b5a677530a90 100644
--- a/test/Driver/fsanitize.c
+++ b/test/Driver/fsanitize.c
@@ -184,11 +184,11 @@
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-memory-use-after-dtor %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-DTOR
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fno-sanitize-memory-use-after-dtor -fsanitize-memory-use-after-dtor %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-DTOR
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-DTOR
 // CHECK-USE-AFTER-DTOR: -cc1{{.*}}-fsanitize-memory-use-after-dtor
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fno-sanitize-memory-use-after-dtor %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-DTOR-OFF
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-memory-use-after-dtor -fno-sanitize-memory-use-after-dtor %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-DTOR-OFF
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-DTOR-OFF
 // CHECK-USE-AFTER-DTOR-OFF-NOT: -cc1{{.*}}memory-use-after-dtor
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-address-field-padding=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-FIELD-PADDING-0
@@ -628,6 +628,10 @@
 // RUN: %clang -target arm-linux-androideabi -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
 // RUN: %clang -target i386-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
+// RUN: %clang -target mips64-unknown-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
+// RUN: %clang -target mips64el-unknown-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
+// RUN: %clang -target mips-unknown-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
+// RUN: %clang -target mipsel-unknown-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
 // CHECK-SCUDO: "-fsanitize=scudo"
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO-PIE
diff --git a/test/Driver/fuchsia.c b/test/Driver/fuchsia.c
index b6946ef40664..72157acd011f 100644
--- a/test/Driver/fuchsia.c
+++ b/test/Driver/fuchsia.c
@@ -80,19 +80,34 @@
 // CHECK-ASAN-SHARED: "{{.*[/\\]}}libclang_rt.asan-x86_64.so"
 // CHECK-ASAN-SHARED-NOT: "{{.*[/\\]}}libclang_rt.asan-preinit-x86_64.a"
 
+// RUN: %clang %s -### --target=x86_64-fuchsia \
+// RUN:     -fsanitize=fuzzer 2>&1 \
+// RUN:     | FileCheck %s -check-prefix=CHECK-FUZZER-X86
+// CHECK-FUZZER-X86: "-fsanitize=fuzzer,fuzzer-no-link"
+// CHECK-FUZZER-X86: "{{.*[/\\]}}libclang_rt.fuzzer-x86_64.a"
+
+// RUN: %clang %s -### --target=aarch64-fuchsia \
+// RUN:     -fsanitize=fuzzer 2>&1 \
+// RUN:     | FileCheck %s -check-prefix=CHECK-FUZZER-AARCH64
+// CHECK-FUZZER-AARCH64: "-fsanitize=fuzzer,fuzzer-no-link"
+// CHECK-FUZZER-AARCH64: "{{.*[/\\]}}libclang_rt.fuzzer-aarch64.a"
+
 // RUN: %clang %s -### --target=x86_64-fuchsia \
 // RUN:     -fsanitize=scudo 2>&1 \
 // RUN:     | FileCheck %s -check-prefix=CHECK-SCUDO-X86
 // CHECK-SCUDO-X86: "-fsanitize=scudo"
 // CHECK-SCUDO-X86: "-pie"
+// CHECK-SCUDO-X86: "{{.*[/\\]}}libclang_rt.scudo-x86_64.so"
 
 // RUN: %clang %s -### --target=aarch64-fuchsia \
 // RUN:     -fsanitize=scudo 2>&1 \
 // RUN:     | FileCheck %s -check-prefix=CHECK-SCUDO-AARCH64
 // CHECK-SCUDO-AARCH64: "-fsanitize=scudo"
 // CHECK-SCUDO-AARCH64: "-pie"
+// CHECK-SCUDO-AARCH64: "{{.*[/\\]}}libclang_rt.scudo-aarch64.so"
 
 // RUN: %clang %s -### --target=x86_64-fuchsia \
 // RUN:     -fsanitize=scudo -fPIC -shared 2>&1 \
 // RUN:     | FileCheck %s -check-prefix=CHECK-SCUDO-SHARED
 // CHECK-SCUDO-SHARED: "-fsanitize=scudo"
+// CHECK-SCUDO-SHARED: "{{.*[/\\]}}libclang_rt.scudo-x86_64.so"
diff --git a/test/Driver/global-isel.c b/test/Driver/global-isel.c
new file mode 100644
index 000000000000..f4fc6a758ac2
--- /dev/null
+++ b/test/Driver/global-isel.c
@@ -0,0 +1,24 @@
+// REQUIRES: x86-registered-target,aarch64-registered-target
+
+// RUN: %clang -fexperimental-isel -S -### %s 2>&1 | FileCheck --check-prefix=ENABLED %s
+// RUN: %clang -fno-experimental-isel -S -### %s 2>&1 | FileCheck --check-prefix=DISABLED %s
+
+// RUN: %clang -target aarch64 -fexperimental-isel -S %s -### 2>&1 | FileCheck --check-prefix=ARM64-DEFAULT %s
+// RUN: %clang -target aarch64 -fexperimental-isel -S -O0 %s -### 2>&1 | FileCheck --check-prefix=ARM64-O0 %s
+// RUN: %clang -target aarch64 -fexperimental-isel -S -O2 %s -### 2>&1 | FileCheck --check-prefix=ARM64-O2 %s
+// RUN: %clang -target aarch64 -fexperimental-isel -Wno-experimental-isel -S -O2 %s -### 2>&1 | FileCheck --check-prefix=ARM64-O2-NOWARN %s
+
+// RUN: %clang -target x86_64 -fexperimental-isel -S %s -### 2>&1 | FileCheck --check-prefix=X86_64 %s
+
+// ENABLED: "-mllvm" "-global-isel=1"
+// DISABLED: "-mllvm" "-global-isel=0"
+
+// ARM64-DEFAULT-NOT: warning: -fexperimental-sel
+// ARM64-DEFAULT-NOT: "-global-isel-abort=2"
+// ARM64-O0-NOT: warning: -fexperimental-sel
+// ARM64-O2: warning: -fexperimental-isel support is incomplete for this architecture at the current optimization level
+// ARM64-O2: "-mllvm" "-global-isel-abort=2"
+// ARM64-O2-NOWARN-NOT: warning: -fexperimental-isel
+
+// X86_64: -fexperimental-isel support for the 'x86_64' architecture is incomplete
+// X86_64: "-mllvm" "-global-isel-abort=2"
diff --git a/test/Driver/hexagon-hvx.c b/test/Driver/hexagon-hvx.c
index 171d586760ed..616355944cb0 100644
--- a/test/Driver/hexagon-hvx.c
+++ b/test/Driver/hexagon-hvx.c
@@ -21,6 +21,9 @@
 
 // RUN: %clang -c %s -### -target hexagon-unknown-elf -mv62 -mhvx \
 // RUN:  -mhvx-length=128B 2>&1 | FileCheck -check-prefix=CHECKHVX2 %s
+
+// RUN: %clang -c %s -### -target hexagon-unknown-elf -mv62 -mhvx \
+// RUN:  -mhvx-length=128b 2>&1 | FileCheck -check-prefix=CHECKHVX2 %s
 // CHECKHVX2-NOT: "-target-feature" "+hvx-length64b"
 // CHECKHVX2: "-target-feature" "+hvx-length128b"
 
@@ -65,7 +68,7 @@
 // RUN:  2>&1 | FileCheck -check-prefix=CHECK-HVXEQ %s
 // CHECK-HVXEQ: "-target-feature" "+hvxv62"
 
-// Honor the last occured -mhvx=, -mhvx flag.
+// Honor the last occurred -mhvx=, -mhvx flag.
 // RUN: %clang -c %s -### -target hexagon-unknown-elf -mv60 -mhvx=v62 -mhvx\
 // RUN:  2>&1 | FileCheck -check-prefix=CHECK-HVXEQ-PRE %s
 // CHECK-HVXEQ-PRE-NOT: "-target-feature" "+hvxv62"
@@ -79,8 +82,10 @@
 // The default mode on v60,v62 is 64B.
 // RUN: %clang -c %s -### -target hexagon-unknown-elf -mv60 -mhvx \
 // RUN:  2>&1 | FileCheck -check-prefix=CHECK-HVXLENGTH-64B %s
-// RUN: %clang -c %s -### -target hexagon-unknown-elf -mv60 -mhvx -mhvx-length=64B\
-// RUN:  2>&1 | FileCheck -check-prefix=CHECK-HVXLENGTH-64B %s
+// RUN: %clang -c %s -### -target hexagon-unknown-elf -mv60 -mhvx \
+// RUN:  -mhvx-length=64b 2>&1 | FileCheck -check-prefix=CHECK-HVXLENGTH-64B %s
+// RUN: %clang -c %s -### -target hexagon-unknown-elf -mv60 -mhvx \
+// RUN:  -mhvx-length=64B 2>&1 | FileCheck -check-prefix=CHECK-HVXLENGTH-64B %s
 // CHECK-HVXLENGTH-64B: "-target-feature" "+hvx{{.*}}" "-target-feature" "+hvx-length64b"
 // RUN: %clang -c %s -### -target hexagon-unknown-elf -mv62 -mhvx -mhvx-length=128B\
 // RUN:  2>&1 | FileCheck -check-prefix=CHECK-HVXLENGTH-128B %s
diff --git a/test/Driver/masm.c b/test/Driver/masm.c
index 17c639358b0b..5c7251abae83 100644
--- a/test/Driver/masm.c
+++ b/test/Driver/masm.c
@@ -2,11 +2,13 @@
 // RUN: %clang -target i386-unknown-linux -masm=att -S %s -### 2>&1 | FileCheck --check-prefix=CHECK-ATT %s
 // RUN: %clang -target i386-unknown-linux -S -masm=somerequired %s -### 2>&1 | FileCheck --check-prefix=CHECK-SOMEREQUIRED %s
 // RUN: %clang -target arm-unknown-eabi -S -masm=intel %s -### 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
+// RUN: %clang_cl --target=x86_64 /FA -### -- %s 2>&1 | FileCheck --check-prefix=CHECK-CL %s
 
 int f() {
 // CHECK-INTEL: -x86-asm-syntax=intel
 // CHECK-ATT: -x86-asm-syntax=att
 // CHECK-SOMEREQUIRED: error: unsupported argument 'somerequired' to option 'masm='
 // CHECK-ARM: warning: argument unused during compilation: '-masm=intel'
+// CHECK-CL: -x86-asm-syntax=intel
   return 0;
 }
diff --git a/test/Driver/myriad-toolchain.c b/test/Driver/myriad-toolchain.c
index 26b02387093a..215a02fd0dec 100644
--- a/test/Driver/myriad-toolchain.c
+++ b/test/Driver/myriad-toolchain.c
@@ -41,7 +41,7 @@
 // RUN:   | FileCheck %s -check-prefix=MOVICOMPILE
 // MOVICOMPILE: moviCompile{{(.exe)?}}" "-S" "-fno-exceptions" "-DMYRIAD2" "-mcpu=myriad2.2" "-isystem" "somewhere" "-I" "common"
 // MOVICOMPILE: moviAsm{{(.exe)?}}" "-no6thSlotCompression" "-cv:myriad2.2" "-noSPrefixing" "-a"
-// MOVICOMPILE: "-yippee" "-i:somewhere" "-i:common" "-elf"
+// MOVICOMPILE: "-yippee" "-i:somewhere" "-i:common"
 
 // RUN: %clang -target shave-myriad -c -### %s -DEFINE_ME -UNDEFINE_ME 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=DEFINES
diff --git a/test/Driver/riscv-abi.c b/test/Driver/riscv-abi.c
new file mode 100644
index 000000000000..8b79c8462bae
--- /dev/null
+++ b/test/Driver/riscv-abi.c
@@ -0,0 +1,47 @@
+// RUN: %clang -target riscv32-unknown-elf %s -### -o %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ILP32 %s
+// RUN: %clang -target riscv32-unknown-elf %s -### -o %t.o -mabi=ilp32 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ILP32 %s
+
+// CHECK-ILP32: "-target-abi" "ilp32"
+
+// TODO: ilp32f support.
+// RUN: not %clang -target riscv32-unknown-elf %s -o %t.o -mabi=ilp32f 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ILP32F %s
+
+// CHECK-ILP32F: error: unknown target ABI 'ilp32f'
+
+// TODO: ilp32d support.
+// RUN: not %clang -target riscv32-unknown-elf %s -o %t.o -mabi=ilp32d 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ILP32D %s
+
+// CHECK-ILP32D: error: unknown target ABI 'ilp32d'
+
+// RUN: not %clang -target riscv32-unknown-elf %s -o %t.o -mabi=lp64 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-RV32-LP64 %s
+
+// CHECK-RV32-LP64: error: unknown target ABI 'lp64'
+
+// RUN: %clang -target riscv64-unknown-elf %s -### -o %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LP64 %s
+// RUN: %clang -target riscv64-unknown-elf %s -### -o %t.o -mabi=lp64 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LP64 %s
+
+// CHECK-LP64: "-target-abi" "lp64"
+
+// TODO: lp64f support.
+// RUN: not %clang -target riscv64-unknown-elf %s -o %t.o -mabi=lp64f 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LP64F %s
+
+// CHECK-LP64F: error: unknown target ABI 'lp64f'
+
+// TODO: lp64d support.
+// RUN: not %clang -target riscv64-unknown-elf %s -o %t.o -mabi=lp64d 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LP64D %s
+
+// CHECK-LP64D: error: unknown target ABI 'lp64d'
+
+// RUN: not %clang -target riscv64-unknown-elf %s -o %t.o -mabi=ilp32 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-RV64-ILP32 %s
+
+// CHECK-RV64-ILP32: error: unknown target ABI 'ilp32'
diff --git a/test/Driver/riscv-features.c b/test/Driver/riscv-features.c
new file mode 100644
index 000000000000..565596cdd3a8
--- /dev/null
+++ b/test/Driver/riscv-features.c
@@ -0,0 +1,4 @@
+// RUN: %clang -target riscv32-unknown-elf -### %s -fsyntax-only 2>&1 | FileCheck %s
+// RUN: %clang -target riscv64-unknown-elf -### %s -fsyntax-only 2>&1 | FileCheck %s
+
+// CHECK: fno-signed-char
diff --git a/test/Driver/riscv-gnutools.c b/test/Driver/riscv-gnutools.c
new file mode 100644
index 000000000000..afcb5052aa3f
--- /dev/null
+++ b/test/Driver/riscv-gnutools.c
@@ -0,0 +1,19 @@
+// Check gnutools are invoked with propagated values for -mabi and -march.
+
+// RUN: %clang -target riscv32 -fno-integrated-as %s -###  -c \
+// RUN: 2>&1 | FileCheck -check-prefix=MABI-ILP32 %s
+
+// RUN: %clang -target riscv32 -fno-integrated-as -march=rv32g %s -### -c \
+// RUN: 2>&1 | FileCheck -check-prefix=MABI-ILP32-MARCH-G %s
+
+// RUN: %clang -target riscv64 -fno-integrated-as %s -###  -c \
+// RUN: 2>&1 | FileCheck -check-prefix=MABI-ILP64 %s
+
+// RUN: %clang -target riscv64 -fno-integrated-as -march=rv64g %s -### -c \
+// RUN: 2>&1 | FileCheck -check-prefix=MABI-ILP64-MARCH-G %s
+
+// MABI-ILP32: "{{.*}}as{{(.exe)?}}" "-mabi" "ilp32"
+// MABI-ILP32-MARCH-G: "{{.*}}as{{(.exe)?}}" "-mabi" "ilp32" "-march" "rv32g"
+
+// MABI-ILP64: "{{.*}}as{{(.exe)?}}" "-mabi" "lp64"
+// MABI-ILP64-MARCH-G: "{{.*}}as{{(.exe)?}}" "-mabi" "lp64" "-march" "rv64g"
diff --git a/test/Driver/riscv32-toolchain.c b/test/Driver/riscv32-toolchain.c
new file mode 100644
index 000000000000..3839f8e59c9b
--- /dev/null
+++ b/test/Driver/riscv32-toolchain.c
@@ -0,0 +1,122 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 2>&1 | FileCheck -check-prefix=CC1 %s
+// CC1: clang{{.*}} "-cc1" "-triple" "riscv32"
+
+
+// RUN: %clang %s -### -no-canonical-prefixes -fuse-ld=ld \
+// RUN:   -target riscv32-linux-unknown-elf \
+// RUN:   --gcc-toolchain=%S/Inputs/multilib_riscv_linux_sdk \
+// RUN:   --sysroot=%S/Inputs/multilib_riscv_linux_sdk/sysroot 2>&1 \
+// RUN:   | FileCheck -check-prefix=CC1-RV32-LINUX-ILP32 %s
+
+// CC1-RV32-LINUX-ILP32: "{{.*}}/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/../../../../riscv64-unknown-linux-gnu/bin{{/|\\\\}}ld"
+// CC1-RV32-LINUX-ILP32: "--sysroot={{.*}}/Inputs/multilib_riscv_linux_sdk/sysroot"
+// CC1-RV32-LINUX-ILP32: "-m" "elf32lriscv"
+// CC1-RV32-LINUX-ILP32: "-dynamic-linker" "/lib/ld-linux-riscv32-ilp32.so.1"
+// CC1-RV32-LINUX-ILP32: "{{.*}}/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32{{/|\\\\}}crtbegin.o"
+// CC1-RV32-LINUX-ILP32: "-L{{.*}}/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32"
+// CC1-RV32-LINUX-ILP32: "-L{{.*}}/Inputs/multilib_riscv_linux_sdk/sysroot/lib32/ilp32"
+// CC1-RV32-LINUX-ILP32: "-L{{.*}}/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib32/ilp32"
+
+// RUN: %clang %s -### -no-canonical-prefixes -fuse-ld=ld \
+// RUN:   -target riscv32-linux-unknown-elf -march=rv32imafd -mabi=ilp32d \
+// RUN:   --gcc-toolchain=%S/Inputs/multilib_riscv_linux_sdk \
+// RUN:   --sysroot=%S/Inputs/multilib_riscv_linux_sdk/sysroot 2>&1 \
+// RUN:   | FileCheck -check-prefix=CC1-RV32-LINUX-ILP32D %s
+
+// CC1-RV32-LINUX-ILP32D: "{{.*}}/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/../../../../riscv64-unknown-linux-gnu/bin{{/|\\\\}}ld"
+// CC1-RV32-LINUX-ILP32D: "--sysroot={{.*}}/Inputs/multilib_riscv_linux_sdk/sysroot"
+// CC1-RV32-LINUX-ILP32D: "-m" "elf32lriscv"
+// CC1-RV32-LINUX-ILP32D: "-dynamic-linker" "/lib/ld-linux-riscv32-ilp32d.so.1"
+// CC1-RV32-LINUX-ILP32D: "{{.*}}/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32d{{/|\\\\}}crtbegin.o"
+// CC1-RV32-LINUX-ILP32D: "-L{{.*}}/Inputs/multilib_riscv_linux_sdk/lib/gcc/riscv64-unknown-linux-gnu/7.2.0/lib32/ilp32d"
+// CC1-RV32-LINUX-ILP32D: "-L{{.*}}/Inputs/multilib_riscv_linux_sdk/sysroot/lib32/ilp32d"
+// CC1-RV32-LINUX-ILP32D: "-L{{.*}}/Inputs/multilib_riscv_linux_sdk/sysroot/usr/lib32/ilp32d"
+
+// RUN: %clang -target riscv32 %s -emit-llvm -S -o - | FileCheck %s
+
+typedef __builtin_va_list va_list;
+typedef __SIZE_TYPE__ size_t;
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+typedef __WCHAR_TYPE__ wchar_t;
+
+// CHECK: @align_c = global i32 1
+int align_c = __alignof(char);
+
+// CHECK: @align_s = global i32 2
+int align_s = __alignof(short);
+
+// CHECK: @align_i = global i32 4
+int align_i = __alignof(int);
+
+// CHECK: @align_wc = global i32 4
+int align_wc = __alignof(wchar_t);
+
+// CHECK: @align_l = global i32 4
+int align_l = __alignof(long);
+
+// CHECK: @align_ll = global i32 8
+int align_ll = __alignof(long long);
+
+// CHECK: @align_p = global i32 4
+int align_p = __alignof(void*);
+
+// CHECK: @align_f = global i32 4
+int align_f = __alignof(float);
+
+// CHECK: @align_d = global i32 8
+int align_d = __alignof(double);
+
+// CHECK: @align_ld = global i32 16
+int align_ld = __alignof(long double);
+
+// CHECK: @align_vl = global i32 4
+int align_vl = __alignof(va_list);
+
+// Check types
+
+// CHECK: zeroext i8 @check_char()
+char check_char() { return 0; }
+
+// CHECK: define signext i16 @check_short()
+short check_short() { return 0; }
+
+// CHECK: define i32 @check_int()
+int check_int() { return 0; }
+
+// CHECK: define i32 @check_wchar_t()
+int check_wchar_t() { return 0; }
+
+// CHECK: define i32 @check_long()
+long check_long() { return 0; }
+
+// CHECK: define i64 @check_longlong()
+long long check_longlong() { return 0; }
+
+// CHECK: define zeroext i8 @check_uchar()
+unsigned char check_uchar() { return 0; }
+
+// CHECK: define zeroext i16 @check_ushort()
+unsigned short check_ushort() { return 0; }
+
+// CHECK: define i32 @check_uint()
+unsigned int check_uint() { return 0; }
+
+// CHECK: define i32 @check_ulong()
+unsigned long check_ulong() { return 0; }
+
+// CHECK: define i64 @check_ulonglong()
+unsigned long long check_ulonglong() { return 0; }
+
+// CHECK: define i32 @check_size_t()
+size_t check_size_t() { return 0; }
+
+// CHECK: define float @check_float()
+float check_float() { return 0; }
+
+// CHECK: define double @check_double()
+double check_double() { return 0; }
+
+// CHECK: define fp128 @check_longdouble()
+long double check_longdouble() { return 0; }
diff --git a/test/Driver/riscv64-toolchain.c b/test/Driver/riscv64-toolchain.c
new file mode 100644
index 000000000000..8a3f3228ed05
--- /dev/null
+++ b/test/Driver/riscv64-toolchain.c
@@ -0,0 +1,91 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 2>&1 | FileCheck -check-prefix=CC1 %s
+// CC1: clang{{.*}} "-cc1" "-triple" "riscv64"
+
+// RUN: %clang -target riscv64 %s -emit-llvm -S -o - | FileCheck %s
+
+typedef __builtin_va_list va_list;
+typedef __SIZE_TYPE__ size_t;
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+typedef __WCHAR_TYPE__ wchar_t;
+
+// CHECK: @align_c = global i32 1
+int align_c = __alignof(char);
+
+// CHECK: @align_s = global i32 2
+int align_s = __alignof(short);
+
+// CHECK: @align_i = global i32 4
+int align_i = __alignof(int);
+
+// CHECK: @align_wc = global i32 4
+int align_wc = __alignof(wchar_t);
+
+// CHECK: @align_l = global i32 8
+int align_l = __alignof(long);
+
+// CHECK: @align_ll = global i32 8
+int align_ll = __alignof(long long);
+
+// CHECK: @align_p = global i32 8
+int align_p = __alignof(void*);
+
+// CHECK: @align_f = global i32 4
+int align_f = __alignof(float);
+
+// CHECK: @align_d = global i32 8
+int align_d = __alignof(double);
+
+// CHECK: @align_ld = global i32 16
+int align_ld = __alignof(long double);
+
+// CHECK: @align_vl = global i32 8
+int align_vl = __alignof(va_list);
+
+// Check types
+
+// CHECK: define zeroext i8 @check_char()
+char check_char() { return 0; }
+
+// CHECK: define signext i16 @check_short()
+short check_short() { return 0; }
+
+// CHECK: define signext i32 @check_int()
+int check_int() { return 0; }
+
+// CHECK: define signext i32 @check_wchar_t()
+int check_wchar_t() { return 0; }
+
+// CHECK: define i64 @check_long()
+long check_long() { return 0; }
+
+// CHECK: define i64 @check_longlong()
+long long check_longlong() { return 0; }
+
+// CHECK: define zeroext i8 @check_uchar()
+unsigned char check_uchar() { return 0; }
+
+// CHECK: define zeroext i16 @check_ushort()
+unsigned short check_ushort() { return 0; }
+
+// CHECK: define signext i32 @check_uint()
+unsigned int check_uint() { return 0; }
+
+// CHECK: define i64 @check_ulong()
+unsigned long check_ulong() { return 0; }
+
+// CHECK: define i64 @check_ulonglong()
+unsigned long long check_ulonglong() { return 0; }
+
+// CHECK: define i64 @check_size_t()
+size_t check_size_t() { return 0; }
+
+// CHECK: define float @check_float()
+float check_float() { return 0; }
+
+// CHECK: define double @check_double()
+double check_double() { return 0; }
+
+// CHECK: define fp128 @check_longdouble()
+long double check_longdouble() { return 0; }
diff --git a/test/Driver/solaris-header-search.cpp b/test/Driver/solaris-header-search.cpp
index 667b2c05d438..d5c3c0dd7468 100644
--- a/test/Driver/solaris-header-search.cpp
+++ b/test/Driver/solaris-header-search.cpp
@@ -1,11 +1,41 @@
-// Test that the C++ headers are found.
+// Test that the C++ headers are found on Solaris with gcc toolchain detection
 //
-// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
-// RUN:     --target=sparc-sun-solaris2.11 \
-// RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/sparc-sun-solaris2.11 \
-// RUN:   | FileCheck %s
-// CHECK: "{{[^"]*}}clang{{[^"]*}}" "-cc1"
-// CHECK: "-internal-isystem" "{{.*}}/usr/include/c++/v1/support/solaris"
-// CHECK: "-internal-isystem" "{{.*}}/usr/gcc/4.8/include/c++/4.8.2"
-// CHECK: "-internal-isystem" "{{.*}}/usr/gcc/4.8/include/c++/4.8.2/sparc-sun-solaris2.11"
+// Sparc, 32bit
+// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
+// RUN:     --target=sparc-sun-solaris2.11 --stdlib=platform \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK_SOLARIS_SPARC %s
+// CHECK_SOLARIS_SPARC: "{{[^"]*}}clang{{[^"]*}}" "-cc1"
+// CHECK_SOLARIS_SPARC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK_SOLARIS_SPARC-SAME: "-internal-isystem" "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/../../../../include/c++/4.8.2"
+// CHECK_SOLARIS_SPARC-SAME: "-internal-isystem" "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/../../../../include/c++/4.8.2/sparc-sun-solaris2.11"
+
+// Sparc, 64bit
+// RUN: %clang -no-canonical-prefixes -m64 %s -### -fsyntax-only 2>&1 \
+// RUN:     --target=sparc-sun-solaris2.11 --stdlib=platform \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK_SOLARIS_SPARC64 %s
+// CHECK_SOLARIS_SPARC64: "{{[^"]*}}clang{{[^"]*}}" "-cc1"
+// CHECK_SOLARIS_SPARC64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK_SOLARIS_SPARC64-SAME: "-internal-isystem" "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/../../../../include/c++/4.8.2"
+// CHECK_SOLARIS_SPARC64-SAME: "-internal-isystem" "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/../../../../include/c++/4.8.2/sparc-sun-solaris2.11/sparcv9"
+
+// Intel, 32bit
+// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
+// RUN:     --target=i386-pc-solaris2.11 --stdlib=platform \
+// RUN:     --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK_SOLARIS_X86 %s
+// CHECK_SOLARIS_X86: "{{[^"]*}}clang{{[^"]*}}" "-cc1"
+// CHECK_SOLARIS_X86-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK_SOLARIS_X86-SAME: "-internal-isystem" "{{.*}}/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/../../../../include/c++/4.9.4"
+// CHECK_SOLARIS_X86-SAME: "-internal-isystem" "{{.*}}/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/../../../../include/c++/4.9.4/i386-pc-solaris2.11"
+
+// Intel, 64bit
+// RUN: %clang -no-canonical-prefixes -m64 %s -### -fsyntax-only 2>&1 \
+// RUN:     --target=i386-pc-solaris2.11 --stdlib=platform \
+// RUN:     --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK_SOLARIS_X64 %s
+// CHECK_SOLARIS_X64: "{{[^"]*}}clang{{[^"]*}}" "-cc1"
+// CHECK_SOLARIS_X64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK_SOLARIS_X64-SAME: "-internal-isystem" "[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/../../../../include/c++/4.9.4"
+// CHECK_SOLARIS_X64-SAME: "-internal-isystem" "[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/../../../../include/c++/4.9.4/i386-pc-solaris2.11/amd64"
diff --git a/test/Driver/solaris-ld.c b/test/Driver/solaris-ld.c
index d871b592eb18..2fc5c91272aa 100644
--- a/test/Driver/solaris-ld.c
+++ b/test/Driver/solaris-ld.c
@@ -1,33 +1,110 @@
-// Test ld invocation on Solaris targets.
+// General tests that ld invocations on Solaris targets sane. Note that we use
+// sysroot to make these tests independent of the host system.
 
-// Check sparc-sun-solaris2.1
+// Check sparc-sun-solaris2.11, 32bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=sparc-sun-solaris2.11 \
 // RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/sparc-sun-solaris2.11 \
-// RUN:   | FileCheck %s
-// CHECK: "-cc1" "-triple" "sparc-sun-solaris2.11"
-// CHECK: ld{{.*}}"
-// CHECK: "--dynamic-linker" "{{.*}}/usr/lib/ld.so.1"
-// CHECK: "{{.*}}/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crt1.o"
-// CHECK: "{{.*}}/usr/lib/crti.o"
-// CHECK: "{{.*}}/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crtbegin.o"
-// CHECK: "{{.*}}/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crtend.o"
-// CHECK: "{{.*}}/usr/lib/crtn.o"
-// CHECK "-lc"
-// CHECK "-lgcc_s"
-// CHECK "-lgcc"
-// CHECK "-lm"
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-SPARC32 %s
+// CHECK-LD-SPARC32-NOT: warning:
+// CHECK-LD-SPARC32: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "sparc-sun-solaris2.11"
+// CHECK-LD-SPARC32-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD-SPARC32: "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD-SPARC32-SAME: "--dynamic-linker" "[[SYSROOT]]/usr/lib{{/|\\\\}}ld.so.1"
+// CHECK-LD-SPARC32-SAME: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crt1.o"
+// CHECK-LD-SPARC32-SAME: "[[SYSROOT]]/usr/lib{{/|\\\\}}crti.o"
+// CHECK-LD-SPARC32-SAME: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crtbegin.o"
+// CHECK-LD-SPARC32-SAME: "-L[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2"
+// CHECK-LD-SPARC32-SAME: "-L[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/../../.."
+// CHECK-LD-SPARC32-SAME: "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD-SPARC32-SAME: "-lgcc_s"
+// CHECK-LD-SPARC32-SAME: "-lc"
+// CHECK-LD-SPARC32-SAME: "-lgcc"
+// CHECK-LD-SPARC32-SAME: "-lm"
+// CHECK-LD-SPARC32-SAME: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crtend.o"
+// CHECK-LD-SPARC32-SAME: "[[SYSROOT]]/usr/lib{{/|\\\\}}crtn.o"
+
+// Check sparc-sun-solaris2.11, 64bit
+// RUN: %clang -no-canonical-prefixes -m64 %s -### -o %t.o 2>&1 \
+// RUN:     --target=sparc-sun-solaris2.11 \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-SPARC64 %s
+// CHECK-LD-SPARC64-NOT: warning:
+// CHECK-LD-SPARC64: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "sparcv9-sun-solaris2.11"
+// CHECK-LD-SPARC64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD-SPARC64: "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD-SPARC64-SAME: "--dynamic-linker" "[[SYSROOT]]/usr/lib/sparcv9{{/|\\\\}}ld.so.1"
+// CHECK-LD-SPARC64-SAME: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9{{/|\\\\}}crt1.o"
+// CHECK-LD-SPARC64-SAME: "[[SYSROOT]]/usr/lib/sparcv9{{/|\\\\}}crti.o"
+// CHECK-LD-SPARC64-SAME: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9{{/|\\\\}}crtbegin.o"
+// CHECK-LD-SPARC64-SAME: "-L[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9"
+// CHECK-LD-SPARC64-SAME: "-L[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/../../../sparcv9"
+// CHECK-LD-SPARC64-SAME: "-L[[SYSROOT]]/usr/lib/sparcv9"
+// CHECK-LD-SPARC64-SAME: "-lgcc_s"
+// CHECK-LD-SPARC64-SAME: "-lc"
+// CHECK-LD-SPARC64-SAME: "-lgcc"
+// CHECK-LD-SPARC64-SAME: "-lm"
+// CHECK-LD-SPARC64-SAME: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2/sparcv9{{/|\\\\}}crtend.o"
+// CHECK-LD-SPARC64-SAME: "[[SYSROOT]]/usr/lib/sparcv9{{/|\\\\}}crtn.o"
+
+// Check i386-pc-solaris2.11, 32bit
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=i386-pc-solaris2.11 \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-X32 %s
+// CHECK-LD-X32-NOT: warning:
+// CHECK-LD-X32: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "i386-pc-solaris2.11"
+// CHECK-LD-X32-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD-X32: "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD-X32-SAME: "--dynamic-linker" "[[SYSROOT]]/usr/lib{{/|\\\\}}ld.so.1"
+// CHECK-LD-X32-SAME: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt1.o"
+// CHECK-LD-X32-SAME: "[[SYSROOT]]/usr/lib{{/|\\\\}}crti.o"
+// CHECK-LD-X32-SAME: "[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4{{/|\\\\}}crtbegin.o"
+// CHECK-LD-X32-SAME: "-L[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4"
+// CHECK-LD-X32-SAME: "-L[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/../../.."
+// CHECK-LD-X32-SAME: "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD-X32-SAME: "-lgcc_s"
+// CHECK-LD-X32-SAME: "-lc"
+// CHECK-LD-X32-SAME: "-lgcc"
+// CHECK-LD-X32-SAME: "-lm"
+// CHECK-LD-X32-SAME: "[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4{{/|\\\\}}crtend.o"
+// CHECK-LD-X32-SAME: "[[SYSROOT]]/usr/lib{{/|\\\\}}crtn.o"
+
+// Check i386-pc-solaris2.11, 64bit
+// RUN: %clang -no-canonical-prefixes -m64 %s -### -o %t.o 2>&1 \
+// RUN:     --target=i386-pc-solaris2.11 \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-X64 %s
+// CHECK-LD-X64-NOT: warning:
+// CHECK-LD-X64: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "x86_64-pc-solaris2.11"
+// CHECK-LD-X64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD-X64: "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD-X64-SAME: "--dynamic-linker" "[[SYSROOT]]/usr/lib/amd64{{/|\\\\}}ld.so.1"
+// CHECK-LD-X64-SAME: "[[SYSROOT]]/usr/lib/amd64{{/|\\\\}}crt1.o"
+// CHECK-LD-X64-SAME: "[[SYSROOT]]/usr/lib/amd64{{/|\\\\}}crti.o"
+// CHECK-LD-X64-SAME: "[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/amd64{{/|\\\\}}crtbegin.o"
+// CHECK-LD-X64-SAME: "-L[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/amd64"
+// CHECK-LD-X64-SAME: "-L[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/../../../amd64"
+// CHECK-LD-X64-SAME: "-L[[SYSROOT]]/usr/lib/amd64"
+// CHECK-LD-X64-SAME: "-lgcc_s"
+// CHECK-LD-X64-SAME: "-lc"
+// CHECK-LD-X64-SAME: "-lgcc"
+// CHECK-LD-X64-SAME: "-lm"
+// CHECK-LD-X64-SAME: "[[SYSROOT]]/usr/gcc/4.9/lib/gcc/i386-pc-solaris2.11/4.9.4/amd64{{/|\\\\}}crtend.o"
+// CHECK-LD-X64-SAME: "[[SYSROOT]]/usr/lib/amd64{{/|\\\\}}crtn.o"
 
 // Check the right -l flags are present with -shared
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o -shared 2>&1 \
 // RUN:     --target=sparc-sun-solaris2.11 \
 // RUN:     --gcc-toolchain="" \
-// RUN:     --sysroot=%S/Inputs/sparc-sun-solaris2.11 \
-// RUN:   | FileCheck --check-prefix=CHECK-SHARED %s
-
-// CHECK-SHARED: ld{{.*}}"
-// CHECK-SHARED "-lc"
-// CHECK-SHARED "-lgcc_s"
-// CHECK-SHARED-NOT "-lgcc"
-// CHECK-SHARED-NOT: "-lm"
+// RUN:     --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-SPARC32-SHARED %s
+// CHECK-SPARC32-SHARED: "{{.*}}ld{{(.exe)?}}"
+// CHECK-SPARC32-SHARED-SAME: "-lgcc_s"
+// CHECK-SPARC32-SHARED-SAME: "-lc"
+// CHECK-SPARC32-SHARED-NOT: "-lgcc"
+// CHECK-SPARC32-SHARED-NOT: "-lm"
diff --git a/test/Driver/stack-size-section.c b/test/Driver/stack-size-section.c
new file mode 100644
index 000000000000..461f0b5c9aee
--- /dev/null
+++ b/test/Driver/stack-size-section.c
@@ -0,0 +1,9 @@
+// RUN: %clang -target x86_64-unknown %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
+// RUN: %clang -target x86_64-scei-ps4 -fno-stack-size-section %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
+// CHECK-ABSENT-NOT: -fstack-size-section
+
+// RUN: %clang -target x86_64-unknown -fstack-size-section -### 2>&1 | FileCheck %s --check-prefix=CHECK-PRESENT
+// RUN: %clang -target x86_64-scei-ps4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-PRESENT
+// CHECK-PRESENT: -fstack-size-section
+
+int foo() { return 42; }
diff --git a/test/Driver/unknown-arg.c b/test/Driver/unknown-arg.c
index 9bba74a942eb..d6bb5f7ae307 100644
--- a/test/Driver/unknown-arg.c
+++ b/test/Driver/unknown-arg.c
@@ -1,11 +1,21 @@
 // RUN: not %clang %s -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -### 2>&1 | \
 // RUN: FileCheck %s
+// RUN: not %clang %s -stdlibs=foo -hell -version -### 2>&1 | \
+// RUN: FileCheck %s --check-prefix=DID-YOU-MEAN
 // RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -### -c -- %s 2>&1 | \
 // RUN: FileCheck %s --check-prefix=CL
+// RUN: %clang_cl -Brepo -### -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CL-DID-YOU-MEAN
 // RUN: not %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -c -Werror=unknown-argument -### -- %s 2>&1 | \
 // RUN: FileCheck %s --check-prefix=CL-ERROR
+// RUN: not %clang_cl -helo -Werror=unknown-argument -### -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CL-ERROR-DID-YOU-MEAN
 // RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -c -Wno-unknown-argument -### -- %s 2>&1 | \
 // RUN: FileCheck %s --check-prefix=SILENT
+// RUN: not %clang -cc1as -hell --version -debug-info-macros 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CC1AS-DID-YOU-MEAN
+// RUN: not %clang -cc1asphalt -help 2>&1 | \
+// RUN: FileCheck %s --check-prefix=UNKNOWN-INTEGRATED
 
 // CHECK: error: unknown argument: '-cake-is-lie'
 // CHECK: error: unknown argument: '-%0'
@@ -14,6 +24,9 @@
 // CHECK: error: unknown argument: '-munknown-to-clang-option'
 // CHECK: error: unknown argument: '-print-stats'
 // CHECK: error: unknown argument: '-funknown-to-clang-option'
+// DID-YOU-MEAN: error: unknown argument '-stdlibs=foo', did you mean '-stdlib=foo'?
+// DID-YOU-MEAN: error: unknown argument '-hell', did you mean '-help'?
+// DID-YOU-MEAN: error: unknown argument '-version', did you mean '--version'?
 // CL: warning: unknown argument ignored in clang-cl: '-cake-is-lie'
 // CL: warning: unknown argument ignored in clang-cl: '-%0'
 // CL: warning: unknown argument ignored in clang-cl: '-%d'
@@ -21,6 +34,7 @@
 // CL: warning: unknown argument ignored in clang-cl: '-munknown-to-clang-option'
 // CL: warning: unknown argument ignored in clang-cl: '-print-stats'
 // CL: warning: unknown argument ignored in clang-cl: '-funknown-to-clang-option'
+// CL-DID-YOU-MEAN: warning: unknown argument ignored in clang-cl '-Brepo' (did you mean '-Brepro'?)
 // CL-ERROR: error: unknown argument ignored in clang-cl: '-cake-is-lie'
 // CL-ERROR: error: unknown argument ignored in clang-cl: '-%0'
 // CL-ERROR: error: unknown argument ignored in clang-cl: '-%d'
@@ -28,9 +42,13 @@
 // CL-ERROR: error: unknown argument ignored in clang-cl: '-munknown-to-clang-option'
 // CL-ERROR: error: unknown argument ignored in clang-cl: '-print-stats'
 // CL-ERROR: error: unknown argument ignored in clang-cl: '-funknown-to-clang-option'
+// CL-ERROR-DID-YOU-MEAN: error: unknown argument ignored in clang-cl '-helo' (did you mean '-help'?)
 // SILENT-NOT: error:
 // SILENT-NOT: warning:
-
+// CC1AS-DID-YOU-MEAN: error: unknown argument '-hell', did you mean '-help'?
+// CC1AS-DID-YOU-MEAN: error: unknown argument '--version', did you mean '-version'?
+// CC1AS-DID-YOU-MEAN: error: unknown argument '-debug-info-macros', did you mean '-debug-info-macro'?
+// UNKNOWN-INTEGRATED: error: unknown integrated tool 'asphalt'. Valid tools include '-cc1' and '-cc1as'.
 
 // RUN: %clang -S %s -o %t.s  -Wunknown-to-clang-option 2>&1 | FileCheck --check-prefix=IGNORED %s
 
diff --git a/test/Driver/unsupported-option.c b/test/Driver/unsupported-option.c
new file mode 100644
index 000000000000..39f135e68348
--- /dev/null
+++ b/test/Driver/unsupported-option.c
@@ -0,0 +1,7 @@
+// RUN: not %clang %s --hedonism -### 2>&1 | \
+// RUN: FileCheck %s
+// RUN: not %clang %s --hell -### 2>&1 | \
+// RUN: FileCheck %s --check-prefix=DID-YOU-MEAN
+
+// CHECK: error: unsupported option '--hedonism'
+// DID-YOU-MEAN: error: unsupported option '--hell', did you mean '--help'?
diff --git a/test/Driver/wasm-toolchain.c b/test/Driver/wasm-toolchain.c
index e9862f0b7d8d..b1f5f48f874d 100644
--- a/test/Driver/wasm-toolchain.c
+++ b/test/Driver/wasm-toolchain.c
@@ -1,21 +1,8 @@
 // A basic clang -cc1 command-line. WebAssembly is somewhat special in
-// enabling -ffunction-sections, -fdata-sections, and -fvisibility=hidden by
-// default.
+// enabling -fvisibility=hidden by default.
 
 // RUN: %clang %s -### -no-canonical-prefixes -target wasm32-unknown-unknown 2>&1 | FileCheck -check-prefix=CC1 %s
-// CC1: clang{{.*}} "-cc1" "-triple" "wasm32-unknown-unknown" {{.*}} "-fvisibility" "hidden" {{.*}} "-ffunction-sections" "-fdata-sections"
-
-// Ditto, but ensure that a user -fno-function-sections disables the
-// default -ffunction-sections.
-
-// RUN: %clang %s -### -target wasm32-unknown-unknown -fno-function-sections 2>&1 | FileCheck -check-prefix=NO_FUNCTION_SECTIONS %s
-// NO_FUNCTION_SECTIONS-NOT: function-sections
-
-// Ditto, but ensure that a user -fno-data-sections disables the
-// default -fdata-sections.
-
-// RUN: %clang %s -### -target wasm32-unknown-unknown -fno-data-sections 2>&1 | FileCheck -check-prefix=NO_DATA_SECTIONS %s
-// NO_DATA_SECTIONS-NOT: data-sections
+// CC1: clang{{.*}} "-cc1" "-triple" "wasm32-unknown-unknown" {{.*}} "-fvisibility" "hidden" {{.*}}
 
 // Ditto, but ensure that a user -fvisibility=default disables the default
 // -fvisibility=hidden.
@@ -27,10 +14,10 @@
 
 // RUN: %clang -### -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo %s 2>&1 | FileCheck -check-prefix=LINK %s
 // LINK: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// LINK: lld{{.*}}" "-flavor" "wasm" "-L/foo/lib" "crt1.o" "[[temp]]" "-allow-undefined-file" "wasm.syms" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
+// LINK: lld{{.*}}" "-flavor" "wasm" "-L/foo/lib" "crt1.o" "[[temp]]" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
 
 // A basic C link command-line with optimization.
 
 // RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo %s 2>&1 | FileCheck -check-prefix=LINK_OPT %s
 // LINK_OPT: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// LINK_OPT: lld{{.*}}" "-flavor" "wasm" "-L/foo/lib" "crt1.o" "[[temp]]" "-allow-undefined-file" "wasm.syms" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
+// LINK_OPT: lld{{.*}}" "-flavor" "wasm" "-L/foo/lib" "crt1.o" "[[temp]]" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
diff --git a/test/Driver/wasm-toolchain.cpp b/test/Driver/wasm-toolchain.cpp
new file mode 100644
index 000000000000..ea3aa313985c
--- /dev/null
+++ b/test/Driver/wasm-toolchain.cpp
@@ -0,0 +1,23 @@
+// A basic clang -cc1 command-line. WebAssembly is somewhat special in
+// enabling -fvisibility=hidden by default.
+
+// RUN: %clangxx %s -### -no-canonical-prefixes -target wasm32-unknown-unknown 2>&1 | FileCheck -check-prefix=CC1 %s
+// CC1: clang{{.*}} "-cc1" "-triple" "wasm32-unknown-unknown" {{.*}} "-fvisibility" "hidden" {{.*}}
+
+// Ditto, but ensure that a user -fvisibility=default disables the default
+// -fvisibility=hidden.
+
+// RUN: %clangxx %s -### -target wasm32-unknown-unknown -fvisibility=default 2>&1 | FileCheck -check-prefix=FVISIBILITY_DEFAULT %s
+// FVISIBILITY_DEFAULT-NOT: hidden
+
+// A basic C++ link command-line.
+
+// RUN: %clangxx -### -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo --stdlib=c++ %s 2>&1 | FileCheck -check-prefix=LINK %s
+// LINK: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK: lld{{.*}}" "-flavor" "wasm" "-L/foo/lib" "crt1.o" "[[temp]]" "-lc++" "-lc++abi" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
+
+// A basic C++ link command-line with optimization.
+
+// RUN: %clangxx -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo %s --stdlib=c++ 2>&1 | FileCheck -check-prefix=LINK_OPT %s
+// LINK_OPT: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_OPT: lld{{.*}}" "-flavor" "wasm" "-L/foo/lib" "crt1.o" "[[temp]]" "-lc++" "-lc++abi" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
diff --git a/test/Driver/x86-march.c b/test/Driver/x86-march.c
index f08c9dfadc09..9c9235dc0fdc 100644
--- a/test/Driver/x86-march.c
+++ b/test/Driver/x86-march.c
@@ -76,6 +76,10 @@
 // RUN:   | FileCheck %s -check-prefix=silvermont
 // silvermont: "-target-cpu" "silvermont"
 //
+// RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=goldmont 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=goldmont
+// goldmont: "-target-cpu" "goldmont"
+//
 // RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=k8 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=k8
 // k8: "-target-cpu" "k8"
diff --git a/test/Driver/x86-target-features.c b/test/Driver/x86-target-features.c
index 1289823d1dbe..530ba04e196a 100644
--- a/test/Driver/x86-target-features.c
+++ b/test/Driver/x86-target-features.c
@@ -125,3 +125,17 @@
 // VBMI2: "-target-feature" "+avx512vbmi2"
 // NO-VBMI2: "-target-feature" "-avx512vbmi2"
 
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mrdpid %s -### -o %t.o 2>&1 | FileCheck -check-prefix=RDPID %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-rdpid %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-RDPID %s
+// RDPID: "-target-feature" "+rdpid"
+// NO-RDPID: "-target-feature" "-rdpid"
+
+// RUN: %clang -target i386-linux-gnu -mretpoline %s -### -o %t.o 2>&1 | FileCheck -check-prefix=RETPOLINE %s
+// RUN: %clang -target i386-linux-gnu -mno-retpoline %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-RETPOLINE %s
+// RETPOLINE: "-target-feature" "+retpoline"
+// NO-RETPOLINE: "-target-feature" "-retpoline"
+
+// RUN: %clang -target i386-linux-gnu -mretpoline -mretpoline-external-thunk %s -### -o %t.o 2>&1 | FileCheck -check-prefix=RETPOLINE-EXTERNAL-THUNK %s
+// RUN: %clang -target i386-linux-gnu -mretpoline -mno-retpoline-external-thunk %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-RETPOLINE-EXTERNAL-THUNK %s
+// RETPOLINE-EXTERNAL-THUNK: "-target-feature" "+retpoline-external-thunk"
+// NO-RETPOLINE-EXTERNAL-THUNK: "-target-feature" "-retpoline-external-thunk"
diff --git a/test/Format/dump-config-cxx.h b/test/Format/dump-config-cxx.h
new file mode 100644
index 000000000000..57f3c975f96f
--- /dev/null
+++ b/test/Format/dump-config-cxx.h
@@ -0,0 +1,3 @@
+// RUN: clang-format -dump-config %s | FileCheck %s
+
+// CHECK: Language: Cpp
diff --git a/test/Format/dump-config-objc.h b/test/Format/dump-config-objc.h
new file mode 100644
index 000000000000..d33fd366a403
--- /dev/null
+++ b/test/Format/dump-config-objc.h
@@ -0,0 +1,5 @@
+// RUN: clang-format -dump-config %s | FileCheck %s
+
+// CHECK: Language: ObjC
+@interface Foo
+@end
diff --git a/test/Format/lit.local.cfg b/test/Format/lit.local.cfg
new file mode 100644
index 000000000000..abf4e8a65cc3
--- /dev/null
+++ b/test/Format/lit.local.cfg
@@ -0,0 +1,4 @@
+# Suffixes supported by clang-format.
+config.suffixes = ['.c', '.cc', '.cpp', '.h', '.m', '.mm', '.java', '.js',
+                   '.ts', '.proto', '.protodevel', '.pb.txt', '.textproto',
+                   '.textpb', '.asciipb', '.td']
diff --git a/test/Frontend/region-pragmas.c b/test/Frontend/region-pragmas.c
new file mode 100644
index 000000000000..8d929e8414fd
--- /dev/null
+++ b/test/Frontend/region-pragmas.c
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -Wall -verify %s
+// expected-no-diagnostics
+
+#pragma region foo
+#pragma endregion foo
diff --git a/test/Frontend/rewrite-includes-messages.c b/test/Frontend/rewrite-includes-messages.c
index f93fe72e58d5..a3a5cb2eae6c 100644
--- a/test/Frontend/rewrite-includes-messages.c
+++ b/test/Frontend/rewrite-includes-messages.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -E -frewrite-includes %s -I%S/Inputs/ | %clang_cc1 -Wall -fsyntax-only -Wunused-macros -x c - 2>&1 > %t.1
 // RUN: %clang_cc1 -I%S/Inputs/ -Wall -Wunused-macros -fsyntax-only %s 2>&1 > %t.2
-// RUN: diff %t.1 %t.2 -u
+// RUN: diff -u %t.1 %t.2
 // expected-no-diagnostics
 
 #include "rewrite-includes-messages.h"
diff --git a/test/Frontend/unknown-arg.c b/test/Frontend/unknown-arg.c
new file mode 100644
index 000000000000..00f2da6242c7
--- /dev/null
+++ b/test/Frontend/unknown-arg.c
@@ -0,0 +1,9 @@
+// RUN: not %clang_cc1 %s -E --helium 2>&1 | \
+// RUN: FileCheck %s
+// RUN: not %clang_cc1 %s -E --hel[ 2>&1 | \
+// RUN: FileCheck %s --check-prefix=DID-YOU-MEAN
+// RUN: not %clang %s -E -Xclang --hel[ 2>&1 | \
+// RUN: FileCheck %s --check-prefix=DID-YOU-MEAN
+
+// CHECK: error: unknown argument: '--helium'
+// DID-YOU-MEAN: error: unknown argument '--hel[', did you mean '--help'?
diff --git a/test/Import/template-specialization/Inputs/T.cpp b/test/Import/template-specialization/Inputs/T.cpp
index b31e2439efeb..7eea95829048 100644
--- a/test/Import/template-specialization/Inputs/T.cpp
+++ b/test/Import/template-specialization/Inputs/T.cpp
@@ -12,3 +12,7 @@ template <> struct A<bool> {
     int g;
   };
 };
+
+
+template <typename T> constexpr int f() { return 0; }
+template <> constexpr int f<int>() { return 4; }
diff --git a/test/Import/template-specialization/test.cpp b/test/Import/template-specialization/test.cpp
index 43996c53a77e..30df8187602d 100644
--- a/test/Import/template-specialization/test.cpp
+++ b/test/Import/template-specialization/test.cpp
@@ -1,7 +1,10 @@
 // RUN: clang-import-test -import %S/Inputs/T.cpp -expression %s
-// XFAIL: *
+
 void expr() {
   A<int>::B b1;
   A<bool>::B b2;
   b1.f + b2.g;
 }
+
+static_assert(f<char>() == 0, "");
+static_assert(f<int>() == 4, "");
diff --git a/test/Index/Core/index-pch.cpp b/test/Index/Core/index-pch.cpp
new file mode 100644
index 000000000000..8c5a92b612c4
--- /dev/null
+++ b/test/Index/Core/index-pch.cpp
@@ -0,0 +1,17 @@
+// RUN: c-index-test core -print-source-symbols -- %s -std=c++14 | FileCheck %s
+// RUN: %clang_cc1 -emit-pch %s -std=c++14 -o %t.pch
+// RUN: c-index-test core -print-source-symbols -module-file %t.pch | FileCheck %s
+
+// CHECK: [[@LINE+2]]:8 | struct(Gen)/C++ | DETECTOR | [[DETECTOR_USR:.*]] | {{.*}} | Def | rel: 0
+template <class _Default, class _AlwaysVoid, template <class...> class _Op, class... _Args>
+struct DETECTOR {
+ using value_t = int;
+};
+
+struct nonesuch {};
+
+// CHECK: [[@LINE+4]]:9 | type-alias/C++ | is_detected
+// CHECK: [[@LINE+3]]:32 | struct(Gen)/C++ | DETECTOR | [[DETECTOR_USR]] | {{.*}} | Ref,RelCont | rel: 1
+// CHECK-NEXT:	RelCont | is_detected
+template <template<class...> class _Op, class... _Args>
+  using is_detected = typename DETECTOR<nonesuch, void, _Op, _Args...>::value_t;
diff --git a/test/Index/annotate-tokens.cpp b/test/Index/annotate-tokens.cpp
index 460ab51dd959..67bdf6317aa6 100644
--- a/test/Index/annotate-tokens.cpp
+++ b/test/Index/annotate-tokens.cpp
@@ -37,7 +37,9 @@ class C {
   ~C();
 };
 
-// RUN: c-index-test -test-annotate-tokens=%s:1:1:38:1 %s -fno-delayed-template-parsing | FileCheck %s
+auto test5(X) -> X;
+
+// RUN: c-index-test -test-annotate-tokens=%s:1:1:41:1 %s -std=c++14 -fno-delayed-template-parsing | FileCheck %s
 // CHECK: Keyword: "struct" [1:1 - 1:7] StructDecl=bonk:1:8 (Definition)
 // CHECK: Identifier: "bonk" [1:8 - 1:12] StructDecl=bonk:1:8 (Definition)
 // CHECK: Punctuation: "{" [1:13 - 1:14] StructDecl=bonk:1:8 (Definition)
@@ -184,6 +186,14 @@ class C {
 // CHECK: Punctuation: "}" [29:22 - 29:23] CompoundStmt=
 // CHECK: Punctuation: "~" [37:3 - 37:4] CXXDestructor=~C:37:3
 // CHECK: Identifier: "C" [37:4 - 37:5] CXXDestructor=~C:37:3
+// CHECK: Keyword: "auto" [40:1 - 40:5] FunctionDecl=test5:40:6
+// CHECK: Identifier: "test5" [40:6 - 40:11] FunctionDecl=test5:40:6
+// CHECK: Punctuation: "(" [40:11 - 40:12] FunctionDecl=test5:40:6
+// CHECK: Identifier: "X" [40:12 - 40:13] TypeRef=struct X:7:8
+// CHECK: Punctuation: ")" [40:13 - 40:14] FunctionDecl=test5:40:6
+// CHECK: Punctuation: "->" [40:15 - 40:17] FunctionDecl=test5:40:6
+// CHECK: Identifier: "X" [40:18 - 40:19] TypeRef=struct X:7:8
+// CHECK: Punctuation: ";" [40:19 - 40:20]
 
 // RUN: env LIBCLANG_DISABLE_CRASH_RECOVERY=1 c-index-test -test-annotate-tokens=%s:32:1:32:13 %s | FileCheck %s -check-prefix=CHECK2
 // CHECK2: Keyword: "if" [32:3 - 32:5] IfStmt=
diff --git a/test/Index/complete-exprs.cpp b/test/Index/complete-exprs.cpp
index 3f3bd5c55a6b..fc60cc560b5c 100644
--- a/test/Index/complete-exprs.cpp
+++ b/test/Index/complete-exprs.cpp
@@ -58,7 +58,7 @@ namespace N {
 // CHECK-CC1: CXXConstructor:{TypedText string}{LeftParen (}{Placeholder const char *}{Comma , }{Placeholder int n}{RightParen )} (50)
 // CHECK-CC1: ClassTemplate:{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >} (50)
 // CHECK-CC1: CXXConstructor:{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >}{LeftParen (}{Placeholder const T &}{Comma , }{Placeholder unsigned int n}{RightParen )} (50)
-// CHECK-CC1: FunctionTemplate:{ResultType void}{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >}{LeftParen (}{Placeholder InputIterator first}{Comma , }{Placeholder InputIterator last}{RightParen )} (50)
+// CHECK-CC1: FunctionTemplate:{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >}{LeftParen (}{Placeholder InputIterator first}{Comma , }{Placeholder InputIterator last}{RightParen )} (50)
 
 // RUN: c-index-test -code-completion-at=%s:19:1 %s | FileCheck -check-prefix=CHECK-CC2 %s
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:19:1 %s | FileCheck -check-prefix=CHECK-CC2 %s
@@ -72,7 +72,7 @@ namespace N {
 // CHECK-CC3: FunctionDecl:{ResultType void}{TypedText g}{LeftParen (}{RightParen )} (50)
 // CHECK-CC3: ClassTemplate:{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >} (50)
 // CHECK-CC3: CXXConstructor:{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >}{LeftParen (}{Placeholder const T &}{Comma , }{Placeholder unsigned int n}{RightParen )} (50)
-// CHECK-CC3: FunctionTemplate:{ResultType void}{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >}{LeftParen (}{Placeholder InputIterator first}{Comma , }{Placeholder InputIterator last}{RightParen )} (50)
+// CHECK-CC3: FunctionTemplate:{TypedText vector}{LeftAngle <}{Placeholder typename T}{RightAngle >}{LeftParen (}{Placeholder InputIterator first}{Comma , }{Placeholder InputIterator last}{RightParen )} (50)
 
 // RUN: c-index-test -code-completion-at=%s:34:1 %s -std=c++0x | FileCheck -check-prefix=CHECK-CC4 %s
 // CHECK-CC4: NotImplemented:{ResultType const X *}{TypedText this} (40)
diff --git a/test/Index/complete-pch-skip.cpp b/test/Index/complete-pch-skip.cpp
new file mode 100644
index 000000000000..6abf40df36c1
--- /dev/null
+++ b/test/Index/complete-pch-skip.cpp
@@ -0,0 +1,30 @@
+namespace ns {
+int bar;
+}
+
+int main() { return ns:: }
+int main2() { return ns::foo(). }
+
+// RUN: echo "namespace ns { struct foo { int baz }; }" > %t.h
+// RUN: c-index-test -write-pch %t.h.pch -x c++-header %t.h
+//
+// RUN: c-index-test -code-completion-at=%s:5:26 -include %t.h %s | FileCheck -check-prefix=WITH-PCH %s
+// WITH-PCH: {TypedText bar}
+// WITH-PCH: {TypedText foo}
+
+// RUN: env CINDEXTEST_COMPLETION_SKIP_PREAMBLE=1 c-index-test -code-completion-at=%s:5:26 -include %t.h %s | FileCheck -check-prefix=SKIP-PCH %s
+// SKIP-PCH-NOT: foo
+// SKIP-PCH: {TypedText bar}
+// SKIP-PCH-NOT: foo
+
+// Verify that with *no* preamble (no -include flag) we still get local results.
+// SkipPreamble used to break this, by making lookup *too* lazy.
+// RUN: env CINDEXTEST_COMPLETION_SKIP_PREAMBLE=1 c-index-test -code-completion-at=%s:5:26 %s | FileCheck -check-prefix=NO-PCH %s
+// NO-PCH-NOT: foo
+// NO-PCH: {TypedText bar}
+// NO-PCH-NOT: foo
+
+// Verify that we still get member results from the preamble.
+// RUN: env CINDEXTEST_COMPLETION_SKIP_PREAMBLE=1 c-index-test -code-completion-at=%s:6:32 -include %t.h %s | FileCheck -check-prefix=MEMBER %s
+// MEMBER: {TypedText baz}
+
diff --git a/test/Index/opencl-types.cl b/test/Index/opencl-types.cl
index fe0042aa2097..d71893a22085 100644
--- a/test/Index/opencl-types.cl
+++ b/test/Index/opencl-types.cl
@@ -16,11 +16,11 @@ void kernel testFloatTypes() {
   double4 vectorDouble;
 }
 
-// CHECK: VarDecl=scalarHalf:11:8 (Definition) [type=half] [typekind=Half] [isPOD=1]
+// CHECK: VarDecl=scalarHalf:11:8 (Definition){{( \(invalid\))?}} [type=half] [typekind=Half] [isPOD=1]
 // CHECK: VarDecl=vectorHalf:12:9 (Definition) [type=half4] [typekind=Typedef] [canonicaltype=half __attribute__((ext_vector_type(4)))] [canonicaltypekind=Unexposed] [isPOD=1]
 // CHECK: VarDecl=scalarFloat:13:9 (Definition) [type=float] [typekind=Float] [isPOD=1]
 // CHECK: VarDecl=vectorFloat:14:10 (Definition) [type=float4] [typekind=Typedef] [canonicaltype=float __attribute__((ext_vector_type(4)))] [canonicaltypekind=Unexposed] [isPOD=1]
-// CHECK: VarDecl=scalarDouble:15:10 (Definition) [type=double] [typekind=Double] [isPOD=1]
+// CHECK: VarDecl=scalarDouble:15:10 (Definition){{( \(invalid\))?}} [type=double] [typekind=Double] [isPOD=1]
 // CHECK: VarDecl=vectorDouble:16:11 (Definition) [type=double4] [typekind=Typedef] [canonicaltype=double __attribute__((ext_vector_type(4)))] [canonicaltypekind=Unexposed] [isPOD=1]
 
 #pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
@@ -45,10 +45,10 @@ void kernel OCLImage3dROTest(read_only image3d_t scalarOCLImage3dRO);
 // CHECK: ParmDecl=scalarOCLImage2dArrayRO:32:61 (Definition) [type=__read_only image2d_array_t] [typekind=OCLImage2dArrayRO] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage2dDepthRO:33:61 (Definition) [type=__read_only image2d_depth_t] [typekind=OCLImage2dDepthRO] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage2dArrayDepthRO:34:72 (Definition) [type=__read_only image2d_array_depth_t] [typekind=OCLImage2dArrayDepthRO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dMSAARO:35:59 (Definition) [type=__read_only image2d_msaa_t] [typekind=OCLImage2dMSAARO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dArrayMSAARO:36:70 (Definition) [type=__read_only image2d_array_msaa_t] [typekind=OCLImage2dArrayMSAARO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dMSAADepthRO:37:70 (Definition) [type=__read_only image2d_msaa_depth_t] [typekind=OCLImage2dMSAADepthRO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dArrayMSAADepthRO:38:81 (Definition) [type=__read_only image2d_array_msaa_depth_t] [typekind=OCLImage2dArrayMSAADepthRO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dMSAARO:35:59 (Definition){{( \(invalid\))?}} [type=__read_only image2d_msaa_t] [typekind=OCLImage2dMSAARO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dArrayMSAARO:36:70 (Definition){{( \(invalid\))?}} [type=__read_only image2d_array_msaa_t] [typekind=OCLImage2dArrayMSAARO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dMSAADepthRO:37:70 (Definition){{( \(invalid\))?}} [type=__read_only image2d_msaa_depth_t] [typekind=OCLImage2dMSAADepthRO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dArrayMSAADepthRO:38:81 (Definition){{( \(invalid\))?}} [type=__read_only image2d_array_msaa_depth_t] [typekind=OCLImage2dArrayMSAADepthRO] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage3dRO:39:50 (Definition) [type=__read_only image3d_t] [typekind=OCLImage3dRO] [isPOD=1]
 
 void kernel OCLImage1dWOTest(write_only image1d_t scalarOCLImage1dWO);
@@ -71,11 +71,11 @@ void kernel OCLImage3dWOTest(write_only image3d_t scalarOCLImage3dWO);
 // CHECK: ParmDecl=scalarOCLImage2dArrayWO:58:62 (Definition) [type=__write_only image2d_array_t] [typekind=OCLImage2dArrayWO] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage2dDepthWO:59:62 (Definition) [type=__write_only image2d_depth_t] [typekind=OCLImage2dDepthWO] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage2dArrayDepthWO:60:73 (Definition) [type=__write_only image2d_array_depth_t] [typekind=OCLImage2dArrayDepthWO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dMSAAWO:61:60 (Definition) [type=__write_only image2d_msaa_t] [typekind=OCLImage2dMSAAWO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dArrayMSAAWO:62:71 (Definition) [type=__write_only image2d_array_msaa_t] [typekind=OCLImage2dArrayMSAAWO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dMSAADepthWO:63:71 (Definition) [type=__write_only image2d_msaa_depth_t] [typekind=OCLImage2dMSAADepthWO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dArrayMSAADepthWO:64:82 (Definition) [type=__write_only image2d_array_msaa_depth_t] [typekind=OCLImage2dArrayMSAADepthWO] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage3dWO:65:51 (Definition) [type=__write_only image3d_t] [typekind=OCLImage3dWO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dMSAAWO:61:60 (Definition){{( \(invalid\))?}} [type=__write_only image2d_msaa_t] [typekind=OCLImage2dMSAAWO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dArrayMSAAWO:62:71 (Definition){{( \(invalid\))?}} [type=__write_only image2d_array_msaa_t] [typekind=OCLImage2dArrayMSAAWO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dMSAADepthWO:63:71 (Definition){{( \(invalid\))?}} [type=__write_only image2d_msaa_depth_t] [typekind=OCLImage2dMSAADepthWO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dArrayMSAADepthWO:64:82 (Definition){{( \(invalid\))?}} [type=__write_only image2d_array_msaa_depth_t] [typekind=OCLImage2dArrayMSAADepthWO] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage3dWO:65:51 (Definition){{( \(invalid\))?}} [type=__write_only image3d_t] [typekind=OCLImage3dWO] [isPOD=1]
 
 void kernel OCLImage1dRWTest(read_write image1d_t scalarOCLImage1dRW);
 void kernel OCLImage1dArrayRWTest(read_write image1d_array_t scalarOCLImage1dArrayRW);
@@ -97,10 +97,10 @@ void kernel OCLImage3dRWTest(read_write image3d_t scalarOCLImage3dRW);
 // CHECK: ParmDecl=scalarOCLImage2dArrayRW:84:62 (Definition) [type=__read_write image2d_array_t] [typekind=OCLImage2dArrayRW] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage2dDepthRW:85:62 (Definition) [type=__read_write image2d_depth_t] [typekind=OCLImage2dDepthRW] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage2dArrayDepthRW:86:73 (Definition) [type=__read_write image2d_array_depth_t] [typekind=OCLImage2dArrayDepthRW] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dMSAARW:87:60 (Definition) [type=__read_write image2d_msaa_t] [typekind=OCLImage2dMSAARW] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dArrayMSAARW:88:71 (Definition) [type=__read_write image2d_array_msaa_t] [typekind=OCLImage2dArrayMSAARW] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dMSAADepthRW:89:71 (Definition) [type=__read_write image2d_msaa_depth_t] [typekind=OCLImage2dMSAADepthRW] [isPOD=1]
-// CHECK: ParmDecl=scalarOCLImage2dArrayMSAADepthRW:90:82 (Definition) [type=__read_write image2d_array_msaa_depth_t] [typekind=OCLImage2dArrayMSAADepthRW] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dMSAARW:87:60 (Definition){{( \(invalid\))?}} [type=__read_write image2d_msaa_t] [typekind=OCLImage2dMSAARW] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dArrayMSAARW:88:71 (Definition){{( \(invalid\))?}} [type=__read_write image2d_array_msaa_t] [typekind=OCLImage2dArrayMSAARW] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dMSAADepthRW:89:71 (Definition){{( \(invalid\))?}} [type=__read_write image2d_msaa_depth_t] [typekind=OCLImage2dMSAADepthRW] [isPOD=1]
+// CHECK: ParmDecl=scalarOCLImage2dArrayMSAADepthRW:90:82 (Definition){{( \(invalid\))?}} [type=__read_write image2d_array_msaa_depth_t] [typekind=OCLImage2dArrayMSAADepthRW] [isPOD=1]
 // CHECK: ParmDecl=scalarOCLImage3dRW:91:51 (Definition) [type=__read_write image3d_t] [typekind=OCLImage3dRW] [isPOD=1]
 
 void kernel intPipeTestRO(read_only pipe int scalarPipe);
diff --git a/test/Index/print-display-names.cpp b/test/Index/print-display-names.cpp
index 94fe4665e630..5ba10e43bc23 100644
--- a/test/Index/print-display-names.cpp
+++ b/test/Index/print-display-names.cpp
@@ -12,9 +12,20 @@ void g(ClassTmpl<T, T>);
 
 template<> void g<int>(ClassTmpl<int, int>);
 
-// RUN: c-index-test -test-load-source all-display %s | FileCheck %s
-// CHECK: print-display-names.cpp:2:7: ClassTemplate=ClassTmpl<T, typename>:2:7
-// CHECK: print-display-names.cpp:6:16: ClassDecl=ClassTmpl<Integer, Integer>:6:16 (Definition)
-// CHECK: print-display-names.cpp:8:6: FunctionDecl=f(ClassTmpl<float, Integer>):8:6
-// CHECK: print-display-names.cpp:11:6: FunctionTemplate=g(ClassTmpl<T, T>):11:6
-// CHECK: print-display-names.cpp:13:17: FunctionDecl=g<>(ClassTmpl<int, int>):13:17 [Specialization of g:11:6]
+// RUN: c-index-test -test-load-source all-display %s | FileCheck %s --check-prefix=DISPLAY_NAME
+// DISPLAY_NAME: print-display-names.cpp:2:7: ClassTemplate=ClassTmpl<T, typename>:2:7
+// DISPLAY_NAME: print-display-names.cpp:6:16: ClassDecl=ClassTmpl<Integer, Integer>:6:16 (Definition)
+// DISPLAY_NAME: print-display-names.cpp:8:6: FunctionDecl=f(ClassTmpl<float, Integer>):8:6
+// DISPLAY_NAME: print-display-names.cpp:11:6: FunctionTemplate=g(ClassTmpl<T, T>):11:6
+// DISPLAY_NAME: print-display-names.cpp:13:17: FunctionDecl=g<>(ClassTmpl<int, int>):13:17 [Specialization of g:11:6]
+
+// RUN: env CINDEXTEST_PRINTINGPOLICY_TERSEOUTPUT=1 c-index-test -test-load-source all-pretty %s | FileCheck %s --check-prefix=PRETTY
+// PRETTY: print-display-names.cpp:2:7: ClassTemplate=template <typename T, typename > class ClassTmpl {}:2:7 (Definition) Extent=[1:1 - 2:20]
+// PRETTY: print-display-names.cpp:4:13: TypedefDecl=typedef int Integer:4:13 (Definition) Extent=[4:1 - 4:20]
+// PRETTY: print-display-names.cpp:6:16: ClassDecl=template<> class ClassTmpl<int, int> {}:6:16 (Definition) [Specialization of ClassTmpl:2:7] Extent=[6:1 - 6:43]
+// PRETTY: print-display-names.cpp:8:6: FunctionDecl=void f(ClassTmpl<float, Integer> p):8:6 Extent=[8:1 - 8:36]
+// PRETTY: print-display-names.cpp:8:34: ParmDecl=ClassTmpl<float, Integer> p:8:34 (Definition) Extent=[8:8 - 8:35]
+// PRETTY: print-display-names.cpp:11:6: FunctionTemplate=template <typename T> void g(ClassTmpl<T, T>):11:6 Extent=[10:1 - 11:24]
+// PRETTY: print-display-names.cpp:11:23: ParmDecl=ClassTmpl<T, T>:11:23 (Definition) Extent=[11:8 - 11:23]
+// PRETTY: print-display-names.cpp:13:17: FunctionDecl=template<> void g<int>(ClassTmpl<int, int>):13:17 [Specialization of g:11:6] [Template arg 0: kind: 1, type: int] Extent=[13:1 - 13:44]
+// PRETTY: print-display-names.cpp:13:43: ParmDecl=ClassTmpl<int, int>:13:43 (Definition) Extent=[13:24 - 13:43]
diff --git a/test/Index/print-type-size.cpp b/test/Index/print-type-size.cpp
index 45de93f30824..1ea53462733f 100644
--- a/test/Index/print-type-size.cpp
+++ b/test/Index/print-type-size.cpp
@@ -4,8 +4,8 @@
 
 namespace basic {
 
-// CHECK64: VarDecl=v:[[@LINE+2]]:6 (Definition) [type=void] [typekind=Void]
-// CHECK32: VarDecl=v:[[@LINE+1]]:6 (Definition) [type=void] [typekind=Void]
+// CHECK64: VarDecl=v:[[@LINE+2]]:6 (Definition) (invalid) [type=void] [typekind=Void]
+// CHECK32: VarDecl=v:[[@LINE+1]]:6 (Definition) (invalid) [type=void] [typekind=Void]
 void v;
 
 // CHECK64: VarDecl=v1:[[@LINE+2]]:7 (Definition) [type=void *] [typekind=Pointer] [sizeof=8] [alignof=8]
diff --git a/test/Lexer/null-character-in-literal.c b/test/Lexer/null-character-in-literal.c
new file mode 100644
index 000000000000..a47954753676
Binary files /dev/null and b/test/Lexer/null-character-in-literal.c differ
diff --git a/test/Misc/ast-dump-stmt.c b/test/Misc/ast-dump-stmt.c
index 1f21cf03f695..461893188a16 100644
--- a/test/Misc/ast-dump-stmt.c
+++ b/test/Misc/ast-dump-stmt.c
@@ -30,6 +30,38 @@ int TestOpaqueValueExpr = 0 ?: 1;
 // CHECK-NEXT:   IntegerLiteral
 // CHECK-NEXT:   OpaqueValueExpr
 // CHECK-NEXT:     IntegerLiteral
-// CHECK-NEXT:   OpaqueValueExpr
-// CHECK-NEXT:     IntegerLiteral
-// CHECK-NEXT:   IntegerLiteral
+// CHECK-NEXT:   OpaqueValueExpr
+// CHECK-NEXT:     IntegerLiteral
+// CHECK-NEXT:   IntegerLiteral
+
+void TestUnaryOperatorExpr(void) {
+  char T1 = 1;
+  int T2 = 1;
+
+  T1++;
+  T2++;
+  // CHECK:      UnaryOperator{{.*}}postfix '++' cannot overflow
+  // CHECK-NEXT:   DeclRefExpr{{.*}}'T1' 'char'
+  // CHECK-NOT:  UnaryOperator{{.*}}postfix '++' cannot overflow
+  // CHECK:        DeclRefExpr{{.*}}'T2' 'int'
+
+  -T1;
+  -T2;
+  // CHECK:      UnaryOperator{{.*}}prefix '-' cannot overflow
+  // CHECK-NEXT:   ImplicitCastExpr
+  // CHECK-NEXT:     ImplicitCastExpr
+  // CHECK-NEXT:       DeclRefExpr{{.*}}'T1' 'char'
+  // CHECK-NOT:  UnaryOperator{{.*}}prefix '-' cannot overflow
+  // CHECK:        ImplicitCastExpr
+  // CHECK:          DeclRefExpr{{.*}}'T2' 'int'
+
+  ~T1;
+  ~T2;
+  // CHECK:      UnaryOperator{{.*}}prefix '~' cannot overflow
+  // CHECK-NEXT:   ImplicitCastExpr
+  // CHECK-NEXT:     ImplicitCastExpr
+  // CHECK-NEXT:       DeclRefExpr{{.*}}'T1' 'char'
+  // CHECK:  	 UnaryOperator{{.*}}prefix '~' cannot overflow
+  // CHECK-NEXT:     ImplicitCastExpr
+  // CHECK-NEXT:       DeclRefExpr{{.*}}'T2' 'int'
+}
diff --git a/test/Modules/ExtDebugInfo.cpp b/test/Modules/ExtDebugInfo.cpp
index 97386bc4d007..c57f1f034eb2 100644
--- a/test/Modules/ExtDebugInfo.cpp
+++ b/test/Modules/ExtDebugInfo.cpp
@@ -187,7 +187,7 @@ void foo() {
 
 // CHECK: !DIGlobalVariable(name: "anon_enum", {{.*}}, type: ![[ANON_ENUM:[0-9]+]]
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, scope: ![[NS]],
-// CHECK-SAME:             line: 16
+// CHECK-SAME:             line: 19
 
 // CHECK: !DIGlobalVariable(name: "GlobalUnion",
 // CHECK-SAME:              type: ![[GLOBAL_UNION:[0-9]+]]
diff --git a/test/Modules/Inputs/DebugCXX.h b/test/Modules/Inputs/DebugCXX.h
index 1ccf8d302f13..8f83c0bc69db 100644
--- a/test/Modules/Inputs/DebugCXX.h
+++ b/test/Modules/Inputs/DebugCXX.h
@@ -1,4 +1,7 @@
 /* -*- C++ -*- */
+
+#include "dummy.h"
+
 namespace DebugCXX {
   // Records.
   struct Struct {
diff --git a/test/Modules/Inputs/odr_hash-Friend/Box.h b/test/Modules/Inputs/odr_hash-Friend/Box.h
new file mode 100644
index 000000000000..01ab90d601c2
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Friend/Box.h
@@ -0,0 +1,14 @@
+template <class T>
+struct iterator {
+  void Compare(const iterator &x) { }
+  friend void Check(iterator) {}
+};
+
+template <class T = int> struct Box {
+  iterator<T> I;
+
+  void test() {
+    Check(I);
+    I.Compare(I);
+  }
+};
diff --git a/test/Modules/Inputs/odr_hash-Friend/M1.h b/test/Modules/Inputs/odr_hash-Friend/M1.h
new file mode 100644
index 000000000000..202ad06c3488
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Friend/M1.h
@@ -0,0 +1,6 @@
+#include "Box.h"
+
+void Peek() {
+  Box<> Gift;
+  Gift.test();
+}
diff --git a/test/Modules/Inputs/odr_hash-Friend/M2.h b/test/Modules/Inputs/odr_hash-Friend/M2.h
new file mode 100644
index 000000000000..69f08a957ede
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Friend/M2.h
@@ -0,0 +1,5 @@
+#include "Box.h"
+void x() {
+  Box<> Unused;
+  //Unused.test();
+}
diff --git a/test/Modules/Inputs/odr_hash-Friend/M3.h b/test/Modules/Inputs/odr_hash-Friend/M3.h
new file mode 100644
index 000000000000..ab457e0c08f2
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Friend/M3.h
@@ -0,0 +1,7 @@
+#include "Box.h"
+#include "M2.h"
+
+void Party() {
+  Box<> Present;
+  Present.test();
+}
diff --git a/test/Modules/Inputs/odr_hash-Friend/module.modulemap b/test/Modules/Inputs/odr_hash-Friend/module.modulemap
new file mode 100644
index 000000000000..28e1832e30e9
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Friend/module.modulemap
@@ -0,0 +1,15 @@
+module Box {
+  header "Box.h"
+}
+
+module Module1 {
+  header "M1.h"
+}
+
+module Module2 {
+  header "M2.h"
+}
+
+module Module3 {
+  header "M3.h"
+}
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/Module2/include.h b/test/Modules/Inputs/odr_hash-Unresolved/Module2/include.h
new file mode 100644
index 000000000000..fa7c017228e7
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/Module2/include.h
@@ -0,0 +1,3 @@
+// include.h
+#include "Sub1/Z.h"
+
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/Module2/not-include.h b/test/Modules/Inputs/odr_hash-Unresolved/Module2/not-include.h
new file mode 100644
index 000000000000..98cd1ef26fce
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/Module2/not-include.h
@@ -0,0 +1,5 @@
+// not-include.h
+
+#include "function.h"
+#include "class.h"
+
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/Sub1/X.h b/test/Modules/Inputs/odr_hash-Unresolved/Sub1/X.h
new file mode 100644
index 000000000000..f610e665e5cb
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/Sub1/X.h
@@ -0,0 +1,3 @@
+// X.h
+#include "Sub1/Z.h"
+
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/Sub1/Y.h b/test/Modules/Inputs/odr_hash-Unresolved/Sub1/Y.h
new file mode 100644
index 000000000000..f41ddd639e76
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/Sub1/Y.h
@@ -0,0 +1,4 @@
+// Y.h
+#include "Sub1/Z.h"
+#include "class.h"
+
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/Sub1/Z.h b/test/Modules/Inputs/odr_hash-Unresolved/Sub1/Z.h
new file mode 100644
index 000000000000..b6bdc140f997
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/Sub1/Z.h
@@ -0,0 +1,4 @@
+// Z.h
+#include "Sub2/A.h"
+#include "Sub2/B.h"
+
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/Sub2/A.h b/test/Modules/Inputs/odr_hash-Unresolved/Sub2/A.h
new file mode 100644
index 000000000000..a65b370beae3
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/Sub2/A.h
@@ -0,0 +1,3 @@
+// A.h
+#include "function.h"
+
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/Sub2/B.h b/test/Modules/Inputs/odr_hash-Unresolved/Sub2/B.h
new file mode 100644
index 000000000000..fa89f216b6ce
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/Sub2/B.h
@@ -0,0 +1,3 @@
+// B.h
+#include "function.h"
+
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/class.h b/test/Modules/Inputs/odr_hash-Unresolved/class.h
new file mode 100644
index 000000000000..fe3a7116f9bf
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/class.h
@@ -0,0 +1,11 @@
+#ifndef Class
+#define Class
+template <class T>
+class S {
+  int Field;
+  void run() {
+    int x;
+    A::Check(&Field, 1);
+  }
+};
+#endif
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/function.h b/test/Modules/Inputs/odr_hash-Unresolved/function.h
new file mode 100644
index 000000000000..de75b2c92ef6
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/function.h
@@ -0,0 +1,6 @@
+#ifndef Function
+#define Function
+namespace A {
+static void Check(int*, int) {}
+}
+#endif
diff --git a/test/Modules/Inputs/odr_hash-Unresolved/module.modulemap b/test/Modules/Inputs/odr_hash-Unresolved/module.modulemap
new file mode 100644
index 000000000000..ac7fc43cae63
--- /dev/null
+++ b/test/Modules/Inputs/odr_hash-Unresolved/module.modulemap
@@ -0,0 +1,21 @@
+module Module1 {
+  module Sub1 {
+    umbrella "Sub1"
+    module * { export * }
+  }
+
+  module Sub2 {
+    umbrella "Sub2"
+    module * { export * }
+  }
+}
+
+module Module2 {
+  umbrella "Module2"
+  module * { export * }
+}
+
+module Other {
+  textual header "function.h"
+  textual header "class.h"
+}
diff --git a/test/Modules/Inputs/shadow/A1/A.h b/test/Modules/Inputs/shadow/A1/A.h
new file mode 100644
index 000000000000..f07c681c2aa5
--- /dev/null
+++ b/test/Modules/Inputs/shadow/A1/A.h
@@ -0,0 +1 @@
+#define A1_A_h
diff --git a/test/Modules/Inputs/shadow/A1/module.modulemap b/test/Modules/Inputs/shadow/A1/module.modulemap
new file mode 100644
index 000000000000..9439a431b1db
--- /dev/null
+++ b/test/Modules/Inputs/shadow/A1/module.modulemap
@@ -0,0 +1,5 @@
+module A {
+  header "A.h"
+}
+
+module A1 {}
diff --git a/test/Modules/Inputs/shadow/A2/A.h b/test/Modules/Inputs/shadow/A2/A.h
new file mode 100644
index 000000000000..9880ed010f56
--- /dev/null
+++ b/test/Modules/Inputs/shadow/A2/A.h
@@ -0,0 +1 @@
+#define A2_A_h
diff --git a/test/Modules/Inputs/shadow/A2/module.modulemap b/test/Modules/Inputs/shadow/A2/module.modulemap
new file mode 100644
index 000000000000..935d89bb425e
--- /dev/null
+++ b/test/Modules/Inputs/shadow/A2/module.modulemap
@@ -0,0 +1,5 @@
+module A {
+  header "A.h"
+}
+
+module A2 {}
diff --git a/test/Modules/Inputs/shadowed-submodule/A1/Foo.h b/test/Modules/Inputs/shadowed-submodule/A1/Foo.h
new file mode 100644
index 000000000000..57775d1a29dd
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A1/Foo.h
@@ -0,0 +1 @@
+#include <stdarg.h> // expected-error {{could not build module 'A'}}
diff --git a/test/Modules/Inputs/shadowed-submodule/A1/module.modulemap b/test/Modules/Inputs/shadowed-submodule/A1/module.modulemap
new file mode 100644
index 000000000000..7afbc4713617
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A1/module.modulemap
@@ -0,0 +1,14 @@
+module A [system] { // expected-note {{previous definition is here}}
+  module sub {
+    header "sys/A.h"
+  }
+  module sub2 {
+    header "sys/A2.h"
+  }
+  module stdarg {
+    header "stdarg.h"
+    export *
+  }
+}
+
+module A2 {}
diff --git a/test/Modules/Inputs/shadowed-submodule/A1/sys/A.h b/test/Modules/Inputs/shadowed-submodule/A1/sys/A.h
new file mode 100644
index 000000000000..4fc3e8ea9e63
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A1/sys/A.h
@@ -0,0 +1 @@
+#include <sys/A2.h>
diff --git a/test/Modules/Inputs/shadowed-submodule/A1/sys/A2.h b/test/Modules/Inputs/shadowed-submodule/A1/sys/A2.h
new file mode 100644
index 000000000000..e9b6a44cfe21
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A1/sys/A2.h
@@ -0,0 +1 @@
+// nothing
diff --git a/test/Modules/Inputs/shadowed-submodule/A2/Foo.h b/test/Modules/Inputs/shadowed-submodule/A2/Foo.h
new file mode 100644
index 000000000000..38ade6d9f505
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A2/Foo.h
@@ -0,0 +1 @@
+#include <stdarg.h>
diff --git a/test/Modules/Inputs/shadowed-submodule/A2/module.modulemap b/test/Modules/Inputs/shadowed-submodule/A2/module.modulemap
new file mode 100644
index 000000000000..c4e44b074aa1
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A2/module.modulemap
@@ -0,0 +1,14 @@
+module A [system] {
+  module sub {
+    header "sys/A.h"
+  }
+  module sub2 { // expected-error {{build a shadowed submodule 'A.sub2'}}
+    header "sys/A2.h"
+  }
+  module stdarg {
+    header "stdarg.h"
+    export *
+  }
+}
+
+module A2 {}
diff --git a/test/Modules/Inputs/shadowed-submodule/A2/sys/A.h b/test/Modules/Inputs/shadowed-submodule/A2/sys/A.h
new file mode 100644
index 000000000000..4fc3e8ea9e63
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A2/sys/A.h
@@ -0,0 +1 @@
+#include <sys/A2.h>
diff --git a/test/Modules/Inputs/shadowed-submodule/A2/sys/A2.h b/test/Modules/Inputs/shadowed-submodule/A2/sys/A2.h
new file mode 100644
index 000000000000..e9b6a44cfe21
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/A2/sys/A2.h
@@ -0,0 +1 @@
+// nothing
diff --git a/test/Modules/Inputs/shadowed-submodule/Foo/module.modulemap b/test/Modules/Inputs/shadowed-submodule/Foo/module.modulemap
new file mode 100644
index 000000000000..11db9cb05da2
--- /dev/null
+++ b/test/Modules/Inputs/shadowed-submodule/Foo/module.modulemap
@@ -0,0 +1,3 @@
+module Foo {
+  header "../A1/Foo.h"
+}
diff --git a/test/Modules/ModuleDebugInfo.cpp b/test/Modules/ModuleDebugInfo.cpp
index 008b3e4f2bab..f0d883767045 100644
--- a/test/Modules/ModuleDebugInfo.cpp
+++ b/test/Modules/ModuleDebugInfo.cpp
@@ -5,12 +5,13 @@
 
 // Modules:
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -triple %itanium_abi_triple -x objective-c++ -std=c++11 -debug-info-kind=limited -fmodules -fmodule-format=obj -fimplicit-module-maps -DMODULES -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o %t.ll -mllvm -debug-only=pchcontainer &>%t-mod.ll
+// RUN: %clang_cc1 -triple %itanium_abi_triple -x objective-c++ -std=c++11 -debugger-tuning=lldb -debug-info-kind=limited -fmodules -fmodule-format=obj -fimplicit-module-maps -DMODULES -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o %t.ll -mllvm -debug-only=pchcontainer &>%t-mod.ll
 // RUN: cat %t-mod.ll | FileCheck %s
 // RUN: cat %t-mod.ll | FileCheck --check-prefix=CHECK-NEG %s
+// RUN: cat %t-mod.ll | FileCheck --check-prefix=CHECK-MOD %s
 
 // PCH:
-// RUN: %clang_cc1 -triple %itanium_abi_triple -x c++ -std=c++11 -emit-pch -fmodule-format=obj -I %S/Inputs -o %t.pch %S/Inputs/DebugCXX.h -mllvm -debug-only=pchcontainer &>%t-pch.ll
+// RUN: %clang_cc1 -triple %itanium_abi_triple -x c++ -std=c++11  -debugger-tuning=lldb -emit-pch -fmodule-format=obj -I %S/Inputs -o %t.pch %S/Inputs/DebugCXX.h -mllvm -debug-only=pchcontainer &>%t-pch.ll
 // RUN: cat %t-pch.ll | FileCheck %s
 // RUN: cat %t-pch.ll | FileCheck --check-prefix=CHECK-NEG %s
 
@@ -18,6 +19,9 @@
 @import DebugCXX;
 #endif
 
+// CHECK-MOD: distinct !DICompileUnit(language: DW_LANG_{{.*}}C_plus_plus,
+// CHECK-MOD: distinct !DICompileUnit(language: DW_LANG_{{.*}}C_plus_plus,
+
 // CHECK: distinct !DICompileUnit(language: DW_LANG_{{.*}}C_plus_plus,
 // CHECK-SAME:                    isOptimized: false,
 // CHECK-NOT:                     splitDebugFilename:
@@ -27,6 +31,8 @@
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX4EnumE")
 // CHECK: !DINamespace(name: "DebugCXX"
 
+// CHECK-MOD: ![[DEBUGCXX:.*]] = !DIModule(scope: null, name: "DebugCXX
+
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
 // CHECK-NOT:              name:
 // CHECK-SAME:             )
@@ -150,4 +156,11 @@
 // CHECK-SAME:                             name: "WithSpecializedBase<float>",
 // CHECK-SAME:                             flags: DIFlagFwdDecl,
 
+// CHECK-MOD: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: ![[DEBUGCXX]],
+// CHECK-MOD-SAME:              entity: ![[DUMMY:[0-9]+]],
+// CHECK-MOD-SAME:              line: 3)
+// CHECK-MOD: ![[DUMMY]] = !DIModule(scope: null, name: "dummy",
+// CHECK-MOD: distinct !DICompileUnit(language: DW_LANG_ObjC_plus_plus,
+// CHECK-MOD-SAME:  splitDebugFilename: "{{.*}}dummy{{.*}}.pcm",
+
 // CHECK-NEG-NOT: !DICompositeType(tag: DW_TAG_structure_type, name: "PureForwardDecl"
diff --git a/test/Modules/crash-vfs-path-emptydir-entries.m b/test/Modules/crash-vfs-path-emptydir-entries.m
index d96adbbf99ba..a50ea869f476 100644
--- a/test/Modules/crash-vfs-path-emptydir-entries.m
+++ b/test/Modules/crash-vfs-path-emptydir-entries.m
@@ -8,7 +8,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/i %t/m %t %t/sysroot
-// RUN: cp -a %S/Inputs/crash-recovery/usr %t/i/
+// RUN: cp -R %S/Inputs/crash-recovery/usr %t/i/
 
 // RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
 // RUN: %clang -fsyntax-only %s -I %/t/i -isysroot %/t/sysroot/ \
diff --git a/test/Modules/crash-vfs-path-symlink-component.m b/test/Modules/crash-vfs-path-symlink-component.m
index 4723a77c7e93..565a64fb5c23 100644
--- a/test/Modules/crash-vfs-path-symlink-component.m
+++ b/test/Modules/crash-vfs-path-symlink-component.m
@@ -8,7 +8,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/i %t/m %t %t/sysroot
-// RUN: cp -a %S/Inputs/crash-recovery/usr %t/i/
+// RUN: cp -R %S/Inputs/crash-recovery/usr %t/i/
 // RUN: ln -s include/tcl-private %t/i/usr/x
 
 // RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
diff --git a/test/Modules/crash-vfs-path-symlink-topheader.m b/test/Modules/crash-vfs-path-symlink-topheader.m
index 8e0c2d4c9226..fea1f01c02ae 100644
--- a/test/Modules/crash-vfs-path-symlink-topheader.m
+++ b/test/Modules/crash-vfs-path-symlink-topheader.m
@@ -8,7 +8,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/i %t/m %t %t/sysroot
-// RUN: cp -a %S/Inputs/crash-recovery/usr %t/i/
+// RUN: cp -R %S/Inputs/crash-recovery/usr %t/i/
 // RUN: rm -f %t/i/usr/include/pthread_impl.h
 // RUN: ln -s pthread/pthread_impl.h %t/i/usr/include/pthread_impl.h
 
diff --git a/test/Modules/crash-vfs-umbrella-frameworks.m b/test/Modules/crash-vfs-umbrella-frameworks.m
index 0c3981ddaa88..a18acf576ea3 100644
--- a/test/Modules/crash-vfs-umbrella-frameworks.m
+++ b/test/Modules/crash-vfs-umbrella-frameworks.m
@@ -5,7 +5,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/i %t/m %t
-// RUN: cp -a %S/Inputs/crash-recovery/Frameworks %t/i/
+// RUN: cp -R %S/Inputs/crash-recovery/Frameworks %t/i/
 // RUN: mkdir -p %t/i/Frameworks/A.framework/Frameworks
 // RUN: ln -s ../../B.framework %t/i/Frameworks/A.framework/Frameworks/B.framework
 
diff --git a/test/Modules/modify-module.m b/test/Modules/modify-module.m
index f9a70b25c877..d59a4799104f 100644
--- a/test/Modules/modify-module.m
+++ b/test/Modules/modify-module.m
@@ -3,9 +3,9 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/include
-// RUN: cp %S/Inputs/Modified/A.h %t/include
-// RUN: cp %S/Inputs/Modified/B.h %t/include
-// RUN: cp %S/Inputs/Modified/module.map %t/include
+// RUN: cat %S/Inputs/Modified/A.h > %t/include/A.h
+// RUN: cat %S/Inputs/Modified/B.h > %t/include/B.h
+// RUN: cat %S/Inputs/Modified/module.map > %t/include/module.map
 // RUN: %clang_cc1 -fdisable-module-hash -fmodules-cache-path=%t/cache -fmodules -fimplicit-module-maps -I %t/include %s -verify
 // RUN: echo '' >> %t/include/B.h
 // RUN: %clang_cc1 -fdisable-module-hash -fmodules-cache-path=%t/cache -fmodules -fimplicit-module-maps -I %t/include %s -verify
diff --git a/test/Modules/odr_hash-Friend.cpp b/test/Modules/odr_hash-Friend.cpp
new file mode 100644
index 000000000000..39c0c4b762c2
--- /dev/null
+++ b/test/Modules/odr_hash-Friend.cpp
@@ -0,0 +1,22 @@
+// RUN: rm -rf %t
+
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t/modules.cache \
+// RUN:  -I %S/Inputs/odr_hash-Friend \
+// RUN:  -emit-obj -o /dev/null \
+// RUN:  -fmodules \
+// RUN:  -fimplicit-module-maps \
+// RUN:  -fmodules-cache-path=%t/modules.cache \
+// RUN:  -std=c++11 -x c++ %s -verify
+
+// PR35939: MicrosoftMangle.cpp triggers an assertion failure on this test.
+// UNSUPPORTED: system-windows
+
+// expected-no-diagnostics
+
+#include "Box.h"
+#include "M1.h"
+#include "M3.h"
+
+void Run() {
+  Box<> Present;
+}
diff --git a/test/Modules/odr_hash-Unresolved.cpp b/test/Modules/odr_hash-Unresolved.cpp
new file mode 100644
index 000000000000..0c73b50f7085
--- /dev/null
+++ b/test/Modules/odr_hash-Unresolved.cpp
@@ -0,0 +1,14 @@
+// RUN: rm -rf %t
+
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t/modules.cache \
+// RUN:  -I %S/Inputs/odr_hash-Unresolved \
+// RUN:  -fmodules \
+// RUN:  -fimplicit-module-maps \
+// RUN:  -fmodules-cache-path=%t/modules.cache \
+// RUN:  -fmodules-local-submodule-visibility \
+// RUN:  -std=c++11 -x c++ %s -fsyntax-only
+
+// Note: There is no -verify in the run line because some error messages are
+// not captured from the module building stage.
+
+#include "Module2/include.h"
diff --git a/test/Modules/odr_hash-blocks.cpp b/test/Modules/odr_hash-blocks.cpp
new file mode 100644
index 000000000000..07dfa4ce2ac8
--- /dev/null
+++ b/test/Modules/odr_hash-blocks.cpp
@@ -0,0 +1,119 @@
+// Clear and create directories
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: mkdir %t/cache
+// RUN: mkdir %t/Inputs
+
+// Build first header file
+// RUN: echo "#define FIRST" >> %t/Inputs/first.h
+// RUN: cat %s               >> %t/Inputs/first.h
+
+// Build second header file
+// RUN: echo "#define SECOND" >> %t/Inputs/second.h
+// RUN: cat %s                >> %t/Inputs/second.h
+
+// Test that each header can compile
+// RUN: %clang_cc1 -fsyntax-only -x c++ -std=c++11 -fblocks %t/Inputs/first.h
+// RUN: %clang_cc1 -fsyntax-only -x c++ -std=c++11 -fblocks %t/Inputs/second.h
+
+// Build module map file
+// RUN: echo "module FirstModule {"     >> %t/Inputs/module.map
+// RUN: echo "    header \"first.h\""   >> %t/Inputs/module.map
+// RUN: echo "}"                        >> %t/Inputs/module.map
+// RUN: echo "module SecondModule {"    >> %t/Inputs/module.map
+// RUN: echo "    header \"second.h\""  >> %t/Inputs/module.map
+// RUN: echo "}"                        >> %t/Inputs/module.map
+
+// Run test
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps \
+// RUN:            -fmodules-cache-path=%t/cache -x c++ -I%t/Inputs \
+// RUN:            -verify %s -std=c++11 -fblocks
+
+#if !defined(FIRST) && !defined(SECOND)
+#include "first.h"
+#include "second.h"
+#endif
+
+// Used for testing
+#if defined(FIRST)
+#define ACCESS public:
+#elif defined(SECOND)
+#define ACCESS private:
+#endif
+
+// TODO: S1, S2, and S3 should generate errors.
+namespace Blocks {
+#if defined(FIRST)
+struct S1 {
+  void (^block)(int x) = ^(int x) { };
+};
+#elif defined(SECOND)
+struct S1 {
+  void (^block)(int x) = ^(int y) { };
+};
+#else
+S1 s1;
+#endif
+
+#if defined(FIRST)
+struct S2 {
+  int (^block)(int x) = ^(int x) { return x + 1; };
+};
+#elif defined(SECOND)
+struct S2 {
+  int (^block)(int x) = ^(int x) { return x; };
+};
+#else
+S2 s2;
+#endif
+
+#if defined(FIRST)
+struct S3 {
+  void run(int (^block)(int x));
+};
+#elif defined(SECOND)
+struct S3 {
+  void run(int (^block)(int x, int y));
+};
+#else
+S3 s3;
+#endif
+
+#define DECLS                                       \
+  int (^block)(int x) = ^(int x) { return x + x; }; \
+  void run(int (^block)(int x, int y));
+
+#if defined(FIRST) || defined(SECOND)
+struct Valid1 {
+  DECLS
+};
+#else
+Valid1 v1;
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+struct Invalid1 {
+  DECLS
+  ACCESS
+};
+#else
+Invalid1 i1;
+// expected-error@second.h:* {{'Blocks::Invalid1' has different definitions in different modules; first difference is definition in module 'SecondModule' found private access specifier}}
+// expected-note@first.h:* {{but in 'FirstModule' found public access specifier}}
+#endif
+
+#undef DECLS
+}
+
+// Keep macros contained to one file.
+#ifdef FIRST
+#undef FIRST
+#endif
+
+#ifdef SECOND
+#undef SECOND
+#endif
+
+#ifdef ACCESS
+#undef ACCESS
+#endif
diff --git a/test/Modules/shadow.m b/test/Modules/shadow.m
new file mode 100644
index 000000000000..44320af2b0c6
--- /dev/null
+++ b/test/Modules/shadow.m
@@ -0,0 +1,21 @@
+// RUN: rm -rf %t
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs/shadow/A1 -I %S/Inputs/shadow/A2 %s -fsyntax-only 2>&1 | FileCheck %s -check-prefix=REDEFINITION
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/shadow/A1/module.modulemap -fmodule-map-file=%S/Inputs/shadow/A2/module.modulemap %s -fsyntax-only 2>&1 | FileCheck %s -check-prefix=REDEFINITION
+// REDEFINITION: error: redefinition of module 'A'
+// REDEFINITION: note: previously defined
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fmodule-map-file=%S/Inputs/shadow/A1/module.modulemap -I %S/Inputs/shadow %s -verify
+
+@import A1;
+@import A2;
+@import A;
+
+#import "A2/A.h" // expected-note {{implicitly imported}}
+// expected-error@A2/module.modulemap:1 {{import of shadowed module 'A'}}
+// expected-note@A1/module.modulemap:1 {{previous definition}}
+
+#if defined(A2_A_h)
+#error got the wrong definition of module A
+#elif !defined(A1_A_h)
+#error missing definition from A1
+#endif
diff --git a/test/Modules/shadowed-submodule.m b/test/Modules/shadowed-submodule.m
new file mode 100644
index 000000000000..c9c77bd13a1c
--- /dev/null
+++ b/test/Modules/shadowed-submodule.m
@@ -0,0 +1,5 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs/shadowed-submodule/Foo -I %S/Inputs/shadowed-submodule/A2 %s -verify
+
+@import Foo; // expected-error {{module 'A' was built in directory}}
+             // expected-note@shadowed-submodule.m:4 {{imported by module 'Foo'}}
diff --git a/test/Modules/templates.mm b/test/Modules/templates.mm
index 79e975a1ad12..9c3f327acb71 100644
--- a/test/Modules/templates.mm
+++ b/test/Modules/templates.mm
@@ -77,10 +77,10 @@ unsigned testMixedStruct() {
   // CHECK: %[[l:.*]] = alloca %[[ListInt:[^ ]*]], align 8
   // CHECK: %[[r:.*]] = alloca %[[ListInt]], align 8
 
-  // CHECK: call {{.*}}memcpy{{.*}}(i8* %{{.*}}, i8* bitcast ({{.*}}* @_ZZ15testMixedStructvE1l to i8*), i64 16,
+  // CHECK: call {{.*}}memcpy{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* align {{[0-9]+}} bitcast ({{.*}}* @_ZZ15testMixedStructvE1l to i8*), i64 16,
   ListInt_left l{0, 1};
 
-  // CHECK: call {{.*}}memcpy{{.*}}(i8* %{{.*}}, i8* bitcast ({{.*}}* @_ZZ15testMixedStructvE1r to i8*), i64 16,
+  // CHECK: call {{.*}}memcpy{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* align {{[0-9]+}} bitcast ({{.*}}* @_ZZ15testMixedStructvE1r to i8*), i64 16,
   ListInt_right r{0, 2};
 
   // CHECK: call void @_Z10useListIntR4ListIiE(%[[ListInt]]* dereferenceable({{[0-9]+}}) %[[l]])
diff --git a/test/Modules/using-decl-friend-2.cpp b/test/Modules/using-decl-friend-2.cpp
new file mode 100644
index 000000000000..0c71c37765f3
--- /dev/null
+++ b/test/Modules/using-decl-friend-2.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -fmodules %s -verify
+// expected-no-diagnostics
+
+#pragma clang module build A
+module A {}
+#pragma clang module contents
+#pragma clang module begin A
+namespace N { class X; }
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module build B
+module B {}
+#pragma clang module contents
+#pragma clang module begin B
+namespace N { class Friendly { friend class X; }; }
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module build C
+module C {}
+#pragma clang module contents
+#pragma clang module begin C
+#pragma clang module import A
+void use_X(N::X *p);
+#pragma clang module import B
+// UsingShadowDecl names the friend declaration
+using N::X;
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module import B
+namespace N { class AlsoFriendly { friend class X; }; }
+#pragma clang module import A
+#pragma clang module import C
+// The friend declaration from N::Friendly is now the first in the redecl
+// chain, so is not ordinarily visible. We need the IDNS of the UsingShadowDecl
+// to still consider it to be visible, though.
+X *p;
diff --git a/test/Modules/using-decl-friend.cpp b/test/Modules/using-decl-friend.cpp
new file mode 100644
index 000000000000..f11d0021a8ad
--- /dev/null
+++ b/test/Modules/using-decl-friend.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fmodules %s -verify
+// expected-no-diagnostics
+
+#pragma clang module build A
+module A {}
+#pragma clang module contents
+#pragma clang module begin A
+namespace N {
+  class X;
+}
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module build B
+module B {
+  module X {}
+  module Y {}
+}
+#pragma clang module contents
+#pragma clang module begin B.X
+namespace N {
+  class Friendly {
+    friend class X;
+  };
+}
+#pragma clang module end
+#pragma clang module begin B.Y
+namespace N {
+  class X;
+}
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module import A
+#pragma clang module import B.X
+using N::X;
+X *p;
diff --git a/test/OpenMP/atomic_write_codegen.c b/test/OpenMP/atomic_write_codegen.c
index 4d5ad1e27d7e..242708cd7cf5 100644
--- a/test/OpenMP/atomic_write_codegen.c
+++ b/test/OpenMP/atomic_write_codegen.c
@@ -142,7 +142,7 @@ int main() {
   dx = dv;
 // CHECK: [[LD:%.+]] = load x86_fp80, x86_fp80*
 // CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[LDTEMP:%.*]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[BITCAST]], i8 0, i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[BITCAST]], i8 0, i64 16, i1 false)
 // CHECK: store x86_fp80 [[LD]], x86_fp80* [[LDTEMP]]
 // CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[LDTEMP:%.*]] to i128*
 // CHECK: [[LD:%.+]] = load i128, i128* [[BITCAST]]
@@ -231,7 +231,7 @@ int main() {
 // CHECK: load i64, i64*
 // CHECK: [[VAL:%.+]] = uitofp i64 %{{.+}} to x86_fp80
 // CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP:%.+]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[BITCAST]], i8 0, i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[BITCAST]], i8 0, i64 16, i1 false)
 // CHECK: store x86_fp80 [[VAL]], x86_fp80* [[TEMP]]
 // CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP]] to i128*
 // CHECK: [[VAL:%.+]] = load i128, i128* [[BITCAST]]
diff --git a/test/OpenMP/distribute_firstprivate_codegen.cpp b/test/OpenMP/distribute_firstprivate_codegen.cpp
index 8404def86e5c..29d44fde92f2 100644
--- a/test/OpenMP/distribute_firstprivate_codegen.cpp
+++ b/test/OpenMP/distribute_firstprivate_codegen.cpp
@@ -255,7 +255,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
 
 // init s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
@@ -270,7 +270,7 @@ int main() {
 // CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
 // CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
 // CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
@@ -283,7 +283,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to{{.+}}
 // CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
 // CHECK-DAG: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
 
 // init svar
@@ -350,7 +350,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
 
 // init s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
@@ -365,7 +365,7 @@ int main() {
 // CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
 // CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
 // CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
@@ -378,7 +378,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to{{.+}}
 // CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
 
 // CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}},
diff --git a/test/OpenMP/distribute_lastprivate_codegen.cpp b/test/OpenMP/distribute_lastprivate_codegen.cpp
index 115e91a02a7f..66a860b7152e 100644
--- a/test/OpenMP/distribute_lastprivate_codegen.cpp
+++ b/test/OpenMP/distribute_lastprivate_codegen.cpp
@@ -256,7 +256,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -269,7 +269,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -280,7 +280,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -289,7 +289,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -358,7 +358,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -371,7 +371,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -382,7 +382,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -391,6 +391,6 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 #endif
diff --git a/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
index 70bc8eb7c6cc..e4915acf69b6 100644
--- a/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
+++ b/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
@@ -328,7 +328,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -345,7 +345,7 @@ int main() {
 // CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // svar
@@ -421,7 +421,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -438,7 +438,7 @@ int main() {
 // CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
@@ -510,7 +510,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -527,7 +527,7 @@ int main() {
 // CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
@@ -591,7 +591,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -608,7 +608,7 @@ int main() {
 // CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
diff --git a/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
index 4c5903182078..dda8c932908e 100644
--- a/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
+++ b/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
@@ -349,7 +349,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -360,7 +360,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -369,7 +369,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -441,7 +441,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -455,7 +455,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -466,7 +466,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -475,7 +475,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -549,7 +549,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -560,7 +560,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -569,7 +569,7 @@ int main() {
 // CHECK: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],{{.+}})
 // CHECK: ret void
 
 // outlined function for 'parallel for'
@@ -635,7 +635,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -649,7 +649,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -660,7 +660,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -669,7 +669,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 
 #endif
diff --git a/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp b/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
index e40ab461e0a3..1447b5ff2ee2 100644
--- a/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -326,7 +326,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -343,7 +343,7 @@ int main() {
 // CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // svar
@@ -419,7 +419,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -436,7 +436,7 @@ int main() {
 // CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
@@ -508,7 +508,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -525,7 +525,7 @@ int main() {
 // CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
@@ -589,7 +589,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
 // CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
 
 // s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
@@ -606,7 +606,7 @@ int main() {
 // CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
 // CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
diff --git a/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp b/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
index f47a6d014dc0..af74ab7b72a5 100644
--- a/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -359,7 +359,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -370,7 +370,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -379,7 +379,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -451,7 +451,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -465,7 +465,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -476,7 +476,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -485,7 +485,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -559,7 +559,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -570,7 +570,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -579,7 +579,7 @@ int main() {
 // CHECK: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],{{.+}})
 // CHECK: ret void
 
 // outlined function for 'parallel for'
@@ -645,7 +645,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -659,7 +659,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -670,7 +670,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -679,7 +679,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 
 // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/OpenMP/distribute_simd_firstprivate_codegen.cpp b/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
index 3f45c6299271..919a12284131 100644
--- a/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
+++ b/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
@@ -257,7 +257,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
 
 // init s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
@@ -272,7 +272,7 @@ int main() {
 // CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
 // CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
 // CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
@@ -285,7 +285,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to{{.+}}
 // CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
 // CHECK-DAG: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
 
 // init svar
@@ -352,7 +352,7 @@ int main() {
 // CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
 // CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
 
 // init s_arr
 // CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
@@ -367,7 +367,7 @@ int main() {
 // CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
 // CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
 // CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
 // CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
@@ -380,7 +380,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to{{.+}}
 // CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
 
 // CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}},
diff --git a/test/OpenMP/distribute_simd_lastprivate_codegen.cpp b/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
index 6935deae8c4c..cde7c81d50f9 100644
--- a/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
+++ b/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
@@ -267,7 +267,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -280,7 +280,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -291,7 +291,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -300,7 +300,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -369,7 +369,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -382,7 +382,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -393,7 +393,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -402,7 +402,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 
 // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/OpenMP/distribute_simd_loop_messages.cpp b/test/OpenMP/distribute_simd_loop_messages.cpp
index 1ae28a97872e..1977ca789dcf 100644
--- a/test/OpenMP/distribute_simd_loop_messages.cpp
+++ b/test/OpenMP/distribute_simd_loop_messages.cpp
@@ -324,9 +324,7 @@ int test_iteration_spaces() {
 
   #pragma omp target
   #pragma omp teams
-  // expected-error@+3 {{unexpected OpenMP clause 'shared' in directive '#pragma omp distribute simd'}}
-  // expected-note@+2  {{defined as shared}}
-  // expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be shared, predetermined as linear}}
+  // expected-error@+1 {{unexpected OpenMP clause 'shared' in directive '#pragma omp distribute simd'}}
   #pragma omp distribute simd shared(ii)
   for (ii = 0; ii < 10; ii++)
     c[ii] = a[ii];
diff --git a/test/OpenMP/flush_codegen.cpp b/test/OpenMP/flush_codegen.cpp
index 050936bb99d9..543524c220be 100644
--- a/test/OpenMP/flush_codegen.cpp
+++ b/test/OpenMP/flush_codegen.cpp
@@ -35,4 +35,6 @@ int main() {
 // CHECK: call {{.*}}void @__kmpc_flush(%{{.+}}* {{(@|%).+}})
 // CHECK: ret
 
+// CHECK-NOT: line: 0,
+
 #endif
diff --git a/test/OpenMP/for_firstprivate_codegen.cpp b/test/OpenMP/for_firstprivate_codegen.cpp
index adb33ec9703e..fc3caa9a2603 100644
--- a/test/OpenMP/for_firstprivate_codegen.cpp
+++ b/test/OpenMP/for_firstprivate_codegen.cpp
@@ -241,7 +241,7 @@ int main() {
 
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
 
 // firstprivate s_arr(s_arr)
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -309,7 +309,7 @@ int main() {
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 
 // firstprivate s_arr(s_arr)
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
diff --git a/test/OpenMP/for_lastprivate_codegen.cpp b/test/OpenMP/for_lastprivate_codegen.cpp
index 2ef3dc70debf..0061de632380 100644
--- a/test/OpenMP/for_lastprivate_codegen.cpp
+++ b/test/OpenMP/for_lastprivate_codegen.cpp
@@ -523,7 +523,7 @@ int main() {
 // original vec[]=private_vec[];
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 
 // original s_arr[]=private_s_arr[];
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -752,7 +752,7 @@ int main() {
 // original vec[]=private_vec[];
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 
 // original s_arr[]=private_s_arr[];
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
diff --git a/test/OpenMP/for_misc_messages.c b/test/OpenMP/for_misc_messages.c
index ffcc639e8ebe..901d66dd688c 100644
--- a/test/OpenMP/for_misc_messages.c
+++ b/test/OpenMP/for_misc_messages.c
@@ -54,6 +54,20 @@ void test_invalid_clause() {
 #pragma omp for foo bar
   for (i = 0; i < 16; ++i)
     ;
+// At one time, this failed an assert.
+// expected-error@+1 {{unexpected OpenMP clause 'num_teams' in directive '#pragma omp for'}}
+#pragma omp for num_teams(3)
+  for (i = 0; i < 16; ++i)
+    ;
+// At one time, this error was reported twice.
+// expected-error@+1 {{unexpected OpenMP clause 'uniform' in directive '#pragma omp for'}}
+#pragma omp for uniform
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{unexpected OpenMP clause 'if' in directive '#pragma omp for'}}
+#pragma omp for if(0)
+  for (i = 0; i < 16; ++i)
+    ;
 }
 
 void test_non_identifiers() {
diff --git a/test/OpenMP/for_reduction_codegen.cpp b/test/OpenMP/for_reduction_codegen.cpp
index b0e9c59cef60..323692af28a1 100644
--- a/test/OpenMP/for_reduction_codegen.cpp
+++ b/test/OpenMP/for_reduction_codegen.cpp
@@ -352,7 +352,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1 = var1.operator &&(var1_reduction);
 // CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
@@ -368,7 +368,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1 = min(t_var1, t_var1_reduction);
 // CHECK: [[T_VAR1_VAL:%.+]] = load float, float* [[T_VAR1_REF]],
@@ -406,7 +406,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // var1 = var1.operator &&(var1_reduction);
@@ -424,7 +424,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // t_var1 = min(t_var1, t_var1_reduction);
@@ -510,7 +510,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_LHS]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_RHS]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1_lhs = var1_lhs.operator &&(var1_rhs);
 // CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_LHS]])
@@ -526,7 +526,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
 // CHECK: [[T_VAR1_LHS_VAL:%.+]] = load float, float* [[T_VAR1_LHS]],
@@ -629,7 +629,7 @@ int main() {
 // CHECK: phi [[S_FLOAT_TY]]*
 // CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
 
@@ -659,7 +659,7 @@ int main() {
 // CHECK: call void @__kmpc_critical(
 // CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
@@ -732,7 +732,7 @@ int main() {
 // CHECK: phi [[S_FLOAT_TY]]*
 // CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
 
@@ -823,7 +823,7 @@ int main() {
 // CHECK: phi [[S_FLOAT_TY]]*
 // CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
 
@@ -853,7 +853,7 @@ int main() {
 // CHECK: call void @__kmpc_critical(
 // CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
@@ -922,7 +922,7 @@ int main() {
 // CHECK: phi [[S_FLOAT_TY]]*
 // CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
 
@@ -1292,7 +1292,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1 = var1.operator &&(var1_reduction);
 // CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
@@ -1308,7 +1308,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1 = min(t_var1, t_var1_reduction);
 // CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
@@ -1334,7 +1334,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // var1 = var1.operator &&(var1_reduction);
@@ -1352,7 +1352,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // t_var1 = min(t_var1, t_var1_reduction);
@@ -1419,7 +1419,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1_lhs = var1_lhs.operator &&(var1_rhs);
 // CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
@@ -1435,7 +1435,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
 // CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
diff --git a/test/OpenMP/for_simd_codegen.cpp b/test/OpenMP/for_simd_codegen.cpp
index 705a8b308a31..3cd06d44d635 100644
--- a/test/OpenMP/for_simd_codegen.cpp
+++ b/test/OpenMP/for_simd_codegen.cpp
@@ -690,4 +690,5 @@ void parallel_simd(float *a) {
     a[i] += bar();
 }
 // TERM_DEBUG: !{{[0-9]+}} = !DILocation(line: [[@LINE-11]],
+// TERM_DEBUG-NOT: line: 0,
 #endif // HEADER
diff --git a/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
index f750be03e052..2ca2073b5c20 100644
--- a/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
+++ b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -72,17 +72,17 @@ int foo(int n, double *ptr) {
   //  firstprivate(b): memcpy(b_priv,b_in)
   // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
   // TCHECK:  [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_ADDR_REF_BCAST]], {{.+}})
 
   // firstprivate(c)
   // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
   // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
 
   // firstprivate(d)
   // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
   // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[D_PRIV_BCAST]], i8* align {{[0-9]+}} [[D_IN_BCAST]],{{.+}})
 
   // TCHECK: load i16, i16* [[A2_ADDR]],
 
@@ -151,7 +151,7 @@ static int fstatic(int n) {
 // firstprivate(b)
 // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
 // TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
 
 // TCHECK:  ret void
 
@@ -214,8 +214,14 @@ int bar(int n, double *ptr) {
 // firstprivate(b)
 // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
 // TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
 
 // TCHECK: ret void
 
 #endif
+
+// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
+// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
+// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
+// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
+// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
diff --git a/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index c4c3e977b0e2..d636240f44d0 100644
--- a/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -168,9 +168,9 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
   // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
@@ -405,9 +405,9 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
   // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
@@ -714,18 +714,18 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
   //
   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
diff --git a/test/OpenMP/nvptx_target_teams_codegen.cpp b/test/OpenMP/nvptx_target_teams_codegen.cpp
index 8b016cc307d5..ff5967b5612b 100644
--- a/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -121,9 +121,7 @@ int bar(int n){
   // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align
   // CHECK: store i[[SZ]] [[ACV]], i[[SZ]]* [[A_ADDR_T:%.+]], align
   // CHECK: [[CONV2:%.+]] = bitcast i[[SZ]]* [[A_ADDR_T]] to i8*
-  // CHECK: [[LD_CONV2:%.+]] = load i8, i8* [[CONV2]],
-  // CHECK: store i8 [[LD_CONV2]], i8* [[A_PRIV:%[^,]+]],
-  // CHECK: store i8 49, i8* [[A_PRIV]], align
+  // CHECK: store i8 49, i8* [[CONV2]], align
   // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
   // CHECK: [[TERMINATE]]
@@ -209,9 +207,7 @@ int bar(int n){
   // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align
   // CHECK: store i[[SZ]] [[ACV]], i[[SZ]]* [[AA_ADDR_T:%.+]], align
   // CHECK: [[CONV2:%.+]] = bitcast i[[SZ]]* [[AA_ADDR_T]] to i16*
-  // CHECK: [[LD_CONV2:%.+]] = load i16, i16* [[CONV2]],
-  // CHECK: store i16 [[LD_CONV2]], i16* [[A_PRIV:%[^,]+]],
-  // CHECK: store i16 1, i16* [[A_PRIV]], align
+  // CHECK: store i16 1, i16* [[CONV2]], align
   // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
   // CHECK: [[TERMINATE]]
diff --git a/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index d77231807fb4..696940bcf14f 100644
--- a/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -168,9 +168,9 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
   // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
@@ -249,9 +249,9 @@ int bar(int n){
   // CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]]
   // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
   // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
-  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double*
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
   // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
+  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double*
   // CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align
   //
   // CHECK: ret
@@ -298,25 +298,15 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
   // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
   // CHECK: br label {{%?}}[[REDUCE_CONT]]
   //
   // CHECK: [[REDUCE_CONT]]
   // CHECK: ret
 
-
-
-
-
-
-
-
-
-
-
   // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker()
 
   // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]](
@@ -480,9 +470,9 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
   // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
@@ -617,9 +607,9 @@ int bar(int n){
   // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
   // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
   // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
-  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float*
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
   // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
+  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float*
   // CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align
   //
   // CHECK: ret
@@ -690,24 +680,15 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
   // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
   // CHECK: br label {{%?}}[[REDUCE_CONT]]
   //
   // CHECK: [[REDUCE_CONT]]
   // CHECK: ret
 
-
-
-
-
-
-
-
-
-
   // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker()
 
   // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]](
@@ -903,18 +884,18 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
   //
   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
@@ -1035,9 +1016,9 @@ int bar(int n){
   // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
   // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
   // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
-  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
   // CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align
   //
   // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4
@@ -1053,9 +1034,9 @@ int bar(int n){
   // CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]]
   // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
   // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
-  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16*
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+  // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16*
   // CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align
   //
   // CHECK: ret
@@ -1121,18 +1102,18 @@ int bar(int n){
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
   //
   // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
   // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
   // CHECK: br label {{%?}}[[REDUCE_CONT]]
   //
diff --git a/test/OpenMP/openmp_win_codegen.cpp b/test/OpenMP/openmp_win_codegen.cpp
index 9efc7dea948b..249832b6a8d6 100644
--- a/test/OpenMP/openmp_win_codegen.cpp
+++ b/test/OpenMP/openmp_win_codegen.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-pc-windows-msvc18.0.0 -std=c++11 -fms-compatibility-version=18 -fms-extensions -emit-llvm %s -fexceptions -fcxx-exceptions -o - -O1 | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-pc-windows-msvc18.0.0 -std=c++11 -fms-compatibility-version=18 -fms-extensions -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-pc-windows-msvc18.0.0 -std=c++11 -fms-compatibility-version=18 -fms-extensions -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-pc-windows-msvc18.0.0 -std=c++11 -fms-compatibility-version=18 -fms-extensions -emit-llvm %s -fexceptions -fcxx-exceptions -o - -O1 | FileCheck --check-prefix SIMD-ONLY0 %s
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
-// REQUIRES: x86-registered-target
 // expected-no-diagnostics
 
 void foo();
@@ -52,13 +52,19 @@ int main() {
 // CHECK: define internal void [[OUTLINED]](
 // CHECK: [[GID:%.+]] = {{.*}}call i32 @__kmpc_global_thread_num(%ident_t* {{.*}}@0)
 // CHECK: invoke void @{{.+}}foo
-// CHECK: catchswitch within
-// CHECK: catchpad within
+// CHECK: [[CATCHSWITCH:%.+]] = catchswitch within none
+// CHECK: [[CATCHPAD:%.+]] = catchpad within [[CATCHSWITCH]]
 // CHECK: call void @__kmpc_critical(%ident_t* {{.*}}@0, i32 [[GID]],
 // CHECK: invoke void @{{.+}}bar
 // CHECK: call void @__kmpc_end_critical(%ident_t* {{.*}}@0, i32 [[GID]],
-// CHECK: catchret from
-// CHECK: cleanuppad within
-// CHECK: call void @__kmpc_end_critical(%ident_t* {{.*}}@0, i32 [[GID]],
-// CHECK: cleanupret from
-
+// CHECK: catchret from [[CATCHPAD]] to
+// CHECK:      cleanuppad within [[CATCHPAD]] []
+// CHECK-NEXT: call void @__kmpc_end_critical(%ident_t* {{.*}}@0, i32 [[GID]],
+// CHECK-NEXT: cleanupret from {{.*}} unwind label %[[CATCHTERM:[^ ]+]]
+// CHECK:      cleanuppad within none []
+// CHECK-NEXT: call void @"\01?terminate@@YAXXZ"() #5 [ "funclet"(token %{{.*}}) ]
+// CHECK-NEXT: unreachable
+// CHECK:      [[CATCHTERM]]
+// CHECK-NEXT: cleanuppad within [[CATCHPAD]] []
+// CHECK-NEXT: call void @"\01?terminate@@YAXXZ"() #5 [ "funclet"(token %{{.*}}) ]
+// CHECK-NEXT: unreachable
diff --git a/test/OpenMP/ordered_doacross_codegen.cpp b/test/OpenMP/ordered_doacross_codegen.cpp
index 8233bb68db6b..60b0353b711e 100644
--- a/test/OpenMP/ordered_doacross_codegen.cpp
+++ b/test/OpenMP/ordered_doacross_codegen.cpp
@@ -24,7 +24,7 @@ int main() {
 // CHECK: icmp
 // CHECK-NEXT: br i1 %
 // CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 [[CAST]], i8 0, i64 24, i1 false)
 // CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
 // CHECK: store i64 %{{.+}}, i64* %
 // CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
@@ -81,7 +81,7 @@ struct TestStruct {
 // CHECK: icmp
 // CHECK-NEXT: br i1 %
 // CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 [[CAST]], i8 0, i64 24, i1 false)
 // CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
 // CHECK: store i64 %{{.+}}, i64* %
 // CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
diff --git a/test/OpenMP/parallel_codegen.cpp b/test/OpenMP/parallel_codegen.cpp
index a91b41e6b713..9d9bb7151f69 100644
--- a/test/OpenMP/parallel_codegen.cpp
+++ b/test/OpenMP/parallel_codegen.cpp
@@ -47,7 +47,7 @@ int main (int argc, char **argv) {
 // CHECK-DEBUG:       [[LOC_2_ADDR:%.+]] = alloca %ident_t
 // CHECK-DEBUG:       [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[LOC_2_ADDR]] to i8*
 // CHECK-DEBUG-NEXT:  [[KMPC_DEFAULT_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[DEF_LOC_2]] to i8*
-// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i32 8, i1 false)
+// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[KMPC_LOC_VOIDPTR]], i8* align 8 [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i1 false)
 // CHECK-DEBUG:       store i32 %argc, i32* [[ARGC_ADDR:%.+]],
 // CHECK-DEBUG:       [[VLA:%.+]] = alloca i32, i64 [[VLA_SIZE:%[^,]+]],
 // CHECK-DEBUG:       [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
@@ -97,7 +97,7 @@ int main (int argc, char **argv) {
 // CHECK-DEBUG-DAG:   [[LOC_2_ADDR:%.+]] = alloca %ident_t
 // CHECK-DEBUG:       [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[LOC_2_ADDR]] to i8*
 // CHECK-DEBUG-NEXT:  [[KMPC_DEFAULT_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[DEF_LOC_2]] to i8*
-// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i32 8, i1 false)
+// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[KMPC_LOC_VOIDPTR]], i8* align 8 [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i1 false)
 // CHECK-DEBUG-NEXT:  store i8** %argc, i8*** [[ARGC_ADDR:%.+]],
 // CHECK-DEBUG:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
 // CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC2]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
diff --git a/test/OpenMP/parallel_copyin_codegen.cpp b/test/OpenMP/parallel_copyin_codegen.cpp
index 66af9ae56efd..d916e5d14c42 100644
--- a/test/OpenMP/parallel_copyin_codegen.cpp
+++ b/test/OpenMP/parallel_copyin_codegen.cpp
@@ -294,10 +294,10 @@ int main() {
 
 // threadprivate_vec = vec;
 // CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[VEC]]
-// CHECK: call void @llvm.memcpy{{.*}}(i8* %{{.+}}, i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}}  %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
 
 // TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x i32]* [[MASTER_REF2]] to i8*
-// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*), i8* [[MASTER_CAST]]
+// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*), i8* align {{[0-9]+}} [[MASTER_CAST]]
 
 // threadprivate_s_arr = s_arr;
 // CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[S_ARR]]
@@ -415,10 +415,10 @@ int main() {
 
 // threadprivate_vec = vec;
 // CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VEC]]
-// CHECK: call {{.*}}void @llvm.memcpy{{.*}}(i8* %{{.+}}, i8* bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*),
+// CHECK: call {{.*}}void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*),
 
 // TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x i32]* [[MASTER_REF1]] to i8*
-// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*), i8* [[MASTER_CAST]]
+// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*), i8* align {{[0-9]+}} [[MASTER_CAST]]
 
 // threadprivate_s_arr = s_arr;
 // CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_S_ARR]]
@@ -508,13 +508,13 @@ void array_func() {
   static St s[2];
 // ARRAY: @__kmpc_fork_call(
 // ARRAY: call i8* @__kmpc_threadprivate_cached(
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* bitcast ([2 x i32]* @{{.+}} to i8*), i64 8, i32 4, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 bitcast ([2 x i32]* @{{.+}} to i8*), i64 8, i1 false)
 // ARRAY: call dereferenceable(8) %struct.St* @{{.+}}(%struct.St* %{{.+}}, %struct.St* dereferenceable(8) %{{.+}})
 
 // TLS-ARRAY: @__kmpc_fork_call(
 // TLS-ARRAY: [[REFT:%.+]] = load [2 x i32]*, [2 x i32]** [[ADDR:%.+]],
 // TLS-ARRAY: [[REF:%.+]] = bitcast [2 x i32]* [[REFT]] to i8*
-// TLS-ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([2 x i32]* @{{.+}} to i8*), i8* [[REF]], i64 8, i32 4, i1 false)
+// TLS-ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 bitcast ([2 x i32]* @{{.+}} to i8*), i8* align 4 [[REF]], i64 8, i1 false)
 // TLS-ARRAY: call dereferenceable(8) %struct.St* @{{.+}}(%struct.St* %{{.+}}, %struct.St* dereferenceable(8) %{{.+}})
 
 #pragma omp threadprivate(a, s)
diff --git a/test/OpenMP/parallel_firstprivate_codegen.cpp b/test/OpenMP/parallel_firstprivate_codegen.cpp
index e26173e46fa7..6772f61779ef 100644
--- a/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -366,7 +366,7 @@ int main() {
 // CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
@@ -459,7 +459,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], align 128
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}} 128,
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align 128 [[VEC_DEST]], i8* align 128 [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i1
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
@@ -507,7 +507,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* %{{.+}}, i64 [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i64 [[SIZE]], i1 false)
 #pragma omp parallel firstprivate(a, s, vla1, vla2)
   s[0].St_func(s, n, vla1);
   ;
@@ -523,7 +523,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* %{{.+}}, i64 [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i64 [[SIZE]], i1 false)
 #endif
 
 
diff --git a/test/OpenMP/parallel_for_simd_codegen.cpp b/test/OpenMP/parallel_for_simd_codegen.cpp
index 7f222df0dea8..2c9d9a9ea94b 100644
--- a/test/OpenMP/parallel_for_simd_codegen.cpp
+++ b/test/OpenMP/parallel_for_simd_codegen.cpp
@@ -685,5 +685,6 @@ void parallel_simd(float *a) {
     a[i] += bar();
 }
 // TERM_DEBUG: !{{[0-9]+}} = !DILocation(line: [[@LINE-11]],
+// TERM_DEBUG-NOT: line: 0,
 #endif // HEADER
 
diff --git a/test/OpenMP/parallel_messages.cpp b/test/OpenMP/parallel_messages.cpp
index 959ce11be883..8b0c0353c20d 100644
--- a/test/OpenMP/parallel_messages.cpp
+++ b/test/OpenMP/parallel_messages.cpp
@@ -31,6 +31,8 @@ int main(int argc, char **argv) {
   foo();
   L1:
     foo();
+  #pragma omp parallel ordered // expected-error {{unexpected OpenMP clause 'ordered' in directive '#pragma omp parallel'}}
+    ;
   #pragma omp parallel
   ;
   #pragma omp parallel
diff --git a/test/OpenMP/parallel_reduction_codegen.cpp b/test/OpenMP/parallel_reduction_codegen.cpp
index 3dab2e915feb..c064ed071210 100644
--- a/test/OpenMP/parallel_reduction_codegen.cpp
+++ b/test/OpenMP/parallel_reduction_codegen.cpp
@@ -414,7 +414,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1 = var1.operator &&(var1_reduction);
 // CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
@@ -430,7 +430,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1 = min(t_var1, t_var1_reduction);
 // CHECK: [[T_VAR1_VAL:%.+]] = load float, float* [[T_VAR1_REF]],
@@ -468,7 +468,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // var1 = var1.operator &&(var1_reduction);
@@ -486,7 +486,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // t_var1 = min(t_var1, t_var1_reduction);
@@ -568,7 +568,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_LHS]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_RHS]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1_lhs = var1_lhs.operator &&(var1_rhs);
 // CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_LHS]])
@@ -584,7 +584,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
 // CHECK: [[T_VAR1_LHS_VAL:%.+]] = load float, float* [[T_VAR1_LHS]],
@@ -737,7 +737,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1 = var1.operator &&(var1_reduction);
 // CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
@@ -752,7 +752,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1 = min(t_var1, t_var1_reduction);
 // CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
@@ -778,7 +778,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // var1 = var1.operator &&(var1_reduction);
@@ -796,7 +796,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // t_var1 = min(t_var1, t_var1_reduction);
@@ -864,7 +864,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1_lhs = var1_lhs.operator &&(var1_rhs);
 // CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
@@ -880,7 +880,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
 // CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
diff --git a/test/OpenMP/sections_firstprivate_codegen.cpp b/test/OpenMP/sections_firstprivate_codegen.cpp
index 94b061dd597b..467c10f242c6 100644
--- a/test/OpenMP/sections_firstprivate_codegen.cpp
+++ b/test/OpenMP/sections_firstprivate_codegen.cpp
@@ -225,7 +225,7 @@ int main() {
 
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
 
 // firstprivate s_arr(s_arr)
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -292,7 +292,7 @@ int main() {
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 
 // firstprivate s_arr(s_arr)
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
diff --git a/test/OpenMP/sections_lastprivate_codegen.cpp b/test/OpenMP/sections_lastprivate_codegen.cpp
index cb76f16a2eb8..bb6a54f00aef 100644
--- a/test/OpenMP/sections_lastprivate_codegen.cpp
+++ b/test/OpenMP/sections_lastprivate_codegen.cpp
@@ -342,7 +342,7 @@ int main() {
 // original vec[]=private_vec[];
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 
 // original s_arr[]=private_s_arr[];
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
diff --git a/test/OpenMP/sections_reduction_codegen.cpp b/test/OpenMP/sections_reduction_codegen.cpp
index d8871700ced5..c76e3e1b71d6 100644
--- a/test/OpenMP/sections_reduction_codegen.cpp
+++ b/test/OpenMP/sections_reduction_codegen.cpp
@@ -303,7 +303,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1 = var1.operator &&(var1_reduction);
 // CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
@@ -319,7 +319,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1 = min(t_var1, t_var1_reduction);
 // CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
@@ -345,7 +345,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // var1 = var1.operator &&(var1_reduction);
@@ -363,7 +363,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 // CHECK: call void @__kmpc_end_critical(
 
 // t_var1 = min(t_var1, t_var1_reduction);
@@ -430,7 +430,7 @@ int main() {
 // CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // var1_lhs = var1_lhs.operator &&(var1_rhs);
 // CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
@@ -446,7 +446,7 @@ int main() {
 // CHECK:  call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
 // CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
 // CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
 
 // t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
 // CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
diff --git a/test/OpenMP/simd_codegen.cpp b/test/OpenMP/simd_codegen.cpp
index ad5679235f6b..883273099ad8 100644
--- a/test/OpenMP/simd_codegen.cpp
+++ b/test/OpenMP/simd_codegen.cpp
@@ -708,5 +708,8 @@ class S8 : private IterDouble, public S7<SS> {
 };
 S8 s8(0);
 
+// TERM_DEBUG-NOT: line: 0,
+// TERM_DEBUG: distinct !DISubprogram(linkageName: "_GLOBAL__sub_I_simd_codegen.cpp",
+
 #endif // HEADER
 
diff --git a/test/OpenMP/simd_loop_messages.cpp b/test/OpenMP/simd_loop_messages.cpp
index 5a2df073abbf..b9d146c2150a 100644
--- a/test/OpenMP/simd_loop_messages.cpp
+++ b/test/OpenMP/simd_loop_messages.cpp
@@ -236,9 +236,7 @@ int test_iteration_spaces() {
   for (ii = 0; ii < 10; ii++)
     c[ii] = a[ii];
 
-  // expected-error@+3 {{unexpected OpenMP clause 'shared' in directive '#pragma omp simd'}}
-  // expected-note@+2  {{defined as shared}}
-  // expected-error@+2 {{loop iteration variable in the associated loop of 'omp simd' directive may not be shared, predetermined as linear}}
+  // expected-error@+1 {{unexpected OpenMP clause 'shared' in directive '#pragma omp simd'}}
   #pragma omp simd shared(ii)
   for (ii = 0; ii < 10; ii++)
     c[ii] = a[ii];
diff --git a/test/OpenMP/single_codegen.cpp b/test/OpenMP/single_codegen.cpp
index cc79beee8a91..108604a74e34 100644
--- a/test/OpenMP/single_codegen.cpp
+++ b/test/OpenMP/single_codegen.cpp
@@ -180,7 +180,7 @@ int main() {
 // CHECK: [[DST_A2_ADDR:%.+]] = load i8*, i8** [[DST_A2_ADDR_REF]],
 // CHECK: [[SRC_A2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[SRC_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
 // CHECK: [[SRC_A2_ADDR:%.+]] = load i8*, i8** [[SRC_A2_ADDR_REF]],
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[DST_A2_ADDR]], i8* [[SRC_A2_ADDR]], i{{[0-9]+}} 2, i{{[0-9]+}} 1, i1 false)
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align 1 [[DST_A2_ADDR]], i8* align 1 [[SRC_A2_ADDR]], i{{[0-9]+}} 2, i1 false)
 // CHECK: [[DST_TC2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DST_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
 // CHECK: [[DST_TC2_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[DST_TC2_ADDR_REF]],
 // CHECK: [[DST_TC2_ADDR:%.+]] = bitcast i8* [[DST_TC2_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
diff --git a/test/OpenMP/single_firstprivate_codegen.cpp b/test/OpenMP/single_firstprivate_codegen.cpp
index 0256e6ed9015..23b543511b0f 100644
--- a/test/OpenMP/single_firstprivate_codegen.cpp
+++ b/test/OpenMP/single_firstprivate_codegen.cpp
@@ -192,7 +192,7 @@ int main() {
 
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
 
 // firstprivate s_arr(s_arr)
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -256,7 +256,7 @@ int main() {
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 
 // firstprivate s_arr(s_arr)
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
diff --git a/test/OpenMP/target_codegen.cpp b/test/OpenMP/target_codegen.cpp
index 4d1d266ef208..3671f44e58d7 100644
--- a/test/OpenMP/target_codegen.cpp
+++ b/test/OpenMP/target_codegen.cpp
@@ -115,7 +115,9 @@ int foo(int n) {
   static long *plocal;
 
   // CHECK:       [[ADD:%.+]] = add nsw i32
-  // CHECK:       [[DEVICE:%.+]] = sext i32 [[ADD]] to i64
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
   // CHECK:       [[RET:%.+]] = call i32 @__tgt_target(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null)
   // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
   // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
@@ -128,7 +130,9 @@ int foo(int n) {
   }
 
   // CHECK-DAG:   [[ADD:%.+]] = add nsw i32
-  // CHECK-DAG:   [[DEVICE:%.+]] = sext i32 [[ADD]] to i64
+  // CHECK-DAG:   store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK-DAG:   [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK-DAG:   [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET]], i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0)
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -234,13 +238,13 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[IFELSE:[^,]+]]
-  // CHECK:       [[TRY]]
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i64 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([9 x i64], [9 x i64]* [[MAPT4]], i32 0, i32 0))
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -510,13 +514,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[IFELSE:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[IFELSE:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0))
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_codegen_registration.cpp b/test/OpenMP/target_codegen_registration.cpp
index c576e9693e64..80604eae1caf 100644
--- a/test/OpenMP/target_codegen_registration.cpp
+++ b/test/OpenMP/target_codegen_registration.cpp
@@ -123,54 +123,54 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME1]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME2]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME3]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME4]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME5]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME6]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME7]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME8]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME9]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME10]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME11]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: @.omp_offloading.entry.[[NAME12]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME1]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME2]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME3]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME4]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME5]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME6]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME7]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME8]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME9]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME10]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME11]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: @.omp_offloading.entry.[[NAME12]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 
 // CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
 // CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
diff --git a/test/OpenMP/target_depend_codegen.cpp b/test/OpenMP/target_depend_codegen.cpp
new file mode 100644
index 000000000000..d7eb93d87adf
--- /dev/null
+++ b/test/OpenMP/target_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target device(global + a) nowait depend(inout: global, a, bn) if(a)
+  {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target if(0) firstprivate(global) depend(out:global)
+  {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_enter_data_depend_codegen.cpp b/test/OpenMP/target_enter_data_depend_codegen.cpp
index 44501d647dd9..2e4440cd4051 100644
--- a/test/OpenMP/target_enter_data_depend_codegen.cpp
+++ b/test/OpenMP/target_enter_data_depend_codegen.cpp
@@ -68,19 +68,19 @@ void foo(int arg) {
   // CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
   // CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
   // CK1: [[BC1:%.+]] = bitcast %struct.anon* [[CAPTURES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 4, i32 4, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 4 [[SHAREDS_REF]], i8* align 4 [[BC1]], i[[sz]] 4, i1 false)
   // CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* [[BC]], i32 0, i32 1
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
@@ -124,19 +124,19 @@ void foo(int arg) {
   // CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
   // CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
   // CK1: [[BC1:%.+]] = bitcast %struct.anon{{.+}}* [[CAPTURES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 1, i32 1, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 1 [[SHAREDS_REF]], i8* align 1 [[BC1]], i[[sz]] 1, i1 false)
   // CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates{{.+}}, %struct.kmp_task_t_with_privates{{.+}}* [[RES_BC]], i32 0, i32 1
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
@@ -198,15 +198,15 @@ void foo(int arg) {
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
   // CK1: [[BC_SIZES:%.+]] = bitcast i[[sz]]* [[GEPS0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* [[BC_SIZES]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} [[BC_SIZES]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint float* %{{.+}} to i[[sz]]
@@ -272,14 +272,14 @@ void foo(int arg) {
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{16|8}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [2 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [5 x %struct.kmp_depend_info], [5 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint double* %{{.+}} to i[[sz]]
diff --git a/test/OpenMP/target_exit_data_depend_codegen.cpp b/test/OpenMP/target_exit_data_depend_codegen.cpp
index f7ad191d06ac..ad8c47522d41 100644
--- a/test/OpenMP/target_exit_data_depend_codegen.cpp
+++ b/test/OpenMP/target_exit_data_depend_codegen.cpp
@@ -68,19 +68,19 @@ void foo(int arg) {
   // CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
   // CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
   // CK1: [[BC1:%.+]] = bitcast %struct.anon* [[CAPTURES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 4, i32 4, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 4 [[SHAREDS_REF]], i8* align 4 [[BC1]], i[[sz]] 4, i1 false)
   // CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* [[BC]], i32 0, i32 1
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
@@ -124,19 +124,19 @@ void foo(int arg) {
   // CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
   // CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
   // CK1: [[BC1:%.+]] = bitcast %struct.anon{{.+}}* [[CAPTURES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 1, i32 1, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 1 [[SHAREDS_REF]], i8* align 1 [[BC1]], i[[sz]] 1, i1 false)
   // CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates{{.+}}, %struct.kmp_task_t_with_privates{{.+}}* [[RES_BC]], i32 0, i32 1
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
@@ -198,15 +198,15 @@ void foo(int arg) {
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
   // CK1: [[BC_SIZES:%.+]] = bitcast i[[sz]]* [[GEPS0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* [[BC_SIZES]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} [[BC_SIZES]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint float* %{{.+}} to i[[sz]]
@@ -272,14 +272,14 @@ void foo(int arg) {
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{16|8}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [2 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [5 x %struct.kmp_depend_info], [5 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint double* %{{.+}} to i[[sz]]
diff --git a/test/OpenMP/target_firstprivate_codegen.cpp b/test/OpenMP/target_firstprivate_codegen.cpp
index ece60d1b05c5..a1599713fab6 100644
--- a/test/OpenMP/target_firstprivate_codegen.cpp
+++ b/test/OpenMP/target_firstprivate_codegen.cpp
@@ -281,7 +281,7 @@ int foo(int n, double *ptr) {
   //  firstprivate(b): memcpy(b_priv,b_in)
   // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
   // TCHECK:  [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_ADDR_REF_BCAST]], {{.+}})
 
   // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
   // TCHECK:  store i8* [[RET_STACK]], i8** [[SSTACK]],
@@ -291,12 +291,12 @@ int foo(int n, double *ptr) {
   // TCHECK:  [[BN_COPY_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[BN_SZ_VAL]], 4
   // TCHECK:  [[BN_PRIV__BCAST:%.+]] = bitcast float* [[BN_PRIV]] to i8*
   // TCHECK:  [[BN_REF_IN_BCAST:%.+]] = bitcast float* [[BN_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[BN_PRIV__BCAST]], i8* [[BN_REF_IN_BCAST]], i{{[0-9]+}} [[BN_COPY_SZ]],{{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BN_PRIV__BCAST]], i8* align {{[0-9]+}} [[BN_REF_IN_BCAST]], i{{[0-9]+}} [[BN_COPY_SZ]],{{.+}})
 
   // firstprivate(c)
   // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
   // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
   
   // firstprivate(cn)
   // TCHECK:  [[CN_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
@@ -305,12 +305,12 @@ int foo(int n, double *ptr) {
   // TCHECK:  [[CN_SZ2_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ2]], 8
   // TCHECK:  [[CN_PRIV_BCAST:%.+]] = bitcast double* [[CN_PRIV]] to i8*
   // TCHECK:  [[CN_IN_BCAST:%.+]] = bitcast double* [[CN_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[CN_PRIV_BCAST]], i8* [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[CN_PRIV_BCAST]], i8* align {{[0-9]+}} [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
   
   // firstprivate(d)
   // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
   // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[D_PRIV_BCAST]], i8* align {{[0-9]+}} [[D_IN_BCAST]],{{.+}})
 
   
   #pragma omp target firstprivate(ptr)
@@ -391,7 +391,7 @@ int fstatic(int n) {
 // firstprivate(b)
 // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
 // TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
 
 // TCHECK:  ret void
 
@@ -499,7 +499,7 @@ struct S1 {
   // TCHECK:  [[C_SZ_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[C_SZ2]],  2
   // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_PRIV]] to i8*
   // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_ADDR_REF]] to i8*
-  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
 
   // finish
   // TCHECK: [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
@@ -594,7 +594,7 @@ int bar(int n, double *ptr){
 // firstprivate(b)
 // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
 // TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
 
 // TCHECK: ret void
 
diff --git a/test/OpenMP/target_parallel_codegen.cpp b/test/OpenMP/target_parallel_codegen.cpp
index 2689e9a2f8f6..7a76317efe74 100644
--- a/test/OpenMP/target_parallel_codegen.cpp
+++ b/test/OpenMP/target_parallel_codegen.cpp
@@ -197,13 +197,13 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-  // CHECK:       [[TRY]]
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([9 x i64], [9 x i64]* [[MAPT4]], i32 0, i32 0), i32 1, i32 0)
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -531,13 +531,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 0)
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_parallel_debug_codegen.cpp b/test/OpenMP/target_parallel_debug_codegen.cpp
index 385f7a29cb8f..4355abf6c7ac 100644
--- a/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/test/OpenMP/target_parallel_debug_codegen.cpp
@@ -92,18 +92,18 @@ int main() {
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 // CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32*
 // CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]*
-// CHECK: call void [[NONDEBUG_WRAPPER:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}})
+// CHECK: call void @[[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}})
 
-// CHECK: define internal void [[DEBUG_PARALLEL:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}})
+// CHECK: define internal void @[[DEBUG_PARALLEL:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 // CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32*
 // CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]*
 
-// CHECK: define internal void [[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* dereferenceable{{[^,]+}}, i32* dereferenceable{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{[^,]+}}, i8* dereferenceable{{[^)]+}})
+// CHECK: define internal void @[[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* dereferenceable{{[^,]+}}, i32* dereferenceable{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{[^,]+}}, i8* dereferenceable{{[^)]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)*
 // CHECK: addrspacecast i32* %{{.+}} to i32 addrspace(1)*
 // CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)*
-// CHECK: call void [[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}})
+// CHECK: call void @[[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}})
 
 // CHECK: define void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* dereferenceable{{[^,]+}}, i32* dereferenceable{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{[^,]+}}, i8* dereferenceable{{[^)]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)*
@@ -116,8 +116,11 @@ int main() {
 // CHECK: !DILocalVariable(name: ".bound_tid.",
 // CHECK-SAME: DIFlagArtificial
 // CHECK: !DILocalVariable(name: "c",
-// CHECK-SAMEi-NOT: DIFlagArtificial
+// CHECK-SAME: line: 11
 // CHECK: !DILocalVariable(name: "a",
-// CHECK-SAMEi-NOT: DIFlagArtificial
+// CHECK-SAME: line: 9
 // CHECK: !DILocalVariable(name: "b",
-// CHECK-SAMEi-NOT: DIFlagArtificial
+// CHECK-SAME: line: 10
+
+// CHECK-DAG: distinct !DISubprogram(name: "[[NONDEBUG_WRAPPER]]",
+// CHECK-DAG: distinct !DISubprogram(name: "[[DEBUG_PARALLEL]]",
diff --git a/test/OpenMP/target_parallel_depend_codegen.cpp b/test/OpenMP/target_parallel_depend_codegen.cpp
new file mode 100644
index 000000000000..108535a784f5
--- /dev/null
+++ b/test/OpenMP/target_parallel_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target parallel device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target parallel device(global + a) nowait depend(inout: global, a, bn) if(target:a)
+  {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target parallel if(0) firstprivate(global) depend(out:global)
+  {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 1, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 1, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_parallel_for_codegen.cpp b/test/OpenMP/target_parallel_for_codegen.cpp
index de99e6d18c3f..1bd9abb4d6d9 100644
--- a/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/test/OpenMP/target_parallel_for_codegen.cpp
@@ -218,13 +218,13 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-  // CHECK:       [[TRY]]
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 10, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([10 x i64], [10 x i64]* [[MAPT4]], i32 0, i32 0), i32 1, i32 0)
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -558,13 +558,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 0)
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_parallel_for_debug_codegen.cpp b/test/OpenMP/target_parallel_for_debug_codegen.cpp
index 0e0bb7e85306..9b119f2503bf 100644
--- a/test/OpenMP/target_parallel_for_debug_codegen.cpp
+++ b/test/OpenMP/target_parallel_for_debug_codegen.cpp
@@ -58,7 +58,7 @@ int main() {
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 // CHECK: call void [[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}})
 
-// CHECK: define internal void [[DEBUG_PARALLEL:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}})
+// CHECK: define internal void [[DEBUG_PARALLEL:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 
 // CHECK: define internal void [[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* dereferenceable{{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{[^,]+}}, i8* dereferenceable{{[^)]+}})
@@ -92,18 +92,18 @@ int main() {
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 // CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32*
 // CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]*
-// CHECK: call void [[NONDEBUG_WRAPPER:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}})
+// CHECK: call void @[[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}})
 
-// CHECK: define internal void [[DEBUG_PARALLEL:@.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}})
+// CHECK: define internal void @[[DEBUG_PARALLEL:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* noalias{{[^,]+}}, i32 addrspace(1)* noalias{{[^,]+}}, [10 x [10 x i32]] addrspace(1)* noalias{{[^,]+}}, i8 addrspace(1)* noalias{{[^)]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 // CHECK: addrspacecast i32 addrspace(1)* %{{.+}} to i32*
 // CHECK: addrspacecast [10 x [10 x i32]] addrspace(1)* %{{.+}} to [10 x [10 x i32]]*
 
-// CHECK: define internal void [[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* dereferenceable{{[^,]+}}, i32* dereferenceable{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{[^,]+}}, i8* dereferenceable{{[^)]+}})
+// CHECK: define internal void @[[NONDEBUG_WRAPPER]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* dereferenceable{{[^,]+}}, i32* dereferenceable{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{[^,]+}}, i8* dereferenceable{{[^)]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)*
 // CHECK: addrspacecast i32* %{{.+}} to i32 addrspace(1)*
 // CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)*
-// CHECK: call void [[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}})
+// CHECK: call void @[[DEBUG_PARALLEL]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}})
 
 // CHECK: define void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]]* dereferenceable{{[^,]+}}, i32* dereferenceable{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{[^,]+}}, i8* dereferenceable{{[^)]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]]* %{{.+}} to [10 x [10 x [10 x i32]]] addrspace(1)*
@@ -111,3 +111,5 @@ int main() {
 // CHECK: addrspacecast [10 x [10 x i32]]* %{{.+}} to [10 x [10 x i32]] addrspace(1)*
 // CHECK: call void @__omp_offloading_{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 addrspace(1)* {{[^,]+}}, [10 x [10 x i32]] addrspace(1)* {{[^,]+}}, i8 addrspace(1)* {{[^)]+}})
 
+// CHECK-DAG: distinct !DISubprogram(name: "[[NONDEBUG_WRAPPER]]",
+// CHECK-DAG: distinct !DISubprogram(name: "[[DEBUG_PARALLEL]]",
diff --git a/test/OpenMP/target_parallel_for_depend_codegen.cpp b/test/OpenMP/target_parallel_for_depend_codegen.cpp
new file mode 100644
index 000000000000..4845ed47ce6d
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[IN]], i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target parallel for device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  for (int i = 0; i < 10; ++i) {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN]], i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN]], i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target parallel for device(global + a) nowait depend(inout: global, a, bn) if(target:a)
+  for (int i = 0; i < *plocal; ++i) {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN]], i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[IN]], i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target parallel for if(0) firstprivate(global) depend(out:global)
+  for (int i = 0; i < global; ++i) {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 1, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 1, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_parallel_for_simd_codegen.cpp b/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 600c767f8e48..3da6b81d0952 100644
--- a/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -215,13 +215,13 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-  // CHECK:       [[TRY]]
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 10, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([10 x i64], [10 x i64]* [[MAPT4]], i32 0, i32 0), i32 1, i32 0)
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -558,13 +558,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 1, i32 0)
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp b/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
new file mode 100644
index 000000000000..da516995ec36
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[IN]], i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target parallel for simd device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  for (int i = 0; i < 10; ++i) {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN]], i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN]], i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target parallel for simd device(global + a) nowait depend(inout: global, a, bn) if(target:a)
+  for (int i = 0; i < *plocal; ++i) {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[IN]], i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[IN]], i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[IN]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target parallel for simd if(0) firstprivate(global) depend(out:global)
+  for (int i = 0; i < global; ++i) {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 1, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 1, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_simd_codegen.cpp b/test/OpenMP/target_simd_codegen.cpp
index 868219d984a2..7ed02435e7e6 100644
--- a/test/OpenMP/target_simd_codegen.cpp
+++ b/test/OpenMP/target_simd_codegen.cpp
@@ -210,13 +210,13 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-  // CHECK:       [[TRY]]
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i64 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([9 x i64], [9 x i64]* [[MAPT4]], i32 0, i32 0))
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -481,13 +481,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0))
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_simd_depend_codegen.cpp b/test/OpenMP/target_simd_depend_codegen.cpp
new file mode 100644
index 000000000000..abbbc76fdc3b
--- /dev/null
+++ b/test/OpenMP/target_simd_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target simd device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  for (int i = 0; i < 10; ++i) {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target simd device(global + a) nowait depend(inout: global, a, bn) if(a)
+  for (int i = 0; i < *plocal; ++i) {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target simd if(0) firstprivate(global) depend(out:global)
+  for (int i = 0; i < global; ++i) {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_teams_codegen.cpp b/test/OpenMP/target_teams_codegen.cpp
index f3cc9352078d..349c0de2a4ef 100644
--- a/test/OpenMP/target_teams_codegen.cpp
+++ b/test/OpenMP/target_teams_codegen.cpp
@@ -220,13 +220,13 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-  // CHECK:       [[TRY]]
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([9 x i64], [9 x i64]* [[MAPT4]], i32 0, i32 0), i32 0, i32 0)
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -345,7 +345,8 @@ int foo(int n) {
 //
 //
 // CHECK:       define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] {{[^)]+}})
-// CHECK:       alloca i16,
+// CHECK:       alloca i[[SZ]],
+// CHECK:       bitcast i[[SZ]]* {{.+}} to i16*
 // CHECK:       ret void
 // CHECK-NEXT:  }
 
@@ -555,13 +556,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT7]], i32 0, i32 0), i32 0, i32 0)
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_teams_depend_codegen.cpp b/test/OpenMP/target_teams_depend_codegen.cpp
new file mode 100644
index 000000000000..27734bc796d5
--- /dev/null
+++ b/test/OpenMP/target_teams_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target teams device(global + a) nowait depend(inout: global, a, bn) if(a)
+  {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* @0, i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams if(0) firstprivate(global) depend(out:global)
+  {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 0, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 0, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_codegen.cpp b/test/OpenMP/target_teams_distribute_codegen.cpp
index 7233a71537e5..a033bfe2d326 100644
--- a/test/OpenMP/target_teams_distribute_codegen.cpp
+++ b/test/OpenMP/target_teams_distribute_codegen.cpp
@@ -62,7 +62,7 @@
 // CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i64] [i64 288]
 // CHECK-DAG: [[SIZET3:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2]
 // CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i64] [i64 288, i64 288]
-// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i64] [i64 288, i64 547, i64 288, i64 547, i64 547, i64 288, i64 288, i64 547, i64 547]
+// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [10 x i64] [i64 288, i64 547, i64 288, i64 547, i64 547, i64 288, i64 288, i64 547, i64 547, i64 288]
 // CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [5 x i64] [i64 547, i64 288, i64 288, i64 288, i64 547]
 // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [5 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 1, i[[SZ]] 40]
 // CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [5 x i64] [i64 288, i64 288, i64 288, i64 288, i64 547]
@@ -211,6 +211,7 @@ int foo(int n) {
   }
 
   // We capture 3 VLA sizes in this target region
+  // CHECK:          load i32, i32* %
   // CHECK-64:       [[A_VAL:%.+]] = load i32, i32* %{{.+}},
   // CHECK-64:       [[A_ADDR:%.+]] = bitcast i[[SZ]]* [[A_CADDR:%.+]] to i32*
   // CHECK-64:       store i32 [[A_VAL]], i32* [[A_ADDR]],
@@ -220,45 +221,48 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-  // CHECK:       [[TRY]]
-  // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([9 x i64], [9 x i64]* [[MAPT4]], i32 0, i32 0), i32 0, i32 0)
-  // CHECK-DAG:   [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
-  // CHECK-DAG:   [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0
-  // CHECK-DAG:   [[SR]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S:%[^,]+]], i32 0, i32 0
-
-  // CHECK-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX0:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX0]]
-  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX0]]
-  // CHECK-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX1:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX1]]
-  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX1]]
-  // CHECK-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX2:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX2]]
-  // CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX2]]
-  // CHECK-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX3:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX3]]
-  // CHECK-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX3]]
-  // CHECK-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX4:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX4]]
-  // CHECK-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX4]]
-  // CHECK-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX5:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX5]]
-  // CHECK-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX5]]
-  // CHECK-DAG:   [[SADDR6:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX6:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX6]]
-  // CHECK-DAG:   [[PADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX6]]
-  // CHECK-DAG:   [[SADDR7:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX7:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX7]]
-  // CHECK-DAG:   [[PADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX7]]
-  // CHECK-DAG:   [[SADDR8:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX8:[0-9]+]]
-  // CHECK-DAG:   [[BPADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX8]]
-  // CHECK-DAG:   [[PADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX8]]
+  // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 10, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([10 x i64], [10 x i64]* [[MAPT4]], i32 0, i32 0), i32 0, i32 0)
+  // CHECK-DAG:   [[BPR]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
+  // CHECK-DAG:   [[PR]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P:%[^,]+]], i32 0, i32 0
+  // CHECK-DAG:   [[SR]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S:%[^,]+]], i32 0, i32 0
+
+  // CHECK-DAG:   [[SADDR0:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX0:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX0]]
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX0]]
+  // CHECK-DAG:   [[SADDR1:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX1:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX1]]
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX1]]
+  // CHECK-DAG:   [[SADDR2:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX2:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX2]]
+  // CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX2]]
+  // CHECK-DAG:   [[SADDR3:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX3:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR3:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX3]]
+  // CHECK-DAG:   [[PADDR3:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX3]]
+  // CHECK-DAG:   [[SADDR4:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX4:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR4:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX4]]
+  // CHECK-DAG:   [[PADDR4:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX4]]
+  // CHECK-DAG:   [[SADDR5:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX5:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR5:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX5]]
+  // CHECK-DAG:   [[PADDR5:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX5]]
+  // CHECK-DAG:   [[SADDR6:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX6:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR6:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX6]]
+  // CHECK-DAG:   [[PADDR6:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX6]]
+  // CHECK-DAG:   [[SADDR7:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX7:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR7:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX7]]
+  // CHECK-DAG:   [[PADDR7:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX7]]
+  // CHECK-DAG:   [[SADDR8:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX8:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR8:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX8]]
+  // CHECK-DAG:   [[PADDR8:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX8]]
+  // CHECK-DAG:   [[SADDR9:%.+]] = getelementptr inbounds [10 x i[[SZ]]], [10 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX9:[0-9]+]]
+  // CHECK-DAG:   [[BPADDR9:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[BP]], i32 0, i32 [[IDX9]]
+  // CHECK-DAG:   [[PADDR9:%.+]] = getelementptr inbounds [10 x i8*], [10 x i8*]* [[P]], i32 0, i32 [[IDX9]]
 
   // The names below are not necessarily consistent with the names used for the
   // addresses above as some are repeated.
@@ -316,14 +320,20 @@ int foo(int n) {
   // CHECK-DAG:   [[CPADDR8]] = bitcast i8** {{%[^,]+}} to [[TT]]**
   // CHECK-DAG:   store i[[SZ]] {{12|16}}, i[[SZ]]* {{%[^,]+}}
 
+  // CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CBPADDR9:%.+]],
+  // CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[CPADDR9:%.+]],
+  // CHECK-DAG:   [[CBPADDR9]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR9]] = bitcast i8** {{%[^,]+}} to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}}
+
   // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
   // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
 
   // CHECK:       [[FAIL]]
-  // CHECK:       call void [[HVT4:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}})
+  // CHECK:       call void [[HVT4:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}})
   // CHECK-NEXT:  br label %[[END]]
   // CHECK:       [[END]]
-  #pragma omp target teams distribute if(target: n>20)
+  #pragma omp target teams distribute if(target: n>20) dist_schedule(static, n)
   for (int i = 0; i < 10; ++i) {
     a += 1;
     b[2] += 1.0;
@@ -345,7 +355,8 @@ int foo(int n) {
 //
 //
 // CHECK:       define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] {{[^)]+}})
-// CHECK:       alloca i16,
+// CHECK:       alloca i[[SZ]],
+// CHECK:       bitcast i[[SZ]]* {{.+}} to i16*
 // CHECK:       ret void
 // CHECK-NEXT:  }
 
@@ -439,6 +450,7 @@ int foo(int n) {
 // CHECK:       [[LOCAL_VLA3:%.+]] = alloca i[[SZ]]
 // CHECK:       [[LOCAL_CN:%.+]] = alloca double*
 // CHECK:       [[LOCAL_D:%.+]] = alloca [[TT]]*
+// CHECK:       alloca i[[SZ]],
 // CHECK:       [[LOCAL_A_CASTED:%.+]] = alloca i[[SZ]]
 // CHECK-DAG:   store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
 // CHECK-DAG:   store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]]
@@ -467,7 +479,7 @@ int foo(int n) {
 // CHECK-32-DAG:store i32 [[LOCAL_AV]], i32* [[LOCAL_A_CASTED]], align
 // CHECK-DAG:   [[REF_A:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_A_CASTED]],
 
-// CHECK:       call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 9, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], [10 x float]*, i[[SZ]], float*, [5 x [10 x double]]*, i[[SZ]], i[[SZ]], double*, [[TT]]*)* [[OMP_OUTLINED4:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], [10 x float]* [[REF_B]], i[[SZ]] [[VAL_VLA1]], float* [[REF_BN]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] [[VAL_VLA2]], i[[SZ]] [[VAL_VLA3]], double* [[REF_CN]], [[TT]]* [[REF_D]])
+// CHECK:       call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC]], i32 10, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i[[SZ]], [10 x float]*, i[[SZ]], float*, [5 x [10 x double]]*, i[[SZ]], i[[SZ]], double*, [[TT]]*, i[[SZ]])* [[OMP_OUTLINED4:@.+]] to void (i32*, i32*, ...)*), i[[SZ]] [[REF_A]], [10 x float]* [[REF_B]], i[[SZ]] [[VAL_VLA1]], float* [[REF_BN]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] [[VAL_VLA2]], i[[SZ]] [[VAL_VLA3]], double* [[REF_CN]], [[TT]]* [[REF_D]], i[[SZ]] %{{.+}})
 //
 //
 // CHECK:       define internal {{.*}}void [[OMP_OUTLINED4]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] %{{.+}}, [10 x float]* {{.+}}, i[[SZ]] %{{.+}}, float* {{.+}}, [5 x [10 x double]]* {{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, double* {{.+}}, [[TT]]* {{.+}})
@@ -555,13 +567,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT5]], i32 0, i32 0), i32 0, i32 0)
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_teams_distribute_depend_codegen.cpp b/test/OpenMP/target_teams_distribute_depend_codegen.cpp
new file mode 100644
index 000000000000..ebd1f9e7a5c1
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  for (int i = 0; i < 10; ++i) {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target teams distribute device(global + a) nowait depend(inout: global, a, bn) if(a)
+  for (int i = 0; i < *plocal; ++i) {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute if(0) firstprivate(global) depend(out:global)
+  for (int i = 0; i < global; ++i) {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 0, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 0, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
index a9f64f384fa8..886a5dedd87c 100644
--- a/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
+++ b/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
@@ -110,16 +110,7 @@ int main() {
     // LAMBDA: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: [[G_PRIV_ADDR:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: [[G1_PRIV_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
-    // LAMBDA: [[SIVAR_PRIV_ADDR:%.+]] = alloca i{{[0-9]+}},
     // skip loop vars
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
@@ -127,15 +118,15 @@ int main() {
     // LAMBDA-DAG: [[G_CONV:%.+]] = bitcast {{.+}} [[G_ADDR]] to
     // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}} [[G1_ADDR]] to
     // LAMBDA-DAG: [[SIVAR_CONV:%.+]] = bitcast {{.+}} [[SIVAR_ADDR]] to
-    // LAMBDA-DAG: store{{.+}} [[G1_PRIV_ADDR]], {{.+}} [[G1_TMP]],
+    // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
     g = 1;
     g1 = 1;
     sivar = 2;
     // LAMBDA: call void @__kmpc_for_static_init_4(
-    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_PRIV_ADDR]],
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_CONV]],
     // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP]]
     // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1]],
-    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_PRIV_ADDR]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_CONV]],
     // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[G1_TMP]],
     // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1_REF]],
     // LAMBDA: call void [[INNER_LAMBDA:@.+]](
@@ -236,7 +227,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -322,7 +313,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
diff --git a/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
index 085733336c14..26a7c0d61da8 100644
--- a/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
+++ b/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
@@ -259,7 +259,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -273,7 +273,7 @@ int main() {
 // CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -284,7 +284,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -293,7 +293,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK-64: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK-32: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR]],
@@ -361,7 +361,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -375,7 +375,7 @@ int main() {
 // CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -386,7 +386,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -395,6 +395,6 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 #endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
new file mode 100644
index 000000000000..38bbace690bf
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
@@ -0,0 +1,118 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix HCK1 --check-prefix HCK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix HCK1 --check-prefix HCK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix HCK1 --check-prefix HCK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix HCK1 --check-prefix HCK1-64
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix TCK1 --check-prefix TCK1-64 
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix TCK1 --check-prefix TCK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc 
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix TCK1 --check-prefix TCK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix TCK1 --check-prefix TCK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY1 
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+#ifdef CK1
+
+// HCK1: define{{.*}} i32 @{{.+}}target_teams_fun{{.*}}(
+int target_teams_fun(int *g){
+  int n = 1000;
+  int a[1000];
+  int te = n / 128;
+  int th = 128;
+  // discard n_addr
+  // HCK1: alloca i32,
+  // HCK1: [[TE:%.+]] alloca i32,
+  // HCK1: [[TH:%.+]] = alloca i32,
+  // discard capture expressions for te and th
+  // HCK1: = alloca i32,
+  // HCK1: = alloca i32,
+  // HCK1: [[N_CAST:%.+]] = alloca i{{32|64}},
+  // HCK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
+  // HCK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
+  // HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]],
+  // HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
+  // HCK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
+  // HCK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, 
+
+  // HCK1: call void @[[OFFL1:.+]](i{{32|64}} [[N_PAR]], {{.+}}, i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]])
+  #pragma omp target teams distribute parallel for num_teams(te), thread_limit(th)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+    #pragma omp cancel for
+  }
+
+  // HCK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0),
+  // HCK1: call void @[[OFFL2:.+]](i{{64|32}} %{{.+}})
+  {{{
+  #pragma omp target teams distribute parallel for is_device_ptr(g)
+  for(int i = 0; i < n; i++) {
+    a[i] = g[0];
+  }
+  }}}
+
+  // outlined target regions
+  // HCK1: define internal void @[[OFFL1]](i{{32|64}} [[N_ARG:%.+]], i{{32|64}} [[TE_ARG:%.+]], i{{32|64}} [[TH_ARG:%.+]])
+  // TCK1: define void @{{.+}}target_teams_fun{{.*}}(i{{32|64}} [[N_ARG:%.+]], {{.+}}, i{{32|64}} [[TE_ARG:%.+]], i{{32|64}} [[TH_ARG:%.+]])
+  // CK1: [[N_ADDR:%.+]] = alloca i{{32|64}},
+  // CK1: [[TE_ADDR:%.+]] = alloca i{{32|64}},
+  // CK1: [[TH_ADDR:%.+]] = alloca i{{32|64}},
+  // TCK1: store {{.+}} [[N_ARG]], {{.+}} [[N_ADDR]],
+  // CK1: store{{.+}} [[TE_ARG]], {{.+}} [[TE_ADDR]],
+  // CK1: store{{.+}} [[TH_ARG]], {{.+}} [[TH_ADDR]],
+  // CK1-64: [[TE_CONV:%.+]] = bitcast{{.+}} [[TE_ADDR]] to
+  // CK1-64: [[TH_CONV:%.+]] = bitcast{{.+}} [[TH_ADDR]] to
+  // CK1-64: [[TE_VAL:%.+]] = load i32, i32* [[TE_CONV]],
+  // CK1-64: [[TH_VAL:%.+]] = load i32, i32* [[TH_CONV]],
+  // CK1-32: [[TE_VAL:%.+]] = load i32, i32* [[TE_ADDR]],
+  // CK1-32: [[TH_VAL:%.+]] = load i32, i32* [[TH_ADDR]],
+  // CK1: {{%.+}} = call i32 @__kmpc_push_num_teams({{.+}}, {{.+}}, i32 [[TE_VAL]], i32 [[TH_VAL]])
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 2, {{.+}} @[[OUTL1:.+]] to {{.+}}, {{.+}}, {{.+}})
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.+}} @__kmpc_fork_call(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // HCK1: define internal void @[[OFFL2]](
+  // TCK1: define void @{{.+}}target_teams_fun{{.+}}(
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL2:.+]] to {{.+}}, {{.+}}, {{.+}})
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.+}} @__kmpc_fork_call(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  return a[0];
+}
+
+#endif // CK1
+#endif // HEADER 
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
new file mode 100644
index 000000000000..80e38925b39e
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
@@ -0,0 +1,157 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a[X][Y];
+
+  // CK1: define {{.*}}i32 @{{.+}}foo{{.+}}(
+  int foo(void) {
+
+    // CK1: call i32 @__tgt_target_teams(
+    // CK1: call void @[[OFFL1:.+]](
+    #pragma omp target teams distribute parallel for collapse(2)
+    for(int i = 0; i < X; i++) {
+      for(int j = 0; j < Y; j++) {
+        a[i][j] = (T)0;
+      }
+    }
+    // CK1: define internal void @[[OFFL1]](
+    // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}},
+    // CK1: ret void
+
+    // CK1: define internal void @[[OUTL1]]({{.+}})
+    // discard loop variables not needed here
+    // CK1: [[OMP_UB:%.omp.comb.ub]] = alloca i32,
+    // CK1: store i32 56087, i32* [[OMP_UB]],
+    // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92, {{.+}}, {{.+}}, i32* [[OMP_UB]],
+    // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+    // CK1: call void @__kmpc_for_static_fini(
+    // CK1: ret void
+
+    // CK1: define internal void @[[PAR_OUTL1]]({{.+}})
+    // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34, {{.+}}, {{.+}},
+    // CK1: call void @__kmpc_for_static_fini(
+    // CK1: ret void
+
+    return a[0][0];
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK2
+
+template <typename T, int n, int m>
+int tmain(T argc) {
+  T a[n][m];
+  #pragma omp target teams distribute parallel for collapse(2)
+  for(int i = 0; i < n; i++) {
+    for(int j = 0; j < m; j++) {
+      a[i][j] = (T)0;
+    }
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int n = 100;
+  int m = 2;
+  int a[n][m];
+  #pragma omp target teams distribute parallel for collapse(2)
+  for(int i = 0; i < n; i++) {
+    for(int j = 0; j < m; j++) {
+      a[i][j] = 0;
+    }
+  }
+  return tmain<int, 10, 2>(argc);
+}
+
+// CK2: define {{.*}}i32 @{{[^,]+}}(i{{.+}}{{.+}} %[[ARGC:.+]], {{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL1:.+]]({{.+}})
+// CK2: {{%.+}} = call{{.*}} i32 @[[TMAIN:.+]]({{.+}})
+// CK2: ret
+
+// CK2:  define {{.*}}void @[[OFFL1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}} @[[OUTL1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL1]]({{.+}})
+// CK2: [[OMP_UB:%.omp.comb.ub]] = alloca i64,
+// CK2: store i64 {{.+}}, i64* [[OMP_UB]],
+// CK2: call void @__kmpc_for_static_init_8({{.+}}, {{.+}}, i32 92, {{.+}}, {{.+}}, i64* [[OMP_UB]],
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_{{[4|8]}}({{.+}}, {{.+}}, i32 34, {{.+}}, {{.+}},
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+
+// CK2: define {{.*}}i32 @[[TMAIN]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT1:.+]]({{.+}})
+// CK2:  ret
+// CK2-NEXT: }
+
+// CK2:  define {{.*}}void @[[OFFLT1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT1]]({{.+}})
+// discard loop variables not needed here
+// CK2: [[OMP_UB:%.omp.comb.ub]] = alloca i32,
+// CK2: store i32 {{.+}}, i32* [[OMP_UB]],
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92, {{.+}}, {{.+}}, i32* [[OMP_UB]],
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[TPAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34, {{.+}}, {{.+}},
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+#endif // CK2
+#endif // #ifndef HEADER
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
new file mode 100644
index 000000000000..ca4e6a61bbbe
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute parallel for device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  for (int i = 0; i < 10; ++i) {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target teams distribute parallel for device(global + a) nowait depend(inout: global, a, bn) if(target:a)
+  for (int i = 0; i < *plocal; ++i) {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute parallel for if(0) firstprivate(global) depend(out:global)
+  for (int i = 0; i < global; ++i) {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 0, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 0, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
new file mode 100644
index 000000000000..28619f883f9d
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
@@ -0,0 +1,270 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a[X];
+  float b;
+  // CK1: define {{.*}}i32 @{{.+}}foo{{.+}}(
+  int foo(void) {
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL1:.+]](
+    #pragma omp target teams distribute parallel for
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL2:.+]](
+    #pragma omp target teams distribute parallel for dist_schedule(static)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL3:.+]](
+    #pragma omp target teams distribute parallel for dist_schedule(static, X/2)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: define internal void @[[OFFL1]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL2]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL2:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+
+  // CK1: define internal void @[[OFFL3]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL3:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+    return a[0];
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK2
+
+template <typename T, int n>
+int tmain(T argc) {
+  T a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for dist_schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for dist_schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int n = 100;
+  int a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for dist_schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for dist_schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+  return tmain<int, 10>(argc);
+}
+
+// CK2: define {{.*}}i32 @{{[^,]+}}(i{{.+}}{{.+}} %[[ARGC:.+]], {{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL3:.+]]({{.+}})
+// CK2: {{%.+}} = call{{.*}} i32 @[[TMAIN:.+]]({{.+}})
+// CK2: ret
+
+// CK2:  define {{.*}}void @[[OFFL1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define {{.*}}void @[[OFFL2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFL3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[OUTL3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define {{.*}}i32 @[[TMAIN]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT3:.+]]({{.+}})
+// CK2:  ret
+// CK2-NEXT: }
+
+// CK2:  define {{.*}}void @[[OFFLT1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+#endif // CK2
+#endif // #ifndef HEADER
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
new file mode 100644
index 000000000000..ce1cb1ec0557
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -0,0 +1,523 @@
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix HCHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix HCHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY  
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64 
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc 
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix TLAMBDA --check-prefix TLAMBDA-64 
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc 
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY 
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g = 1212;
+volatile int &g1 = g;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+#pragma omp target teams distribute parallel for firstprivate(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+// HCHECK-DAG: [[TEST:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// HCHECK-DAG: [[T_VAR:@.+]] = global i{{[0-9]+}} 333,
+int t_var = 333;
+// HCHECK-DAG: [[VEC:@.+]] = global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// HCHECK-DAG: [[S_ARR:@.+]] = global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// HCHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// HCHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // HLAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // HLAMBDA-LABEL: @main
+  // HLAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    // HLAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // HLAMBDA: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+    // HLAMBDA: call void @[[LOFFL1:.+]](i{{64|32}} %{{.+}})
+    // HLAMBDA:  ret
+#pragma omp target teams distribute parallel for firstprivate(g, g1, sivar)
+  for (int i = 0; i < 2; ++i) {
+    // HLAMBDA: define{{.*}} internal{{.*}} void @[[LOFFL1]](i{{64|32}} {{%.+}}, i{{64|32}} {{%.+}})
+    // TLAMBDA: define void @[[LOFFL1:.+]](i{{64|32}} {{%.+}}, i{{64|32}} {{%.+}})
+    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_CAST:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_CAST:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_CAST:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA-DAG: [[G_CAST_VAL:%.+]] = load{{.+}} [[G_CAST]],
+    // LAMBDA-DAG: [[G1_CAST_VAL:%.+]] = load{{.+}} [[G1_CAST]],
+    // LAMBDA-DAG: [[SIVAR_CAST_VAL:%.+]] = load{{.+}} [[SIVAR_CAST]],
+    // LAMBDA: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[LOUTL1:.+]] to {{.+}}, {{.+}} [[G_CAST_VAL]], {{.+}} [[G1_CAST_VAL]], {{.+}} [[SIVAR_CAST_VAL]])
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LOUTL1]]({{.+}})
+    // Skip global and bound tid vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
+    // skip loop vars
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA-DAG: [[G_CONV:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+    // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}} [[G1_ADDR]] to
+    // LAMBDA-DAG: [[SIVAR_CONV:%.+]] = bitcast {{.+}} [[SIVAR_ADDR]] to
+    // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
+    g = 1;
+    g1 = 1;
+    sivar = 2;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LPAR_OUTL]]({{.+}})
+    // Skip global and bound tid vars, and prev lb and ub vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
+    // skip loop vars
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA-DAG: [[G_CONV:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+    // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}} [[G1_ADDR]] to
+    // LAMBDA-DAG: [[SIVAR_CONV:%.+]] = bitcast {{.+}} [[SIVAR_ADDR]] to
+    // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
+
+    // use of private vars
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_CONV]],
+    // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP]]
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_CONV]],
+    // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[G1_TMP]],
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      g1 = 2;
+      sivar = 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+      // LAMBDA: [[G1_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+      // LAMBDA: [[G1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G1_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G1_REF]]
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp target teams distribute parallel for firstprivate(t_var, vec, s_arr, var, sivar)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+    sivar += i;
+  }
+  return tmain<int>();
+#endif
+}
+
+// HCHECK: define {{.*}}i{{[0-9]+}} @main()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, 
+// HCHECK: call void @[[OFFL1:.+]](
+// HCHECK: {{%.+}} = call{{.*}} i32 @[[TMAIN_INT:.+]]()
+// HCHECK:  ret
+
+// HCHECK: define{{.*}} void @[[OFFL1]](
+// TCHECK: define{{.*}} void @[[OFFL1:.+]](
+// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_CAST:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[SIVAR_CAST:%.+]] = alloca i{{[0-9]+}},
+
+// CHECK-DAG: [[VEC_TE_PAR:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PRIV]],
+// CHECK-DAG: [[T_VAR_TE_PAR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_CAST]],
+// CHECK-DAG: [[S_ARR_TE_PAR:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_PRIV]],
+// CHECK-DAG: [[VAR_TE_PAR:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_PRIV]],
+// CHECK-DAG: [[SIVAR_TE_PAR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_CAST]],
+
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}} @[[OUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[VEC_TE_PAR]], i{{[0-9]+}} [[T_VAR_TE_PAR]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_TE_PAR]], [[S_FLOAT_TY]]* [[VAR_TE_PAR]], i{{[0-9]+}} [[SIVAR_TE_PAR]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[OUTL1]]({{.+}})
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* {{.+}}, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[SIVAR_ADDR]],
+
+// T_VAR and SIVAR
+// CHECK-DAG-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-DAG-64: [[CONV_SIVAR:%.+]] = bitcast i64* [[SIVAR_ADDR]] to i32*
+
+// preparation vars
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
+// CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}}], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[PAR_OUTL]]({{.+}})
+// Skip global and bound tid vars, and prev lb ub vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* {{.+}}, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[SIVAR_ADDR]],
+
+// T_VAR and SIVAR
+// CHECK-DAG-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-DAG-64: [[CONV_SIVAR:%.+]] = bitcast i64* [[SIVAR_ADDR]] to i32*
+
+// preparation vars
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
+// CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}}], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG-32: {{.+}} = {{.+}} [[T_VAR_ADDR]]
+// CHECK-DAG-64: {{.+}} = {{.+}} [[CONV_TVAR]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VAR_PRIV]]
+// CHECK-DAG-32: {{.+}} = {{.+}} [[SIVAR_ADDR]]
+// CHECK-DAG-64: {{.+}} = {{.+}} [[CONV_SIVAR]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// HCHECK: define{{.*}} i{{[0-9]+}} @[[TMAIN_INT]]()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+// HCHECK: call void @[[TOFFL1:.+]](
+// HCHECK:  ret
+
+// HCHECK: define {{.*}}void @[[TOFFL1]]({{.+}})
+// TCHECK: define {{.*}}void @[[TOFFL1:.+]]({{.+}})
+// CHECK-DAG: [[TT_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[TVEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK-DAG: [[TS_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[TVAR_PRIV:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TT_VAR_CAST:%.+]] = alloca i{{[0-9]+}},
+
+// CHECK-DAG: [[TVEC_TE_PAR:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[TVEC_PRIV]],
+// CHECK-DAG: [[TT_VAR_TE_PAR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[TT_VAR_CAST]],
+// CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
+// CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
+
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[TOUTL1]]({{.+}})
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_INT_TY]]]* {{.+}}, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_INT_TY]]* {{.+}}, [[S_INT_TY]]** [[VAR_ADDR]],
+
+// T_VAR and preparation variables
+// CHECK: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[TPAR_OUTL]]({{.+}})
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_INT_TY]]]* {{.+}}, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_INT_TY]]* {{.+}}, [[S_INT_TY]]** [[VAR_ADDR]],
+
+// T_VAR and preparation variables
+// CHECK: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG-32: {{.+}} = {{.+}} [[T_VAR_ADDR]]
+// CHECK-DAG-64: {{.+}} = {{.+}} [[CONV_TVAR]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[TMP]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
new file mode 100644
index 000000000000..8cd90a610b5c
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void fn1();
+void fn2();
+void fn3();
+void fn4();
+void fn5();
+void fn6();
+
+int Arg;
+
+// CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
+void gtid_test() {
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+#pragma omp target teams distribute parallel for
+  for(int i = 0 ; i < 100; i++) {}
+  // CHECK: define internal void [[OFFLOADING_FUN_0]](
+  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+  // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK:  call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
+  // CHECK: call void @__kmpc_for_static_fini(
+
+  // CHECK: define{{.+}} void [[OMP_OUTLINED_0]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: ret
+#pragma omp target teams distribute parallel for if (parallel: false)
+  for(int i = 0 ; i < 100; i++) {
+  // CHECK: define internal void [[OFFLOADING_FUN_1]](
+  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+  // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_1]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK: call void @__kmpc_serialized_parallel(
+  // CHECK: call void [[OMP_OUTLINED_1:@.+]](
+  // CHECK: call void @__kmpc_end_serialized_parallel(
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: define{{.+}} void [[OMP_OUTLINED_1]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK: call void @{{.+}}gtid_test
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: ret
+    gtid_test();
+  }
+}
+
+
+template <typename T>
+int tmain(T Arg) {
+#pragma omp target teams distribute parallel for if (true)
+  for(int i = 0 ; i < 100; i++) {
+    fn1();
+  }
+#pragma omp target teams distribute parallel for if (false)
+  for(int i = 0 ; i < 100; i++) {
+    fn2();
+  }
+#pragma omp target teams distribute parallel for if (parallel: Arg)
+  for(int i = 0 ; i < 100; i++) {
+    fn3();
+  }
+  return 0;
+}
+
+// CHECK-LABEL: define {{.*}}i{{[0-9]+}} @main()
+int main() {
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_2:@.+]](
+// CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain
+#pragma omp target teams distribute parallel for if (true)
+  for(int i = 0 ; i < 100; i++) {
+    // CHECK: define internal void [[OFFLOADING_FUN_0]](
+    // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+    // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
+
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_2:@.+]] to void
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: define{{.+}} void [[OMP_OUTLINED_2]](
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call {{.*}}void @{{.+}}fn4
+    // CHECK: call void @__kmpc_for_static_fini(
+
+    fn4();
+  }
+
+#pragma omp target teams distribute parallel for if (false)
+  for(int i = 0 ; i < 100; i++) {
+    // CHECK: define internal void [[OFFLOADING_FUN_1]](
+    // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+    // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_1]](
+
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call void @__kmpc_serialized_parallel(
+    // CHECK: call void [[OMP_OUTLINED_3:@.+]](
+    // CHECK: call void @__kmpc_end_serialized_parallel(
+    // CHECK: call void @__kmpc_for_static_fini(
+
+    // CHECK: define{{.+}} void [[OMP_OUTLINED_3]](
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call {{.*}}void @{{.+}}fn5
+    // CHECK: call void @__kmpc_for_static_fini(
+    fn5();
+  }
+
+#pragma omp target teams distribute parallel for if (Arg)
+  for(int i = 0 ; i < 100; i++) {
+    // CHECK: define internal void [[OFFLOADING_FUN_2]](
+    // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_2:@.+]] to {{.+}})
+    // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_2]](
+
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_4:@.+]] to void
+    // CHECK: call void @__kmpc_serialized_parallel(
+    // CHECK: call void [[OMP_OUTLINED_4:@.+]](
+    // CHECK: call void @__kmpc_end_serialized_parallel(
+    // CHECK: call void @__kmpc_for_static_fini(
+
+    // CHECK: define{{.+}} void [[OMP_OUTLINED_4]](
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call {{.*}}void @{{.+}}fn6
+    // CHECK: call void @__kmpc_for_static_fini(
+    fn6();
+  }
+
+  return tmain(Arg);
+}
+
+// CHECK-LABEL: define {{.+}} @{{.+}}tmain
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, void {{.+}}* [[T_OUTLINE_FUN_1:@.+]] to void
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_1]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn1
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @__kmpc_serialized_parallel(
+// CHECK: call void [[T_OUTLINE_FUN_2:@.+]](
+// CHECK: call {{.*}}void @__kmpc_end_serialized_parallel(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_2]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn2
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, void {{.+}}* [[T_OUTLINE_FUN_3:@.+]] to void
+// CHECK: call {{.*}}void @__kmpc_serialized_parallel(
+// call void [[T_OUTLINE_FUN_3:@.+]](
+// CHECK: call {{.*}}void @__kmpc_end_serialized_parallel(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_3]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn3
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
new file mode 100644
index 000000000000..acb1c0b636ec
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -0,0 +1,454 @@
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// RUN: %clang_cc1  -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1  -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+  #pragma omp target teams distribute parallel for lastprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int svar;
+  volatile double g;
+  volatile double &g1 = g;
+
+  #ifdef LAMBDA
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    static float sfvar;
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+    // LAMBDA: call void [[OFFLOADING_FUN:@.+]](
+
+    // LAMBDA: define{{.+}} void [[OFFLOADING_FUN]](
+    // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED:@.+]] to {{.+}})
+    #pragma omp target teams distribute parallel for lastprivate(g, g1, svar, sfvar)
+    for (int i = 0; i < 2; ++i) {
+      // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_OUTLINED]](i32* {{.+}}, i32* {{.+}}, {{.+}} [[G1_IN:%.+]], {{.+}} [[SVAR_IN:%.+]], {{.+}} [[SFVAR_IN:%.+]], {{.+}} [[G_IN:%.+]])
+      // skip gbl and bound tid
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: [[G1_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // loop variables
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+       
+      // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
+    
+      // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+      // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
+      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
+      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+      // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
+      
+      g1 = 1;
+      svar = 3;
+      sfvar = 4.0;
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+      // LAMBDA: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+
+      // LAMBDA: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+      // LAMBDA: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+      // LAMBDA: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+      // LAMBDA: [[OMP_LASTPRIV_BLOCK]]:
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_TGT]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_TGT]],
+      // LAMBDA-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+      // LAMBDA-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[SFVAR_TGT]],
+      // LAMBDA: br label %[[OMP_LASTPRIV_DONE]]
+      // LAMBDA: [[OMP_LASTPRIV_DONE]]:
+      // LAMBDA: ret
+
+      // LAMBDA: define{{.*}} internal{{.*}} void @[[LPAR_OUTL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[G1_IN:%.+]], {{.+}} [[SVAR_IN:%.+]], {{.+}} [[SFVAR_IN:%.+]], {{.+}} [[G_IN:%.+]])
+      // skip tid and prev variables
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: [[G1_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // loop variables
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+       
+      // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
+    
+      // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+      // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
+      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
+      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+      // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
+
+
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+      // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+
+      // LAMBDA: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+      // LAMBDA: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+      // LAMBDA: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+      
+      // LAMBDA: [[OMP_LASTPRIV_BLOCK]]:
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_TGT]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_TGT]],
+      // LAMBDA-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+      // LAMBDA-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[SFVAR_TGT]],
+      // LAMBDA: br label %[[OMP_LASTPRIV_DONE]]
+      // LAMBDA: [[OMP_LASTPRIV_DONE]]:
+      // LAMBDA: ret
+
+      [&]() {
+	// LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+	// LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+	g = 2;
+	g1 = 2;
+	svar = 4;
+	sfvar = 8.0;
+	// LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+	// LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+	// LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+	// LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+	// LAMBDA: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+	// LAMBDA: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]]
+	// LAMBDA: store double 2.0{{.+}}, double* [[G1_REF]],
+	// LAMBDA: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+	// LAMBDA: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]]
+	// LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SVAR_REF]]
+	// LAMBDA: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+	// LAMBDA: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]]
+	// LAMBDA: store float 8.0{{.+}}, float* [[SFVAR_REF]]
+      }();
+    }
+  }();
+  return 0;
+  #else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> &var = test;
+
+  #pragma omp target teams distribute parallel for lastprivate(t_var, vec, s_arr, s_arr, var, var, svar)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  int i;
+
+  return tmain<int>();
+  #endif
+}
+
+// CHECK: define{{.*}} i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN:@.+]](
+// CHECK: ret
+
+// CHECK: define{{.+}} [[OFFLOAD_FUN]](
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(
+// CHECK: ret
+//
+// CHECK: define internal void [[OMP_OUTLINED:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN:%.+]], [2 x [[S_FLOAT_TY]]]*{{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]*{{.+}} [[VAR_IN:%.+]], i{{[0-9]+}}{{.*}} [[S_VAR_IN:%.+]])
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK: store {{.+}} [[T_VAR_IN]], {{.+}} [[T_VAR_ADDR]],
+// CHECK: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+// CHECK: store {{.+}} [[S_VAR_IN]], {{.+}} [[SVAR_ADDR]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+
+// the distribute loop
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
+// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
+// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+// CHECK: ret void
+
+// CHECK: define internal void [[OMP_OUTLINED:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN:%.+]], [2 x [[S_FLOAT_TY]]]*{{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]*{{.+}} [[VAR_IN:%.+]], i{{[0-9]+}}{{.*}} [[S_VAR_IN:%.+]])
+
+// gbl and bound tid vars, prev lb and ub vars
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK: store {{.+}} [[T_VAR_IN]], {{.+}} [[T_VAR_ADDR]],
+// CHECK: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+// CHECK: store {{.+}} [[S_VAR_IN]], {{.+}} [[SVAR_ADDR]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+
+// the distribute loop
+// CHECK: call void @__kmpc_for_static_init_4(
+// skip body: code generation routine is same as distribute parallel for lastprivate
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
+// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
+// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+// CHECK: ret void
+
+// template tmain
+// CHECK: define{{.*}} i{{[0-9]+}} [[TMAIN_INT:@.+]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN_1:@.+]](
+// CHECK: ret
+
+// CHECK: define internal void [[OFFLOAD_FUN_1]](
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4,
+// CHECK: ret
+
+// CHECK: define internal void [[OMP_OUTLINED_1:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
+// skip alloca of global_tid and bound_tid
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR1:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST1:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN1]], {{.+}} [[VEC_ADDR1]],
+// CHECK: store {{.+}} [[T_VAR_IN1]], {{.+}} [[T_VAR_ADDR1]],
+// CHECK: store {{.+}} [[S_ARR_IN1]], {{.+}} [[S_ARR_ADDR1]],
+// CHECK: store {{.+}} [[VAR_IN1]], {{.+}} [[VAR_ADDR1]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]], 
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
+// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
+// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],  
+// CHECK: ret void
+
+// CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
+// skip alloca of global_tid and bound_tid, and prev lb and ub vars
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+
+// CHECK: [[VEC_ADDR1:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST1:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN1]], {{.+}} [[VEC_ADDR1]],
+// CHECK: store {{.+}} [[T_VAR_IN1]], {{.+}} [[T_VAR_ADDR1]],
+// CHECK: store {{.+}} [[S_ARR_IN1]], {{.+}} [[S_ARR_ADDR1]],
+// CHECK: store {{.+}} [[VAR_IN1]], {{.+}} [[VAR_ADDR1]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]], 
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// skip body: code generation routine is same as distribute parallel for lastprivate
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
+// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
+// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],  
+// CHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
new file mode 100644
index 000000000000..2523696cffb6
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
@@ -0,0 +1,369 @@
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HHECK --check-prefix HCHECK-64
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64 
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc 
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix TLAMBDA --check-prefix TLAMBDA-64 
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc 
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+//
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g = 1212;
+volatile int &g1 = g;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+#pragma omp target teams distribute parallel for private(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+// HCHECK-DAG: [[TEST:@.+]] ={{.*}} global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// HCHECK-DAG: [[T_VAR:@.+]] ={{.+}} global i{{[0-9]+}} 333,
+int t_var = 333;
+// HCHECK-DAG: [[VEC:@.+]] ={{.+}} global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// HCHECK-DAG: [[S_ARR:@.+]] ={{.+}} global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// HCHECK-DAG: [[VAR:@.+]] ={{.+}} global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// HCHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // HLAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // HLAMBDA-LABEL: @main
+  // HLAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    // HLAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // HLAMBDA: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 0,
+    // HLAMBDA: call void @[[LOFFL1:.+]](
+    // HLAMBDA:  ret
+#pragma omp target teams distribute parallel for private(g, g1, sivar)
+  for (int i = 0; i < 2; ++i) {
+    // HLAMBDA: define{{.*}} internal{{.*}} void @[[LOFFL1]]()
+    // TLAMBDA: define{{.*}} void @[[LOFFL1:.+]]()
+    // LAMBDA: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}} @[[LOUTL1:.+]] to {{.+}})
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LOUTL1]]({{.+}})
+    // Skip global, bound tid and loop vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: [[G_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_PRIV:%.+]] = alloca i{{[0-9]+}}
+    // LAMBDA: [[TMP:%.+]] = alloca i{{[0-9]+}}*,
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store{{.+}} [[G1_PRIV]], {{.+}} [[TMP]],
+
+    g = 1;
+    g1 = 1;
+    sivar = 2;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LPAR_OUTL]]({{.+}})
+    // Skip global, bound tid and loop vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: [[G_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_PRIV:%.+]] = alloca i{{[0-9]+}}
+    // LAMBDA: [[TMP:%.+]] = alloca i{{[0-9]+}}*,
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store{{.+}} [[G1_PRIV]], {{.+}} [[TMP]],
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_PRIV]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[TMP]],
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1_REF]],
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      g1 = 2;
+      sivar = 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+      // LAMBDA: [[G1_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+      // LAMBDA: [[G1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G1_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G1_REF]]
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp target teams distribute parallel for private(t_var, vec, s_arr, var, sivar)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+    sivar += i;
+  }
+  return tmain<int>();
+#endif
+}
+
+// HCHECK: define {{.*}}i{{[0-9]+}} @main()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 0, i8** null, i8** null, {{.+}} null, {{.+}} null, i32 0, i32 0) 
+// HCHECK: call void @[[OFFL1:.+]]()
+// HCHECK: {{%.+}} = call{{.*}} i32 @[[TMAIN_INT:.+]]()
+// HCHECK:  ret
+
+// HCHECK: define{{.*}} void @[[OFFL1]]()
+// TCHECK: define{{.*}} void @[[OFFL1:.+]]()
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}} @[[OUTL1:.+]] to {{.+}})
+// CHECK: ret void
+
+// CHECK: define internal void @[[OUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-DAG: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[PAR_OUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-DAG: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG: {{.+}} = {{.+}} [[T_VAR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VAR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[SIVAR_PRIV]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// HCHECK: define{{.*}} i{{[0-9]+}} @[[TMAIN_INT]]()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 0,
+// HCHECK: call void @[[TOFFL1:.+]]()
+// HCHECK: ret
+
+// HCHECK: define {{.*}}void @[[TOFFL1]]()
+// TCHECK: define void @[[TOFFL1:.+]]()
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}} @[[TOUTL1:.+]] to {{.+}})
+// CHECK: ret void
+
+// CHECK: define internal void @[[TOUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+// CHECK-DAG: store{{.+}} [[VAR_PRIV]], {{.+}} [[TMP]]
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL1:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[TPAR_OUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// prev lb and ub
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// iter variables
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+// CHECK-DAG: store{{.+}} [[VAR_PRIV]], {{.+}} [[TMP]]
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG: {{.+}} = {{.+}} [[T_VAR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[TMP]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
new file mode 100644
index 000000000000..855273cb4762
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -0,0 +1,93 @@
+// add -fopenmp-targets
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+typedef __INTPTR_TYPE__ intptr_t;
+
+// CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
+void foo();
+
+struct S {
+  intptr_t a, b, c;
+  S(intptr_t a) : a(a) {}
+  operator char() { return a; }
+  ~S() {}
+};
+
+template <typename T>
+T tmain() {
+#pragma omp target teams distribute parallel for proc_bind(master)
+  for(int i = 0; i < 1000; i++) {}
+  return T();
+}
+
+int main() {
+  // CHECK-LABEL: @main
+#pragma omp target teams distribute parallel for proc_bind(spread)
+  for(int i = 0; i < 1000; i++) {}
+#pragma omp target teams distribute parallel for proc_bind(close)
+  for(int i = 0; i < 1000; i++) {}
+  return tmain<int>();
+}
+
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL1:@.+]]()
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL2:@.+]]()
+// CHECK: [[CALL_RET:%.+]] = call{{.+}} i32 [[TMAIN:@.+]]()
+// CHECK: ret i32 [[CALL_RET]]
+
+// CHECK: define{{.+}} void [[OFFL1]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_1]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 4)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+
+// CHECK: define{{.+}} [[OFFL2]]()
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_1]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 3)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+
+// CHECK: define{{.+}} [[TMAIN]]()
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL3:@.+]]()
+
+// CHECK: define{{.+}} [[OFFL3]]()
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_3:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_3]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 2)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
new file mode 100644
index 000000000000..6575659637a5
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
@@ -0,0 +1,353 @@
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <typename T>
+T tmain() {
+  T t_var = T();
+  T vec[] = {1, 2};
+#pragma omp target teams distribute parallel for reduction(+: t_var)
+  for (int i = 0; i < 2; ++i) {
+    t_var += (T) i;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[RED_VAR:@.+]] = common global [8 x {{.+}}] zeroinitializer
+
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // LAMBDA: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+    // LAMBDA: call void @[[LOFFL1:.+]](
+    // LAMBDA:  ret
+#pragma omp target teams distribute parallel for reduction(+: sivar)
+  for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define{{.*}} internal{{.*}} void @[[LOFFL1]](i{{64|32}}*{{.*}} [[SIVAR_ARG:%.+]])
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+    // LAMBDA: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: [[SIVAR_PAR:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[LOUTL1:.+]] to {{.+}}, {{.+}} [[SIVAR_PAR]])
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LOUTL1]]({{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+    // Skip global and bound tid vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+    // LAMBDA: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+    // LAMBDA: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: [[SIVAR_REF:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]]
+    // LAMBDA: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+    // LAMBDA: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+    // LAMBDA: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+    // LAMBDA: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+    // LAMBDA: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+    // LAMBDA: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+    // LAMBDA: {{.+}}, label %[[CASE1:.+]]
+    // LAMBDA: {{.+}}, label %[[CASE2:.+]]
+    // LAMBDA: ]
+    // LAMBDA: [[CASE1]]:
+    // LAMBDA-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+    // LAMBDA: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+    // LAMBDA: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+    // LAMBDA: br
+    // LAMBDA: [[CASE2]]:
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+    // LAMBDA: br
+
+    // LAMBDA: define internal void @[[LPAR_OUTL]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+
+    // Skip global and bound tid vars, and prev lb and ub vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+    // skip loop vars
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+    // LAMBDA: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+    // LAMBDA: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: [[SIVAR_REF:%.+]] = load {{.+}} [[SIVAR_ADDR]]
+    // LAMBDA: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+     // LAMBDA: store{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+    // LAMBDA: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+    // LAMBDA: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+    // LAMBDA: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+    // LAMBDA: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+    // LAMBDA: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+    // LAMBDA: {{.+}}, label %[[CASE1:.+]]
+    // LAMBDA: {{.+}}, label %[[CASE2:.+]]
+    // LAMBDA: ]
+    // LAMBDA: [[CASE1]]:
+    // LAMBDA-64-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+    // LAMBDA-32-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+    // LAMBDA: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+    // LAMBDA: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+    // LAMBDA: br
+    // LAMBDA: [[CASE2]]:
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+    // LAMBDA: br
+
+    sivar += i;
+
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+
+      sivar += 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
+      // LAMBDA: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], 4
+      // LAMBDA: store i{{[0-9]+}} [[SIVAR_INC]], i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp target teams distribute parallel for reduction(+: sivar)
+  for (int i = 0; i < 2; ++i) {
+    sivar += i;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[RED_VAR:@.+]] = common global [8 x {{.+}}] zeroinitializer
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+// CHECK: call void @[[OFFL1:.+]](i{{64|32}}* @{{.+}})
+// CHECK: {{%.+}} = call{{.*}} i32 @[[TMAIN_INT:.+]]()
+// CHECK:  ret
+
+// CHECK: define{{.*}} void @[[OFFL1]](i{{64|32}}*{{.*}} [[SIVAR_ARG:%.+]])
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+// CHECK: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]]
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}}, {{.+}} [[SIVAR_VAL]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[OUTL1]]({{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+// CHECK: [[SIVAR_REF:%.+]] = load{{.+}} [[SIVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+// CHECK: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+// CHECK: br
+
+// CHECK: define internal void @[[PAR_OUTL]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+// Skip global and bound tid vars, and prev lb and ub
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+// skip loop vars
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+// CHECK-64: [[SIVAR_REF:%.+]] = load {{.+}} [[SIVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: store{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+// CHECK: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+// CHECK: br
+
+// CHECK: define{{.*}} i{{[0-9]+}} @[[TMAIN_INT]]()
+// CHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 1,
+// CHECK: call void @[[TOFFL1:.+]]({{.+}})
+// CHECK:  ret
+
+// CHECK: define{{.*}} void @[[TOFFL1]](i{{64|32}}*{{.*}} [[TVAR_ARG:%.+]])
+// CHECK: [[TVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: store{{.+}} [[TVAR_ARG]], {{.+}} [[TVAR_ADDR]],
+// CHECK: [[TVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_ADDR]]
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[TOUTL1:.+]] to {{.+}}, {{.+}} [[TVAR_VAL]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[TOUTL1]]({{.+}}, {{.+}}, {{.+}} [[TVAR_ARG:%.+]])
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[TVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: [[TVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[TVAR_ARG]], {{.+}} [[TVAR_ADDR]],
+// CHECK: [[TVAR_REF:%.+]] = load {{.+}} [[TVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[TVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[TVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[TVAR_PRIV]] to
+// CHECK: store{{.+}} [[TVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[TVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_REF]],
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[TVAR_INC:%.+]] = add{{.+}} [[TVAR_VAL]], [[TVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[TVAR_INC]], {{.+}} [[TVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[TVAR_REF]], {{.+}} [[TVAR_PRIV_VAL]]
+// CHECK: br
+
+// CHECK: define internal void @[[TPAR_OUTL]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[TVAR_ARG:%.+]])
+// Skip global and bound tid vars, and prev lb and ub vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[TVAR_ADDR:%.+]] = alloca i{{.+}},
+// skip loop vars
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: [[TVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[TVAR_ARG]], {{.+}} [[TVAR_ADDR]],
+// CHECK: [[TVAR_REF:%.+]] = load {{.+}} [[TVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[TVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: store{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[TVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[TVAR_PRIV]] to
+// CHECK: store{{.+}} [[TVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[TVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_REF]],
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[TVAR_INC:%.+]] = add{{.+}} [[TVAR_VAL]], [[TVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[TVAR_INC]], {{.+}} [[TVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[TVAR_REF]], {{.+}} [[TVAR_PRIV_VAL]]
+// CHECK: br
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
new file mode 100644
index 000000000000..8eb6c443ca2d
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
@@ -0,0 +1,400 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a[X];
+  float b;
+  // CK1: define {{.*}}i32 @{{.+}}foo{{.+}}(
+  int foo(void) {
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL1:.+]](
+    #pragma omp target teams distribute parallel for
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL2:.+]](
+    #pragma omp target teams distribute parallel for schedule(static)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL3:.+]](
+    #pragma omp target teams distribute parallel for schedule(static, X/2)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL4:.+]](
+    #pragma omp target teams distribute parallel for schedule(dynamic)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL5:.+]](
+    #pragma omp target teams distribute parallel for schedule(dynamic, X/2)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+
+  // CK1: define internal void @[[OFFL1]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL2]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL2:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL3]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL3:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 33,
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL4]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL4:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL4]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL4:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL4]]({{.+}})
+  // CK1: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+  // CK1: call {{.+}} @__kmpc_dispatch_next_4(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL5]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL5:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL5]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL5:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL5]]({{.+}})
+  // CK1: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+  // CK1: call {{.+}} @__kmpc_dispatch_next_4(
+  // CK1: ret void
+
+    return a[0];
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY 
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK2
+
+template <typename T, int n>
+int tmain(T argc) {
+  T a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for schedule(dynamic)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for schedule(dynamic, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int n = 100;
+  int a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for dist_schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for dist_schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for schedule(dynamic)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for schedule(dynamic, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+  return tmain<int, 10>(argc);
+}
+
+// CK2: define {{.*}}i32 @{{[^,]+}}(i{{.+}}{{.+}} %[[ARGC:.+]], {{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL3:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL4:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL5:.+]]({{.+}})
+// CK2: {{%.+}} = call{{.*}} i32 @[[TMAIN:.+]]({{.+}})
+// CK2: ret
+
+// CK2:  define {{.*}}void @[[OFFL1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define {{.*}}void @[[OFFL2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFL3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[OUTL3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFL4]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTL4:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL4]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL4:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL4]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+
+// CK2: define {{.*}}void @[[OFFL5]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTL5:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL5]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL5:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL5]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+// CK2: define {{.*}}i32 @[[TMAIN]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT3:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT4:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT5:.+]]({{.+}})
+// CK2:  ret
+// CK2-NEXT: }
+
+// CK2:  define {{.*}}void @[[OFFLT1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 33,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT4]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT4:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT4]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT4:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT4]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT5]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT5:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT5]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT5:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT5]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+#endif // CK2
+#endif // #ifndef HEADER
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
new file mode 100644
index 000000000000..61cdd3c91d70
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -0,0 +1,131 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix HCK1 --check-prefix HCK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix HCK1 --check-prefix HCK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix HCK1 --check-prefix HCK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix HCK1 --check-prefix HCK1-64
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix TCK1 --check-prefix TCK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix TCK1 --check-prefix TCK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix TCK1 --check-prefix TCK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix TCK1 --check-prefix TCK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY1
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+#ifdef CK1
+
+// HCK1: define{{.*}} i32 @{{.+}}target_teams_fun{{.*}}(
+int target_teams_fun(int *g){
+  int n = 1000;
+  int a[1000];
+  int te = n / 128;
+  int th = 128;
+// discard n_addr
+// HCK1: alloca i32,
+// HCK1: [[TE:%.+]] alloca i32,
+// HCK1: [[TH:%.+]] = alloca i32,
+// HCK1: [[I:%.+]] = alloca i32,
+// discard capture expressions for te and th
+// HCK1: = alloca i32,
+// HCK1: = alloca i32,
+// HCK1: [[I_CAST:%.+]] = alloca i{{32|64}},
+// HCK1: [[N_CAST:%.+]] = alloca i{{32|64}},
+// HCK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
+// HCK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
+// HCK1: [[I_PAR:%.+]] = load{{.+}}, {{.+}} [[I_CAST]],
+// HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]],
+// HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
+// HCK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
+// HCK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** %{{[^,]+}}, i8** %{{[^,]+}},
+
+// HCK1: call void @[[OFFL1:.+]](i{{32|64}} [[I_PAR]], i{{32|64}} [[N_PAR]], {{.+}}, i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]])
+  int i;
+#pragma omp target teams distribute parallel for simd num_teams(te), thread_limit(th) aligned(a : 8) safelen(16) simdlen(4) linear(i : n)
+  for(i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+
+  // HCK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0),
+  // HCK1: call void @[[OFFL2:.+]](i{{64|32}} %{{.+}})
+  {{{
+  #pragma omp target teams distribute parallel for simd is_device_ptr(g) simdlen(8)
+  for(int i = 0; i < n; i++) {
+    a[i] = g[0];
+  }
+  }}}
+
+  // outlined target regions
+  // HCK1: define internal void @[[OFFL1]](i{{32|64}} [[I_ARG:%.+]], i{{32|64}} [[N_ARG:%.+]], {{.+}}, i{{32|64}} [[TE_ARG:%.+]], i{{32|64}} [[TH_ARG:%.+]])
+  // TCK1: define void @{{.+}}target_teams_fun{{.*}}(i{{32|64}} [[I_ARG:%.+]], i{{32|64}} [[N_ARG:%.+]], {{.+}}, i{{32|64}} [[TE_ARG:%.+]], i{{32|64}} [[TH_ARG:%.+]])
+  // CK1: [[I_ADDR:%.+]] = alloca i{{32|64}},
+  // CK1: [[N_ADDR:%.+]] = alloca i{{32|64}},
+  // CK1: [[TE_ADDR:%.+]] = alloca i{{32|64}},
+  // CK1: [[TH_ADDR:%.+]] = alloca i{{32|64}},
+  // TCK1: store {{.+}} [[N_ARG]], {{.+}} [[N_ADDR]],
+  // CK1: store{{.+}} [[TE_ARG]], {{.+}} [[TE_ADDR]],
+  // CK1: store{{.+}} [[TH_ARG]], {{.+}} [[TH_ADDR]],
+  // CK1-64: [[TE_CONV:%.+]] = bitcast{{.+}} [[TE_ADDR]] to
+  // CK1-64: [[TH_CONV:%.+]] = bitcast{{.+}} [[TH_ADDR]] to
+  // CK1-64: [[TE_VAL:%.+]] = load i32, i32* [[TE_CONV]],
+  // CK1-64: [[TH_VAL:%.+]] = load i32, i32* [[TH_CONV]],
+  // CK1-32: [[TE_VAL:%.+]] = load i32, i32* [[TE_ADDR]],
+  // CK1-32: [[TH_VAL:%.+]] = load i32, i32* [[TH_ADDR]],
+  // CK1: {{%.+}} = call i32 @__kmpc_push_num_teams({{.+}}, {{.+}}, i32 [[TE_VAL]], i32 [[TH_VAL]])
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL1:.+]] to {{.+}}, {{.+}}, {{.+}})
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL1]]({{.+}})
+  // CK1: [[ARRDECAY:%.+]] = getelementptr inbounds [1000 x i32], [1000 x i32]* %{{.+}}, i32 0, i32 0
+  // CK1: [[ARR_CAST:%.+]] = ptrtoint i32* [[ARRDECAY]] to i{{32|64}}
+  // CK1: [[MASKED_PTR:%.+]] = and i{{32|64}} [[ARR_CAST]], 7
+  // CK1: [[COND:%.+]] = icmp eq i{{32|64}} [[MASKED_PTR]], 0
+  // CK1: call void @llvm.assume(i1 [[COND]])
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.+}} @__kmpc_fork_call(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // HCK1: define internal void @[[OFFL2]](
+  // TCK1: define void @{{.+}}target_teams_fun{{.+}}(
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL2:.+]] to {{.+}}, {{.+}}, {{.+}})
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.+}} @__kmpc_fork_call(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  return a[0];
+}
+
+// CK1-DAG: !{!"llvm.loop.vectorize.width", i32 4}
+// CK1-DAG: !{!"llvm.loop.vectorize.enable", i1 true}
+// CK1-DAG: !{!"llvm.loop.vectorize.width", i32 8}
+
+#endif // CK1
+#endif // HEADER
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp
new file mode 100644
index 000000000000..f03971b7ebfd
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp
@@ -0,0 +1,472 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target teams distribute parallel for simd codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// Check that no target code is emmitted if no omptests flag was provided.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s
+// SIMD-ONLY2-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[SA:%.+]] = type { [4 x i32] }
+// CHECK-DAG: [[SB:%.+]] = type { [8 x i32] }
+// CHECK-DAG: [[SC:%.+]] = type { [16 x i32] }
+// CHECK-DAG: [[SD:%.+]] = type { [32 x i32] }
+// CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
+// CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
+// CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK:    [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
+// CHECK-DAG: [[A2:@.+]] = global [[SA]]
+// CHECK-DAG: [[B1:@.+]] = global [[SB]]
+// CHECK-DAG: [[B2:@.+]] = global [[SB]]
+// CHECK-DAG: [[C1:@.+]] = internal global [[SC]]
+// CHECK-DAG: [[D1:@.+]] = global [[SD]]
+// CHECK-DAG: [[E1:@.+]] = global [[SE]]
+// CHECK-DAG: [[T1:@.+]] = global [[ST1]]
+// CHECK-DAG: [[T2:@.+]] = global [[ST2]]
+
+// CHECK-NTARGET-DAG: [[SA:%.+]] = type { [4 x i32] }
+// CHECK-NTARGET-DAG: [[SB:%.+]] = type { [8 x i32] }
+// CHECK-NTARGET-DAG: [[SC:%.+]] = type { [16 x i32] }
+// CHECK-NTARGET-DAG: [[SD:%.+]] = type { [32 x i32] }
+// CHECK-NTARGET-DAG: [[SE:%.+]] = type { [64 x i32] }
+// CHECK-NTARGET-DAG: [[ST1:%.+]] = type { [228 x i32] }
+// CHECK-NTARGET-DAG: [[ST2:%.+]] = type { [1128 x i32] }
+// CHECK-NTARGET-NOT: type { i8*, i8*, %
+// CHECK-NTARGET-NOT: type { i32, %
+
+// We have 7 target regions
+
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// TCHECK-NOT: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+// CHECK-DAG: {{@.+}} = private constant i8 0
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i64] [i64 288]
+
+// CHECK-NTARGET-NOT: private constant i8 0
+// CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
+
+// CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
+// CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
+// CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
+// CHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
+// CHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
+// CHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
+// CHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
+// CHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
+// CHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
+// CHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
+// CHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
+// CHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
+// CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+
+// TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
+// TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
+// TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
+// TCHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
+// TCHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
+// TCHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
+// TCHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
+// TCHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
+// TCHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
+// TCHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
+// TCHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
+// TCHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
+// TCHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
+// CHECK: @llvm.global_ctors = appending global [4 x { i32, void ()*, i8* }] [
+// CHECK-SAME: { i32, void ()*, i8* } { i32 500, void ()* [[P500:@[^,]+]], i8* null },
+// CHECK-SAME: { i32, void ()*, i8* } { i32 501, void ()* [[P501:@[^,]+]], i8* null },
+// CHECK-SAME: { i32, void ()*, i8* } { i32 65535, void ()* [[PMAX:@[^,]+]], i8* null },
+// CHECK-SAME: { i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+// CHECK-NTARGET: @llvm.global_ctors = appending global [3   x { i32, void ()*, i8* }] [
+
+extern int *R;
+
+struct SA {
+  int arr[4];
+  void foo() {
+    int a = *R;
+    a += 1;
+    *R = a;
+  }
+  SA() {
+    int a = *R;
+    a += 2;
+    *R = a;
+  }
+  ~SA() {
+    int a = *R;
+    a += 3;
+    *R = a;
+  }
+};
+
+struct SB {
+  int arr[8];
+  void foo() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 4;
+    *R = a;
+  }
+  SB() {
+    int a = *R;
+    a += 5;
+    *R = a;
+  }
+  ~SB() {
+    int a = *R;
+    a += 6;
+    *R = a;
+  }
+};
+
+struct SC {
+  int arr[16];
+  void foo() {
+    int a = *R;
+    a += 7;
+    *R = a;
+  }
+  SC() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 8;
+    *R = a;
+  }
+  ~SC() {
+    int a = *R;
+    a += 9;
+    *R = a;
+  }
+};
+
+struct SD {
+  int arr[32];
+  void foo() {
+    int a = *R;
+    a += 10;
+    *R = a;
+  }
+  SD() {
+    int a = *R;
+    a += 11;
+    *R = a;
+  }
+  ~SD() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 12;
+    *R = a;
+  }
+};
+
+struct SE {
+  int arr[64];
+  void foo() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd if(target: 0)
+    for (int i = 0; i < 10; ++i)
+      a += 13;
+    *R = a;
+  }
+  SE() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 14;
+    *R = a;
+  }
+  ~SE() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 15;
+    *R = a;
+  }
+};
+
+template <int x>
+struct ST {
+  int arr[128 + x];
+  void foo() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 16 + x;
+    *R = a;
+  }
+  ST() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 17 + x;
+    *R = a;
+  }
+  ~ST() {
+    int a = *R;
+    #pragma omp target teams distribute parallel for simd
+    for (int i = 0; i < 10; ++i)
+      a += 18 + x;
+    *R = a;
+  }
+};
+
+// We have to make sure we us all the target regions:
+//CHECK-DAG: define internal void @[[NAME1]](
+//CHECK-DAG: call void @[[NAME1]](
+//CHECK-DAG: define internal void @[[NAME2]](
+//CHECK-DAG: call void @[[NAME2]](
+//CHECK-DAG: define internal void @[[NAME3]](
+//CHECK-DAG: call void @[[NAME3]](
+//CHECK-DAG: define internal void @[[NAME4]](
+//CHECK-DAG: call void @[[NAME4]](
+//CHECK-DAG: define internal void @[[NAME5]](
+//CHECK-DAG: call void @[[NAME5]](
+//CHECK-DAG: define internal void @[[NAME6]](
+//CHECK-DAG: call void @[[NAME6]](
+//CHECK-DAG: define internal void @[[NAME7]](
+//CHECK-DAG: call void @[[NAME7]](
+//CHECK-DAG: define internal void @[[NAME8]](
+//CHECK-DAG: call void @[[NAME8]](
+//CHECK-DAG: define internal void @[[NAME9]](
+//CHECK-DAG: call void @[[NAME9]](
+//CHECK-DAG: define internal void @[[NAME10]](
+//CHECK-DAG: call void @[[NAME10]](
+//CHECK-DAG: define internal void @[[NAME11]](
+//CHECK-DAG: call void @[[NAME11]](
+//CHECK-DAG: define internal void @[[NAME12]](
+//CHECK-DAG: call void @[[NAME12]](
+
+//TCHECK-DAG: define void @[[NAME1]](
+//TCHECK-DAG: define void @[[NAME2]](
+//TCHECK-DAG: define void @[[NAME3]](
+//TCHECK-DAG: define void @[[NAME4]](
+//TCHECK-DAG: define void @[[NAME5]](
+//TCHECK-DAG: define void @[[NAME6]](
+//TCHECK-DAG: define void @[[NAME7]](
+//TCHECK-DAG: define void @[[NAME8]](
+//TCHECK-DAG: define void @[[NAME9]](
+//TCHECK-DAG: define void @[[NAME10]](
+//TCHECK-DAG: define void @[[NAME11]](
+//TCHECK-DAG: define void @[[NAME12]](
+
+// CHECK-NTARGET-NOT: __tgt_target
+// CHECK-NTARGET-NOT: __tgt_register_lib
+// CHECK-NTARGET-NOT: __tgt_unregister_lib
+
+// TCHECK-NOT: __tgt_target
+// TCHECK-NOT: __tgt_register_lib
+// TCHECK-NOT: __tgt_unregister_lib
+
+// We have 2 initializers with priority 500
+//CHECK: define internal void [[P500]](
+//CHECK:     call void @{{.+}}()
+//CHECK:     call void @{{.+}}()
+//CHECK-NOT: call void @{{.+}}()
+//CHECK:     ret void
+
+// We have 1 initializers with priority 501
+//CHECK: define internal void [[P501]](
+//CHECK:     call void @{{.+}}()
+//CHECK-NOT: call void @{{.+}}()
+//CHECK:     ret void
+
+// We have 6 initializers with default priority
+//CHECK: define internal void [[PMAX]](
+//CHECK:     call void @{{.+}}()
+//CHECK:     call void @{{.+}}()
+//CHECK:     call void @{{.+}}()
+//CHECK:     call void @{{.+}}()
+//CHECK:     call void @{{.+}}()
+//CHECK:     call void @{{.+}}()
+//CHECK-NOT: call void @{{.+}}()
+//CHECK:     ret void
+
+// Check registration and unregistration
+
+//CHECK:     define internal void @[[UNREGFN:.+]](i8*)
+//CHECK-SAME: comdat($[[REGFN]]) {
+//CHECK:     call i32 @__tgt_unregister_lib([[DSCTY]]* [[DESC]])
+//CHECK:     ret void
+//CHECK:     declare i32 @__tgt_unregister_lib([[DSCTY]]*)
+
+//CHECK:     define linkonce hidden void @[[REGFN]](i8*)
+//CHECK-SAME: comdat {
+//CHECK:     call i32 @__tgt_register_lib([[DSCTY]]* [[DESC]])
+//CHECK:     call i32 @__cxa_atexit(void (i8*)* @[[UNREGFN]], i8* bitcast ([[DSCTY]]* [[DESC]] to i8*),
+//CHECK:     ret void
+//CHECK:     declare i32 @__tgt_register_lib([[DSCTY]]*)
+
+static __attribute__((init_priority(500))) SA a1;
+SA a2;
+SB __attribute__((init_priority(500))) b1;
+SB __attribute__((init_priority(501))) b2;
+static SC c1;
+SD d1;
+SE e1;
+ST<100> t1;
+ST<1000> t2;
+
+
+int bar(int a){
+  int r = a;
+
+  a1.foo();
+  a2.foo();
+  b1.foo();
+  b2.foo();
+  c1.foo();
+  d1.foo();
+  e1.foo();
+  t1.foo();
+  t2.foo();
+
+  #pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ++r;
+
+  return r + *R;
+}
+
+// Check metadata is properly generated:
+// CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 216, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 268, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 286, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 293, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 305, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 312, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 436, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 319, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 312, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 319, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 305, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 242, i32 {{[0-9]+}}}
+
+// TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 216, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 268, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 286, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 293, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 305, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 312, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 436, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 319, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 312, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 319, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 305, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 242, i32 {{[0-9]+}}}
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration_naming.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration_naming.cpp
new file mode 100644
index 000000000000..faeaeb949018
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration_naming.cpp
@@ -0,0 +1,86 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target teams distribute parallel for simd codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[CA:%.+]] = type { i32* }
+
+// CHECK: define {{.*}}i32 @[[NNAME:.+]](i32 {{.*}}%{{.+}})
+int nested(int a){
+  // CHECK: call void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME]]_l[[T1L:[0-9]+]](
+  #pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ++a;
+
+  // CHECK: call void @"[[LNAME:.+]]"([[CA]]*
+  auto F = [&](){
+    #pragma omp parallel
+    {
+      #pragma omp target teams distribute parallel for simd
+      for (int i = 0; i < 10; ++i)
+        ++a;
+    }
+  };
+
+  F();
+
+  return a;
+}
+
+// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T1L]](
+// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME:.+]]_l[[T1L:[0-9]+]](
+
+// CHECK: define {{.*}}void @"[[LNAME]]"(
+// CHECK: call void {{.*}}@__kmpc_fork_call{{.+}}[[PNAME:@.+]] to
+
+// CHECK: define {{.*}}void [[PNAME]](
+// CHECK: call void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L:[0-9]+]](
+
+// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L]](
+// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME:.+]]_l[[T2L:[0-9]+]](
+
+
+// Check metadata is properly generated:
+// CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}}
+
+// TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}}
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
new file mode 100644
index 000000000000..3aca29165cd8
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
@@ -0,0 +1,157 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a[X][Y];
+
+  // CK1: define {{.*}}i32 @{{.+}}foo{{.+}}(
+  int foo(void) {
+
+    // CK1: call i32 @__tgt_target_teams(
+    // CK1: call void @[[OFFL1:.+]](
+    #pragma omp target teams distribute parallel for simd collapse(2)
+    for(int i = 0; i < X; i++) {
+      for(int j = 0; j < Y; j++) {
+        a[i][j] = (T)0;
+      }
+    }
+    // CK1: define internal void @[[OFFL1]](
+    // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}},
+    // CK1: ret void
+
+    // CK1: define internal void @[[OUTL1]]({{.+}})
+    // discard loop variables not needed here
+    // CK1: [[OMP_UB:%.omp.comb.ub]] = alloca i32,
+    // CK1: store i32 56087, i32* [[OMP_UB]],
+    // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92, {{.+}}, {{.+}}, i32* [[OMP_UB]],
+    // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+    // CK1: call void @__kmpc_for_static_fini(
+    // CK1: ret void
+
+    // CK1: define internal void @[[PAR_OUTL1]]({{.+}})
+    // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34, {{.+}}, {{.+}},
+    // CK1: call void @__kmpc_for_static_fini(
+    // CK1: ret void
+
+    return a[0][0];
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK2
+
+template <typename T, int n, int m>
+int tmain(T argc) {
+  T a[n][m];
+  #pragma omp target teams distribute parallel for simd collapse(2)
+  for(int i = 0; i < n; i++) {
+    for(int j = 0; j < m; j++) {
+      a[i][j] = (T)0;
+    }
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int n = 100;
+  int m = 2;
+  int a[n][m];
+  #pragma omp target teams distribute parallel for simd collapse(2)
+  for(int i = 0; i < n; i++) {
+    for(int j = 0; j < m; j++) {
+      a[i][j] = 0;
+    }
+  }
+  return tmain<int, 10, 2>(argc);
+}
+
+// CK2: define {{.*}}i32 @{{[^,]+}}(i{{.+}}{{.+}} %[[ARGC:.+]], {{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL1:.+]]({{.+}})
+// CK2: {{%.+}} = call{{.*}} i32 @[[TMAIN:.+]]({{.+}})
+// CK2: ret
+
+// CK2:  define {{.*}}void @[[OFFL1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}} @[[OUTL1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL1]]({{.+}})
+// CK2: [[OMP_UB:%.omp.comb.ub]] = alloca i64,
+// CK2: store i64 {{.+}}, i64* [[OMP_UB]],
+// CK2: call void @__kmpc_for_static_init_8({{.+}}, {{.+}}, i32 92, {{.+}}, {{.+}}, i64* [[OMP_UB]],
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_{{[4|8]}}({{.+}}, {{.+}}, i32 34, {{.+}}, {{.+}},
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+
+// CK2: define {{.*}}i32 @[[TMAIN]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT1:.+]]({{.+}})
+// CK2:  ret
+// CK2-NEXT: }
+
+// CK2:  define {{.*}}void @[[OFFLT1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT1]]({{.+}})
+// discard loop variables not needed here
+// CK2: [[OMP_UB:%.omp.comb.ub]] = alloca i32,
+// CK2: store i32 {{.+}}, i32* [[OMP_UB]],
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92, {{.+}}, {{.+}}, i32* [[OMP_UB]],
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[TPAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34, {{.+}}, {{.+}},
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+#endif // CK2
+#endif // #ifndef HEADER
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
new file mode 100644
index 000000000000..e8b4a9359743
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute parallel for simd device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  for (int i = 0; i < 10; ++i) {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target teams distribute parallel for simd device(global + a) nowait depend(inout: global, a, bn) if(target:a)
+  for (int i = 0; i < *plocal; ++i) {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute parallel for simd if(0) firstprivate(global) depend(out:global)
+  for (int i = 0; i < global; ++i) {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 0, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 0, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
new file mode 100644
index 000000000000..281a8d742534
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
@@ -0,0 +1,270 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a[X];
+  float b;
+  // CK1: define {{.*}}i32 @{{.+}}foo{{.+}}(
+  int foo(void) {
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL1:.+]](
+    #pragma omp target teams distribute parallel for simd
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL2:.+]](
+    #pragma omp target teams distribute parallel for simd dist_schedule(static)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL3:.+]](
+    #pragma omp target teams distribute parallel for simd dist_schedule(static, X/2)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: define internal void @[[OFFL1]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL2]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL2:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+
+  // CK1: define internal void @[[OFFL3]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL3:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+    return a[0];
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK2
+
+template <typename T, int n>
+int tmain(T argc) {
+  T a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for simd
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for simd dist_schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for simd dist_schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int n = 100;
+  int a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for simd
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for simd dist_schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for simd dist_schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+  return tmain<int, 10>(argc);
+}
+
+// CK2: define {{.*}}i32 @{{[^,]+}}(i{{.+}}{{.+}} %[[ARGC:.+]], {{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL3:.+]]({{.+}})
+// CK2: {{%.+}} = call{{.*}} i32 @[[TMAIN:.+]]({{.+}})
+// CK2: ret
+
+// CK2:  define {{.*}}void @[[OFFL1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define {{.*}}void @[[OFFL2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFL3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[OUTL3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define {{.*}}i32 @[[TMAIN]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT3:.+]]({{.+}})
+// CK2:  ret
+// CK2-NEXT: }
+
+// CK2:  define {{.*}}void @[[OFFLT1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+#endif // CK2
+#endif // #ifndef HEADER
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
new file mode 100644
index 000000000000..05f7f8c34a3a
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -0,0 +1,523 @@
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HCHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix HCHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK --check-prefix HCHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix TLAMBDA --check-prefix TLAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g = 1212;
+volatile int &g1 = g;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+#pragma omp target teams distribute parallel for simd firstprivate(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+// HCHECK-DAG: [[TEST:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// HCHECK-DAG: [[T_VAR:@.+]] = global i{{[0-9]+}} 333,
+int t_var = 333;
+// HCHECK-DAG: [[VEC:@.+]] = global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// HCHECK-DAG: [[S_ARR:@.+]] = global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// HCHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// HCHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // HLAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // HLAMBDA-LABEL: @main
+  // HLAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    // HLAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // HLAMBDA: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+    // HLAMBDA: call void @[[LOFFL1:.+]](i{{64|32}} %{{.+}})
+    // HLAMBDA:  ret
+#pragma omp target teams distribute parallel for simd firstprivate(g, g1, sivar)
+  for (int i = 0; i < 2; ++i) {
+    // HLAMBDA: define{{.*}} internal{{.*}} void @[[LOFFL1]](i{{64|32}} {{%.+}}, i{{64|32}} {{%.+}})
+    // TLAMBDA: define void @[[LOFFL1:.+]](i{{64|32}} {{%.+}}, i{{64|32}} {{%.+}})
+    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_CAST:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_CAST:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_CAST:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA-DAG: [[G_CAST_VAL:%.+]] = load{{.+}} [[G_CAST]],
+    // LAMBDA-DAG: [[G1_CAST_VAL:%.+]] = load{{.+}} [[G1_CAST]],
+    // LAMBDA-DAG: [[SIVAR_CAST_VAL:%.+]] = load{{.+}} [[SIVAR_CAST]],
+    // LAMBDA: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[LOUTL1:.+]] to {{.+}}, {{.+}} [[G_CAST_VAL]], {{.+}} [[G1_CAST_VAL]], {{.+}} [[SIVAR_CAST_VAL]])
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LOUTL1]]({{.+}})
+    // Skip global and bound tid vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
+    // skip loop vars
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA-DAG: [[G_CONV:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+    // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}} [[G1_ADDR]] to
+    // LAMBDA-DAG: [[SIVAR_CONV:%.+]] = bitcast {{.+}} [[SIVAR_ADDR]] to
+    // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
+    g = 1;
+    g1 = 1;
+    sivar = 2;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LPAR_OUTL]]({{.+}})
+    // Skip global and bound tid vars, and prev lb and ub vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
+    // skip loop vars
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
+    // LAMBDA-DAG: store {{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA-DAG: [[G_CONV:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+    // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}} [[G1_ADDR]] to
+    // LAMBDA-DAG: [[SIVAR_CONV:%.+]] = bitcast {{.+}} [[SIVAR_ADDR]] to
+    // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
+
+    // use of private vars
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_CONV]],
+    // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP]]
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_CONV]],
+    // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[G1_TMP]],
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      g1 = 2;
+      sivar = 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+      // LAMBDA: [[G1_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+      // LAMBDA: [[G1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G1_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G1_REF]]
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp target teams distribute parallel for simd firstprivate(t_var, vec, s_arr, var, sivar)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+    sivar += i;
+  }
+  return tmain<int>();
+#endif
+}
+
+// HCHECK: define {{.*}}i{{[0-9]+}} @main()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, 
+// HCHECK: call void @[[OFFL1:.+]](
+// HCHECK: {{%.+}} = call{{.*}} i32 @[[TMAIN_INT:.+]]()
+// HCHECK:  ret
+
+// HCHECK: define{{.*}} void @[[OFFL1]](
+// TCHECK: define{{.*}} void @[[OFFL1:.+]](
+// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_CAST:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[SIVAR_CAST:%.+]] = alloca i{{[0-9]+}},
+
+// CHECK-DAG: [[VEC_TE_PAR:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_PRIV]],
+// CHECK-DAG: [[T_VAR_TE_PAR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_CAST]],
+// CHECK-DAG: [[S_ARR_TE_PAR:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_PRIV]],
+// CHECK-DAG: [[VAR_TE_PAR:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_PRIV]],
+// CHECK-DAG: [[SIVAR_TE_PAR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_CAST]],
+
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}} @[[OUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[VEC_TE_PAR]], i{{[0-9]+}} [[T_VAR_TE_PAR]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_TE_PAR]], [[S_FLOAT_TY]]* [[VAR_TE_PAR]], i{{[0-9]+}} [[SIVAR_TE_PAR]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[OUTL1]]({{.+}})
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* {{.+}}, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[SIVAR_ADDR]],
+
+// T_VAR and SIVAR
+// CHECK-DAG-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-DAG-64: [[CONV_SIVAR:%.+]] = bitcast i64* [[SIVAR_ADDR]] to i32*
+
+// preparation vars
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}}], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[PAR_OUTL]]({{.+}})
+// Skip global and bound tid vars, and prev lb ub vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* {{.+}}, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[SIVAR_ADDR]],
+
+// T_VAR and SIVAR
+// CHECK-DAG-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK-DAG-64: [[CONV_SIVAR:%.+]] = bitcast i64* [[SIVAR_ADDR]] to i32*
+
+// preparation vars
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}}], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG-32: {{.+}} = {{.+}} [[T_VAR_ADDR]]
+// CHECK-DAG-64: {{.+}} = {{.+}} [[CONV_TVAR]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VAR_PRIV]]
+// CHECK-DAG-32: {{.+}} = {{.+}} [[SIVAR_ADDR]]
+// CHECK-DAG-64: {{.+}} = {{.+}} [[CONV_SIVAR]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// HCHECK: define{{.*}} i{{[0-9]+}} @[[TMAIN_INT]]()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+// HCHECK: call void @[[TOFFL1:.+]](
+// HCHECK:  ret
+
+// HCHECK: define {{.*}}void @[[TOFFL1]]({{.+}})
+// TCHECK: define {{.*}}void @[[TOFFL1:.+]]({{.+}})
+// CHECK-DAG: [[TT_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[TVEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK-DAG: [[TS_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[TVAR_PRIV:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TT_VAR_CAST:%.+]] = alloca i{{[0-9]+}},
+
+// CHECK-DAG: [[TVEC_TE_PAR:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[TVEC_PRIV]],
+// CHECK-DAG: [[TT_VAR_TE_PAR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[TT_VAR_CAST]],
+// CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
+// CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
+
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[TOUTL1]]({{.+}})
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_INT_TY]]]* {{.+}}, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_INT_TY]]* {{.+}}, [[S_INT_TY]]** [[VAR_ADDR]],
+
+// T_VAR and preparation variables
+// CHECK: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[TPAR_OUTL]]({{.+}})
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// Skip temp vars for loop
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[AGG_TMP1:%.+]] = alloca [[ST_TY]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[AGG_TMP2:%.+]] = alloca [[ST_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// param copy
+// CHECK: store [2 x i{{[0-9]+}}]* {{.+}}, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_INT_TY]]]* {{.+}}, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_INT_TY]]* {{.+}}, [[S_INT_TY]]** [[VAR_ADDR]],
+
+// T_VAR and preparation variables
+// CHECK: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+
+// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
+
+// firstprivate(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BGN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]] to
+// CHECK-DAG: [[S_ARR_FIN:%.+]] = icmp{{.+}} [[S_ARR_PRIV_BGN]],
+// CHECK-DAG: [[S_ARR_SRC_COPY:%.+]] = phi{{.+}} [ [[S_ARR_ADDR_BGN]], {{.+}} ], [ [[S_ARR_SRC:%.+]], {{.+}} ]
+// CHECK-DAG: [[S_ARR_DST_COPY:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_DST:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_DST_COPY]], {{.+}} [[S_ARR_SRC_COPY]], {{.+}} [[AGG_TMP1]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP1]])
+// CHECK-DAG: [[S_ARR_DST]] = getelementptr {{.+}} [[S_ARR_DST_COPY]],
+// CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
+
+// firstprivate(var)
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
+// CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
+// CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG-32: {{.+}} = {{.+}} [[T_VAR_ADDR]]
+// CHECK-DAG-64: {{.+}} = {{.+}} [[CONV_TVAR]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[TMP]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
new file mode 100644
index 000000000000..a3a14fbf1e68
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void fn1();
+void fn2();
+void fn3();
+void fn4();
+void fn5();
+void fn6();
+
+int Arg;
+
+// CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
+void gtid_test() {
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+#pragma omp target teams distribute parallel for simd
+  for(int i = 0 ; i < 100; i++) {}
+  // CHECK: define internal void [[OFFLOADING_FUN_0]](
+  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+  // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK:  call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
+  // CHECK: call void @__kmpc_for_static_fini(
+
+  // CHECK: define{{.+}} void [[OMP_OUTLINED_0]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: ret
+#pragma omp target teams distribute parallel for simd if (parallel: false)
+  for(int i = 0 ; i < 100; i++) {
+  // CHECK: define internal void [[OFFLOADING_FUN_1]](
+  // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+  // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_1]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK: call void @__kmpc_serialized_parallel(
+  // CHECK: call void [[OMP_OUTLINED_1:@.+]](
+  // CHECK: call void @__kmpc_end_serialized_parallel(
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: define{{.+}} void [[OMP_OUTLINED_1]](
+  // CHECK: call void @__kmpc_for_static_init_4(
+  // CHECK: call void @{{.+}}gtid_test
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: ret
+    gtid_test();
+  }
+}
+
+
+template <typename T>
+int tmain(T Arg) {
+#pragma omp target teams distribute parallel for simd if (true)
+  for(int i = 0 ; i < 100; i++) {
+    fn1();
+  }
+#pragma omp target teams distribute parallel for simd if (false)
+  for(int i = 0 ; i < 100; i++) {
+    fn2();
+  }
+#pragma omp target teams distribute parallel for simd if (parallel: Arg)
+  for(int i = 0 ; i < 100; i++) {
+    fn3();
+  }
+  return 0;
+}
+
+// CHECK-LABEL: define {{.*}}i{{[0-9]+}} @main()
+int main() {
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_2:@.+]](
+// CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain
+#pragma omp target teams distribute parallel for simd if (true)
+  for(int i = 0 ; i < 100; i++) {
+    // CHECK: define internal void [[OFFLOADING_FUN_0]](
+    // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+    // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
+
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_2:@.+]] to void
+    // CHECK: call void @__kmpc_for_static_fini(
+    // CHECK: define{{.+}} void [[OMP_OUTLINED_2]](
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call {{.*}}void @{{.+}}fn4
+    // CHECK: call void @__kmpc_for_static_fini(
+
+    fn4();
+  }
+
+#pragma omp target teams distribute parallel for simd if (false)
+  for(int i = 0 ; i < 100; i++) {
+    // CHECK: define internal void [[OFFLOADING_FUN_1]](
+    // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+    // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_1]](
+
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call void @__kmpc_serialized_parallel(
+    // CHECK: call void [[OMP_OUTLINED_3:@.+]](
+    // CHECK: call void @__kmpc_end_serialized_parallel(
+    // CHECK: call void @__kmpc_for_static_fini(
+
+    // CHECK: define{{.+}} void [[OMP_OUTLINED_3]](
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call {{.*}}void @{{.+}}fn5
+    // CHECK: call void @__kmpc_for_static_fini(
+    fn5();
+  }
+
+#pragma omp target teams distribute parallel for simd if (Arg)
+  for(int i = 0 ; i < 100; i++) {
+    // CHECK: define internal void [[OFFLOADING_FUN_2]](
+    // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_2:@.+]] to {{.+}})
+    // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_2]](
+
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_4:@.+]] to void
+    // CHECK: call void @__kmpc_serialized_parallel(
+    // CHECK: call void [[OMP_OUTLINED_4:@.+]](
+    // CHECK: call void @__kmpc_end_serialized_parallel(
+    // CHECK: call void @__kmpc_for_static_fini(
+
+    // CHECK: define{{.+}} void [[OMP_OUTLINED_4]](
+    // CHECK: call void @__kmpc_for_static_init_4(
+    // CHECK: call {{.*}}void @{{.+}}fn6
+    // CHECK: call void @__kmpc_for_static_fini(
+    fn6();
+  }
+
+  return tmain(Arg);
+}
+
+// CHECK-LABEL: define {{.+}} @{{.+}}tmain
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, void {{.+}}* [[T_OUTLINE_FUN_1:@.+]] to void
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_1]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn1
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @__kmpc_serialized_parallel(
+// CHECK: call void [[T_OUTLINE_FUN_2:@.+]](
+// CHECK: call {{.*}}void @__kmpc_end_serialized_parallel(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_2]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn2
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, void {{.+}}* [[T_OUTLINE_FUN_3:@.+]] to void
+// CHECK: call {{.*}}void @__kmpc_serialized_parallel(
+// call void [[T_OUTLINE_FUN_3:@.+]](
+// CHECK: call {{.*}}void @__kmpc_end_serialized_parallel(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_3]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn3
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
new file mode 100644
index 000000000000..c3d38612d63d
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -0,0 +1,456 @@
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// RUN: %clang_cc1  -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1  -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+  #pragma omp target teams distribute parallel for simd lastprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int svar;
+  volatile double g;
+  volatile double &g1 = g;
+
+  #ifdef LAMBDA
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    static float sfvar;
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+    // LAMBDA: call void [[OFFLOADING_FUN:@.+]](
+
+    // LAMBDA: define{{.+}} void [[OFFLOADING_FUN]](
+    // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED:@.+]] to {{.+}})
+    #pragma omp target teams distribute parallel for simd lastprivate(g, g1, svar, sfvar)
+    for (int i = 0; i < 2; ++i) {
+      // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_OUTLINED]](i32* {{.+}}, i32* {{.+}}, {{.+}} [[G1_IN:%.+]], {{.+}} [[SVAR_IN:%.+]], {{.+}} [[SFVAR_IN:%.+]], {{.+}} [[G_IN:%.+]])
+      // skip gbl and bound tid
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: [[G1_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // loop variables
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+
+      // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
+
+      // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+      // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
+      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
+      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+      // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
+
+      g1 = 1;
+      svar = 3;
+      sfvar = 4.0;
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+      // LAMBDA: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+
+      // LAMBDA: store i32 2, i32* %
+      // LAMBDA: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+      // LAMBDA: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+      // LAMBDA: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+      // LAMBDA: [[OMP_LASTPRIV_BLOCK]]:
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_TGT]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_TGT]],
+      // LAMBDA-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+      // LAMBDA-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[SFVAR_TGT]],
+      // LAMBDA: br label %[[OMP_LASTPRIV_DONE]]
+      // LAMBDA: [[OMP_LASTPRIV_DONE]]:
+      // LAMBDA: ret
+
+      // LAMBDA: define{{.*}} internal{{.*}} void @[[LPAR_OUTL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[G1_IN:%.+]], {{.+}} [[SVAR_IN:%.+]], {{.+}} [[SFVAR_IN:%.+]], {{.+}} [[G_IN:%.+]])
+      // skip tid and prev variables
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: alloca
+      // LAMBDA: [[G1_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
+      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // loop variables
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+      // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+
+      // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
+
+      // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
+      // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
+      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
+      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+      // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
+
+
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+      // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](
+      // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+
+      // LAMBDA: store i32 2, i32* %
+      // LAMBDA: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+      // LAMBDA: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+      // LAMBDA: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+      // LAMBDA: [[OMP_LASTPRIV_BLOCK]]:
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_TGT]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_TGT]],
+      // LAMBDA-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+      // LAMBDA-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+      // LAMBDA-DAG: store {{.+}}, {{.+}} [[SFVAR_TGT]],
+      // LAMBDA: br label %[[OMP_LASTPRIV_DONE]]
+      // LAMBDA: [[OMP_LASTPRIV_DONE]]:
+      // LAMBDA: ret
+
+      [&]() {
+        // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+        // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+        g = 2;
+        g1 = 2;
+        svar = 4;
+        sfvar = 8.0;
+        // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+        // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+        // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+        // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+        // LAMBDA: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+        // LAMBDA: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]]
+        // LAMBDA: store double 2.0{{.+}}, double* [[G1_REF]],
+        // LAMBDA: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+        // LAMBDA: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]]
+        // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SVAR_REF]]
+        // LAMBDA: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+        // LAMBDA: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]]
+        // LAMBDA: store float 8.0{{.+}}, float* [[SFVAR_REF]]
+      }();
+    }
+  }();
+  return 0;
+  #else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> &var = test;
+
+  #pragma omp target teams distribute parallel for simd lastprivate(t_var, vec, s_arr, s_arr, var, var, svar)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  int i;
+
+  return tmain<int>();
+  #endif
+}
+
+// CHECK: define{{.*}} i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN:@.+]](
+// CHECK: ret
+
+// CHECK: define{{.+}} [[OFFLOAD_FUN]](
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(
+// CHECK: ret
+//
+// CHECK: define internal void [[OMP_OUTLINED:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN:%.+]], [2 x [[S_FLOAT_TY]]]*{{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]*{{.+}} [[VAR_IN:%.+]], i{{[0-9]+}}{{.*}} [[S_VAR_IN:%.+]])
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK: store {{.+}} [[T_VAR_IN]], {{.+}} [[T_VAR_ADDR]],
+// CHECK: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+// CHECK: store {{.+}} [[S_VAR_IN]], {{.+}} [[SVAR_ADDR]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+
+// the distribute loop
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
+// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
+// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+// CHECK: ret void
+
+// CHECK: define internal void [[OMP_OUTLINED:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN:%.+]], [2 x [[S_FLOAT_TY]]]*{{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]*{{.+}} [[VAR_IN:%.+]], i{{[0-9]+}}{{.*}} [[S_VAR_IN:%.+]])
+
+// gbl and bound tid vars, prev lb and ub vars
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK: store {{.+}} [[T_VAR_IN]], {{.+}} [[T_VAR_ADDR]],
+// CHECK: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+// CHECK: store {{.+}} [[S_VAR_IN]], {{.+}} [[SVAR_ADDR]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
+
+// the distribute loop
+// CHECK: call void @__kmpc_for_static_init_4(
+// skip body: code generation routine is same as distribute parallel for lastprivate
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
+// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
+// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
+// CHECK: ret void
+
+// template tmain
+// CHECK: define{{.*}} i{{[0-9]+}} [[TMAIN_INT:@.+]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN_1:@.+]](
+// CHECK: ret
+
+// CHECK: define internal void [[OFFLOAD_FUN_1]](
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4,
+// CHECK: ret
+
+// CHECK: define internal void [[OMP_OUTLINED_1:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
+// skip alloca of global_tid and bound_tid
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR1:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST1:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN1]], {{.+}} [[VEC_ADDR1]],
+// CHECK: store {{.+}} [[T_VAR_IN1]], {{.+}} [[T_VAR_ADDR1]],
+// CHECK: store {{.+}} [[S_ARR_IN1]], {{.+}} [[S_ARR_ADDR1]],
+// CHECK: store {{.+}} [[VAR_IN1]], {{.+}} [[VAR_ADDR1]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
+// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
+// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
+// CHECK: ret void
+
+// CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
+// skip alloca of global_tid and bound_tid, and prev lb and ub vars
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+
+// CHECK: [[VEC_ADDR1:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST1:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store {{.+}} [[VEC_IN1]], {{.+}} [[VEC_ADDR1]],
+// CHECK: store {{.+}} [[T_VAR_IN1]], {{.+}} [[T_VAR_ADDR1]],
+// CHECK: store {{.+}} [[S_ARR_IN1]], {{.+}} [[S_ARR_ADDR1]],
+// CHECK: store {{.+}} [[VAR_IN1]], {{.+}} [[VAR_ADDR1]],
+
+// prepare lastprivate targets
+// CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
+// CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
+// CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// skip body: code generation routine is same as distribute parallel for lastprivate
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
+// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
+// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
+// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
+// CHECK: call void @llvm.memcpy.{{.+}}(
+// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
+// CHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
new file mode 100644
index 000000000000..3792597b82fc
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -0,0 +1,369 @@
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix HHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HHECK --check-prefix HCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HHECK --check-prefix HCHECK-64
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix HLAMBDA --check-prefix HLAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64 --check-prefix TLAMBDA --check-prefix TLAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY0
+//
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix SIMD-ONLY0
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g = 1212;
+volatile int &g1 = g;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+#pragma omp target teams distribute parallel for simd private(t_var, vec, s_arr, var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+// HCHECK-DAG: [[TEST:@.+]] ={{.*}} global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// HCHECK-DAG: [[T_VAR:@.+]] ={{.+}} global i{{[0-9]+}} 333,
+int t_var = 333;
+// HCHECK-DAG: [[VEC:@.+]] ={{.+}} global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// HCHECK-DAG: [[S_ARR:@.+]] ={{.+}} global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// HCHECK-DAG: [[VAR:@.+]] ={{.+}} global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// HCHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // HLAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // HLAMBDA-LABEL: @main
+  // HLAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    // HLAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // HLAMBDA: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 0,
+    // HLAMBDA: call void @[[LOFFL1:.+]](
+    // HLAMBDA:  ret
+#pragma omp target teams distribute parallel for simd private(g, g1, sivar)
+  for (int i = 0; i < 2; ++i) {
+    // HLAMBDA: define{{.*}} internal{{.*}} void @[[LOFFL1]]()
+    // TLAMBDA: define{{.*}} void @[[LOFFL1:.+]]()
+    // LAMBDA: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}} @[[LOUTL1:.+]] to {{.+}})
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LOUTL1]]({{.+}})
+    // Skip global, bound tid and loop vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: [[G_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_PRIV:%.+]] = alloca i{{[0-9]+}}
+    // LAMBDA: [[TMP:%.+]] = alloca i{{[0-9]+}}*,
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store{{.+}} [[G1_PRIV]], {{.+}} [[TMP]],
+
+    g = 1;
+    g1 = 1;
+    sivar = 2;
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LPAR_OUTL]]({{.+}})
+    // Skip global, bound tid and loop vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: [[G_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G1_PRIV:%.+]] = alloca i{{[0-9]+}}
+    // LAMBDA: [[TMP:%.+]] = alloca i{{[0-9]+}}*,
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store{{.+}} [[G1_PRIV]], {{.+}} [[TMP]],
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_PRIV]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[TMP]],
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1_REF]],
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: ret void
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      g1 = 2;
+      sivar = 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G_REF]]
+      // LAMBDA: [[G1_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+      // LAMBDA: [[G1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G1_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[G1_REF]]
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp target teams distribute parallel for simd private(t_var, vec, s_arr, var, sivar)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+    sivar += i;
+  }
+  return tmain<int>();
+#endif
+}
+
+// HCHECK: define {{.*}}i{{[0-9]+}} @main()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 0, i8** null, i8** null, {{.+}} null, {{.+}} null, i32 0, i32 0)
+// HCHECK: call void @[[OFFL1:.+]]()
+// HCHECK: {{%.+}} = call{{.*}} i32 @[[TMAIN_INT:.+]]()
+// HCHECK:  ret
+
+// HCHECK: define{{.*}} void @[[OFFL1]]()
+// TCHECK: define{{.*}} void @[[OFFL1:.+]]()
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}} @[[OUTL1:.+]] to {{.+}})
+// CHECK: ret void
+
+// CHECK: define internal void @[[OUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-DAG: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[PAR_OUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK: {{.+}} = alloca i32,
+// CHECK-DAG: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK-DAG: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK-DAG: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-DAG: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-DAG: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG: {{.+}} = {{.+}} [[T_VAR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VAR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[SIVAR_PRIV]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// HCHECK: define{{.*}} i{{[0-9]+}} @[[TMAIN_INT]]()
+// HCHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 0,
+// HCHECK: call void @[[TOFFL1:.+]]()
+// HCHECK: ret
+
+// HCHECK: define {{.*}}void @[[TOFFL1]]()
+// TCHECK: define void @[[TOFFL1:.+]]()
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}} @[[TOUTL1:.+]] to {{.+}})
+// CHECK: ret void
+
+// CHECK: define internal void @[[TOUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+// CHECK-DAG: store{{.+}} [[VAR_PRIV]], {{.+}} [[TMP]]
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL1:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define internal void @[[TPAR_OUTL1]]({{.+}})
+// Skip global, bound tid and loop vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// prev lb and ub
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// iter variables
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i32,
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// private(s_arr)
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_ALLOC:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]], {{.+}} ], [ [[S_ARR_NEXT:%.+]], {{.+}} ]
+// CHECK-DAG: call void @{{.+}}({{.+}} [[S_ARR_PTR_ALLOC]])
+// CHECK-DAG: [[S_ARR_NEXT]] = getelementptr {{.+}} [[S_ARR_PTR_ALLOC]],
+
+// private(var)
+// CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]])
+// CHECK-DAG: store{{.+}} [[VAR_PRIV]], {{.+}} [[TMP]]
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-DAG: {{.+}} = {{.+}} [[T_VAR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[VEC_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[S_ARR_PRIV]]
+// CHECK-DAG: {{.+}} = {{.+}} [[TMP]]
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
new file mode 100644
index 000000000000..b578634c025f
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -0,0 +1,93 @@
+// add -fopenmp-targets
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+typedef __INTPTR_TYPE__ intptr_t;
+
+// CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
+void foo();
+
+struct S {
+  intptr_t a, b, c;
+  S(intptr_t a) : a(a) {}
+  operator char() { return a; }
+  ~S() {}
+};
+
+template <typename T>
+T tmain() {
+#pragma omp target teams distribute parallel for simd proc_bind(master)
+  for(int i = 0; i < 1000; i++) {}
+  return T();
+}
+
+int main() {
+  // CHECK-LABEL: @main
+#pragma omp target teams distribute parallel for simd proc_bind(spread)
+  for(int i = 0; i < 1000; i++) {}
+#pragma omp target teams distribute parallel for simd proc_bind(close)
+  for(int i = 0; i < 1000; i++) {}
+  return tmain<int>();
+}
+
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL1:@.+]]()
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL2:@.+]]()
+// CHECK: [[CALL_RET:%.+]] = call{{.+}} i32 [[TMAIN:@.+]]()
+// CHECK: ret i32 [[CALL_RET]]
+
+// CHECK: define{{.+}} void [[OFFL1]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_1]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 4)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+
+// CHECK: define{{.+}} [[OFFL2]]()
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_1]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 3)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+
+// CHECK: define{{.+}} [[TMAIN]]()
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL3:@.+]]()
+
+// CHECK: define{{.+}} [[OFFL3]]()
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_3:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_3]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 2)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
new file mode 100644
index 000000000000..e938a7746313
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
@@ -0,0 +1,353 @@
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++  -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <typename T>
+T tmain() {
+  T t_var = T();
+  T vec[] = {1, 2};
+#pragma omp target teams distribute parallel for simd reduction(+: t_var)
+  for (int i = 0; i < 2; ++i) {
+    t_var += (T) i;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[RED_VAR:@.+]] = common global [8 x {{.+}}] zeroinitializer
+
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+    // LAMBDA: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+    // LAMBDA: call void @[[LOFFL1:.+]](
+    // LAMBDA:  ret
+#pragma omp target teams distribute parallel for simd reduction(+: sivar)
+  for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define{{.*}} internal{{.*}} void @[[LOFFL1]](i{{64|32}}*{{.*}} [[SIVAR_ARG:%.+]])
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+    // LAMBDA: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: [[SIVAR_PAR:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[LOUTL1:.+]] to {{.+}}, {{.+}} [[SIVAR_PAR]])
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @[[LOUTL1]]({{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+    // Skip global and bound tid vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+    // LAMBDA: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+    // LAMBDA: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: [[SIVAR_REF:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]]
+    // LAMBDA: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[LPAR_OUTL:.+]] to
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+    // LAMBDA: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+    // LAMBDA: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+    // LAMBDA: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+    // LAMBDA: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+    // LAMBDA: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+    // LAMBDA: {{.+}}, label %[[CASE1:.+]]
+    // LAMBDA: {{.+}}, label %[[CASE2:.+]]
+    // LAMBDA: ]
+    // LAMBDA: [[CASE1]]:
+    // LAMBDA-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+    // LAMBDA: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+    // LAMBDA: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+    // LAMBDA: br
+    // LAMBDA: [[CASE2]]:
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+    // LAMBDA: br
+
+    // LAMBDA: define internal void @[[LPAR_OUTL]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+
+    // Skip global and bound tid vars, and prev lb and ub vars
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: {{.+}} = alloca i32*,
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+    // skip loop vars
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: alloca i32,
+    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+    // LAMBDA: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+    // LAMBDA: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA: [[SIVAR_REF:%.+]] = load {{.+}} [[SIVAR_ADDR]]
+    // LAMBDA: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+     // LAMBDA: store{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA: call void [[INNER_LAMBDA:@.+]](
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+    // LAMBDA: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+    // LAMBDA: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+    // LAMBDA: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+    // LAMBDA: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+    // LAMBDA: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+    // LAMBDA: {{.+}}, label %[[CASE1:.+]]
+    // LAMBDA: {{.+}}, label %[[CASE2:.+]]
+    // LAMBDA: ]
+    // LAMBDA: [[CASE1]]:
+    // LAMBDA-64-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+    // LAMBDA-32-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]],
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+    // LAMBDA: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+    // LAMBDA: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+    // LAMBDA: br
+    // LAMBDA: [[CASE2]]:
+    // LAMBDA-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+    // LAMBDA: br
+
+    sivar += i;
+
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+
+      sivar += 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
+      // LAMBDA: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], 4
+      // LAMBDA: store i{{[0-9]+}} [[SIVAR_INC]], i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+#pragma omp target teams distribute parallel for simd reduction(+: sivar)
+  for (int i = 0; i < 2; ++i) {
+    sivar += i;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[RED_VAR:@.+]] = common global [8 x {{.+}}] zeroinitializer
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+// CHECK: call void @[[OFFL1:.+]](i{{64|32}}* @{{.+}})
+// CHECK: {{%.+}} = call{{.*}} i32 @[[TMAIN_INT:.+]]()
+// CHECK:  ret
+
+// CHECK: define{{.*}} void @[[OFFL1]](i{{64|32}}*{{.*}} [[SIVAR_ARG:%.+]])
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+// CHECK: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_ADDR]]
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}}, {{.+}} [[SIVAR_VAL]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[OUTL1]]({{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+// CHECK: [[SIVAR_REF:%.+]] = load{{.+}} [[SIVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+// CHECK: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+// CHECK: br
+
+// CHECK: define internal void @[[PAR_OUTL]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[SIVAR_ARG:%.+]])
+// Skip global and bound tid vars, and prev lb and ub
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[SIVAR_ADDR:%.+]] = alloca i{{.+}},
+// skip loop vars
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[SIVAR_ARG]], {{.+}} [[SIVAR_ADDR]],
+// CHECK-64: [[SIVAR_REF:%.+]] = load {{.+}} [[SIVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[SIVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: store{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[SIVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[SIVAR_PRIV]] to
+// CHECK: store{{.+}} [[SIVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[SIVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_REF]],
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[SIVAR_INC:%.+]] = add{{.+}} [[SIVAR_VAL]], [[SIVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[SIVAR_INC]], {{.+}} [[SIVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[SIVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[SIVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[SIVAR_REF]], {{.+}} [[SIVAR_PRIV_VAL]]
+// CHECK: br
+
+// CHECK: define{{.*}} i{{[0-9]+}} @[[TMAIN_INT]]()
+// CHECK: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 1,
+// CHECK: call void @[[TOFFL1:.+]]({{.+}})
+// CHECK:  ret
+
+// CHECK: define{{.*}} void @[[TOFFL1]](i{{64|32}}*{{.*}} [[TVAR_ARG:%.+]])
+// CHECK: [[TVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: store{{.+}} [[TVAR_ARG]], {{.+}} [[TVAR_ADDR]],
+// CHECK: [[TVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_ADDR]]
+// CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[TOUTL1:.+]] to {{.+}}, {{.+}} [[TVAR_VAL]])
+// CHECK: ret void
+
+// CHECK: define internal void @[[TOUTL1]]({{.+}}, {{.+}}, {{.+}} [[TVAR_ARG:%.+]])
+// Skip global and bound tid vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: [[TVAR_ADDR:%.+]] = alloca i{{.+}},
+// CHECK: [[TVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[TVAR_ARG]], {{.+}} [[TVAR_ADDR]],
+// CHECK: [[TVAR_REF:%.+]] = load {{.+}} [[TVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[TVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[TVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[TVAR_PRIV]] to
+// CHECK: store{{.+}} [[TVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[TVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_REF]],
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[TVAR_INC:%.+]] = add{{.+}} [[TVAR_VAL]], [[TVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[TVAR_INC]], {{.+}} [[TVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[TVAR_REF]], {{.+}} [[TVAR_PRIV_VAL]]
+// CHECK: br
+
+// CHECK: define internal void @[[TPAR_OUTL]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[TVAR_ARG:%.+]])
+// Skip global and bound tid vars, and prev lb and ub vars
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: {{.+}} = alloca i32*,
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[TVAR_ADDR:%.+]] = alloca i{{.+}},
+// skip loop vars
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: alloca i32,
+// CHECK: [[TVAR_PRIV:%.+]] = alloca i{{.+}},
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x {{.+}}],
+// CHECK: store{{.+}} [[TVAR_ARG]], {{.+}} [[TVAR_ADDR]],
+// CHECK: [[TVAR_REF:%.+]] = load {{.+}} [[TVAR_ADDR]],
+// CHECK: store{{.+}} 0, {{.+}} [[TVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: store{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: [[RED_LIST_GEP:%.+]] = getelementptr{{.+}} [[RED_LIST]],
+// CHECK: [[TVAR_PRIV_CAST:%.+]] = bitcast{{.+}} [[TVAR_PRIV]] to
+// CHECK: store{{.+}} [[TVAR_PRIV_CAST]], {{.+}} [[RED_LIST_GEP]],
+// CHECK: [[RED_LIST_BCAST:%.+]] = bitcast{{.+}} [[RED_LIST]] to
+// CHECK: [[K_RED_RET:%.+]] = call{{.+}} @__kmpc_reduce_nowait({{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}} [[RED_LIST_BCAST]], {{.+}} [[RED_FUN:@.+]], {{.+}} [[RED_VAR]])
+// CHECK: switch{{.+}} [[K_RED_RET]], label{{.+}} [
+// CHECK: {{.+}}, label %[[CASE1:.+]]
+// CHECK: {{.+}}, label %[[CASE2:.+]]
+// CHECK: ]
+// CHECK: [[CASE1]]:
+// CHECK-DAG: [[TVAR_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_REF]],
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[TVAR_INC:%.+]] = add{{.+}} [[TVAR_VAL]], [[TVAR_PRIV_VAL]]
+// CHECK: store{{.+}} [[TVAR_INC]], {{.+}} [[TVAR_REF]],
+// CHECK: call void @__kmpc_end_reduce_nowait({{.+}}, {{.+}}, {{.+}} [[RED_VAR]])
+// CHECK: br
+// CHECK: [[CASE2]]:
+// CHECK-DAG: [[TVAR_PRIV_VAL:%.+]] = load{{.+}}, {{.+}} [[TVAR_PRIV]],
+// CHECK-DAG: [[ATOMIC_RES:%.+]] = atomicrmw add{{.+}} [[TVAR_REF]], {{.+}} [[TVAR_PRIV_VAL]]
+// CHECK: br
+#endif
diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
new file mode 100644
index 000000000000..799769739d97
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
@@ -0,0 +1,400 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a[X];
+  float b;
+  // CK1: define {{.*}}i32 @{{.+}}foo{{.+}}(
+  int foo(void) {
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL1:.+]](
+    #pragma omp target teams distribute parallel for simd
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL2:.+]](
+    #pragma omp target teams distribute parallel for simd schedule(static)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL3:.+]](
+    #pragma omp target teams distribute parallel for simd schedule(static, X/2)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL4:.+]](
+    #pragma omp target teams distribute parallel for simd schedule(dynamic)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+
+  // CK1: call i32 @__tgt_target_teams(
+  // CK1: call void @[[OFFL5:.+]](
+    #pragma omp target teams distribute parallel for simd schedule(dynamic, X/2)
+    for(int i = 0; i < X; i++) {
+      a[i] = (T)0;
+    }
+
+  // CK1: define internal void @[[OFFL1]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL1:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL1]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL2]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL2:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL2]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL3]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL3:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL3]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 33,
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL4]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL4:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL4]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL4:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL4]]({{.+}})
+  // CK1: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+  // CK1: call {{.+}} @__kmpc_dispatch_next_4(
+  // CK1: ret void
+
+  // CK1: define internal void @[[OFFL5]](
+  // CK1: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTL5:.+]] to {{.+}},
+  // CK1: ret void
+
+  // CK1: define internal void @[[OUTL5]]({{.+}})
+  // CK1: call void @__kmpc_for_static_init_4(
+  // CK1: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL5:.+]] to
+  // CK1: call void @__kmpc_for_static_fini(
+  // CK1: ret void
+
+  // CK1: define internal void @[[PAR_OUTL5]]({{.+}})
+  // CK1: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+  // CK1: call {{.+}} @__kmpc_dispatch_next_4(
+  // CK1: ret void
+
+    return a[0];
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix SIMD-ONLY
+// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
+#ifdef CK2
+
+template <typename T, int n>
+int tmain(T argc) {
+  T a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for simd
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for simd schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for simd schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for simd schedule(dynamic)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+#pragma omp target teams distribute parallel for simd schedule(dynamic, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = (T)0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int n = 100;
+  int a[n];
+  int m = 10;
+#pragma omp target teams distribute parallel for simd
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for simd dist_schedule(static)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for simd dist_schedule(static, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for simd schedule(dynamic)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+#pragma omp target teams distribute parallel for simd schedule(dynamic, m)
+  for(int i = 0; i < n; i++) {
+    a[i] = 0;
+  }
+  return tmain<int, 10>(argc);
+}
+
+// CK2: define {{.*}}i32 @{{[^,]+}}(i{{.+}}{{.+}} %[[ARGC:.+]], {{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL3:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL4:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFL5:.+]]({{.+}})
+// CK2: {{%.+}} = call{{.*}} i32 @[[TMAIN:.+]]({{.+}})
+// CK2: ret
+
+// CK2:  define {{.*}}void @[[OFFL1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define {{.*}}void @[[OFFL2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 3, {{.+}} @[[OUTL2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFL3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[OUTL3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFL4]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTL4:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL4]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL4:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL4]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+
+// CK2: define {{.*}}void @[[OFFL5]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTL5:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTL5]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL5:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTL5]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+// CK2: define {{.*}}i32 @[[TMAIN]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT1:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT2:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT3:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT4:.+]]({{.+}})
+// CK2: call i32 @__tgt_target_teams(
+// CK2: call void @[[OFFLT5:.+]]({{.+}})
+// CK2:  ret
+// CK2-NEXT: }
+
+// CK2:  define {{.*}}void @[[OFFLT1]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT1:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT1:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT1]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT2]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}} @[[OUTLT2:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT2:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT2]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 34,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT3]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT3:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT3:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT3]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 33,
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT4]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT4:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT4]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT4:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT4]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+// CK2:  define {{.*}}void @[[OFFLT5]]({{.+}})
+// CK2: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 {{.+}}, {{.+}} @[[OUTLT5:.+]] to {{.+}},
+// CK2: ret void
+
+// CK2: define internal void @[[OUTLT5]]({{.+}})
+// CK2: call void @__kmpc_for_static_init_4(
+// CK2: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTLT5:.+]] to
+// CK2: call void @__kmpc_for_static_fini(
+// CK2: ret void
+
+// CK2: define internal void @[[PAR_OUTLT5]]({{.+}})
+// CK2: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, i32 35,
+// CK2: call {{.+}} @__kmpc_dispatch_next_4(
+// CK2: ret void
+
+#endif // CK2
+#endif // #ifndef HEADER
diff --git a/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 3ddc60440513..3553aa24e6ec 100644
--- a/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -220,13 +220,13 @@ int foo(int n) {
   // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
   // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
 
+  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
+  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+  // CHECK:       [[TRY]]
   // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
   // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
   // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8
 
-  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
-  // CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-  // CHECK:       [[TRY]]
   // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([9 x i64], [9 x i64]* [[MAPT4]], i32 0, i32 0), i32 0, i32 0)
   // CHECK-DAG:   [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0
   // CHECK-DAG:   [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0
@@ -345,7 +345,8 @@ int foo(int n) {
 //
 //
 // CHECK:       define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i[[SZ]] {{[^)]+}})
-// CHECK:       alloca i16,
+// CHECK:       alloca i[[SZ]],
+// CHECK:       bitcast i[[SZ]]* {{.+}} to i16*
 // CHECK:       ret void
 // CHECK-NEXT:  }
 
@@ -555,13 +556,13 @@ int bar(int n){
 // CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
 // CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
 
+// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
+// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
+// CHECK:       [[TRY]]
 // We capture 2 VLA sizes in this target region
 // CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2
 
-// CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
-// CHECK:       br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]]
-// CHECK:       [[TRY]]
 // CHECK-DAG:   [[RET:%.+]] = call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPT5]], i32 0, i32 0), i32 0, i32 0)
 // CHECK-DAG:   [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0
 // CHECK-DAG:   [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0
diff --git a/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp b/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
new file mode 100644
index 000000000000..a94da30344c0
--- /dev/null
+++ b/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
@@ -0,0 +1,261 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// SIMD-ONLY1-NOT: {{__kmpc|__tgt}}
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
+// CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
+
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
+
+// CHECK-DAG: $[[REGFN:\.omp_offloading\..+]] = comdat
+
+// CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 0, i[[SZ]] 4]
+// CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 32, i64 288]
+// CHECK-DAG: @{{.*}} = private constant i8 0
+
+// TCHECK: @{{.+}} = constant [[ENTTY]]
+// TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]]
+// TCHECK-NOT: @{{.+}} = constant [[ENTTY]]
+
+// Check if offloading descriptor is created.
+// CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
+// CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
+// CHECK: [[DEVBEGIN:@.+]] = external constant i8
+// CHECK: [[DEVEND:@.+]] = external constant i8
+// CHECK: [[IMAGES:@.+]] = internal unnamed_addr constant [1 x [[DEVTY]]] [{{.+}} { i8* [[DEVBEGIN]], i8* [[DEVEND]], [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }], comdat($[[REGFN]])
+// CHECK: [[DESC:@.+]] = internal constant [[DSCTY]] { i32 1, [[DEVTY]]* getelementptr inbounds ([1 x [[DEVTY]]], [1 x [[DEVTY]]]* [[IMAGES]], i32 0, i32 0), [[ENTTY]]* [[ENTBEGIN]], [[ENTTY]]* [[ENTEND]] }, comdat($[[REGFN]])
+
+// Check target registration is registered as a Ctor.
+// CHECK: appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* bitcast (void (i8*)* @[[REGFN]] to void ()*), i8* bitcast (void (i8*)* @[[REGFN]] to i8*) }]
+
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int global;
+extern int global;
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  static long *plocal;
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 3
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute simd device(global + a) depend(in: global) depend(out: a, b, cn[4])
+  for (int i = 0; i < 10; ++i) {
+  }
+
+  // CHECK:       [[ADD:%.+]] = add nsw i32
+  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+
+  // CHECK:       [[BOOL:%.+]] = icmp ne i32 %{{.+}}, 0
+  // CHECK:       br i1 [[BOOL]], label %[[THEN:.+]], label %[[ELSE:.+]]
+  // CHECK:       [[THEN]]:
+  // CHECK-DAG:   [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%.+]], i32 0, i32 0
+  // CHECK-DAG:   [[CBPADDR0:%.+]] = bitcast i8** [[BPADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   [[CPADDR0:%.+]] = bitcast i8** [[PADDR0]] to i[[SZ]]**
+  // CHECK-DAG:   store i[[SZ]]* [[BP0:%[^,]+]], i[[SZ]]** [[CBPADDR0]]
+  // CHECK-DAG:   store i[[SZ]]* [[BP0]], i[[SZ]]** [[CPADDR0]]
+
+  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
+  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
+  // CHECK-DAG:   [[CBPADDR1:%.+]] = bitcast i8** [[BPADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   [[CPADDR1:%.+]] = bitcast i8** [[PADDR1]] to i[[SZ]]*
+  // CHECK-DAG:   store i[[SZ]] [[BP1:%[^,]+]], i[[SZ]]* [[CBPADDR1]]
+  // CHECK-DAG:   store i[[SZ]] [[BP1]], i[[SZ]]* [[CPADDR1]]
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0
+  // CHECK-DAG:   getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{104|52}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+
+  // CHECK:       [[ELSE]]:
+  // CHECK-NOT:   getelementptr inbounds [2 x i8*], [2 x i8*]*
+  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 1
+  // CHECK:       getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 2
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       br label %[[EXIT:.+]]
+  // CHECK:       [[EXIT]]:
+
+  #pragma omp target teams distribute simd device(global + a) nowait depend(inout: global, a, bn) if(a)
+  for (int i = 0; i < *plocal; ++i) {
+    static int local1;
+    *plocal = global;
+    local1 = global;
+  }
+
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[ID]], i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+  // CHECK:       getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+  // CHECK:       [[DEP_START:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* %{{.+}}, i32 0, i32 0
+  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+  // CHECK:       call void @__kmpc_omp_wait_deps(%ident_t* [[ID]], i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call void @__kmpc_omp_task_begin_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+  // CHECK:       call void @__kmpc_omp_task_complete_if0(%ident_t* [[ID]], i32 [[GTID]], i8* [[TASK]])
+  #pragma omp target teams distribute simd if(0) firstprivate(global) depend(out:global)
+  for (int i = 0; i < global; ++i) {
+    global += 1;
+  }
+
+  return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+// CHECK:       define internal void [[HVT0:@.+]]()
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY0]](i32{{.*}}, [[TASK_TY0]]* noalias)
+// CHECK:       store void (i8*, ...)* null, void (i8*, ...)** %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i64* null, i32 0, i32 0)
+// CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       call void [[HVT0]]()
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT1:@.+]](i[[SZ]]* %{{.+}}, i[[SZ]] %{{.+}})
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1_]](i32{{.*}}, [[TASK_TY1_]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[SZT:%.+]] = getelementptr inbounds [2 x i[[SZ]]], [2 x i[[SZ]]]* %{{.+}}, i[[SZ]] 0, i[[SZ]] 0
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       [[DEVICE:%.+]] = sext i32 [[DEV]] to i64
+// CHECK:       [[RET:%.+]] = call i32 @__tgt_target_teams_nowait(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SZT]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT]], i32 0, i32 0), i32 0, i32 0)
+
+// CHECK:       [[ERROR:%.+]] = icmp ne i32 [[RET]], 0
+// CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
+// CHECK:       [[FAIL]]
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK-NEXT:  br label %[[END]]
+// CHECK:       [[END]]
+// CHECK:       ret i32 0
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY1__]](i32{{.*}}, [[TASK_TY1__]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[DEVICE_CAP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 2
+// CHECK:       [[BP0:%.+]] = load i[[SZ]]*, i[[SZ]]** %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT1]](i[[SZ]]* [[BP0]], i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+// CHECK:       define internal void [[HVT2:@.+]](i[[SZ]] %{{.+}})
+// Create stack storage and store argument in there.
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align
+
+// CHECK:       define internal{{.*}} i32 [[TASK_ENTRY2]](i32{{.*}}, [[TASK_TY2]]* noalias)
+// CHECK:       call void (i8*, ...) %
+// CHECK:       [[BP1_I32:%.+]] = load i32, i32* %
+// CHECK-64:    [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64:    store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32:    store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+// CHECK:       [[BP1:%.+]] = load i[[SZ]], i[[SZ]]* [[BP1_PTR]],
+// CHECK:       call void [[HVT2]](i[[SZ]] [[BP1]])
+// CHECK:       ret i32 0
+
+
+#endif
diff --git a/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
index 8074fba7360e..34cf1f2f9761 100644
--- a/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
+++ b/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
@@ -110,16 +110,7 @@ int main() {
     // LAMBDA: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: {{%.+}} = alloca i{{[0-9]+}},
-    // LAMBDA: [[G_PRIV_ADDR:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: [[G1_PRIV_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
-    // LAMBDA: [[SIVAR_PRIV_ADDR:%.+]] = alloca i{{[0-9]+}},
     // skip loop vars
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
@@ -127,15 +118,15 @@ int main() {
     // LAMBDA-DAG: [[G_CONV:%.+]] = bitcast {{.+}} [[G_ADDR]] to
     // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}} [[G1_ADDR]] to
     // LAMBDA-DAG: [[SIVAR_CONV:%.+]] = bitcast {{.+}} [[SIVAR_ADDR]] to
-    // LAMBDA-DAG: store{{.+}} [[G1_PRIV_ADDR]], {{.+}} [[G1_TMP]],
+    // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
     g = 1;
     g1 = 1;
     sivar = 2;
     // LAMBDA: call void @__kmpc_for_static_init_4(
-    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_PRIV_ADDR]],
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_CONV]],
     // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP]]
     // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1]],
-    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_PRIV_ADDR]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_CONV]],
     // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[G1_TMP]],
     // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1_REF]],
     // LAMBDA: call void [[INNER_LAMBDA:@.+]](
@@ -236,7 +227,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -322,7 +313,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
diff --git a/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp b/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
index e6f7157d6f99..fdf6d2f3a7dc 100644
--- a/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
+++ b/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
@@ -260,7 +260,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -274,7 +274,7 @@ int main() {
 // CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -285,7 +285,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -294,7 +294,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK-64: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK-32: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR]],
@@ -362,7 +362,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -376,7 +376,7 @@ int main() {
 // CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -387,7 +387,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -396,6 +396,6 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 #endif
diff --git a/test/OpenMP/target_teams_distribute_simd_messages.cpp b/test/OpenMP/target_teams_distribute_simd_messages.cpp
index adef1feb3b56..6ee8073357f4 100644
--- a/test/OpenMP/target_teams_distribute_simd_messages.cpp
+++ b/test/OpenMP/target_teams_distribute_simd_messages.cpp
@@ -64,7 +64,7 @@ int main(int argc, char **argv) {
 // expected-error@+1 {{unexpected OpenMP clause 'default' in directive '#pragma omp target teams distribute simd'}}
 #pragma omp target teams distribute simd default(none)
   for (int i = 0; i < 10; ++i)
-    ++argc; // expected-error {{ariable 'argc' must have explicitly specified data sharing attributes}}
+    ++argc;
 
   goto L2; // expected-error {{use of undeclared label 'L2'}}
 #pragma omp target teams distribute simd
diff --git a/test/OpenMP/target_update_depend_codegen.cpp b/test/OpenMP/target_update_depend_codegen.cpp
index 11a0c9c07e14..550d31f9de6c 100644
--- a/test/OpenMP/target_update_depend_codegen.cpp
+++ b/test/OpenMP/target_update_depend_codegen.cpp
@@ -68,19 +68,19 @@ void foo(int arg) {
   // CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
   // CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
   // CK1: [[BC1:%.+]] = bitcast %struct.anon* [[CAPTURES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 4, i32 4, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 4 [[SHAREDS_REF]], i8* align 4 [[BC1]], i[[sz]] 4, i1 false)
   // CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* [[BC]], i32 0, i32 1
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
@@ -124,19 +124,19 @@ void foo(int arg) {
   // CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
   // CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
   // CK1: [[BC1:%.+]] = bitcast %struct.anon{{.+}}* [[CAPTURES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 1, i32 1, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 1 [[SHAREDS_REF]], i8* align 1 [[BC1]], i[[sz]] 1, i1 false)
   // CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates{{.+}}, %struct.kmp_task_t_with_privates{{.+}}* [[RES_BC]], i32 0, i32 1
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
@@ -198,15 +198,15 @@ void foo(int arg) {
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
   // CK1: [[BC_SIZES:%.+]] = bitcast i[[sz]]* [[GEPS0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* [[BC_SIZES]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} [[BC_SIZES]], i[[sz]] {{8|4}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint float* %{{.+}} to i[[sz]]
@@ -272,14 +272,14 @@ void foo(int arg) {
   // CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
   // CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_BASEPTRS]] to i8*
   // CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i1 false)
   // CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
   // CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_PTRS]] to i8*
   // CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{16|8}}, i1 false)
   // CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
   // CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [2 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
-  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+  // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i1 false)
   // CK1: [[DEP:%.+]] = getelementptr inbounds [5 x %struct.kmp_depend_info], [5 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
   // CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
   // CK1: [[BC_ADR:%.+]] = ptrtoint double* %{{.+}} to i[[sz]]
diff --git a/test/OpenMP/task_codegen.cpp b/test/OpenMP/task_codegen.cpp
index 03abbeb66308..fefe8b4329c2 100644
--- a/test/OpenMP/task_codegen.cpp
+++ b/test/OpenMP/task_codegen.cpp
@@ -38,7 +38,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
 // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS]]* [[CAPTURES]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[BITCAST]], i64 16, i1 false)
 // CHECK: [[PRIORITY_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 4
 // CHECK: [[PRIORITY:%.+]] = bitcast %union{{.+}}* [[PRIORITY_REF_PTR]] to i32*
 // CHECK: store i32 {{.+}}, i32* [[PRIORITY]]
@@ -55,7 +55,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
 // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS1]]* [[CAPTURES]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 8, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[BITCAST]], i64 8, i1 false)
 // CHECK: [[DEP:%.*]] = getelementptr inbounds [4 x [[KMP_DEPEND_INFO]]], [4 x [[KMP_DEPEND_INFO]]]* [[DEPENDENCIES:%.*]], i64 0, i64 0
 // CHECK: [[T0:%.*]] = getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* [[DEP]], i32 0, i32 0
 // CHECK: store i64 ptrtoint (i32* @{{.+}} to i64), i64* [[T0]]
diff --git a/test/OpenMP/task_firstprivate_codegen.cpp b/test/OpenMP/task_firstprivate_codegen.cpp
index aedbb98dfdb6..09b1deabde40 100644
--- a/test/OpenMP/task_firstprivate_codegen.cpp
+++ b/test/OpenMP/task_firstprivate_codegen.cpp
@@ -207,7 +207,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // Also copy address of private copy to the corresponding shareds reference.
@@ -361,7 +361,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
diff --git a/test/OpenMP/taskgroup_task_reduction_codegen.cpp b/test/OpenMP/taskgroup_task_reduction_codegen.cpp
index 9885a01d97d5..29c1f46c8afc 100644
--- a/test/OpenMP/taskgroup_task_reduction_codegen.cpp
+++ b/test/OpenMP/taskgroup_task_reduction_codegen.cpp
@@ -57,14 +57,14 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP6:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPA]], i32 0, i32 1
 // CHECK-DAG:   store i64 4, i64* [[TMP6]],
 // CHECK-DAG:   [[TMP7:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPA]], i32 0, i32 2
-// CHECK-DAG:   store i8* bitcast (void (i8*)* [[AINIT:@.+]] to i8*), i8** [[TMP7]],
+// CHECK-DAG:   store i8* bitcast (void (i8*)* @[[AINIT:.+]] to i8*), i8** [[TMP7]],
 // CHECK-DAG:   [[TMP8:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPA]], i32 0, i32 3
 // CHECK-DAG:   store i8* null, i8** [[TMP8]],
 // CHECK-DAG:   [[TMP9:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPA]], i32 0, i32 4
-// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* [[ACOMB:@.+]] to i8*), i8** [[TMP9]],
+// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* @[[ACOMB:.+]] to i8*), i8** [[TMP9]],
 // CHECK-DAG:   [[TMP10:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPA]], i32 0, i32 5
 // CHECK-DAG:   [[TMP11:%.+]] = bitcast i32* [[TMP10]] to i8*
-// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* [[TMP11]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP11]], i8 0, i64 4, i1 false)
 // CHECK-DAG:   [[TMP13:%.+]] = bitcast float* [[B]] to i8*
 // CHECK-DAG:   store i8* [[TMP13]], i8** [[TMP12:%[^,]+]],
 // CHECK-DAG:   [[TMP12]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB:%[^,]+]], i32 0, i32 0
@@ -72,14 +72,14 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP14:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB]], i32 0, i32 1
 // CHECK-DAG:   store i64 4, i64* [[TMP14]],
 // CHECK-DAG:   [[TMP15:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB]], i32 0, i32 2
-// CHECK-DAG:   store i8* bitcast (void (i8*)* [[BINIT:@.+]] to i8*), i8** [[TMP15]],
+// CHECK-DAG:   store i8* bitcast (void (i8*)* @[[BINIT:.+]] to i8*), i8** [[TMP15]],
 // CHECK-DAG:   [[TMP16:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB]], i32 0, i32 3
 // CHECK-DAG:   store i8* null, i8** [[TMP16]],
 // CHECK-DAG:   [[TMP17:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB]], i32 0, i32 4
-// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* [[BCOMB:@.+]] to i8*), i8** [[TMP17]],
+// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* @[[BCOMB:.+]] to i8*), i8** [[TMP17]],
 // CHECK-DAG:   [[TMP18:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB]], i32 0, i32 5
 // CHECK-DAG:   [[TMP19:%.+]] = bitcast i32* [[TMP18]] to i8*
-// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* [[TMP19]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP19]], i8 0, i64 4, i1 false)
 // CHECK-DAG:   [[TMP21:%.+]] = bitcast i32* [[ARGC_ADDR]] to i8*
 // CHECK-DAG:   store i8* [[TMP21]], i8** [[TMP20:%[^,]+]],
 // CHECK-DAG:   [[TMP20]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC:%[^,]+]], i32 0, i32 0
@@ -87,14 +87,14 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP22:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC]], i32 0, i32 1
 // CHECK-DAG:   store i64 4, i64* [[TMP22]],
 // CHECK-DAG:   [[TMP23:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC]], i32 0, i32 2
-// CHECK-DAG:   store i8* bitcast (void (i8*)* [[ARGCINIT:@.+]] to i8*), i8** [[TMP23]],
+// CHECK-DAG:   store i8* bitcast (void (i8*)* @[[ARGCINIT:.+]] to i8*), i8** [[TMP23]],
 // CHECK-DAG:   [[TMP24:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC]], i32 0, i32 3
 // CHECK-DAG:   store i8* null, i8** [[TMP24]],
 // CHECK-DAG:   [[TMP25:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC]], i32 0, i32 4
-// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* [[ARGCCOMB:@.+]] to i8*), i8** [[TMP25]],
+// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* @[[ARGCCOMB:.+]] to i8*), i8** [[TMP25]],
 // CHECK-DAG:   [[TMP26:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC]], i32 0, i32 5
 // CHECK-DAG:   [[TMP27:%.+]] = bitcast i32* [[TMP26]] to i8*
-// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP27]], i8 0, i64 4, i1 false)
 // CHECK-DAG:   [[TMP28:%.+]] = bitcast [3 x [[T1]]]* [[RD_IN1]] to i8*
 // CHECK-DAG:   [[TMP29:%.+]] = call i8* @__kmpc_task_reduction_init(i32 [[GTID]], i32 3, i8* [[TMP28]])
 // DEBUG-DAG:   call void @llvm.dbg.declare(metadata i8** [[TD1]],
@@ -107,14 +107,14 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP32:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPC]], i32 0, i32 1
 // CHECK-DAG:   store i64 20, i64* [[TMP32]],
 // CHECK-DAG:   [[TMP33:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPC]], i32 0, i32 2
-// CHECK-DAG:   store i8* bitcast (void (i8*)* [[CINIT:@.+]] to i8*), i8** [[TMP33]],
+// CHECK-DAG:   store i8* bitcast (void (i8*)* @[[CINIT:.+]] to i8*), i8** [[TMP33]],
 // CHECK-DAG:   [[TMP34:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPC]], i32 0, i32 3
-// CHECK-DAG:   store i8* bitcast (void (i8*)* [[CFINI:@.+]] to i8*), i8** [[TMP34]],
+// CHECK-DAG:   store i8* bitcast (void (i8*)* @[[CFINI:.+]] to i8*), i8** [[TMP34]],
 // CHECK-DAG:   [[TMP35:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPC]], i32 0, i32 4
-// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* [[CCOMB:@.+]] to i8*), i8** [[TMP35]],
+// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* @[[CCOMB:.+]] to i8*), i8** [[TMP35]],
 // CHECK-DAG:   [[TMP36:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPC]], i32 0, i32 5
 // CHECK-DAG:   [[TMP37:%.+]] = bitcast i32* [[TMP36]] to i8*
-// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* [[TMP37]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:   call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP37]], i8 0, i64 4, i1 false)
 // CHECK-DAG:   [[TMP39:%.+]] = bitcast i16* [[VLA]] to i8*
 // CHECK-DAG:   store i8* [[TMP39]], i8** [[TMP38:%[^,]+]],
 // CHECK-DAG:   [[TMP38]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPVLA:%[^,]+]], i32 0, i32 0
@@ -124,11 +124,11 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP42:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPVLA]], i32 0, i32 1
 // CHECK-DAG:   store i64 [[TMP40]], i64* [[TMP42]],
 // CHECK-DAG:   [[TMP43:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPVLA]], i32 0, i32 2
-// CHECK-DAG:   store i8* bitcast (void (i8*)* [[VLAINIT:@.+]] to i8*), i8** [[TMP43]],
+// CHECK-DAG:   store i8* bitcast (void (i8*)* @[[VLAINIT:.+]] to i8*), i8** [[TMP43]],
 // CHECK-DAG:   [[TMP44:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPVLA]], i32 0, i32 3
 // CHECK-DAG:   store i8* null, i8** [[TMP44]],
 // CHECK-DAG:   [[TMP45:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPVLA]], i32 0, i32 4
-// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* [[VLACOMB:@.+]] to i8*), i8** [[TMP45]],
+// CHECK-DAG:   store i8* bitcast (void (i8*, i8*)* @[[VLACOMB:.+]] to i8*), i8** [[TMP45]],
 // CHECK-DAG:   [[TMP46:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPVLA]], i32 0, i32 5
 // CHECK-DAG:   store i32 1, i32* [[TMP46]],
 // CHECK:       [[TMP47:%.+]] = bitcast [2 x [[T2]]]* [[RD_IN2]] to i8*
@@ -137,54 +137,54 @@ int main(int argc, char **argv) {
 // CHECK:       call void @__kmpc_end_taskgroup(%ident_t* {{[^,]+}}, i32 [[GTID]])
 // CHECK:       call void @__kmpc_end_taskgroup(%ident_t* {{[^,]+}}, i32 [[GTID]])
 
-// CHECK-DAG: define internal void [[AINIT]](i8*)
+// CHECK-DAG: define internal void @[[AINIT]](i8*)
 // CHECK-DAG: store i32 0, i32* %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[ACOMB]](i8*, i8*)
+// CHECK-DAG: define internal void @[[ACOMB]](i8*, i8*)
 // CHECK-DAG: add nsw i32 %
 // CHECK-DAG: store i32 %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[BINIT]](i8*)
+// CHECK-DAG: define internal void @[[BINIT]](i8*)
 // CHECK-DAG: store float 0.000000e+00, float* %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[BCOMB]](i8*, i8*)
+// CHECK-DAG: define internal void @[[BCOMB]](i8*, i8*)
 // CHECK-DAG: fadd float %
 // CHECK-DAG: store float %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[ARGCINIT]](i8*)
+// CHECK-DAG: define internal void @[[ARGCINIT]](i8*)
 // CHECK-DAG: store i32 0, i32* %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[ARGCCOMB]](i8*, i8*)
+// CHECK-DAG: define internal void @[[ARGCCOMB]](i8*, i8*)
 // CHECK-DAG: add nsw i32 %
 // CHECK-DAG: store i32 %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[CINIT]](i8*)
+// CHECK-DAG: define internal void @[[CINIT]](i8*)
 // CHECK-DAG: phi %struct.S* [
 // CHECK-DAG: call {{.+}}(%struct.S* {{.+}})
 // CHECK-DAG: br i1 %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[CFINI]](i8*)
+// CHECK-DAG: define internal void @[[CFINI]](i8*)
 // CHECK-DAG: phi %struct.S* [
 // CHECK-DAG: call {{.+}}(%struct.S* {{.+}})
 // CHECK-DAG: br i1 %
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[CCOMB]](i8*, i8*)
+// CHECK-DAG: define internal void @[[CCOMB]](i8*, i8*)
 // CHECK-DAG: phi %struct.S* [
 // CHECK-DAG: phi %struct.S* [
 // CHECK-DAG: call {{.+}}(%struct.S* {{.+}}, %struct.S* {{.+}}, %struct.S* {{.+}})
@@ -194,7 +194,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG: ret void
 // CHECK_DAG: }
 
-// CHECK-DAG: define internal void [[VLAINIT]](i8*)
+// CHECK-DAG: define internal void @[[VLAINIT]](i8*)
 // CHECK-DAG: call i32 @__kmpc_global_thread_num(%ident_t* {{[^,]+}})
 // CHECK-DAG: call i8* @__kmpc_threadprivate_cached(%ident_t*
 // CHECK-DAG: phi i16* [
@@ -203,7 +203,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 
-// CHECK-DAG: define internal void [[VLACOMB]](i8*, i8*)
+// CHECK-DAG: define internal void @[[VLACOMB]](i8*, i8*)
 // CHECK-DAG: call i32 @__kmpc_global_thread_num(%ident_t* {{[^,]+}})
 // CHECK-DAG: call i8* @__kmpc_threadprivate_cached(%ident_t*
 // CHECK-DAG: phi i16* [
@@ -216,3 +216,16 @@ int main(int argc, char **argv) {
 // CHECK-DAG: ret void
 // CHECK-DAG: }
 #endif
+
+// DEBUG-LABEL: distinct !DICompileUnit
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[AINIT]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[ACOMB]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[BINIT]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[BCOMB]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[ARGCINIT]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[ARGCCOMB]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[CINIT]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[CFINI]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[CCOMB]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[VLAINIT]]",
+// DEBUG-DAG: distinct !DISubprogram(linkageName: "[[VLACOMB]]",
diff --git a/test/OpenMP/taskloop_firstprivate_codegen.cpp b/test/OpenMP/taskloop_firstprivate_codegen.cpp
index dec481a6da84..d617835299cc 100644
--- a/test/OpenMP/taskloop_firstprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_firstprivate_codegen.cpp
@@ -206,7 +206,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // Also copy address of private copy to the corresponding shareds reference.
@@ -376,7 +376,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
diff --git a/test/OpenMP/taskloop_lastprivate_codegen.cpp b/test/OpenMP/taskloop_lastprivate_codegen.cpp
index 45463a91a68a..a6f7abcc338b 100644
--- a/test/OpenMP/taskloop_lastprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_lastprivate_codegen.cpp
@@ -198,7 +198,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // Also copy address of private copy to the corresponding shareds reference.
@@ -287,18 +287,18 @@ int main() {
 // CHECK-NEXT: br i1
 // CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
 // CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: load i32, i32* %
 // CHECK: store i32 %{{.+}}, i32* %
 // CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
 // CHECK: phi [[S_DOUBLE_TY]]*
 // CHECK: phi [[S_DOUBLE_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: icmp eq [[S_DOUBLE_TY]]* %
 // CHECK-NEXT: br i1
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: load i32, i32* %
 // CHECK: store i32 %{{.+}}, i32* %
 // CHECK: br label
@@ -372,7 +372,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
@@ -453,16 +453,16 @@ int main() {
 // CHECK: store i32 %{{.+}}, i32* %
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
 // CHECK: phi [[S_INT_TY]]*
 // CHECK: phi [[S_INT_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: icmp eq [[S_INT_TY]]* %
 // CHECK-NEXT: br i1
 // CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
 // CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: br label
 // CHECK: ret
 
diff --git a/test/OpenMP/taskloop_loop_messages.cpp b/test/OpenMP/taskloop_loop_messages.cpp
index 00c7d716b966..4ce83b4ae1ae 100644
--- a/test/OpenMP/taskloop_loop_messages.cpp
+++ b/test/OpenMP/taskloop_loop_messages.cpp
@@ -294,10 +294,8 @@ int test_iteration_spaces() {
     c[ii] = a[ii];
 
 #pragma omp parallel
-// expected-error@+2 {{unexpected OpenMP clause 'linear' in directive '#pragma omp taskloop'}}
-// expected-note@+1 {{defined as linear}}
+// expected-error@+1 {{unexpected OpenMP clause 'linear' in directive '#pragma omp taskloop'}}
 #pragma omp taskloop linear(ii)
-// expected-error@+1 {{loop iteration variable in the associated loop of 'omp taskloop' directive may not be linear, predetermined as private}}
   for (ii = 0; ii < 10; ii++)
     c[ii] = a[ii];
 
diff --git a/test/OpenMP/taskloop_reduction_codegen.cpp b/test/OpenMP/taskloop_reduction_codegen.cpp
index a7941ff36279..8abd0175b406 100644
--- a/test/OpenMP/taskloop_reduction_codegen.cpp
+++ b/test/OpenMP/taskloop_reduction_codegen.cpp
@@ -72,14 +72,14 @@ sum = 0.0;
 // CHECK-DAG:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 1
 // CHECK-DAG:    store i64 4, i64* [[TMP22]],
 // CHECK-DAG:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT1:@.+]] to i8*), i8** [[TMP23]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT1:.+]] to i8*), i8** [[TMP23]],
 // CHECK-DAG:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 3
 // CHECK-DAG:    store i8* null, i8** [[TMP24]],
 // CHECK-DAG:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB1:@.+]] to i8*), i8** [[TMP25]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB1:.+]] to i8*), i8** [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
 // CHECK-DAG:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
-// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP27]], i8 0, i64 4, i1 false)
 // CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
 // CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
@@ -95,11 +95,11 @@ sum = 0.0;
 // CHECK-DAG:    store i64 [[TMP37]], i64* [[TMP38:%[^,]+]],
 // CHECK-DAG:    [[TMP38]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
 // CHECK-DAG:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT2:@.+]] to i8*), i8** [[TMP39]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT2:.+]] to i8*), i8** [[TMP39]],
 // CHECK-DAG:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 3
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_FINI2:@.+]] to i8*), i8** [[TMP40]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_FINI2:.+]] to i8*), i8** [[TMP40]],
 // CHECK-DAG:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB2:@.+]] to i8*), i8** [[TMP41]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB2:.+]] to i8*), i8** [[TMP41]],
 // CHECK-DAG:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 5
 // CHECK-DAG:    store i32 1, i32* [[TMP42]],
 // CHECK-DAG:    [[TMP44:%.*]] = load float*, float** [[D]],
@@ -109,14 +109,14 @@ sum = 0.0;
 // CHECK-DAG:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK-DAG:    store i64 4, i64* [[TMP46]],
 // CHECK-DAG:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT3:@.+]] to i8*), i8** [[TMP47]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT3:.+]] to i8*), i8** [[TMP47]],
 // CHECK-DAG:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 3
 // CHECK-DAG:    store i8* null, i8** [[TMP48]],
 // CHECK-DAG:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB3:@.+]] to i8*), i8** [[TMP49]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB3:.+]] to i8*), i8** [[TMP49]],
 // CHECK-DAG:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
 // CHECK-DAG:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
-// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP51]], i8 0, i64 4, i1 false)
 // CHECK-DAG:    [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
 // CHECK-DAG:    store i8* [[TMP53]], i8** [[TMP52:%[^,]+]],
 // CHECK-DAG:    [[TMP52]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8:%.+]], i32 0, i32 0
@@ -125,11 +125,11 @@ sum = 0.0;
 // CHECK-DAG:    store i64 [[TMP54]], i64* [[TMP56:%[^,]+]],
 // CHECK-DAG:    [[TMP56]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 1
 // CHECK-DAG:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT4:@.+]] to i8*), i8** [[TMP57]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT4:.+]] to i8*), i8** [[TMP57]],
 // CHECK-DAG:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 3
 // CHECK-DAG:    store i8* null, i8** [[TMP58]],
 // CHECK-DAG:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB4:@.+]] to i8*), i8** [[TMP59]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB4:.+]] to i8*), i8** [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 5
 // CHECK-DAG:    store i32 1, i32* [[TMP60]],
 // CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
@@ -153,50 +153,59 @@ sum = 0.0;
 
 // CHECK:    ret i32
 
-// CHECK: define internal void [[RED_INIT1]](i8*)
+// CHECK: define internal void @[[RED_INIT1]](i8*)
 // CHECK: store float 0.000000e+00, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB1]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB1]](i8*, i8*)
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_INIT2]](i8*)
+// CHECK: define internal void @[[RED_INIT2]](i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_FINI2]](i8*)
+// CHECK: define internal void @[[RED_FINI2]](i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: call void @
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB2]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB2]](i8*, i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_INIT3]](i8*)
+// CHECK: define internal void @[[RED_INIT3]](i8*)
 // CHECK: store float 0.000000e+00, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB3]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB3]](i8*, i8*)
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_INIT4]](i8*)
+// CHECK: define internal void @[[RED_INIT4]](i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: store float 0.000000e+00, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB4]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB4]](i8*, i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
-// CHECK: distinct !DISubprogram(linkageName: "[[TASK]]", scope: !
+// CHECK-DAG: distinct !DISubprogram(linkageName: "[[TASK]]", scope: !
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT1]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB1]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT2]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_FINI2]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB2]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT3]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB3]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT4]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB4]]"
diff --git a/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
index 0f13ea1803a5..4f406b2a583a 100644
--- a/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
@@ -206,7 +206,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // Also copy address of private copy to the corresponding shareds reference.
@@ -376,7 +376,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
diff --git a/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
index e9c54853bb35..be484465e3ae 100644
--- a/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
@@ -198,7 +198,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // Also copy address of private copy to the corresponding shareds reference.
@@ -287,18 +287,18 @@ int main() {
 // CHECK-NEXT: br i1
 // CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
 // CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: load i32, i32* %
 // CHECK: store i32 %{{.+}}, i32* %
 // CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
 // CHECK: phi [[S_DOUBLE_TY]]*
 // CHECK: phi [[S_DOUBLE_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: icmp eq [[S_DOUBLE_TY]]* %
 // CHECK-NEXT: br i1
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: load i32, i32* %
 // CHECK: store i32 %{{.+}}, i32* %
 // CHECK: br label
@@ -372,7 +372,7 @@ int main() {
 // CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
 // CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
 
 // Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
 // CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
@@ -453,16 +453,16 @@ int main() {
 // CHECK: store i32 %{{.+}}, i32* %
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
 // CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
 // CHECK: phi [[S_INT_TY]]*
 // CHECK: phi [[S_INT_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: icmp eq [[S_INT_TY]]* %
 // CHECK-NEXT: br i1
 // CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
 // CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
 // CHECK: br label
 // CHECK: ret
 
diff --git a/test/OpenMP/taskloop_simd_reduction_codegen.cpp b/test/OpenMP/taskloop_simd_reduction_codegen.cpp
index 8fa364a63153..4bf460a98bfb 100644
--- a/test/OpenMP/taskloop_simd_reduction_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_reduction_codegen.cpp
@@ -72,14 +72,14 @@ sum = 0.0;
 // CHECK-DAG:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 1
 // CHECK-DAG:    store i64 4, i64* [[TMP22]],
 // CHECK-DAG:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT1:@.+]] to i8*), i8** [[TMP23]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT1:.+]] to i8*), i8** [[TMP23]],
 // CHECK-DAG:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 3
 // CHECK-DAG:    store i8* null, i8** [[TMP24]],
 // CHECK-DAG:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB1:@.+]] to i8*), i8** [[TMP25]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB1:.+]] to i8*), i8** [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
 // CHECK-DAG:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
-// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP27]], i8 0, i64 4, i1 false)
 // CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
 // CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
@@ -95,11 +95,11 @@ sum = 0.0;
 // CHECK-DAG:    store i64 [[TMP37]], i64* [[TMP38:%[^,]+]],
 // CHECK-DAG:    [[TMP38]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
 // CHECK-DAG:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT2:@.+]] to i8*), i8** [[TMP39]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT2:.+]] to i8*), i8** [[TMP39]],
 // CHECK-DAG:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 3
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_FINI2:@.+]] to i8*), i8** [[TMP40]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_FINI2:.+]] to i8*), i8** [[TMP40]],
 // CHECK-DAG:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB2:@.+]] to i8*), i8** [[TMP41]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB2:.+]] to i8*), i8** [[TMP41]],
 // CHECK-DAG:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 5
 // CHECK-DAG:    store i32 1, i32* [[TMP42]],
 // CHECK-DAG:    [[TMP44:%.*]] = load float*, float** [[D]],
@@ -109,14 +109,14 @@ sum = 0.0;
 // CHECK-DAG:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK-DAG:    store i64 4, i64* [[TMP46]],
 // CHECK-DAG:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT3:@.+]] to i8*), i8** [[TMP47]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT3:.+]] to i8*), i8** [[TMP47]],
 // CHECK-DAG:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 3
 // CHECK-DAG:    store i8* null, i8** [[TMP48]],
 // CHECK-DAG:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB3:@.+]] to i8*), i8** [[TMP49]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB3:.+]] to i8*), i8** [[TMP49]],
 // CHECK-DAG:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
 // CHECK-DAG:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
-// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP51]], i8 0, i64 4, i1 false)
 // CHECK-DAG:    [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
 // CHECK-DAG:    store i8* [[TMP53]], i8** [[TMP52:%[^,]+]],
 // CHECK-DAG:    [[TMP52]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8:%.+]], i32 0, i32 0
@@ -125,11 +125,11 @@ sum = 0.0;
 // CHECK-DAG:    store i64 [[TMP54]], i64* [[TMP56:%[^,]+]],
 // CHECK-DAG:    [[TMP56]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 1
 // CHECK-DAG:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 2
-// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT4:@.+]] to i8*), i8** [[TMP57]],
+// CHECK-DAG:    store i8* bitcast (void (i8*)* @[[RED_INIT4:.+]] to i8*), i8** [[TMP57]],
 // CHECK-DAG:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 3
 // CHECK-DAG:    store i8* null, i8** [[TMP58]],
 // CHECK-DAG:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 4
-// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB4:@.+]] to i8*), i8** [[TMP59]],
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* @[[RED_COMB4:.+]] to i8*), i8** [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 5
 // CHECK-DAG:    store i32 1, i32* [[TMP60]],
 // CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
@@ -153,49 +153,58 @@ sum = 0.0;
 
 // CHECK:    ret i32
 
-// CHECK: define internal void [[RED_INIT1]](i8*)
+// CHECK: define internal void @[[RED_INIT1]](i8*)
 // CHECK: store float 0.000000e+00, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB1]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB1]](i8*, i8*)
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_INIT2]](i8*)
+// CHECK: define internal void @[[RED_INIT2]](i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_FINI2]](i8*)
+// CHECK: define internal void @[[RED_FINI2]](i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: call void @
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB2]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB2]](i8*, i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_INIT3]](i8*)
+// CHECK: define internal void @[[RED_INIT3]](i8*)
 // CHECK: store float 0.000000e+00, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB3]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB3]](i8*, i8*)
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_INIT4]](i8*)
+// CHECK: define internal void @[[RED_INIT4]](i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: store float 0.000000e+00, float* %
 // CHECK: ret void
 
-// CHECK: define internal void [[RED_COMB4]](i8*, i8*)
+// CHECK: define internal void @[[RED_COMB4]](i8*, i8*)
 // CHECK: call i8* @__kmpc_threadprivate_cached(
 // CHECK: fadd float %
 // CHECK: store float %{{.+}}, float* %
 // CHECK: ret void
 
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT1]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB1]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT2]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_FINI2]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB2]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT3]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB3]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_INIT4]]"
+// CHECK-DAG: !DISubprogram(linkageName: "[[RED_COMB4]]"
diff --git a/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
index e677894ac4c9..b603006dac91 100644
--- a/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
@@ -230,7 +230,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -316,7 +316,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
diff --git a/test/OpenMP/teams_distribute_lastprivate_codegen.cpp b/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
index b5febdf1aa40..33072d2683c9 100644
--- a/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
@@ -247,7 +247,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -260,7 +260,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -271,7 +271,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -280,7 +280,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -347,7 +347,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -360,7 +360,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -371,7 +371,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -380,6 +380,6 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 #endif
diff --git a/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
index 796642707dff..9042955a06c0 100644
--- a/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -140,16 +140,6 @@ int main() {
     // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
     // skip loop vars
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: [[G_PRIV:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: [[G1_PRIV:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: [[G1_TMP_PRIV:%.+]] = alloca i{{[0-9]+}}*,
-    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[SIVAR_ADDR]],
@@ -159,10 +149,10 @@ int main() {
     // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
 
     // use of private vars
-    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_PRIV]],
-    // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP_PRIV]]
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_CONV]],
+    // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP]]
     // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1]],
-    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_CONV]],
     // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[G1_TMP]],
     // LAMBDA: call void [[INNER_LAMBDA:@.+]](
     // LAMBDA: call void @__kmpc_for_static_fini(
@@ -264,7 +254,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -330,7 +320,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -415,7 +405,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
@@ -476,7 +466,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
diff --git a/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp b/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
index b40248dc6b30..e06d10c58a29 100644
--- a/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -304,7 +304,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -315,7 +315,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -324,7 +324,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -381,7 +381,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -395,7 +395,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -406,7 +406,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -415,7 +415,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -484,7 +484,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -495,7 +495,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -504,7 +504,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 
 // CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}*{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
@@ -560,7 +560,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -574,7 +574,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -585,7 +585,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -594,7 +594,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 
 #endif
diff --git a/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 1b996b38ffe0..69c9eebb4570 100644
--- a/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -140,16 +140,6 @@ int main() {
     // LAMBDA: [[SIVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[G1_TMP:%.+]] = alloca i32*,
     // skip loop vars
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: alloca i32,
-    // LAMBDA: [[G_PRIV:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: [[G1_PRIV:%.+]] = alloca i{{[0-9]+}},
-    // LAMBDA: [[G1_TMP_PRIV:%.+]] = alloca i{{[0-9]+}}*,
-    // LAMBDA: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_ADDR]],
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_ADDR]],
     // LAMBDA-DAG: store {{.+}}, {{.+}} [[SIVAR_ADDR]],
@@ -159,10 +149,10 @@ int main() {
     // LAMBDA-DAG: store{{.+}} [[G1_CONV]], {{.+}} [[G1_TMP]],
 
     // use of private vars
-    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_PRIV]],
-    // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP_PRIV]]
+    // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G_CONV]],
+    // LAMBDA-DAG: [[G1:%.+]] = load{{.+}}, {{.+}}* [[G1_TMP]]
     // LAMBDA-DAG: store{{.+}} 1, {{.+}} [[G1]],
-    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_PRIV]],
+    // LAMBDA-DAG: store{{.+}} 2, {{.+}} [[SIVAR_CONV]],
     // LAMBDA-DAG: [[G1_REF:%.+]] = load{{.+}}, {{.+}} [[G1_TMP]],
     // LAMBDA: call void [[INNER_LAMBDA:@.+]](
     // LAMBDA: call void @__kmpc_for_static_fini(
@@ -267,7 +257,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -333,7 +323,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -418,7 +408,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
@@ -479,7 +469,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
diff --git a/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index 8f1a644e4d7a..c524652d3e5f 100644
--- a/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -311,7 +311,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -322,7 +322,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -331,7 +331,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -388,7 +388,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -405,7 +405,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -416,7 +416,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -425,7 +425,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -494,7 +494,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -505,7 +505,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -514,7 +514,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 
 // CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}*{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
@@ -570,7 +570,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -587,7 +587,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -598,7 +598,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -607,7 +607,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 
 // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp b/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
index ade1ecf12007..a613f1edc917 100644
--- a/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
@@ -230,7 +230,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] =  bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
@@ -316,7 +316,7 @@ int main() {
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
 // CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
 // CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
 // CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
diff --git a/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp b/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
index 0e0d74614ec0..b337c0a48f36 100644
--- a/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
+++ b/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
@@ -248,7 +248,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -261,7 +261,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -272,7 +272,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -281,7 +281,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
 // CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
 // CHECK: ret void
@@ -348,7 +348,7 @@ int main() {
 // CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
 // CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
 // CHECK: call void @__kmpc_for_static_fini(
 
 // lastprivates
@@ -361,7 +361,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
 // CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
 // CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
 // CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
@@ -372,7 +372,7 @@ int main() {
 // CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
 // CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
 // CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
 // CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
 // CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
 // CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
@@ -381,7 +381,7 @@ int main() {
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
 // CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
 #endif
diff --git a/test/OpenMP/teams_firstprivate_codegen.cpp b/test/OpenMP/teams_firstprivate_codegen.cpp
index 0fb456b3f37f..44eed102c8dd 100644
--- a/test/OpenMP/teams_firstprivate_codegen.cpp
+++ b/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -172,7 +172,7 @@ int main() {
 // CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
 // CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
@@ -224,7 +224,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], align 128
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}} 128,
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align 128 [[VEC_DEST]], i8* align 128 [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}},
 // CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
 // CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
@@ -284,7 +284,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i{{[0-9]+}} [[SIZE]], i1 false)
   #pragma omp target
   #pragma omp teams firstprivate(a, s, vla1, vla2)
   s[0].St_func(s, n, vla1);
@@ -302,6 +302,6 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i{{[0-9]+}} [[SIZE]], i1 false)
 #endif
 #endif
diff --git a/test/PCH/modified-header-crash.c b/test/PCH/modified-header-crash.c
index 4c21a8c2b825..39b2530a32f2 100644
--- a/test/PCH/modified-header-crash.c
+++ b/test/PCH/modified-header-crash.c
@@ -1,6 +1,6 @@
 // Don't crash.
 
-// RUN: cp %S/modified-header-crash.h %t.h
+// RUN: cat %S/modified-header-crash.h > %t.h
 // RUN: %clang_cc1 -DCAKE -x c-header %t.h -emit-pch -o %t
 // RUN: echo 'int foobar;' >> %t.h
 // RUN: not %clang_cc1 %s -include-pch %t -fsyntax-only
diff --git a/test/PCH/verify_pch.m b/test/PCH/verify_pch.m
index e905f2537c7f..d50e23456e35 100644
--- a/test/PCH/verify_pch.m
+++ b/test/PCH/verify_pch.m
@@ -2,7 +2,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/usr/include
 // RUN: echo '// empty' > %t/usr/include/sys_header.h
-// RUN: cp %s %t.h
+// RUN: cat %s > %t.h
 //
 // Precompile
 // RUN: %clang_cc1 -isystem %t/usr/include -x objective-c-header -emit-pch -o %t.pch %t.h
diff --git a/test/Parser/pointer_promotion.c b/test/Parser/pointer_promotion.c
index 30589d01326f..8b718ad565ce 100644
--- a/test/Parser/pointer_promotion.c
+++ b/test/Parser/pointer_promotion.c
@@ -12,6 +12,6 @@ void test() {
   if (cp < fp) {} // expected-warning {{comparison of distinct pointer types ('char *' and 'struct foo *')}}
   if (fp < bp) {} // expected-warning {{comparison of distinct pointer types ('struct foo *' and 'struct bar *')}}
   if (ip < 7) {} // expected-warning {{comparison between pointer and integer ('int *' and 'int')}}
-  if (sint < ip) {} // expected-warning {{comparison between pointer and integer ('int' and 'int *')}}
+  if (sint < ip) {} // expected-warning {{comparison between pointer and integer ('short' and 'int *')}}
   if (ip == cp) {} // expected-warning {{comparison of distinct pointer types ('int *' and 'char *')}}
 }
diff --git a/test/Preprocessor/cuda-types.cu b/test/Preprocessor/cuda-types.cu
index 9e96f6a15e6e..4ad3e4d97aa2 100644
--- a/test/Preprocessor/cuda-types.cu
+++ b/test/Preprocessor/cuda-types.cu
@@ -9,40 +9,40 @@
 
 // RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/i386-host-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/i386-host-defines-filtered
 // RUN: %clang --cuda-device-only -nocudainc -nocudalib -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/i386-device-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/i386-device-defines-filtered
 // RUN: diff %t/i386-host-defines-filtered %t/i386-device-defines-filtered
 
 // RUN: %clang --cuda-host-only -nocudainc -target x86_64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/x86_64-host-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/x86_64-host-defines-filtered
 // RUN: %clang --cuda-device-only -nocudainc -nocudalib -target x86_64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/x86_64-device-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/x86_64-device-defines-filtered
 // RUN: diff %t/x86_64-host-defines-filtered %t/x86_64-device-defines-filtered
 
 // RUN: %clang --cuda-host-only -nocudainc -target powerpc64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/powerpc64-host-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/powerpc64-host-defines-filtered
 // RUN: %clang --cuda-device-only -nocudainc -nocudalib -target powerpc64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/powerpc64-device-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/powerpc64-device-defines-filtered
 // RUN: diff %t/powerpc64-host-defines-filtered %t/powerpc64-device-defines-filtered
 
 // RUN: %clang --cuda-host-only -nocudainc -target i386-windows-msvc -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/i386-msvc-host-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/i386-msvc-host-defines-filtered
 // RUN: %clang --cuda-device-only -nocudainc -nocudalib -target i386-windows-msvc -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/i386-msvc-device-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/i386-msvc-device-defines-filtered
 // RUN: diff %t/i386-msvc-host-defines-filtered %t/i386-msvc-device-defines-filtered
 
 // RUN: %clang --cuda-host-only -nocudainc -target x86_64-windows-msvc -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/x86_64-msvc-host-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/x86_64-msvc-host-defines-filtered
 // RUN: %clang --cuda-device-only -nocudainc -nocudalib -target x86_64-windows-msvc -x cuda -E -dM -o - /dev/null \
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
-// RUN:   | grep -v '__FLT128\|__LDBL\|_LONG_DOUBLE' > %t/x86_64-msvc-device-defines-filtered
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %t/x86_64-msvc-device-defines-filtered
 // RUN: diff %t/x86_64-msvc-host-defines-filtered %t/x86_64-msvc-device-defines-filtered
diff --git a/test/Preprocessor/init.c b/test/Preprocessor/init.c
index 51e1233456b8..5f3fe7a3270c 100644
--- a/test/Preprocessor/init.c
+++ b/test/Preprocessor/init.c
@@ -10003,3 +10003,398 @@
 // ARM-DARWIN-BAREMETAL-64: #define __PTRDIFF_TYPE__ long int
 // ARM-DARWIN-BAREMETAL-64: #define __SIZE_TYPE__ long unsigned int
 
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=riscv32 < /dev/null \
+// RUN:   | FileCheck -match-full-lines -check-prefix=RISCV32 %s
+// RISCV32: #define _ILP32 1
+// RISCV32: #define __ATOMIC_ACQUIRE 2
+// RISCV32: #define __ATOMIC_ACQ_REL 4
+// RISCV32: #define __ATOMIC_CONSUME 1
+// RISCV32: #define __ATOMIC_RELAXED 0
+// RISCV32: #define __ATOMIC_RELEASE 3
+// RISCV32: #define __ATOMIC_SEQ_CST 5
+// RISCV32: #define __BIGGEST_ALIGNMENT__ 16
+// RISCV32: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// RISCV32: #define __CHAR16_TYPE__ unsigned short
+// RISCV32: #define __CHAR32_TYPE__ unsigned int
+// RISCV32: #define __CHAR_BIT__ 8
+// RISCV32: #define __DBL_DECIMAL_DIG__ 17
+// RISCV32: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// RISCV32: #define __DBL_DIG__ 15
+// RISCV32: #define __DBL_EPSILON__ 2.2204460492503131e-16
+// RISCV32: #define __DBL_HAS_DENORM__ 1
+// RISCV32: #define __DBL_HAS_INFINITY__ 1
+// RISCV32: #define __DBL_HAS_QUIET_NAN__ 1
+// RISCV32: #define __DBL_MANT_DIG__ 53
+// RISCV32: #define __DBL_MAX_10_EXP__ 308
+// RISCV32: #define __DBL_MAX_EXP__ 1024
+// RISCV32: #define __DBL_MAX__ 1.7976931348623157e+308
+// RISCV32: #define __DBL_MIN_10_EXP__ (-307)
+// RISCV32: #define __DBL_MIN_EXP__ (-1021)
+// RISCV32: #define __DBL_MIN__ 2.2250738585072014e-308
+// RISCV32: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// RISCV32: #define __ELF__ 1
+// RISCV32: #define __FINITE_MATH_ONLY__ 0
+// RISCV32: #define __FLT_DECIMAL_DIG__ 9
+// RISCV32: #define __FLT_DENORM_MIN__ 1.40129846e-45F
+// RISCV32: #define __FLT_DIG__ 6
+// RISCV32: #define __FLT_EPSILON__ 1.19209290e-7F
+// RISCV32: #define __FLT_EVAL_METHOD__ 0
+// RISCV32: #define __FLT_HAS_DENORM__ 1
+// RISCV32: #define __FLT_HAS_INFINITY__ 1
+// RISCV32: #define __FLT_HAS_QUIET_NAN__ 1
+// RISCV32: #define __FLT_MANT_DIG__ 24
+// RISCV32: #define __FLT_MAX_10_EXP__ 38
+// RISCV32: #define __FLT_MAX_EXP__ 128
+// RISCV32: #define __FLT_MAX__ 3.40282347e+38F
+// RISCV32: #define __FLT_MIN_10_EXP__ (-37)
+// RISCV32: #define __FLT_MIN_EXP__ (-125)
+// RISCV32: #define __FLT_MIN__ 1.17549435e-38F
+// RISCV32: #define __FLT_RADIX__ 2
+// RISCV32: #define __GCC_ATOMIC_BOOL_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_CHAR_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_INT_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_LLONG_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_LONG_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_POINTER_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_SHORT_LOCK_FREE 1
+// RISCV32: #define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// RISCV32: #define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 1
+// RISCV32: #define __GNUC_MINOR__ {{.*}}
+// RISCV32: #define __GNUC_PATCHLEVEL__ {{.*}}
+// RISCV32: #define __GNUC_STDC_INLINE__ 1
+// RISCV32: #define __GNUC__ {{.*}}
+// RISCV32: #define __GXX_ABI_VERSION {{.*}}
+// RISCV32: #define __ILP32__ 1
+// RISCV32: #define __INT16_C_SUFFIX__
+// RISCV32: #define __INT16_MAX__ 32767
+// RISCV32: #define __INT16_TYPE__ short
+// RISCV32: #define __INT32_C_SUFFIX__
+// RISCV32: #define __INT32_MAX__ 2147483647
+// RISCV32: #define __INT32_TYPE__ int
+// RISCV32: #define __INT64_C_SUFFIX__ LL
+// RISCV32: #define __INT64_MAX__ 9223372036854775807LL
+// RISCV32: #define __INT64_TYPE__ long long int
+// RISCV32: #define __INT8_C_SUFFIX__
+// RISCV32: #define __INT8_MAX__ 127
+// RISCV32: #define __INT8_TYPE__ signed char
+// RISCV32: #define __INTMAX_C_SUFFIX__ LL
+// RISCV32: #define __INTMAX_MAX__ 9223372036854775807LL
+// RISCV32: #define __INTMAX_TYPE__ long long int
+// RISCV32: #define __INTMAX_WIDTH__ 64
+// RISCV32: #define __INTPTR_MAX__ 2147483647
+// RISCV32: #define __INTPTR_TYPE__ int
+// RISCV32: #define __INTPTR_WIDTH__ 32
+// TODO: RISC-V GCC defines INT_FAST16 as int
+// RISCV32: #define __INT_FAST16_MAX__ 32767
+// RISCV32: #define __INT_FAST16_TYPE__ short
+// RISCV32: #define __INT_FAST32_MAX__ 2147483647
+// RISCV32: #define __INT_FAST32_TYPE__ int
+// RISCV32: #define __INT_FAST64_MAX__ 9223372036854775807LL
+// RISCV32: #define __INT_FAST64_TYPE__ long long int
+// TODO: RISC-V GCC defines INT_FAST8 as int
+// RISCV32: #define __INT_FAST8_MAX__ 127
+// RISCV32: #define __INT_FAST8_TYPE__ signed char
+// RISCV32: #define __INT_LEAST16_MAX__ 32767
+// RISCV32: #define __INT_LEAST16_TYPE__ short
+// RISCV32: #define __INT_LEAST32_MAX__ 2147483647
+// RISCV32: #define __INT_LEAST32_TYPE__ int
+// RISCV32: #define __INT_LEAST64_MAX__ 9223372036854775807LL
+// RISCV32: #define __INT_LEAST64_TYPE__ long long int
+// RISCV32: #define __INT_LEAST8_MAX__ 127
+// RISCV32: #define __INT_LEAST8_TYPE__ signed char
+// RISCV32: #define __INT_MAX__ 2147483647
+// RISCV32: #define __LDBL_DECIMAL_DIG__ 36
+// RISCV32: #define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
+// RISCV32: #define __LDBL_DIG__ 33
+// RISCV32: #define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
+// RISCV32: #define __LDBL_HAS_DENORM__ 1
+// RISCV32: #define __LDBL_HAS_INFINITY__ 1
+// RISCV32: #define __LDBL_HAS_QUIET_NAN__ 1
+// RISCV32: #define __LDBL_MANT_DIG__ 113
+// RISCV32: #define __LDBL_MAX_10_EXP__ 4932
+// RISCV32: #define __LDBL_MAX_EXP__ 16384
+// RISCV32: #define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
+// RISCV32: #define __LDBL_MIN_10_EXP__ (-4931)
+// RISCV32: #define __LDBL_MIN_EXP__ (-16381)
+// RISCV32: #define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
+// RISCV32: #define __LITTLE_ENDIAN__ 1
+// RISCV32: #define __LONG_LONG_MAX__ 9223372036854775807LL
+// RISCV32: #define __LONG_MAX__ 2147483647L
+// RISCV32: #define __NO_INLINE__ 1
+// RISCV32: #define __POINTER_WIDTH__ 32
+// RISCV32: #define __PRAGMA_REDEFINE_EXTNAME 1
+// RISCV32: #define __PTRDIFF_MAX__ 2147483647
+// RISCV32: #define __PTRDIFF_TYPE__ int
+// RISCV32: #define __PTRDIFF_WIDTH__ 32
+// RISCV32: #define __SCHAR_MAX__ 127
+// RISCV32: #define __SHRT_MAX__ 32767
+// RISCV32: #define __SIG_ATOMIC_MAX__ 2147483647
+// RISCV32: #define __SIG_ATOMIC_WIDTH__ 32
+// RISCV32: #define __SIZEOF_DOUBLE__ 8
+// RISCV32: #define __SIZEOF_FLOAT__ 4
+// RISCV32: #define __SIZEOF_INT__ 4
+// RISCV32: #define __SIZEOF_LONG_DOUBLE__ 16
+// RISCV32: #define __SIZEOF_LONG_LONG__ 8
+// RISCV32: #define __SIZEOF_LONG__ 4
+// RISCV32: #define __SIZEOF_POINTER__ 4
+// RISCV32: #define __SIZEOF_PTRDIFF_T__ 4
+// RISCV32: #define __SIZEOF_SHORT__ 2
+// RISCV32: #define __SIZEOF_SIZE_T__ 4
+// RISCV32: #define __SIZEOF_WCHAR_T__ 4
+// RISCV32: #define __SIZEOF_WINT_T__ 4
+// RISCV32: #define __SIZE_MAX__ 4294967295U
+// RISCV32: #define __SIZE_TYPE__ unsigned int
+// RISCV32: #define __SIZE_WIDTH__ 32
+// RISCV32: #define __STDC_HOSTED__ 0
+// RISCV32: #define __STDC_UTF_16__ 1
+// RISCV32: #define __STDC_UTF_32__ 1
+// RISCV32: #define __STDC_VERSION__ 201112L
+// RISCV32: #define __STDC__ 1
+// RISCV32: #define __UINT16_C_SUFFIX__
+// RISCV32: #define __UINT16_MAX__ 65535
+// RISCV32: #define __UINT16_TYPE__ unsigned short
+// RISCV32: #define __UINT32_C_SUFFIX__ U
+// RISCV32: #define __UINT32_MAX__ 4294967295U
+// RISCV32: #define __UINT32_TYPE__ unsigned int
+// RISCV32: #define __UINT64_C_SUFFIX__ ULL
+// RISCV32: #define __UINT64_MAX__ 18446744073709551615ULL
+// RISCV32: #define __UINT64_TYPE__ long long unsigned int
+// RISCV32: #define __UINT8_C_SUFFIX__
+// RISCV32: #define __UINT8_MAX__ 255
+// RISCV32: #define __UINT8_TYPE__ unsigned char
+// RISCV32: #define __UINTMAX_C_SUFFIX__ ULL
+// RISCV32: #define __UINTMAX_MAX__ 18446744073709551615ULL
+// RISCV32: #define __UINTMAX_TYPE__ long long unsigned int
+// RISCV32: #define __UINTMAX_WIDTH__ 64
+// RISCV32: #define __UINTPTR_MAX__ 4294967295U
+// RISCV32: #define __UINTPTR_TYPE__ unsigned int
+// RISCV32: #define __UINTPTR_WIDTH__ 32
+// TODO: RISC-V GCC defines UINT_FAST16 to be unsigned int
+// RISCV32: #define __UINT_FAST16_MAX__ 65535
+// RISCV32: #define __UINT_FAST16_TYPE__ unsigned short
+// RISCV32: #define __UINT_FAST32_MAX__ 4294967295U
+// RISCV32: #define __UINT_FAST32_TYPE__ unsigned int
+// RISCV32: #define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// RISCV32: #define __UINT_FAST64_TYPE__ long long unsigned int
+// TODO: RISC-V GCC defines UINT_FAST8 to be unsigned int
+// RISCV32: #define __UINT_FAST8_MAX__ 255
+// RISCV32: #define __UINT_FAST8_TYPE__ unsigned char
+// RISCV32: #define __UINT_LEAST16_MAX__ 65535
+// RISCV32: #define __UINT_LEAST16_TYPE__ unsigned short
+// RISCV32: #define __UINT_LEAST32_MAX__ 4294967295U
+// RISCV32: #define __UINT_LEAST32_TYPE__ unsigned int
+// RISCV32: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// RISCV32: #define __UINT_LEAST64_TYPE__ long long unsigned int
+// RISCV32: #define __UINT_LEAST8_MAX__ 255
+// RISCV32: #define __UINT_LEAST8_TYPE__ unsigned char
+// RISCV32: #define __USER_LABEL_PREFIX__
+// RISCV32: #define __WCHAR_MAX__ 2147483647
+// RISCV32: #define __WCHAR_TYPE__ int
+// RISCV32: #define __WCHAR_WIDTH__ 32
+// RISCV32: #define __WINT_TYPE__ unsigned int
+// RISCV32: #define __WINT_UNSIGNED__ 1
+// RISCV32: #define __WINT_WIDTH__ 32
+// RISCV32: #define __riscv 1
+// RISCV32: #define __riscv_cmodel_medlow 1
+// RISCV32: #define __riscv_float_abi_soft 1
+// RISCV32: #define __riscv_xlen 32
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=riscv64 < /dev/null \
+// RUN:   | FileCheck -match-full-lines -check-prefix=RISCV64 %s
+// RISCV64: #define _LP64 1
+// RISCV64: #define __ATOMIC_ACQUIRE 2
+// RISCV64: #define __ATOMIC_ACQ_REL 4
+// RISCV64: #define __ATOMIC_CONSUME 1
+// RISCV64: #define __ATOMIC_RELAXED 0
+// RISCV64: #define __ATOMIC_RELEASE 3
+// RISCV64: #define __ATOMIC_SEQ_CST 5
+// RISCV64: #define __BIGGEST_ALIGNMENT__ 16
+// RISCV64: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// RISCV64: #define __CHAR16_TYPE__ unsigned short
+// RISCV64: #define __CHAR32_TYPE__ unsigned int
+// RISCV64: #define __CHAR_BIT__ 8
+// RISCV64: #define __DBL_DECIMAL_DIG__ 17
+// RISCV64: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// RISCV64: #define __DBL_DIG__ 15
+// RISCV64: #define __DBL_EPSILON__ 2.2204460492503131e-16
+// RISCV64: #define __DBL_HAS_DENORM__ 1
+// RISCV64: #define __DBL_HAS_INFINITY__ 1
+// RISCV64: #define __DBL_HAS_QUIET_NAN__ 1
+// RISCV64: #define __DBL_MANT_DIG__ 53
+// RISCV64: #define __DBL_MAX_10_EXP__ 308
+// RISCV64: #define __DBL_MAX_EXP__ 1024
+// RISCV64: #define __DBL_MAX__ 1.7976931348623157e+308
+// RISCV64: #define __DBL_MIN_10_EXP__ (-307)
+// RISCV64: #define __DBL_MIN_EXP__ (-1021)
+// RISCV64: #define __DBL_MIN__ 2.2250738585072014e-308
+// RISCV64: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// RISCV64: #define __ELF__ 1
+// RISCV64: #define __FINITE_MATH_ONLY__ 0
+// RISCV64: #define __FLT_DECIMAL_DIG__ 9
+// RISCV64: #define __FLT_DENORM_MIN__ 1.40129846e-45F
+// RISCV64: #define __FLT_DIG__ 6
+// RISCV64: #define __FLT_EPSILON__ 1.19209290e-7F
+// RISCV64: #define __FLT_EVAL_METHOD__ 0
+// RISCV64: #define __FLT_HAS_DENORM__ 1
+// RISCV64: #define __FLT_HAS_INFINITY__ 1
+// RISCV64: #define __FLT_HAS_QUIET_NAN__ 1
+// RISCV64: #define __FLT_MANT_DIG__ 24
+// RISCV64: #define __FLT_MAX_10_EXP__ 38
+// RISCV64: #define __FLT_MAX_EXP__ 128
+// RISCV64: #define __FLT_MAX__ 3.40282347e+38F
+// RISCV64: #define __FLT_MIN_10_EXP__ (-37)
+// RISCV64: #define __FLT_MIN_EXP__ (-125)
+// RISCV64: #define __FLT_MIN__ 1.17549435e-38F
+// RISCV64: #define __FLT_RADIX__ 2
+// RISCV64: #define __GCC_ATOMIC_BOOL_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_CHAR_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_INT_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_LLONG_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_LONG_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_POINTER_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_SHORT_LOCK_FREE 1
+// RISCV64: #define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// RISCV64: #define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 1
+// RISCV64: #define __GNUC_MINOR__ {{.*}}
+// RISCV64: #define __GNUC_PATCHLEVEL__ {{.*}}
+// RISCV64: #define __GNUC_STDC_INLINE__ 1
+// RISCV64: #define __GNUC__ {{.*}}
+// RISCV64: #define __GXX_ABI_VERSION {{.*}}
+// RISCV64: #define __INT16_C_SUFFIX__
+// RISCV64: #define __INT16_MAX__ 32767
+// RISCV64: #define __INT16_TYPE__ short
+// RISCV64: #define __INT32_C_SUFFIX__
+// RISCV64: #define __INT32_MAX__ 2147483647
+// RISCV64: #define __INT32_TYPE__ int
+// RISCV64: #define __INT64_C_SUFFIX__ L
+// RISCV64: #define __INT64_MAX__ 9223372036854775807L
+// RISCV64: #define __INT64_TYPE__ long int
+// RISCV64: #define __INT8_C_SUFFIX__
+// RISCV64: #define __INT8_MAX__ 127
+// RISCV64: #define __INT8_TYPE__ signed char
+// RISCV64: #define __INTMAX_C_SUFFIX__ L
+// RISCV64: #define __INTMAX_MAX__ 9223372036854775807L
+// RISCV64: #define __INTMAX_TYPE__ long int
+// RISCV64: #define __INTMAX_WIDTH__ 64
+// RISCV64: #define __INTPTR_MAX__ 9223372036854775807L
+// RISCV64: #define __INTPTR_TYPE__ long int
+// RISCV64: #define __INTPTR_WIDTH__ 64
+// TODO: RISC-V GCC defines INT_FAST16 as int
+// RISCV64: #define __INT_FAST16_MAX__ 32767
+// RISCV64: #define __INT_FAST16_TYPE__ short
+// RISCV64: #define __INT_FAST32_MAX__ 2147483647
+// RISCV64: #define __INT_FAST32_TYPE__ int
+// RISCV64: #define __INT_FAST64_MAX__ 9223372036854775807L
+// RISCV64: #define __INT_FAST64_TYPE__ long int
+// TODO: RISC-V GCC defines INT_FAST8 as int
+// RISCV64: #define __INT_FAST8_MAX__ 127
+// RISCV64: #define __INT_FAST8_TYPE__ signed char
+// RISCV64: #define __INT_LEAST16_MAX__ 32767
+// RISCV64: #define __INT_LEAST16_TYPE__ short
+// RISCV64: #define __INT_LEAST32_MAX__ 2147483647
+// RISCV64: #define __INT_LEAST32_TYPE__ int
+// RISCV64: #define __INT_LEAST64_MAX__ 9223372036854775807L
+// RISCV64: #define __INT_LEAST64_TYPE__ long int
+// RISCV64: #define __INT_LEAST8_MAX__ 127
+// RISCV64: #define __INT_LEAST8_TYPE__ signed char
+// RISCV64: #define __INT_MAX__ 2147483647
+// RISCV64: #define __LDBL_DECIMAL_DIG__ 36
+// RISCV64: #define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
+// RISCV64: #define __LDBL_DIG__ 33
+// RISCV64: #define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
+// RISCV64: #define __LDBL_HAS_DENORM__ 1
+// RISCV64: #define __LDBL_HAS_INFINITY__ 1
+// RISCV64: #define __LDBL_HAS_QUIET_NAN__ 1
+// RISCV64: #define __LDBL_MANT_DIG__ 113
+// RISCV64: #define __LDBL_MAX_10_EXP__ 4932
+// RISCV64: #define __LDBL_MAX_EXP__ 16384
+// RISCV64: #define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
+// RISCV64: #define __LDBL_MIN_10_EXP__ (-4931)
+// RISCV64: #define __LDBL_MIN_EXP__ (-16381)
+// RISCV64: #define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
+// RISCV64: #define __LITTLE_ENDIAN__ 1
+// RISCV64: #define __LONG_LONG_MAX__ 9223372036854775807LL
+// RISCV64: #define __LONG_MAX__ 9223372036854775807L
+// RISCV64: #define __LP64__ 1
+// RISCV64: #define __NO_INLINE__ 1
+// RISCV64: #define __POINTER_WIDTH__ 64
+// RISCV64: #define __PRAGMA_REDEFINE_EXTNAME 1
+// RISCV64: #define __PTRDIFF_MAX__ 9223372036854775807L
+// RISCV64: #define __PTRDIFF_TYPE__ long int
+// RISCV64: #define __PTRDIFF_WIDTH__ 64
+// RISCV64: #define __SCHAR_MAX__ 127
+// RISCV64: #define __SHRT_MAX__ 32767
+// RISCV64: #define __SIG_ATOMIC_MAX__ 2147483647
+// RISCV64: #define __SIG_ATOMIC_WIDTH__ 32
+// RISCV64: #define __SIZEOF_DOUBLE__ 8
+// RISCV64: #define __SIZEOF_FLOAT__ 4
+// RISCV64: #define __SIZEOF_INT__ 4
+// RISCV64: #define __SIZEOF_LONG_DOUBLE__ 16
+// RISCV64: #define __SIZEOF_LONG_LONG__ 8
+// RISCV64: #define __SIZEOF_LONG__ 8
+// RISCV64: #define __SIZEOF_POINTER__ 8
+// RISCV64: #define __SIZEOF_PTRDIFF_T__ 8
+// RISCV64: #define __SIZEOF_SHORT__ 2
+// RISCV64: #define __SIZEOF_SIZE_T__ 8
+// RISCV64: #define __SIZEOF_WCHAR_T__ 4
+// RISCV64: #define __SIZEOF_WINT_T__ 4
+// RISCV64: #define __SIZE_MAX__ 18446744073709551615UL
+// RISCV64: #define __SIZE_TYPE__ long unsigned int
+// RISCV64: #define __SIZE_WIDTH__ 64
+// RISCV64: #define __STDC_HOSTED__ 0
+// RISCV64: #define __STDC_UTF_16__ 1
+// RISCV64: #define __STDC_UTF_32__ 1
+// RISCV64: #define __STDC_VERSION__ 201112L
+// RISCV64: #define __STDC__ 1
+// RISCV64: #define __UINT16_C_SUFFIX__
+// RISCV64: #define __UINT16_MAX__ 65535
+// RISCV64: #define __UINT16_TYPE__ unsigned short
+// RISCV64: #define __UINT32_C_SUFFIX__ U
+// RISCV64: #define __UINT32_MAX__ 4294967295U
+// RISCV64: #define __UINT32_TYPE__ unsigned int
+// RISCV64: #define __UINT64_C_SUFFIX__ UL
+// RISCV64: #define __UINT64_MAX__ 18446744073709551615UL
+// RISCV64: #define __UINT64_TYPE__ long unsigned int
+// RISCV64: #define __UINT8_C_SUFFIX__
+// RISCV64: #define __UINT8_MAX__ 255
+// RISCV64: #define __UINT8_TYPE__ unsigned char
+// RISCV64: #define __UINTMAX_C_SUFFIX__ UL
+// RISCV64: #define __UINTMAX_MAX__ 18446744073709551615UL
+// RISCV64: #define __UINTMAX_TYPE__ long unsigned int
+// RISCV64: #define __UINTMAX_WIDTH__ 64
+// RISCV64: #define __UINTPTR_MAX__ 18446744073709551615UL
+// RISCV64: #define __UINTPTR_TYPE__ long unsigned int
+// RISCV64: #define __UINTPTR_WIDTH__ 64
+// TODO: RISC-V GCC defines UINT_FAST16 to be unsigned int
+// RISCV64: #define __UINT_FAST16_MAX__ 65535
+// RISCV64: #define __UINT_FAST16_TYPE__ unsigned short
+// RISCV64: #define __UINT_FAST32_MAX__ 4294967295U
+// RISCV64: #define __UINT_FAST32_TYPE__ unsigned int
+// RISCV64: #define __UINT_FAST64_MAX__ 18446744073709551615UL
+// RISCV64: #define __UINT_FAST64_TYPE__ long unsigned int
+// TODO: RISC-V GCC defines UINT_FAST8 to be unsigned int
+// RISCV64: #define __UINT_FAST8_MAX__ 255
+// RISCV64: #define __UINT_FAST8_TYPE__ unsigned char
+// RISCV64: #define __UINT_LEAST16_MAX__ 65535
+// RISCV64: #define __UINT_LEAST16_TYPE__ unsigned short
+// RISCV64: #define __UINT_LEAST32_MAX__ 4294967295U
+// RISCV64: #define __UINT_LEAST32_TYPE__ unsigned int
+// RISCV64: #define __UINT_LEAST64_MAX__ 18446744073709551615UL
+// RISCV64: #define __UINT_LEAST64_TYPE__ long unsigned int
+// RISCV64: #define __UINT_LEAST8_MAX__ 255
+// RISCV64: #define __UINT_LEAST8_TYPE__ unsigned char
+// RISCV64: #define __USER_LABEL_PREFIX__
+// RISCV64: #define __WCHAR_MAX__ 2147483647
+// RISCV64: #define __WCHAR_TYPE__ int
+// RISCV64: #define __WCHAR_WIDTH__ 32
+// RISCV64: #define __WINT_TYPE__ unsigned int
+// RISCV64: #define __WINT_UNSIGNED__ 1
+// RISCV64: #define __WINT_WIDTH__ 32
+// RISCV64: #define __riscv 1
+// RISCV64: #define __riscv_cmodel_medlow 1
+// RISCV64: #define __riscv_float_abi_soft 1
+// RISCV64: #define __riscv_xlen 64
diff --git a/test/Preprocessor/macro-multiline.c b/test/Preprocessor/macro-multiline.c
index 72a5d20e523f..664c37cd02a4 100644
--- a/test/Preprocessor/macro-multiline.c
+++ b/test/Preprocessor/macro-multiline.c
@@ -1,4 +1,4 @@
-// RUN: printf -- "-DX=A\nTHIS_SHOULD_NOT_EXIST_IN_THE_OUTPUT" | xargs -0 %clang -E %s | FileCheck -strict-whitespace %s
+// RUN: printf -- "-DX=A\\\\\nTHIS_SHOULD_NOT_EXIST_IN_THE_OUTPUT\n" | xargs %clang -E %s | FileCheck -strict-whitespace %s
 
 // Per GCC -D semantics, \n and anything that follows is ignored.
 
diff --git a/test/Preprocessor/pragma_unknown.c b/test/Preprocessor/pragma_unknown.c
index 5578ce5b7502..81fe88cd3855 100644
--- a/test/Preprocessor/pragma_unknown.c
+++ b/test/Preprocessor/pragma_unknown.c
@@ -1,29 +1,45 @@
 // RUN: %clang_cc1 -fsyntax-only -Wunknown-pragmas -verify %s
-// RUN: %clang_cc1 -E %s | FileCheck --strict-whitespace %s
+// RUN: %clang_cc1 -E %s 2>&1 | FileCheck --strict-whitespace %s
 
 // GCC doesn't expand macro args for unrecognized pragmas.
 #define bar xX
 #pragma foo bar   // expected-warning {{unknown pragma ignored}}
+// CHECK-NOT: unknown pragma in STDC namespace
 // CHECK: {{^}}#pragma foo bar{{$}}
 
 #pragma STDC FP_CONTRACT ON
 #pragma STDC FP_CONTRACT OFF
 #pragma STDC FP_CONTRACT DEFAULT
 #pragma STDC FP_CONTRACT IN_BETWEEN  // expected-warning {{expected 'ON' or 'OFF' or 'DEFAULT' in pragma}}
+// CHECK: {{^}}#pragma STDC FP_CONTRACT ON{{$}}
+// CHECK: {{^}}#pragma STDC FP_CONTRACT OFF{{$}}
+// CHECK: {{^}}#pragma STDC FP_CONTRACT DEFAULT{{$}}
+// CHECK: {{^}}#pragma STDC FP_CONTRACT IN_BETWEEN{{$}}
 
 #pragma STDC FENV_ACCESS ON          // expected-warning {{pragma STDC FENV_ACCESS ON is not supported, ignoring pragma}}
 #pragma STDC FENV_ACCESS OFF
 #pragma STDC FENV_ACCESS DEFAULT
 #pragma STDC FENV_ACCESS IN_BETWEEN   // expected-warning {{expected 'ON' or 'OFF' or 'DEFAULT' in pragma}}
+// CHECK: {{^}}#pragma STDC FENV_ACCESS ON{{$}}
+// CHECK: {{^}}#pragma STDC FENV_ACCESS OFF{{$}}
+// CHECK: {{^}}#pragma STDC FENV_ACCESS DEFAULT{{$}}
+// CHECK: {{^}}#pragma STDC FENV_ACCESS IN_BETWEEN{{$}}
 
 #pragma STDC CX_LIMITED_RANGE ON
 #pragma STDC CX_LIMITED_RANGE OFF
 #pragma STDC CX_LIMITED_RANGE DEFAULT 
 #pragma STDC CX_LIMITED_RANGE IN_BETWEEN   // expected-warning {{expected 'ON' or 'OFF' or 'DEFAULT' in pragma}}
+// CHECK: {{^}}#pragma STDC CX_LIMITED_RANGE ON{{$}}
+// CHECK: {{^}}#pragma STDC CX_LIMITED_RANGE OFF{{$}}
+// CHECK: {{^}}#pragma STDC CX_LIMITED_RANGE DEFAULT{{$}}
+// CHECK: {{^}}#pragma STDC CX_LIMITED_RANGE IN_BETWEEN{{$}}
 
 #pragma STDC CX_LIMITED_RANGE    // expected-warning {{expected 'ON' or 'OFF' or 'DEFAULT' in pragma}}
 #pragma STDC CX_LIMITED_RANGE ON FULL POWER  // expected-warning {{expected end of directive in pragma}}
+// CHECK: {{^}}#pragma STDC CX_LIMITED_RANGE{{$}}
+// CHECK: {{^}}#pragma STDC CX_LIMITED_RANGE ON FULL POWER{{$}}
 
 #pragma STDC SO_GREAT  // expected-warning {{unknown pragma in STDC namespace}}
 #pragma STDC   // expected-warning {{unknown pragma in STDC namespace}}
-
+// CHECK: {{^}}#pragma STDC SO_GREAT{{$}}
+// CHECK: {{^}}#pragma STDC{{$}}
diff --git a/test/Preprocessor/predefined-arch-macros.c b/test/Preprocessor/predefined-arch-macros.c
index 6021f9c60e34..ef0c022ac0e2 100644
--- a/test/Preprocessor/predefined-arch-macros.c
+++ b/test/Preprocessor/predefined-arch-macros.c
@@ -890,6 +890,7 @@
 // CHECK_SKX_M32: #define __POPCNT__ 1
 // CHECK_SKX_M32: #define __PRFCHW__ 1
 // CHECK_SKX_M32: #define __RDRND__ 1
+// CHECK_SKX_M32: #define __RDSEED__ 1
 // CHECK_SKX_M32: #define __RTM__ 1
 // CHECK_SKX_M32: #define __SGX__ 1
 // CHECK_SKX_M32: #define __SSE2__ 1
@@ -934,6 +935,7 @@
 // CHECK_SKX_M64: #define __POPCNT__ 1
 // CHECK_SKX_M64: #define __PRFCHW__ 1
 // CHECK_SKX_M64: #define __RDRND__ 1
+// CHECK_SKX_M64: #define __RDSEED__ 1
 // CHECK_SKX_M64: #define __RTM__ 1
 // CHECK_SKX_M64: #define __SGX__ 1
 // CHECK_SKX_M64: #define __SSE2_MATH__ 1
@@ -983,6 +985,7 @@
 // CHECK_CNL_M32: #define __POPCNT__ 1
 // CHECK_CNL_M32: #define __PRFCHW__ 1
 // CHECK_CNL_M32: #define __RDRND__ 1
+// CHECK_CNL_M32: #define __RDSEED__ 1
 // CHECK_CNL_M32: #define __RTM__ 1
 // CHECK_CNL_M32: #define __SGX__ 1
 // CHECK_CNL_M32: #define __SHA__ 1
@@ -1030,6 +1033,7 @@
 // CHECK_CNL_M64: #define __POPCNT__ 1
 // CHECK_CNL_M64: #define __PRFCHW__ 1
 // CHECK_CNL_M64: #define __RDRND__ 1
+// CHECK_CNL_M64: #define __RDSEED__ 1
 // CHECK_CNL_M64: #define __RTM__ 1
 // CHECK_CNL_M64: #define __SGX__ 1
 // CHECK_CNL_M64: #define __SHA__ 1
@@ -1082,7 +1086,9 @@
 // CHECK_ICL_M32: #define __PKU__ 1
 // CHECK_ICL_M32: #define __POPCNT__ 1
 // CHECK_ICL_M32: #define __PRFCHW__ 1
+// CHECK_ICL_M32: #define __RDPID__ 1
 // CHECK_ICL_M32: #define __RDRND__ 1
+// CHECK_ICL_M32: #define __RDSEED__ 1
 // CHECK_ICL_M32: #define __RTM__ 1
 // CHECK_ICL_M32: #define __SGX__ 1
 // CHECK_ICL_M32: #define __SHA__ 1
@@ -1136,7 +1142,9 @@
 // CHECK_ICL_M64: #define __PKU__ 1
 // CHECK_ICL_M64: #define __POPCNT__ 1
 // CHECK_ICL_M64: #define __PRFCHW__ 1
+// CHECK_ICL_M64: #define __RDPID__ 1
 // CHECK_ICL_M64: #define __RDRND__ 1
+// CHECK_ICL_M64: #define __RDSEED__ 1
 // CHECK_ICL_M64: #define __RTM__ 1
 // CHECK_ICL_M64: #define __SGX__ 1
 // CHECK_ICL_M64: #define __SHA__ 1
@@ -1264,6 +1272,7 @@
 // CHECK_SLM_M32: #define __PCLMUL__ 1
 // CHECK_SLM_M32: #define __POPCNT__ 1
 // CHECK_SLM_M32: #define __PRFCHW__ 1
+// CHECK_SLM_M32: #define __RDRND__ 1
 // CHECK_SLM_M32: #define __SSE2__ 1
 // CHECK_SLM_M32: #define __SSE3__ 1
 // CHECK_SLM_M32: #define __SSE4_1__ 1
@@ -1285,6 +1294,7 @@
 // CHECK_SLM_M64: #define __PCLMUL__ 1
 // CHECK_SLM_M64: #define __POPCNT__ 1
 // CHECK_SLM_M64: #define __PRFCHW__ 1
+// CHECK_SLM_M64: #define __RDRND__ 1
 // CHECK_SLM_M64: #define __SSE2_MATH__ 1
 // CHECK_SLM_M64: #define __SSE2__ 1
 // CHECK_SLM_M64: #define __SSE3__ 1
diff --git a/test/Preprocessor/x86_target_features.c b/test/Preprocessor/x86_target_features.c
index 542328aca047..37a4ffd51e19 100644
--- a/test/Preprocessor/x86_target_features.c
+++ b/test/Preprocessor/x86_target_features.c
@@ -376,9 +376,13 @@
 
 // ADX: #define __ADX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -mshstk -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=CETSS %s
+// RUN: %clang -target i386-unknown-unknown -mshstk -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SHSTK %s
 
-// CETSS: #define __SHSTK__ 1
+// SHSTK: #define __SHSTK__ 1
+
+// RUN: %clang -target i386-unknown-unknown -mibt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=IBT %s
+
+// IBT: #define __IBT__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDSEED %s
 
@@ -436,3 +440,6 @@
 // VPCLMULQDQNOPCLMUL-NOT: #define __PCLMUL__ 1
 // VPCLMULQDQNOPCLMUL-NOT: #define __VPCLMULQDQ__ 1
 
+// RUN: %clang -target i386-unknown-unknown -march=atom -mrdpid -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDPID %s
+
+// RDPID: #define __RDPID__ 1
diff --git a/test/Sema/_Float128.c b/test/Sema/_Float128.c
deleted file mode 100644
index f0c3c6d555ef..000000000000
--- a/test/Sema/_Float128.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clang_cc1 -verify %s
-// RUN: %clang_cc1 -triple powerpc64-linux -verify %s
-// RUN: %clang_cc1 -triple i686-windows-gnu -verify %s
-// RUN: %clang_cc1 -triple x86_64-windows-gnu -verify %s
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -verify %s
-
-#if defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
-_Float128 f;
-_Float128 tiny = __FLT128_EPSILON__;
-int g(int x, _Float128 *y) {
-  return x + *y;
-}
-
-// expected-no-diagnostics
-#else
-_Float128 f;  // expected-error {{__float128 is not supported on this target}}
-float tiny = __FLT128_EPSILON__; // expected-error{{use of undeclared identifier}}
-int g(int x, _Float128 *y) {  // expected-error {{__float128 is not supported on this target}}
-  return x + *y;
-}
-
-#endif  // defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
diff --git a/test/Sema/annotate.c b/test/Sema/annotate.c
index 4a786d0a0163..0d1a1c29a310 100644
--- a/test/Sema/annotate.c
+++ b/test/Sema/annotate.c
@@ -1,9 +1,13 @@
-// RUN: %clang_cc1 %s -fsyntax-only -verify
+// RUN: %clang_cc1 %s -fsyntax-only -fdouble-square-bracket-attributes -verify
 
 void __attribute__((annotate("foo"))) foo(float *a) {
   __attribute__((annotate("bar"))) int x;
+  [[clang::annotate("bar")]] int x2;
   __attribute__((annotate(1))) int y; // expected-error {{'annotate' attribute requires a string}}
+  [[clang::annotate(1)]] int y2; // expected-error {{'annotate' attribute requires a string}}
   __attribute__((annotate("bar", 1))) int z; // expected-error {{'annotate' attribute takes one argument}}
+  [[clang::annotate("bar", 1)]] int z2; // expected-error {{'annotate' attribute takes one argument}}
+
   int u = __builtin_annotation(z, (char*) 0); // expected-error {{second argument to __builtin_annotation must be a non-wide string constant}}
   int v = __builtin_annotation(z, (char*) L"bar"); // expected-error {{second argument to __builtin_annotation must be a non-wide string constant}}
   int w = __builtin_annotation(z, "foo");
diff --git a/test/Sema/ast-print.c b/test/Sema/ast-print.c
index f701b1220953..83a08bf24583 100644
--- a/test/Sema/ast-print.c
+++ b/test/Sema/ast-print.c
@@ -15,6 +15,10 @@ struct blah {
   };
 };
 
+// This used to crash clang.
+struct {
+}(s1);
+
 int foo(const struct blah *b) {
   // CHECK: return b->b;
   return b->b;
diff --git a/test/Sema/attr-target-mv-bad-target.c b/test/Sema/attr-target-mv-bad-target.c
new file mode 100644
index 000000000000..9cf3c5e10a6b
--- /dev/null
+++ b/test/Sema/attr-target-mv-bad-target.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple x86_64-windows-pc  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple arm-none-eabi  -fsyntax-only -verify %s
+
+int __attribute__((target("sse4.2"))) redecl1(void) { return 1; }
+//expected-error@+2 {{function multiversioning is not supported on the current target}}
+//expected-note@-2 {{previous declaration is here}}
+int __attribute__((target("avx")))  redecl1(void) { return 2; }
+
+//expected-error@+1 {{function multiversioning is not supported on the current target}}
+int __attribute__((target("default"))) with_def(void) { return 1;}
diff --git a/test/Sema/attr-target-mv.c b/test/Sema/attr-target-mv.c
new file mode 100644
index 000000000000..671adff5b042
--- /dev/null
+++ b/test/Sema/attr-target-mv.c
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu  -fsyntax-only -verify %s
+
+void __attribute__((target("sse4.2"))) no_default(void);
+void __attribute__((target("arch=sandybridge")))  no_default(void);
+
+void use1(void){
+  // expected-error@+1 {{no matching function for call to 'no_default'}}
+  no_default();
+}
+
+void __attribute__((target("sse4.2"))) has_def(void);
+void __attribute__((target("default")))  has_def(void);
+
+void use2(void){
+  // expected-error@+2 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+  // expected-note@-4 {{possible target for call}}
+  +has_def;
+}
+
+int __attribute__((target("sse4.2"))) no_proto();
+// expected-error@-1 {{multiversioned function must have a prototype}}
+// expected-note@+1 {{function multiversioning caused by this declaration}}
+int __attribute__((target("arch=sandybridge"))) no_proto();
+
+// The following should all be legal, since they are just redeclarations.
+int __attribute__((target("sse4.2"))) redecl1(void);
+int __attribute__((target("sse4.2"))) redecl1(void) { return 1; }
+int __attribute__((target("arch=sandybridge")))  redecl1(void) { return 2; }
+
+int __attribute__((target("sse4.2"))) redecl2(void) { return 1; }
+int __attribute__((target("sse4.2"))) redecl2(void);
+int __attribute__((target("arch=sandybridge")))  redecl2(void) { return 2; }
+
+int __attribute__((target("sse4.2"))) redecl3(void) { return 0; }
+int __attribute__((target("arch=ivybridge"))) redecl3(void) { return 1; }
+int __attribute__((target("arch=sandybridge")))  redecl3(void);
+int __attribute__((target("arch=sandybridge")))  redecl3(void) { return 2; }
+
+int __attribute__((target("sse4.2"))) redecl4(void) { return 1; }
+int __attribute__((target("arch=sandybridge")))  redecl4(void) { return 2; }
+int __attribute__((target("arch=sandybridge")))  redecl4(void);
+
+int __attribute__((target("sse4.2"))) redef(void) { return 1; }
+int __attribute__((target("arch=ivybridge"))) redef(void) { return 1; }
+int __attribute__((target("arch=sandybridge")))  redef(void) { return 2; }
+// expected-error@+2 {{redefinition of 'redef'}}
+// expected-note@-2 {{previous definition is here}}
+int __attribute__((target("arch=sandybridge")))  redef(void) { return 2; }
+
+int __attribute__((target("default"))) redef2(void) { return 1;}
+// expected-error@+2 {{redefinition of 'redef2'}}
+// expected-note@-2 {{previous definition is here}}
+int __attribute__((target("default"))) redef2(void) { return 1;}
+
+int __attribute__((target("sse4.2"))) mv_after_use(void) { return 1; }
+int use3(void) {
+  return mv_after_use();
+}
+
+// expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}}
+int __attribute__((target("arch=sandybridge")))  mv_after_use(void) { return 2; }
+
+int __attribute__((target("sse4.2,arch=sandybridge"))) mangle(void) { return 1; }
+//expected-error@+2 {{multiversioned function redeclarations require identical target attributes}}
+//expected-note@-2 {{previous declaration is here}}
+int __attribute__((target("arch=sandybridge,sse4.2")))  mangle(void) { return 2; }
+
+int prev_no_target(void);
+int __attribute__((target("arch=sandybridge")))  prev_no_target(void) { return 2; }
+// expected-error@-2 {{function declaration is missing 'target' attribute in a multiversioned function}}
+// expected-note@+1 {{function multiversioning caused by this declaration}}
+int __attribute__((target("arch=ivybridge")))  prev_no_target(void) { return 2; }
+
+int __attribute__((target("arch=sandybridge")))  prev_no_target2(void);
+int prev_no_target2(void);
+// expected-error@-1 {{function declaration is missing 'target' attribute in a multiversioned function}}
+// expected-note@+1 {{function multiversioning caused by this declaration}}
+int __attribute__((target("arch=ivybridge")))  prev_no_target2(void);
+
+void __attribute__((target("sse4.2"))) addtl_attrs(void);
+//expected-error@+1 {{attribute 'target' multiversioning cannot be combined}}
+void __attribute__((used,target("arch=sandybridge")))  addtl_attrs(void);
+
+//expected-error@+1 {{attribute 'target' multiversioning cannot be combined}}
+void __attribute__((target("default"), used)) addtl_attrs2(void);
+
+//expected-error@+2 {{attribute 'target' multiversioning cannot be combined}}
+//expected-note@+2 {{function multiversioning caused by this declaration}}
+void __attribute__((used,target("sse4.2"))) addtl_attrs3(void);
+void __attribute__((target("arch=sandybridge")))  addtl_attrs3(void);
+
+void __attribute__((target("sse4.2"))) addtl_attrs4(void);
+void __attribute__((target("arch=sandybridge")))  addtl_attrs4(void);
+//expected-error@+1 {{attribute 'target' multiversioning cannot be combined}}
+void __attribute__((used,target("arch=ivybridge")))  addtl_attrs4(void);
+
+int __attribute__((target("sse4.2"))) diff_cc(void);
+// expected-error@+1 {{multiversioned function declaration has a different calling convention}}
+__vectorcall int __attribute__((target("arch=sandybridge")))  diff_cc(void);
+
+int __attribute__((target("sse4.2"))) diff_ret(void);
+// expected-error@+1 {{multiversioned function declaration has a different return type}}
+short __attribute__((target("arch=sandybridge")))  diff_ret(void);
diff --git a/test/Sema/builtins-x86.c b/test/Sema/builtins-x86.c
index 074efe16ade6..472deeb9210b 100644
--- a/test/Sema/builtins-x86.c
+++ b/test/Sema/builtins-x86.c
@@ -4,12 +4,17 @@ typedef long long __m128i __attribute__((__vector_size__(16)));
 typedef float __m128 __attribute__((__vector_size__(16)));
 typedef double __m128d __attribute__((__vector_size__(16)));
 
+typedef long long __m256i __attribute__((__vector_size__(32)));
+typedef float __m256 __attribute__((__vector_size__(32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+
 typedef long long __m512i __attribute__((__vector_size__(64)));
 typedef float __m512 __attribute__((__vector_size__(64)));
 typedef double __m512d __attribute__((__vector_size__(64)));
 
 typedef unsigned char __mmask8;
 typedef unsigned short __mmask16;
+typedef unsigned int __mmask32;
 
 __m128 test__builtin_ia32_cmpps(__m128 __a, __m128 __b) {
   __builtin_ia32_cmpps(__a, __b, 32); // expected-error {{argument should be a value from 0 to 31}}
@@ -83,3 +88,74 @@ __m512 _mm512_mask_prefetch_i32gather_ps_2(__m512i index, __mmask16 mask, int co
   return __builtin_ia32_gatherpfdps(mask, index, addr, 1, 1); // expected-error {{argument should be a value from 2 to 3}}
 }
 
+__m512i test_mm512_mask_shldi_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) {
+  return __builtin_ia32_vpshldq512_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m512i test_mm512_mask_shldi_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
+  return __builtin_ia32_vpshldd512_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m512i test_mm512_mask_shldi_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) {
+  return __builtin_ia32_vpshldw512_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m512i test_mm512_mask_shrdi_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) {
+  return __builtin_ia32_vpshrdq512_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m512i test_mm512_mask_shrdi_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
+  return __builtin_ia32_vpshrdd512_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m512i test_mm512_mask_shrdi_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) {
+  return __builtin_ia32_vpshrdw512_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m256i test_mm256_mask_shldi_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
+  return __builtin_ia32_vpshldq256_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m128i test_mm128_mask_shldi_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  return __builtin_ia32_vpshldq128_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m256i test_mm256_mask_shldi_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
+  return __builtin_ia32_vpshldd256_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m128i test_mm128_mask_shldi_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  return __builtin_ia32_vpshldd128_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m256i test_mm256_mask_shldi_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) {
+  return __builtin_ia32_vpshldw256_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m128i test_mm128_mask_shldi_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  return __builtin_ia32_vpshldw128_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m256i test_mm256_mask_shrdi_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
+  return __builtin_ia32_vpshrdq256_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m128i test_mm128_mask_shrdi_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  return __builtin_ia32_vpshrdq128_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m256i test_mm256_mask_shrdi_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
+  return __builtin_ia32_vpshrdd256_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m128i test_mm128_mask_shrdi_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  return __builtin_ia32_vpshrdd128_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m256i test_mm256_mask_shrdi_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) {
+  return __builtin_ia32_vpshrdw256_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
+
+__m128i test_mm128_mask_shrdi_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  return __builtin_ia32_vpshrdw128_mask(__A, __B, 1024, __S, __U); // expected-error {{argument should be a value from 0 to 255}}
+}
diff --git a/test/Sema/compare.c b/test/Sema/compare.c
index 97586a7cc05e..7cd8adab8921 100644
--- a/test/Sema/compare.c
+++ b/test/Sema/compare.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -fsyntax-only -pedantic -verify -Wsign-compare %s -Wno-unreachable-code
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fsyntax-only -pedantic -verify -Wsign-compare -Wtautological-constant-in-range-compare %s -Wno-unreachable-code
 
 int test(char *C) { // nothing here should warn.
   return C != ((void*)0);
diff --git a/test/Sema/constant-builtins-2.c b/test/Sema/constant-builtins-2.c
index a4baecb61558..40cfce1f4a00 100644
--- a/test/Sema/constant-builtins-2.c
+++ b/test/Sema/constant-builtins-2.c
@@ -5,24 +5,35 @@
 double       g0  = __builtin_huge_val();
 float        g1  = __builtin_huge_valf();
 long double  g2  = __builtin_huge_vall();
+#if defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
+__float128   g2_2 = __builtin_huge_valf128();
+#endif
 
 double       g3  = __builtin_inf();
 float        g4  = __builtin_inff();
 long double  g5  = __builtin_infl();
+#if defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
+__float128   g5_2 = __builtin_inff128();
+#endif
 
 double       g6  = __builtin_nan("");
 float        g7  = __builtin_nanf("");
 long double  g8  = __builtin_nanl("");
+#if defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
+__float128   g8_2 = __builtin_nanf128("");
+#endif
 
 // GCC constant folds these too (via native strtol):
 //double       g6_1  = __builtin_nan("1");
 //float        g7_1  = __builtin_nanf("1");
 //long double  g8_1  = __builtin_nanl("1");
 
-// APFloat doesn't have signalling NaN functions.
-//double       g9  = __builtin_nans("");
-//float        g10 = __builtin_nansf("");
-//long double  g11 = __builtin_nansl("");
+double       g9  = __builtin_nans("");
+float        g10 = __builtin_nansf("");
+long double  g11 = __builtin_nansl("");
+#if defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
+__float128   g11_2 = __builtin_nansf128("");
+#endif
 
 //int          g12 = __builtin_abs(-12);
 
@@ -32,10 +43,16 @@ double       g13_1 = __builtin_fabs(-__builtin_inf());
 float        g14 = __builtin_fabsf(-12.f);
 // GCC doesn't eat this one.
 //long double  g15 = __builtin_fabsfl(-12.0L);
+#if defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
+__float128   g15_2 = __builtin_fabsf128(-12.q);
+#endif
 
 float        g16 = __builtin_copysign(1.0, -1.0);
 double       g17 = __builtin_copysignf(1.0f, -1.0f);
 long double  g18 = __builtin_copysignl(1.0L, -1.0L);
+#if defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
+__float128   g18_2 = __builtin_copysignf128(1.0q, -1.0q);
+#endif
 
 char classify_nan     [__builtin_fpclassify(+1, -1, -1, -1, -1, __builtin_nan(""))];
 char classify_snan    [__builtin_fpclassify(+1, -1, -1, -1, -1, __builtin_nans(""))];
diff --git a/test/Sema/darwin-tls.c b/test/Sema/darwin-tls.c
index 0fcf096d92ba..6279e0b7e412 100644
--- a/test/Sema/darwin-tls.c
+++ b/test/Sema/darwin-tls.c
@@ -1,12 +1,18 @@
 // RUN: not %clang_cc1 -fsyntax-only -triple x86_64-apple-macosx10.6 %s 2>&1 | FileCheck %s --check-prefix NO-TLS
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-apple-macosx10.7 %s 2>&1 | FileCheck %s --check-prefix TLS
+
 // RUN: not %clang_cc1 -fsyntax-only -triple arm64-apple-ios7.1 %s 2>&1 | FileCheck %s --check-prefix NO-TLS
 // RUN: %clang_cc1 -fsyntax-only -triple arm64-apple-ios8.0 %s 2>&1 | FileCheck %s --check-prefix TLS
 // RUN: not %clang_cc1 -fsyntax-only -triple thumbv7s-apple-ios8.3 %s 2>&1 | FileCheck %s --check-prefix NO-TLS
 // RUN: %clang_cc1 -fsyntax-only -triple thumbv7s-apple-ios9.0 %s 2>&1 | FileCheck %s --check-prefix TLS
 // RUN: %clang_cc1 -fsyntax-only -triple armv7-apple-ios9.0 %s 2>&1 | FileCheck %s --check-prefix TLS
+// RUN: not %clang_cc1 -fsyntax-only -triple i386-apple-ios9.0-simulator %s 2>&1 | FileCheck %s --check-prefix NO-TLS
+// RUN: %clang_cc1 -fsyntax-only -triple i386-apple-ios10.0-simulator %s 2>&1 | FileCheck %s --check-prefix TLS
+
 // RUN: not %clang_cc1 -fsyntax-only -triple thumbv7k-apple-watchos1.0 %s 2>&1 | FileCheck %s --check-prefix NO-TLS
 // RUN: %clang_cc1 -fsyntax-only -triple thumbv7k-apple-watchos2.0 %s 2>&1 | FileCheck %s --check-prefix TLS
+// RUN: not %clang_cc1 -fsyntax-only -triple i386-apple-watchos2.0-simulator %s 2>&1 | FileCheck %s --check-prefix NO-TLS
+// RUN: %clang_cc1 -fsyntax-only -triple i386-apple-watchos3.0-simulator %s 2>&1 | FileCheck %s --check-prefix TLS
 
 
 __thread int a;
diff --git a/test/Sema/ext_vector_comparisons.c b/test/Sema/ext_vector_comparisons.c
index 605ba6c55e33..4c632a412f44 100644
--- a/test/Sema/ext_vector_comparisons.c
+++ b/test/Sema/ext_vector_comparisons.c
@@ -6,12 +6,12 @@ static int4 test1() {
   int4 vec, rv;
 
   // comparisons to self...
-  return vec == vec; // expected-warning{{self-comparison always evaluates to a constant}}
-  return vec != vec; // expected-warning{{self-comparison always evaluates to a constant}}
-  return vec < vec; // expected-warning{{self-comparison always evaluates to a constant}}
-  return vec <= vec; // expected-warning{{self-comparison always evaluates to a constant}}
-  return vec > vec; // expected-warning{{self-comparison always evaluates to a constant}}
-  return vec >= vec; // expected-warning{{self-comparison always evaluates to a constant}}
+  return vec == vec; // expected-warning{{self-comparison always evaluates to true}}
+  return vec != vec; // expected-warning{{self-comparison always evaluates to false}}
+  return vec < vec; // expected-warning{{self-comparison always evaluates to false}}
+  return vec <= vec; // expected-warning{{self-comparison always evaluates to true}}
+  return vec > vec; // expected-warning{{self-comparison always evaluates to false}}
+  return vec >= vec; // expected-warning{{self-comparison always evaluates to true}}
 }
 
 
diff --git a/test/Sema/float128-ld-incompatibility.cpp b/test/Sema/float128-ld-incompatibility.cpp
index d993ed7b081c..13f5b6d5d10e 100644
--- a/test/Sema/float128-ld-incompatibility.cpp
+++ b/test/Sema/float128-ld-incompatibility.cpp
@@ -1,10 +1,12 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 \
 // RUN: -triple powerpc64le-unknown-linux-gnu -target-cpu pwr8 \
 // RUN: -target-feature +float128 %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -Wno-parentheses %s
 
 __float128 qf();
 long double ldf();
 
+#ifdef __PPC__
 // FIXME: once operations between long double and __float128 are implemented for
 //        targets where the types are different, these next two will change
 long double ld{qf()}; // expected-error {{cannot initialize a variable of type 'long double' with an rvalue of type '__float128'}}
@@ -17,6 +19,7 @@ auto test1(__float128 q, long double ld) -> decltype(q + ld) { // expected-error
 auto test2(long double a, __float128 b) -> decltype(a + b) { // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
   return a + b;      // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
 }
+#endif
 
 void test3(bool b) {
   long double ld;
diff --git a/test/Sema/mms-bitfields.c b/test/Sema/mms-bitfields.c
index d238a7a10d0d..cee5b0669d25 100644
--- a/test/Sema/mms-bitfields.c
+++ b/test/Sema/mms-bitfields.c
@@ -11,3 +11,18 @@ struct
 
 // MS pads out bitfields between different types.
 static int arr[(sizeof(t) == 8) ? 1 : -1];
+
+#pragma pack (push,1)
+
+typedef unsigned int UINT32;
+
+struct Inner {
+  UINT32    A    :  1;
+  UINT32    B    :  1;
+  UINT32    C    :  1;
+  UINT32    D    : 30;
+} Inner;
+
+#pragma pack (pop)
+
+static int arr2[(sizeof(Inner) == 8) ? 1 : -1];
diff --git a/test/Sema/self-comparison.c b/test/Sema/self-comparison.c
index edb3a6a4c8a5..bd7adcd9cdaa 100644
--- a/test/Sema/self-comparison.c
+++ b/test/Sema/self-comparison.c
@@ -86,3 +86,8 @@ int R8435950(int i) {
   return 1;
 }
 
+__attribute__((weak)) int weak_1[3];
+__attribute__((weak)) int weak_2[3];
+_Bool compare_weak() {
+  return weak_1 == weak_2;
+}
diff --git a/test/Sema/tautological-constant-compare.c b/test/Sema/tautological-constant-compare.c
index c48aa9944453..b242f35dc6cf 100644
--- a/test/Sema/tautological-constant-compare.c
+++ b/test/Sema/tautological-constant-compare.c
@@ -1,7 +1,13 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -DTEST -verify %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wno-tautological-constant-compare -verify %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -DTEST -verify -x c++ %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wno-tautological-constant-compare -verify -x c++ %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wtautological-constant-in-range-compare -DTEST -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wtautological-constant-in-range-compare -DTEST -verify -x c++ %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wtautological-type-limit-compare -DTEST -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wtautological-type-limit-compare -DTEST -verify -x c++ %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wextra -Wno-sign-compare -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wextra -Wno-sign-compare -verify -x c++ %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wall -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -Wall -verify -x c++ %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify -x c++ %s
 
 int value(void);
 
@@ -122,32 +128,6 @@ int main()
   if (32767UL >= s)
       return 0;
 
-  if (s == 0UL)
-      return 0;
-  if (s != 0UL)
-      return 0;
-  if (s < 0UL) // expected-warning {{comparison of unsigned expression < 0 is always false}}
-      return 0;
-  if (s <= 0UL)
-      return 0;
-  if (s > 0UL)
-      return 0;
-  if (s >= 0UL) // expected-warning {{comparison of unsigned expression >= 0 is always true}}
-      return 0;
-
-  if (0UL == s)
-      return 0;
-  if (0UL != s)
-      return 0;
-  if (0UL < s)
-      return 0;
-  if (0UL <= s) // expected-warning {{comparison of 0 <= unsigned expression is always true}}
-      return 0;
-  if (0UL > s) // expected-warning {{comparison of 0 > unsigned expression is always false}}
-      return 0;
-  if (0UL >= s)
-      return 0;
-
   enum { ULONG_MAX = (2UL * (unsigned long)__LONG_MAX__ + 1UL) };
   if (s == 2UL * (unsigned long)__LONG_MAX__ + 1UL)
       return 0;
@@ -498,7 +478,7 @@ int main()
     return 0;
 
 #if __SIZEOF_INT128__
-  __int128 i128;
+  __int128 i128 = value();
   if (i128 == -1) // used to crash
       return 0;
 #endif
@@ -509,7 +489,7 @@ int main()
   no,
   maybe
   };
-  enum E e;
+  enum E e = (enum E)value();
 
   if (e == yes)
       return 0;
diff --git a/test/Sema/tautological-constant-enum-compare.c b/test/Sema/tautological-constant-enum-compare.c
index 539439b817dc..99481c7adb33 100644
--- a/test/Sema/tautological-constant-enum-compare.c
+++ b/test/Sema/tautological-constant-enum-compare.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple=x86_64-pc-linux-gnu -fsyntax-only -DUNSIGNED -verify %s
-// RUN: %clang_cc1 -triple=x86_64-pc-win32 -fsyntax-only -DSIGNED -verify %s
+// RUN: %clang_cc1 -triple=x86_64-pc-linux-gnu -fsyntax-only -DUNSIGNED -Wtautological-constant-in-range-compare -verify %s
+// RUN: %clang_cc1 -triple=x86_64-pc-win32 -fsyntax-only -DSIGNED -Wtautological-constant-in-range-compare -verify %s
 // RUN: %clang_cc1 -triple=x86_64-pc-linux-gnu -fsyntax-only -DUNSIGNED -DSILENCE -Wno-tautological-constant-compare -verify %s
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -fsyntax-only -DSIGNED -DSILENCE -Wno-tautological-constant-compare -verify %s
 
diff --git a/test/Sema/tautological-unsigned-enum-zero-compare.c b/test/Sema/tautological-unsigned-enum-zero-compare.c
index a32cfcd8329f..87a56aa40bc3 100644
--- a/test/Sema/tautological-unsigned-enum-zero-compare.c
+++ b/test/Sema/tautological-unsigned-enum-zero-compare.c
@@ -1,9 +1,10 @@
 // RUN: %clang_cc1 -triple=x86_64-pc-linux-gnu -fsyntax-only \
+// RUN:            -Wtautological-unsigned-enum-zero-compare \
 // RUN:            -verify=unsigned,unsigned-signed %s
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -fsyntax-only \
+// RUN:            -Wtautological-unsigned-enum-zero-compare \
 // RUN:            -verify=unsigned-signed %s
 // RUN: %clang_cc1 -triple=x86_64-pc-win32 -fsyntax-only \
-// RUN:            -Wno-tautological-unsigned-enum-zero-compare \
 // RUN:            -verify=silence %s
 
 // Okay, this is where it gets complicated.
diff --git a/test/Sema/tautological-unsigned-enum-zero-compare.cpp b/test/Sema/tautological-unsigned-enum-zero-compare.cpp
index a733b6edfc03..0b66c68197d9 100644
--- a/test/Sema/tautological-unsigned-enum-zero-compare.cpp
+++ b/test/Sema/tautological-unsigned-enum-zero-compare.cpp
@@ -1,9 +1,10 @@
 // RUN: %clang_cc1 -std=c++11 -triple=x86_64-pc-linux-gnu -fsyntax-only \
+// RUN:            -Wtautological-unsigned-enum-zero-compare \
 // RUN:            -verify=unsigned,unsigned-signed %s
 // RUN: %clang_cc1 -std=c++11 -triple=x86_64-pc-win32 -fsyntax-only \
+// RUN:            -Wtautological-unsigned-enum-zero-compare \
 // RUN:            -verify=unsigned-signed %s
 // RUN: %clang_cc1 -std=c++11 -triple=x86_64-pc-win32 -fsyntax-only \
-// RUN:            -Wno-tautological-unsigned-enum-zero-compare \
 // RUN:            -verify=silence %s
 
 // silence-no-diagnostics
diff --git a/test/Sema/tautological-unsigned-zero-compare.c b/test/Sema/tautological-unsigned-zero-compare.c
index b9ea02a731a3..4c9394e213c4 100644
--- a/test/Sema/tautological-unsigned-zero-compare.c
+++ b/test/Sema/tautological-unsigned-zero-compare.c
@@ -1,7 +1,13 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -Wno-tautological-unsigned-zero-compare -verify=silence %s
-// RUN: %clang_cc1 -fsyntax-only -verify -x c++ %s
-// RUN: %clang_cc1 -fsyntax-only -Wno-tautological-unsigned-zero-compare -verify=silence -x c++ %s
+// RUN: %clang_cc1 -fsyntax-only \
+// RUN:            -Wtautological-unsigned-zero-compare \
+// RUN:            -verify %s
+// RUN: %clang_cc1 -fsyntax-only \
+// RUN:            -verify=silence %s
+// RUN: %clang_cc1 -fsyntax-only \
+// RUN:            -Wtautological-unsigned-zero-compare \
+// RUN:            -verify -x c++ %s
+// RUN: %clang_cc1 -fsyntax-only \
+// RUN:            -verify=silence -x c++ %s
 
 unsigned uvalue(void);
 signed int svalue(void);
@@ -32,10 +38,39 @@ int main()
   TFunc<unsigned short>();
 #endif
 
+  short s = svalue();
+
   unsigned un = uvalue();
 
   // silence-no-diagnostics
 
+  // Note: both sides are promoted to unsigned long prior to the comparison.
+  if (s == 0UL)
+      return 0;
+  if (s != 0UL)
+      return 0;
+  if (s < 0UL) // expected-warning {{comparison of unsigned expression < 0 is always false}}
+      return 0;
+  if (s <= 0UL)
+      return 0;
+  if (s > 0UL)
+      return 0;
+  if (s >= 0UL) // expected-warning {{comparison of unsigned expression >= 0 is always true}}
+      return 0;
+
+  if (0UL == s)
+      return 0;
+  if (0UL != s)
+      return 0;
+  if (0UL < s)
+      return 0;
+  if (0UL <= s) // expected-warning {{comparison of 0 <= unsigned expression is always true}}
+      return 0;
+  if (0UL > s) // expected-warning {{comparison of 0 > unsigned expression is always false}}
+      return 0;
+  if (0UL >= s)
+      return 0;
+
   if (un == 0)
       return 0;
   if (un != 0)
diff --git a/test/Sema/transparent-union.c b/test/Sema/transparent-union.c
index 048112767480..7967535db8c1 100644
--- a/test/Sema/transparent-union.c
+++ b/test/Sema/transparent-union.c
@@ -43,6 +43,35 @@ void fi(int i) {} // expected-error{{conflicting types}}
 void fvpp(TU); // expected-note{{previous declaration is here}}
 void fvpp(void **v) {} // expected-error{{conflicting types}}
 
+/* Test redeclaring a function taking a transparent_union arg more than twice.
+   Merging different declarations depends on their order, vary order too. */
+
+void f_triple0(TU tu) {}
+void f_triple0(int *); // expected-note{{previous declaration is here}}
+void f_triple0(float *f); // expected-error{{conflicting types}}
+
+void f_triple1(int *);
+void f_triple1(TU tu) {} // expected-note{{previous definition is here}}
+void f_triple1(float *f); // expected-error{{conflicting types}}
+
+void f_triple2(int *); // expected-note{{previous declaration is here}}
+void f_triple2(float *f); // expected-error{{conflicting types}}
+void f_triple2(TU tu) {}
+
+/* Test calling redeclared function taking a transparent_union arg. */
+
+void f_callee(TU);
+void f_callee(int *i) {} // expected-note{{passing argument to parameter 'i' here}}
+
+void caller(void) {
+  TU tu;
+  f_callee(tu); // expected-error{{passing 'TU' to parameter of incompatible type 'int *'}}
+
+  int *i;
+  f_callee(i);
+}
+
+
 /* FIXME: we'd like to just use an "int" here and align it differently
    from the normal "int", but if we do so we lose the alignment
    information from the typedef within the compiler. */
diff --git a/test/Sema/warn-strict-prototypes.c b/test/Sema/warn-strict-prototypes.c
index a28f57d48c84..0c23b3b2c245 100644
--- a/test/Sema/warn-strict-prototypes.c
+++ b/test/Sema/warn-strict-prototypes.c
@@ -65,3 +65,9 @@ void foo11(p, p2) int p; int p2; {}
 void __attribute__((cdecl)) foo12(d) // expected-warning {{this old-style function definition is not preceded by a prototype}}
   short d;
 {}
+
+// No warnings for variadic functions. Overloadable attribute is required
+// to avoid err_ellipsis_first_param error.
+// rdar://problem/33251668
+void foo13(...) __attribute__((overloadable));
+void foo13(...) __attribute__((overloadable)) {}
diff --git a/test/SemaCXX/ast-print-crash.cpp b/test/SemaCXX/ast-print-crash.cpp
new file mode 100644
index 000000000000..c108f666d7e3
--- /dev/null
+++ b/test/SemaCXX/ast-print-crash.cpp
@@ -0,0 +1,12 @@
+// RUN: not %clang_cc1 -triple %ms_abi_triple -ast-print %s -std=gnu++11 \
+// RUN:     | FileCheck %s
+
+// The test compiles a file with a syntax error which used to cause a crash with
+// -ast-print. Compilation fails due to the syntax error, but compiler should
+// not crash and print out whatever it manager to parse.
+
+// CHECK:      struct {
+// CHECK-NEXT: } dont_crash_on_syntax_error;
+// CHECK-NEXT: decltype(nullptr) p;
+struct {
+} dont_crash_on_syntax_error /* missing ; */ decltype(nullptr) p;
diff --git a/test/SemaCXX/attr-target-mv.cpp b/test/SemaCXX/attr-target-mv.cpp
new file mode 100644
index 000000000000..cc2d4f16c3ff
--- /dev/null
+++ b/test/SemaCXX/attr-target-mv.cpp
@@ -0,0 +1,185 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu  -fsyntax-only -verify -fexceptions -fcxx-exceptions %s -std=c++14
+void __attribute__((target("default"))) invalid_features(void);
+//expected-error@+2 {{function multiversioning doesn't support feature 'hello_world'}}
+//expected-warning@+1 {{ignoring unsupported 'hello_world' in the target attribute string}}
+void __attribute__((target("hello_world"))) invalid_features(void);
+//expected-error@+1 {{function multiversioning doesn't support feature 'no-sse4.2'}}
+void __attribute__((target("no-sse4.2"))) invalid_features(void);
+
+void __attribute__((target("sse4.2"))) no_default(void);
+void __attribute__((target("arch=sandybridge")))  no_default(void);
+
+void use1(void){
+  // expected-error@+1 {{no matching function for call to 'no_default'}}
+  no_default();
+}
+constexpr int __attribute__((target("sse4.2"))) foo(void) { return 0; }
+constexpr int __attribute__((target("arch=sandybridge"))) foo(void);
+//expected-error@+1 {{multiversioned function declaration has a different constexpr specification}}
+int __attribute__((target("arch=ivybridge"))) foo(void) {return 1;}
+constexpr int __attribute__((target("default"))) foo(void) { return 2; }
+
+int __attribute__((target("sse4.2"))) foo2(void) { return 0; }
+//expected-error@+1 {{multiversioned function declaration has a different constexpr specification}}
+constexpr int __attribute__((target("arch=sandybridge"))) foo2(void);
+int __attribute__((target("arch=ivybridge"))) foo2(void) {return 1;}
+int __attribute__((target("default"))) foo2(void) { return 2; }
+
+static int __attribute__((target("sse4.2"))) bar(void) { return 0; }
+static int __attribute__((target("arch=sandybridge"))) bar(void);
+//expected-error@+1 {{multiversioned function declaration has a different storage class}}
+int __attribute__((target("arch=ivybridge"))) bar(void) {return 1;}
+static int __attribute__((target("default"))) bar(void) { return 2; }
+
+int __attribute__((target("sse4.2"))) bar2(void) { return 0; }
+//expected-error@+1 {{multiversioned function declaration has a different storage class}}
+static int __attribute__((target("arch=sandybridge"))) bar2(void);
+int __attribute__((target("arch=ivybridge"))) bar2(void) {return 1;}
+int __attribute__((target("default"))) bar2(void) { return 2; }
+
+
+inline int __attribute__((target("sse4.2"))) baz(void) { return 0; }
+inline int __attribute__((target("arch=sandybridge"))) baz(void);
+//expected-error@+1 {{multiversioned function declaration has a different inline specification}}
+int __attribute__((target("arch=ivybridge"))) baz(void) {return 1;}
+inline int __attribute__((target("default"))) baz(void) { return 2; }
+
+int __attribute__((target("sse4.2"))) baz2(void) { return 0; }
+//expected-error@+1 {{multiversioned function declaration has a different inline specification}}
+inline int __attribute__((target("arch=sandybridge"))) baz2(void);
+int __attribute__((target("arch=ivybridge"))) baz2(void) {return 1;}
+int __attribute__((target("default"))) baz2(void) { return 2; }
+
+float __attribute__((target("sse4.2"))) bock(void) { return 0; }
+//expected-error@+1 {{multiversioned function declaration has a different return type}}
+int __attribute__((target("arch=sandybridge"))) bock(void);
+//expected-error@+1 {{multiversioned function declaration has a different return type}}
+int __attribute__((target("arch=ivybridge"))) bock(void) {return 1;}
+//expected-error@+1 {{multiversioned function declaration has a different return type}}
+int __attribute__((target("default"))) bock(void) { return 2; }
+
+int __attribute__((target("sse4.2"))) bock2(void) { return 0; }
+//expected-error@+1 {{multiversioned function declaration has a different return type}}
+float __attribute__((target("arch=sandybridge"))) bock2(void);
+int __attribute__((target("arch=ivybridge"))) bock2(void) {return 1;}
+int __attribute__((target("default"))) bock2(void) { return 2; }
+
+auto __attribute__((target("sse4.2"))) bock3(void) -> int { return 0; }
+//expected-error@+1 {{multiversioned function declaration has a different return type}}
+auto __attribute__((target("arch=sandybridge"))) bock3(void) -> short { return (short)0;}
+
+int __attribute__((target("sse4.2"))) bock4(void) noexcept(false) { return 0; }
+//expected-error@+2 {{exception specification in declaration does not match previous declaration}}
+//expected-note@-2 {{previous declaration is here}}
+int __attribute__((target("arch=sandybridge"))) bock4(void) noexcept(true) { return 1;}
+
+// FIXME: Add support for templates and virtual functions!
+template<typename T>
+int __attribute__((target("sse4.2"))) foo(T) { return 0; }
+// expected-error@+2 {{multiversioned functions do not yet support function templates}}
+template<typename T>
+int __attribute__((target("arch=sandybridge"))) foo(T);
+
+// expected-error@+2 {{multiversioned functions do not yet support function templates}}
+template<typename T>
+int __attribute__((target("default"))) foo(T) { return 2; }
+
+struct S {
+  template<typename T>
+  int __attribute__((target("sse4.2"))) foo(T) { return 0; }
+  // expected-error@+2 {{multiversioned functions do not yet support function templates}}
+  template<typename T>
+  int __attribute__((target("arch=sandybridge"))) foo(T);
+
+  // expected-error@+2 {{multiversioned functions do not yet support function templates}}
+  template<typename T>
+  int __attribute__((target("default"))) foo(T) { return 2; }
+
+  // expected-error@+1 {{multiversioned functions do not yet support virtual functions}}
+  virtual void __attribute__((target("default"))) virt();
+};
+
+extern "C" {
+int __attribute__((target("sse4.2"))) diff_mangle(void) { return 0; }
+}
+//expected-error@+1 {{multiversioned function declaration has a different linkage}}
+int __attribute__((target("arch=sandybridge"))) diff_mangle(void) { return 0; }
+
+// expected-error@+1 {{multiversioned functions do not yet support deduced return types}}
+auto __attribute__((target("default"))) deduced_return(void) { return 0; }
+// expected-error@-1 {{cannot initialize return object of type 'auto' with an rvalue of type 'int'}}
+
+auto __attribute__((target("default"))) trailing_return(void)-> int { return 0; }
+
+__attribute__((target("default"))) void DiffDecl();
+namespace N {
+using ::DiffDecl;
+// expected-error@+3 {{declaration conflicts with target of using declaration already in scope}}
+// expected-note@-4 {{target of using declaration}}
+// expected-note@-3 {{using declaration}}
+__attribute__((target("arch=sandybridge"))) void DiffDecl();
+} // namespace N
+
+struct SpecialFuncs {
+  // expected-error@+1 {{multiversioned functions do not yet support constructors}}
+  __attribute__((target("default"))) SpecialFuncs();
+  // expected-error@+1 {{multiversioned functions do not yet support destructors}}
+  __attribute__((target("default"))) ~SpecialFuncs();
+
+  // expected-error@+1 {{multiversioned functions do not yet support defaulted functions}}
+  SpecialFuncs& __attribute__((target("default"))) operator=(const SpecialFuncs&) = default;
+  // expected-error@+1 {{multiversioned functions do not yet support deleted functions}}
+  SpecialFuncs& __attribute__((target("default"))) operator=(SpecialFuncs&&) = delete;
+};
+
+class Secret {
+  int i = 0;
+  __attribute__((target("default")))
+  friend int SecretAccessor(Secret &s);
+  __attribute__((target("arch=sandybridge")))
+  friend int SecretAccessor(Secret &s);
+};
+
+__attribute__((target("default")))
+int SecretAccessor(Secret &s) {
+  return s.i;
+}
+
+__attribute__((target("arch=sandybridge")))
+int SecretAccessor(Secret &s) {
+  return s.i + 2;
+}
+
+__attribute__((target("arch=ivybridge")))
+int SecretAccessor(Secret &s) {
+  //expected-error@+2{{'i' is a private member of 'Secret'}}
+  //expected-note@-20{{implicitly declared private here}}
+  return s.i + 3;
+}
+
+constexpr int __attribute__((target("sse4.2"))) constexpr_foo(void) {
+  return 0;
+}
+constexpr int __attribute__((target("arch=sandybridge"))) constexpr_foo(void);
+constexpr int __attribute__((target("arch=ivybridge"))) constexpr_foo(void) {
+  return 1;
+}
+constexpr int __attribute__((target("default"))) constexpr_foo(void) {
+  return 2;
+}
+
+void constexpr_test() {
+  static_assert(foo() == 2, "Should call 'default' in a constexpr context");
+}
+
+struct BadOutOfLine {
+  int __attribute__((target("sse4.2"))) foo(int);
+  int __attribute__((target("default"))) foo(int);
+};
+
+int __attribute__((target("sse4.2"))) BadOutOfLine::foo(int) { return 0; }
+int __attribute__((target("default"))) BadOutOfLine::foo(int) { return 1; }
+// expected-error@+3 {{out-of-line definition of 'foo' does not match any declaration in 'BadOutOfLine'}}
+// expected-note@-3 {{member declaration nearly matches}}
+// expected-note@-3 {{member declaration nearly matches}}
+int __attribute__((target("arch=atom"))) BadOutOfLine::foo(int) { return 1; }
diff --git a/test/SemaCXX/base-class-ambiguity-check.cpp b/test/SemaCXX/base-class-ambiguity-check.cpp
new file mode 100644
index 000000000000..fc1c4c2cea5e
--- /dev/null
+++ b/test/SemaCXX/base-class-ambiguity-check.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+template <typename T> class Foo {
+  struct Base : T {};
+
+  // Test that this code no longer causes a crash in Sema. rdar://23291875
+  struct Derived : Base, T {};
+};
diff --git a/test/SemaCXX/compare-cxx2a.cpp b/test/SemaCXX/compare-cxx2a.cpp
index 61d35021cc13..d88e3bae415e 100644
--- a/test/SemaCXX/compare-cxx2a.cpp
+++ b/test/SemaCXX/compare-cxx2a.cpp
@@ -3,6 +3,14 @@
 
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -fsyntax-only -pedantic -verify -Wsign-compare -std=c++2a %s
 
+void self_compare() {
+  int a;
+  int b[3], c[3];
+  (void)(a <=> a); // expected-warning {{self-comparison always evaluates to 'std::strong_ordering::equal'}}
+  (void)(b <=> b); // expected-warning {{self-comparison always evaluates to 'std::strong_ordering::equal'}}
+  (void)(b <=> c); // expected-warning {{array comparison always evaluates to a constant}}
+}
+
 void test0(long a, unsigned long b) {
   enum EnumA {A};
   enum EnumB {B};
diff --git a/test/SemaCXX/compare.cpp b/test/SemaCXX/compare.cpp
index d852180c0663..ee0fe01fd1d1 100644
--- a/test/SemaCXX/compare.cpp
+++ b/test/SemaCXX/compare.cpp
@@ -1,7 +1,7 @@
 // Force x86-64 because some of our heuristics are actually based
 // on integer sizes.
 
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -fsyntax-only -pedantic -verify -Wsign-compare -std=c++11 %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fsyntax-only -pedantic -verify -Wsign-compare -Wtautological-constant-in-range-compare -std=c++11 %s
 
 int test0(long a, unsigned long b) {
   enum EnumA {A};
diff --git a/test/SemaCXX/copy-initialization.cpp b/test/SemaCXX/copy-initialization.cpp
index 4f6c65cf550c..cd7e5f07e140 100644
--- a/test/SemaCXX/copy-initialization.cpp
+++ b/test/SemaCXX/copy-initialization.cpp
@@ -26,7 +26,7 @@ struct foo {
 };
 
 // PR3600
-void test(const foo *P) { P->bar(); } // expected-error{{'bar' not viable: 'this' argument has type 'const foo', but function is not marked const}}
+void test(const foo *P) { P->bar(); } // expected-error{{'this' argument to member function 'bar' has type 'const foo', but function is not marked const}}
 
 namespace PR6757 {
   struct Foo {
diff --git a/test/SemaCXX/coroutines.cpp b/test/SemaCXX/coroutines.cpp
index 65e17abb11fe..4a92403f10a5 100644
--- a/test/SemaCXX/coroutines.cpp
+++ b/test/SemaCXX/coroutines.cpp
@@ -1171,4 +1171,73 @@ template CoroMemberTag DepTestType<int>::test_member_template(long, const char *
 
 template CoroMemberTag DepTestType<int>::test_static_template<void>(const char *volatile &, unsigned);
 
+struct bad_promise_deleted_constructor {
+  // expected-note@+1 {{'bad_promise_deleted_constructor' has been explicitly marked deleted here}}
+  bad_promise_deleted_constructor() = delete;
+  coro<bad_promise_deleted_constructor> get_return_object();
+  suspend_always initial_suspend();
+  suspend_always final_suspend();
+  void return_void();
+  void unhandled_exception();
+};
+
+coro<bad_promise_deleted_constructor>
+bad_coroutine_calls_deleted_promise_constructor() {
+  // expected-error@-1 {{call to deleted constructor of 'std::experimental::coroutine_traits<coro<CoroHandleMemberFunctionTest::bad_promise_deleted_constructor>>::promise_type' (aka 'CoroHandleMemberFunctionTest::bad_promise_deleted_constructor')}}
+  co_return;
+}
+
+// Test that, when the promise type has a constructor whose signature matches
+// that of the coroutine function, that constructor is used. If no matching
+// constructor exists, the default constructor is used as a fallback. If no
+// matching constructors exist at all, an error is emitted. This is an
+// experimental feature that will be proposed for the Coroutines TS.
+
+struct good_promise_default_constructor {
+  good_promise_default_constructor(double, float, int);
+  good_promise_default_constructor() = default;
+  coro<good_promise_default_constructor> get_return_object();
+  suspend_always initial_suspend();
+  suspend_always final_suspend();
+  void return_void();
+  void unhandled_exception();
+};
+
+coro<good_promise_default_constructor>
+good_coroutine_calls_default_constructor() {
+  co_return;
+}
+
+struct good_promise_custom_constructor {
+  good_promise_custom_constructor(double, float, int);
+  good_promise_custom_constructor() = delete;
+  coro<good_promise_custom_constructor> get_return_object();
+  suspend_always initial_suspend();
+  suspend_always final_suspend();
+  void return_void();
+  void unhandled_exception();
+};
+
+coro<good_promise_custom_constructor>
+good_coroutine_calls_custom_constructor(double, float, int) {
+  co_return;
+}
+
+struct bad_promise_no_matching_constructor {
+  bad_promise_no_matching_constructor(int, int, int);
+  // expected-note@+1 {{'bad_promise_no_matching_constructor' has been explicitly marked deleted here}}
+  bad_promise_no_matching_constructor() = delete;
+  coro<bad_promise_no_matching_constructor> get_return_object();
+  suspend_always initial_suspend();
+  suspend_always final_suspend();
+  void return_void();
+  void unhandled_exception();
+};
+
+coro<bad_promise_no_matching_constructor>
+bad_coroutine_calls_with_no_matching_constructor(int, int) {
+  // expected-error@-1 {{call to deleted constructor of 'std::experimental::coroutine_traits<coro<CoroHandleMemberFunctionTest::bad_promise_no_matching_constructor>, int, int>::promise_type' (aka 'CoroHandleMemberFunctionTest::bad_promise_no_matching_constructor')}}
+  co_return;
+}
+
 } // namespace CoroHandleMemberFunctionTest
diff --git a/test/SemaCXX/cxx1y-contextual-conversion-tweaks.cpp b/test/SemaCXX/cxx1y-contextual-conversion-tweaks.cpp
index c7e6074a4437..7cb83c6f8876 100644
--- a/test/SemaCXX/cxx1y-contextual-conversion-tweaks.cpp
+++ b/test/SemaCXX/cxx1y-contextual-conversion-tweaks.cpp
@@ -96,8 +96,8 @@ namespace extended_examples {
 //expected-error@81 {{statement requires expression of integer type ('extended_examples::A1' invalid)}}
 //expected-error@85 {{statement requires expression of integer type ('extended_examples::B2' invalid)}}
 #else
-//expected-error@81 {{cannot initialize object parameter of type 'extended_examples::A1' with an expression of type 'extended_examples::A1'}}
-//expected-error@85 {{cannot initialize object parameter of type 'extended_examples::B2' with an expression of type 'extended_examples::B2'}}
+//expected-error@81 {{'this' argument to member function 'operator int' is an lvalue, but function has rvalue ref-qualifier}} expected-note@54 {{'operator int' declared here}}
+//expected-error@85 {{'this' argument to member function 'operator int' is an lvalue, but function has rvalue ref-qualifier}} expected-note@75 {{'operator int' declared here}}
 #endif
 
 namespace extended_examples_cxx1y {
@@ -149,9 +149,9 @@ namespace extended_examples_cxx1y {
 #ifdef CXX1Y
 //expected-error@139 {{statement requires expression of integer type ('extended_examples_cxx1y::A2' invalid)}}
 #else
-//expected-error@138 {{cannot initialize object parameter of type 'extended_examples_cxx1y::A1' with an expression of type 'extended_examples_cxx1y::A1'}}
-//expected-error@139 {{cannot initialize object parameter of type 'extended_examples_cxx1y::A2' with an expression of type 'extended_examples_cxx1y::A2'}}
-//expected-error@143 {{cannot initialize object parameter of type 'extended_examples_cxx1y::D' with an expression of type 'extended_examples_cxx1y::D'}}
+//expected-error@138 {{'this' argument to member function 'operator int' is an lvalue, but function has rvalue ref-qualifier}} expected-note@106 {{'operator int' declared here}}
+//expected-error@139 {{'this' argument to member function 'operator int' is an lvalue, but function has rvalue ref-qualifier}} expected-note@111 {{'operator int' declared here}}
+//expected-error@143 {{'this' argument to member function 'operator int' is an lvalue, but function has rvalue ref-qualifier}} expected-note@131 {{'operator int' declared here}}
 #endif
 
 namespace extended_examples_array_bounds {
diff --git a/test/SemaCXX/cxx1y-generic-lambdas.cpp b/test/SemaCXX/cxx1y-generic-lambdas.cpp
index 9f3c77591a86..49386e7f43d9 100644
--- a/test/SemaCXX/cxx1y-generic-lambdas.cpp
+++ b/test/SemaCXX/cxx1y-generic-lambdas.cpp
@@ -181,7 +181,7 @@ int test() {
     int (*fp2)(int) = [](auto b) -> int {  return b; };
     int (*fp3)(char) = [](auto c) -> int { return c; };
     char (*fp4)(int) = [](auto d) { return d; }; //expected-error{{no viable conversion}}\
-                                                 //expected-note{{candidate template ignored}}
+                                                 //expected-note{{candidate function[with $0 = int]}}
     char (*fp5)(char) = [](auto e) -> int { return e; }; //expected-error{{no viable conversion}}\
                                                  //expected-note{{candidate template ignored}}
 
@@ -207,12 +207,22 @@ int variadic_test() {
 } // end ns
 
 namespace conversion_operator {
-void test() {
-    auto L = [](auto a) -> int { return a; };
+  void test() {
+    auto L = [](auto a) -> int { return a; }; // expected-error {{cannot initialize}}
     int (*fp)(int) = L; 
     int (&fp2)(int) = [](auto a) { return a; };  // expected-error{{non-const lvalue}}
     int (&&fp3)(int) = [](auto a) { return a; };  // expected-error{{no viable conversion}}\
                                                   //expected-note{{candidate}}
+
+    using F = int(int);
+    using G = int(void*);
+    L.operator F*();
+    L.operator G*(); // expected-note-re {{instantiation of function template specialization '{{.*}}::operator()<void *>'}}
+
+    // Here, the conversion function is named 'operator auto (*)(int)', and
+    // there is no way to write that name in valid C++.
+    auto M = [](auto a) -> auto { return a; };
+    M.operator F*(); // expected-error {{no member named 'operator int (*)(int)'}}
   }
 }
 }
@@ -992,4 +1002,4 @@ namespace PR32638 {
  void test() {
     [](auto x) noexcept(noexcept(x)) { } (0);
  }
-}
\ No newline at end of file
+}
diff --git a/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp b/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
index 9080f67fe0e1..d21fbf289289 100644
--- a/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
+++ b/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
@@ -309,6 +309,17 @@ namespace dependent {
   template int New(int);
 }
 
+namespace injected_class_name {
+  template<typename T = void> struct A {
+    A();
+    template<typename U> A(A<U>);
+  };
+  A<int> a;
+  A b = a;
+  using T = decltype(a);
+  using T = decltype(b);
+}
+
 #else
 
 // expected-no-diagnostics
diff --git a/test/SemaCXX/cxx2a-pointer-to-const-ref-member.cpp b/test/SemaCXX/cxx2a-pointer-to-const-ref-member.cpp
index e9cbb1a18532..0bc40c817983 100644
--- a/test/SemaCXX/cxx2a-pointer-to-const-ref-member.cpp
+++ b/test/SemaCXX/cxx2a-pointer-to-const-ref-member.cpp
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -std=c++2a %s -verify
 
 struct X {
-  void ref() & {}
+  void ref() & {} // expected-note{{'ref' declared here}}
   void cref() const& {}
 };
 
 void test() {
-  X{}.ref(); // expected-error{{cannot initialize object parameter of type 'X' with an expression of type 'X'}}
+  X{}.ref(); // expected-error{{'this' argument to member function 'ref' is an rvalue, but function has non-const lvalue ref-qualifier}}
   X{}.cref(); // expected-no-error
 
   (X{}.*&X::ref)(); // expected-error-re{{pointer-to-member function type 'void (X::*)() {{.*}}&' can only be called on an lvalue}}
diff --git a/test/SemaCXX/extern-c.cpp b/test/SemaCXX/extern-c.cpp
index fa6c2b1990c0..1a32beec7baa 100644
--- a/test/SemaCXX/extern-c.cpp
+++ b/test/SemaCXX/extern-c.cpp
@@ -242,3 +242,20 @@ namespace tag_hiding {
   ExternCStruct3 *q3 = p3;
   ExternCStruct4 *q4 = p4; // expected-error {{ambiguous}}
 }
+
+namespace PR35697 {
+  typedef struct {} *ReturnStruct;
+  extern "C" ReturnStruct PR35697_f();
+  extern "C" ReturnStruct PR35697_v;
+  ReturnStruct PR35697_f();
+  ReturnStruct PR35697_v;
+
+  namespace {
+    extern "C" ReturnStruct PR35697_f();
+    extern "C" ReturnStruct PR35697_v;
+    void q() {
+      extern ReturnStruct PR35697_f();
+      extern ReturnStruct PR35697_v;
+    }
+  }
+}
diff --git a/test/SemaCXX/lambda-expressions.cpp b/test/SemaCXX/lambda-expressions.cpp
index efbb47681ade..de77467b6d5a 100644
--- a/test/SemaCXX/lambda-expressions.cpp
+++ b/test/SemaCXX/lambda-expressions.cpp
@@ -583,3 +583,28 @@ namespace PR25627_dont_odr_use_local_consts {
     (void) [] { X<N> x; };
   }
 }
+
+namespace ConversionOperatorDoesNotHaveDeducedReturnType {
+  auto x = [](int){};
+  auto y = [](auto) -> void {};
+  using T = decltype(x);
+  using U = decltype(y);
+  using ExpectedTypeT = void (*)(int);
+  template<typename T>
+    using ExpectedTypeU = void (*)(T);
+
+  struct X {
+    friend T::operator ExpectedTypeT() const;
+
+    // Formally, this is invalid, because the return type of the conversion
+    // function for a generic lambda expression is an unspecified decltype
+    // type, which this should not match. However, this declaration is
+    // functionally equivalent to that one, so we're permitted to choose to
+    // accept this.
+    template<typename T>
+      friend U::operator ExpectedTypeU<T>() const;
+  };
+
+  // This used to crash in return type deduction for the conversion opreator.
+  struct A { int n; void f() { +[](decltype(n)) {}; } };
+}
diff --git a/test/SemaCXX/self-comparison.cpp b/test/SemaCXX/self-comparison.cpp
index fb15ec843061..ac129b68a67a 100644
--- a/test/SemaCXX/self-comparison.cpp
+++ b/test/SemaCXX/self-comparison.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++2a
 
 int foo(int x) {
   return x == x; // expected-warning {{self-comparison always evaluates to true}}
@@ -21,3 +21,22 @@ struct A {
     return a == c; // expected-warning {{array comparison always evaluates to false}}
   }
 };
+
+namespace NA { extern "C" int x[3]; }
+namespace NB { extern "C" int x[3]; }
+bool k = NA::x == NB::x; // expected-warning {{self-comparison always evaluates to true}}
+
+template<typename T> struct Y { static inline int n; };
+bool f() {
+  return
+    Y<int>::n == Y<int>::n || // expected-warning {{self-comparison always evaluates to true}}
+    Y<void>::n == Y<int>::n;
+}
+template<typename T, typename U>
+bool g() {
+  // FIXME: Ideally we'd produce a self-comparison warning on the first of these.
+  return
+    Y<T>::n == Y<T>::n ||
+    Y<T>::n == Y<U>::n;
+}
+template bool g<int, int>(); // should not produce any warnings
diff --git a/test/SemaCXX/sourceranges.cpp b/test/SemaCXX/sourceranges.cpp
index 287925f7ffd7..58772a063915 100644
--- a/test/SemaCXX/sourceranges.cpp
+++ b/test/SemaCXX/sourceranges.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple i686-mingw32 -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple i686-mingw32 -std=c++1z -ast-dump %s | FileCheck %s -check-prefix=CHECK-1Z
 
 template<class T>
 class P {
@@ -12,7 +13,7 @@ enum B {};
 typedef int C;
 }
 
-// CHECK: VarDecl {{0x[0-9a-fA-F]+}} <line:16:1, col:36> col:15 ImplicitConstrArray 'foo::A [2]'
+// CHECK: VarDecl {{0x[0-9a-fA-F]+}} <line:[[@LINE+1]]:1, col:36> col:15 ImplicitConstrArray 'foo::A [2]'
 static foo::A ImplicitConstrArray[2];
 
 int main() {
@@ -50,3 +51,89 @@ void construct() {
   D d = D(12);
   // CHECK: CXXConstructExpr {{0x[0-9a-fA-F]+}} <col:9, col:13> 'D' 'void (int){{( __attribute__\(\(thiscall\)\))?}}'
 }
+
+void abort() __attribute__((noreturn));
+
+namespace std {
+typedef decltype(sizeof(int)) size_t;
+
+template <typename E> struct initializer_list {
+  const E *p;
+  size_t n;
+  initializer_list(const E *p, size_t n) : p(p), n(n) {}
+};
+
+template <typename F, typename S> struct pair {
+  F f;
+  S s;
+  pair(const F &f, const S &s) : f(f), s(s) {}
+};
+
+struct string {
+  const char *str;
+  string() { abort(); }
+  string(const char *S) : str(S) {}
+  ~string() { abort(); }
+};
+
+template<typename K, typename V>
+struct map {
+  using T = pair<K, V>;
+  map(initializer_list<T> i, const string &s = string()) {}
+  ~map() { abort(); }
+};
+
+}; // namespace std
+
+#if __cplusplus >= 201703L
+// CHECK-1Z: FunctionDecl {{.*}} construct_with_init_list
+std::map<int, int> construct_with_init_list() {
+  // CHECK-1Z-NEXT: CompoundStmt
+  // CHECK-1Z-NEXT: ReturnStmt {{.*}} <line:[[@LINE+5]]:3, col:35
+  // CHECK-1Z-NEXT: ExprWithCleanups {{.*}} <col:10, col:35
+  // CHECK-1Z-NEXT: CXXBindTemporaryExpr {{.*}} <col:10, col:35
+  // CHECK-1Z-NEXT: CXXTemporaryObjectExpr {{.*}} <col:10, col:35
+  // CHECK-1Z-NEXT: CXXStdInitializerListExpr {{.*}} <col:28, col:35
+  return std::map<int, int>{{0, 0}};
+}
+
+// CHECK-1Z: NamespaceDecl {{.*}} in_class_init
+namespace in_class_init {
+  struct A {};
+
+  // CHECK-1Z: CXXRecordDecl {{.*}} struct B definition
+  struct B {
+    // CHECK-1Z: FieldDecl {{.*}} a 'in_class_init::A'
+    // CHECK-1Z-NEXT: InitListExpr {{.*}} <col:11, col:12
+    A a = {};
+  };
+}
+
+// CHECK-1Z: NamespaceDecl {{.*}} delegating_constructor_init
+namespace delegating_constructor_init {
+  struct A {};
+
+  struct B : A {
+    A a;
+    B(A a) : a(a) {}
+  };
+
+  // CHECK-1Z: CXXRecordDecl {{.*}} struct C definition
+  struct C : B {
+    // CHECK-1Z: CXXConstructorDecl {{.*}} C
+    // CHECK-1Z-NEXT: CXXCtorInitializer 'delegating_constructor_init::B'
+    // CHECK-1Z-NEXT: CXXConstructExpr {{.*}} <col:11, col:15
+    // CHECK-1Z-NEXT: InitListExpr {{.*}} <col:13, col:14
+    C() : B({}) {};
+  };
+}
+
+// CHECK-1Z: NamespaceDecl {{.*}} new_init
+namespace new_init {
+  void A() {
+    // CHECK-1Z: CXXNewExpr {{.*}} <line:[[@LINE+2]]:5, col:14
+    // CHECK-1Z-NEXT: InitListExpr {{.*}} <col:12, col:14
+    new int{0};
+  }
+}
+#endif
diff --git a/test/SemaCXX/type-traits.cpp b/test/SemaCXX/type-traits.cpp
index b334e507554f..8cf688014c45 100644
--- a/test/SemaCXX/type-traits.cpp
+++ b/test/SemaCXX/type-traits.cpp
@@ -2225,6 +2225,7 @@ void constructible_checks() {
 
   // PR25513
   { int arr[F(__is_constructible(int(int)))]; }
+  { int arr[T(__is_constructible(int const &, long))]; }
 
   { int arr[T(__is_constructible(ACompleteType))]; }
   { int arr[T(__is_nothrow_constructible(ACompleteType))]; }
@@ -2275,6 +2276,47 @@ void is_trivially_constructible_test() {
   { int arr[F(__is_trivially_constructible(const volatile void))]; }
 }
 
+template <class T, class RefType = T &>
+struct ConvertsToRef {
+  operator RefType() const { return static_cast<RefType>(obj); }
+  mutable T obj = 42;
+};
+
+void reference_binds_to_temporary_checks() {
+  { int arr[F((__reference_binds_to_temporary(int &, int &)))]; }
+  { int arr[F((__reference_binds_to_temporary(int &, int &&)))]; }
+
+  { int arr[F((__reference_binds_to_temporary(int const &, int &)))]; }
+  { int arr[F((__reference_binds_to_temporary(int const &, int const &)))]; }
+  { int arr[F((__reference_binds_to_temporary(int const &, int &&)))]; }
+
+  { int arr[F((__reference_binds_to_temporary(int &, long &)))]; } // doesn't construct
+  { int arr[T((__reference_binds_to_temporary(int const &, long &)))]; }
+  { int arr[T((__reference_binds_to_temporary(int const &, long &&)))]; }
+  { int arr[T((__reference_binds_to_temporary(int &&, long &)))]; }
+
+  using LRef = ConvertsToRef<int, int &>;
+  using RRef = ConvertsToRef<int, int &&>;
+  using CLRef = ConvertsToRef<int, const int &>;
+  using LongRef = ConvertsToRef<long, long &>;
+  { int arr[T((__is_constructible(int &, LRef)))]; }
+  { int arr[F((__reference_binds_to_temporary(int &, LRef)))]; }
+
+  { int arr[T((__is_constructible(int &&, RRef)))]; }
+  { int arr[F((__reference_binds_to_temporary(int &&, RRef)))]; }
+
+  { int arr[T((__is_constructible(int const &, CLRef)))]; }
+  { int arr[F((__reference_binds_to_temporary(int &&, CLRef)))]; }
+
+  { int arr[T((__is_constructible(int const &, LongRef)))]; }
+  { int arr[T((__reference_binds_to_temporary(int const &, LongRef)))]; }
+
+  // Test that it doesn't accept non-reference types as input.
+  { int arr[F((__reference_binds_to_temporary(int, long)))]; }
+
+  { int arr[T((__reference_binds_to_temporary(const int &, long)))]; }
+}
+
 void array_rank() {
   int t01[T(__array_rank(IntAr) == 1)];
   int t02[T(__array_rank(ConstIntArAr) == 2)];
diff --git a/test/SemaCXX/warn-missing-variable-declarations.cpp b/test/SemaCXX/warn-missing-variable-declarations.cpp
index 5b882845f3c6..5b71f38c811f 100644
--- a/test/SemaCXX/warn-missing-variable-declarations.cpp
+++ b/test/SemaCXX/warn-missing-variable-declarations.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang -Wmissing-variable-declarations -fsyntax-only -Xclang -verify %s
+// RUN: %clang -Wmissing-variable-declarations -fsyntax-only -Xclang -verify -std=c++17 %s
 
 // Variable declarations that should trigger a warning.
 int vbad1; // expected-warning{{no previous extern declaration for non-static variable 'vbad1'}}
@@ -52,3 +52,24 @@ class C {
 namespace {
   int vgood4;
 }
+
+inline int inline_var = 0;
+const int const_var = 0;
+constexpr int constexpr_var = 0;
+inline constexpr int inline_constexpr_var = 0;
+extern const int extern_const_var = 0; // expected-warning {{no previous extern declaration}}
+extern constexpr int extern_constexpr_var = 0; // expected-warning {{no previous extern declaration}}
+
+template<typename> int var_template = 0;
+template<typename> constexpr int const_var_template = 0;
+template<typename> static int static_var_template = 0;
+
+template int var_template<int[1]>;
+int use_var_template() { return var_template<int[2]>; }
+template int var_template<int[3]>;
+extern template int var_template<int[4]>;
+template<> int var_template<int[5]>; // expected-warning {{no previous extern declaration}}
+
+// FIXME: We give this specialization internal linkage rather than inheriting
+// the linkage from the template! We should not warn here.
+template<> int static_var_template<int[5]>; // expected-warning {{no previous extern declaration}}
diff --git a/test/SemaCXX/warn-thread-safety-analysis.cpp b/test/SemaCXX/warn-thread-safety-analysis.cpp
index fc1ea5e9e996..829e2e491627 100644
--- a/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wthread-safety -Wthread-safety-beta -Wno-thread-safety-negative -fcxx-exceptions -DUSE_ASSERT_CAPABILITY=0 %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wthread-safety -Wthread-safety-beta -Wno-thread-safety-negative -fcxx-exceptions -DUSE_ASSERT_CAPABILITY=1 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++17 -Wthread-safety -Wthread-safety-beta -Wno-thread-safety-negative -fcxx-exceptions -DUSE_ASSERT_CAPABILITY=0 %s
 
 // FIXME: should also run  %clang_cc1 -fsyntax-only -verify -Wthread-safety -std=c++11 -Wc++98-compat %s
 // FIXME: should also run  %clang_cc1 -fsyntax-only -verify -Wthread-safety %s
@@ -5248,3 +5249,28 @@ struct C {
 C c;
 void f() { c[A()]->g(); }
 } // namespace PR34800
+
+namespace ReturnScopedLockable {
+  template<typename Object> class SCOPED_LOCKABLE ReadLockedPtr {
+  public:
+    ReadLockedPtr(Object *ptr) SHARED_LOCK_FUNCTION((*this)->mutex);
+    ReadLockedPtr(ReadLockedPtr &&) SHARED_LOCK_FUNCTION((*this)->mutex);
+    ~ReadLockedPtr() UNLOCK_FUNCTION();
+
+    Object *operator->() const { return object; }
+
+  private:
+    Object *object;
+  };
+
+  struct Object {
+    int f() SHARED_LOCKS_REQUIRED(mutex);
+    Mutex mutex;
+  };
+
+  ReadLockedPtr<Object> get();
+  int use() {
+    auto ptr = get();
+    return ptr->f();
+  }
+}
diff --git a/test/SemaCXX/warn-throw-out-noexcept-func.cpp b/test/SemaCXX/warn-throw-out-noexcept-func.cpp
index a6c23ddc6c8f..4ecc3a94ebdc 100644
--- a/test/SemaCXX/warn-throw-out-noexcept-func.cpp
+++ b/test/SemaCXX/warn-throw-out-noexcept-func.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s  -fdelayed-template-parsing -fcxx-exceptions -fsyntax-only -Wexceptions -verify -fdeclspec -std=c++11
+// RUN: %clang_cc1 %s  -fdelayed-template-parsing -fcxx-exceptions -fsyntax-only -Wexceptions -verify -fdeclspec -std=c++17
 struct A_ShouldDiag {
   ~A_ShouldDiag(); // implicitly noexcept(true)
 };
@@ -325,3 +325,84 @@ int main() {
   ShouldDiagnose obj;
   ShouldNotDiagnose obj1;
 }
+
+namespace ExceptionInNamespace {
+  namespace N {
+    struct E {};
+  }
+  void run() throw() {
+    try {
+      throw N::E();
+    } catch (const N::E &e) {
+    }
+  }
+}
+
+namespace HandlerSpecialCases {
+  struct A {};
+  using CA = const A;
+
+  struct B : A {};
+  using CB = const B;
+
+  struct AmbigBase {};
+  struct AmbigMiddle : AmbigBase {};
+  struct AmbigDerived : AmbigBase, AmbigMiddle {}; // expected-warning {{inaccessible}}
+
+  struct PrivateBase {};
+  struct PrivateDerived : private PrivateBase { friend void bad3() throw(); };
+
+  void good() throw() {
+    try { throw CA(); } catch (volatile A&) {}
+    try { throw B(); } catch (A&) {}
+    try { throw B(); } catch (const volatile A&) {}
+    try { throw CB(); } catch (A&) {}
+    try { throw (int*)0; } catch (void* const volatile) {}
+    try { throw (int*)0; } catch (void* const &) {}
+    try { throw (B*)0; } catch (A*) {}
+    try { throw (B*)0; } catch (A* const &) {}
+    try { throw (void(*)() noexcept)0; } catch (void (*)()) {}
+    try { throw (void(*)() noexcept)0; } catch (void (*const &)()) {}
+    try { throw (int**)0; } catch (const int * const*) {}
+    try { throw (int**)0; } catch (const int * const* const&) {}
+    try { throw nullptr; } catch (int*) {}
+    try { throw nullptr; } catch (int* const&) {}
+  }
+
+  void bad1() throw() { // expected-note {{here}}
+    try { throw A(); } catch (const B&) {} // expected-warning {{still throw}}
+  }
+  void bad2() throw() { // expected-note {{here}}
+    try { throw AmbigDerived(); } catch (const AmbigBase&) {} // expected-warning {{still throw}}
+  }
+  void bad3() throw() { // expected-note {{here}}
+    try { throw PrivateDerived(); } catch (const PrivateBase&) {} // expected-warning {{still throw}}
+  }
+  void bad4() throw() { // expected-note {{here}}
+    try { throw (int*)0; } catch (void* &) {} // expected-warning {{still throw}}
+  }
+  void bad5() throw() { // expected-note {{here}}
+    try { throw (int*)0; } catch (void* const volatile &) {} // expected-warning {{still throw}}
+  }
+  void bad6() throw() { // expected-note {{here}}
+    try { throw (int* volatile)0; } catch (void* const volatile &) {} // expected-warning {{still throw}}
+  }
+  void bad7() throw() { // expected-note {{here}}
+    try { throw (AmbigDerived*)0; } catch (AmbigBase*) {} // expected-warning {{still throw}}
+  }
+  void bad8() throw() { // expected-note {{here}}
+    try { throw (PrivateDerived*)0; } catch (PrivateBase*) {} // expected-warning {{still throw}}
+  }
+  void bad9() throw() { // expected-note {{here}}
+    try { throw (B*)0; } catch (A* &) {} // expected-warning {{still throw}}
+  }
+  void bad10() throw() { // expected-note {{here}}
+    try { throw (void(*)())0; } catch (void (*)() noexcept) {} // expected-warning {{still throw}}
+  }
+  void bad11() throw() { // expected-note {{here}}
+    try { throw (int**)0; } catch (const int **) {} // expected-warning {{still throw}}
+  }
+  void bad12() throw() { // expected-note {{here}}
+    try { throw nullptr; } catch (int) {} // expected-warning {{still throw}}
+  }
+}
diff --git a/test/SemaObjC/Inputs/module.map b/test/SemaObjC/Inputs/module.map
new file mode 100644
index 000000000000..492d4c368eb8
--- /dev/null
+++ b/test/SemaObjC/Inputs/module.map
@@ -0,0 +1,3 @@
+module empty {
+  header "empty.h"
+}
diff --git a/test/SemaObjC/ns-consumed-error-not-warning.m b/test/SemaObjC/ns-consumed-error-not-warning.m
new file mode 100644
index 000000000000..f44cc8f0f60b
--- /dev/null
+++ b/test/SemaObjC/ns-consumed-error-not-warning.m
@@ -0,0 +1,13 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -fobjc-arc -verify -fblocks -triple x86_64-apple-darwin10.0.0 -DOBJCARC %s
+// rdar://36265651
+
+@interface A
+-(void) m:(id)p; // expected-note {{parameter declared here}}
+@end
+
+@interface B : A
+-(void) m:(__attribute__((ns_consumed)) id)p; // expected-error {{overriding method has mismatched ns_consumed attribute on its parameter}}
+@end
+
+@import empty;
diff --git a/test/SemaOpenCL/extension-version.cl b/test/SemaOpenCL/extension-version.cl
index e56027604651..714e4c210861 100644
--- a/test/SemaOpenCL/extension-version.cl
+++ b/test/SemaOpenCL/extension-version.cl
@@ -131,6 +131,15 @@
 #endif
 #pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
 
+#if (__OPENCL_C_VERSION__ >= 110)
+#ifndef cles_khr_int64
+#error "Missing cles_khr_int64 define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cles_khr_int64' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cles_khr_int64: enable
+
 #if (__OPENCL_C_VERSION__ >= 120)
 #ifndef cl_khr_context_abort
 #error "Missing cl_context_abort define"
diff --git a/test/SemaTemplate/alignas.cpp b/test/SemaTemplate/alignas.cpp
index 8a1f96e5bdec..680f07b32998 100644
--- a/test/SemaTemplate/alignas.cpp
+++ b/test/SemaTemplate/alignas.cpp
@@ -21,3 +21,14 @@ struct C { char a[16]; };
 
 static_assert(sizeof(my_union<A, B, C>) == 16, "");
 static_assert(alignof(my_union<A, B, C>) == 8, "");
+
+namespace PR35028 {
+  template<class X, int Alignment> struct alignas(X) alignas(long long) alignas(long double) alignas(Alignment) Aligned {
+    union {
+      long long align1;
+      long double align2;
+      char data[sizeof(X)];
+    };
+  };
+  Aligned<int, 1> a;
+}
diff --git a/test/SemaTemplate/attributes.cpp b/test/SemaTemplate/attributes.cpp
index 1d46058b0191..7634b937c906 100644
--- a/test/SemaTemplate/attributes.cpp
+++ b/test/SemaTemplate/attributes.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=gnu++11 -fsyntax-only -verify %s
+// RUN: not %clang_cc1 -std=gnu++11 -ast-dump %s | FileCheck %s
 
 namespace attribute_aligned {
   template<int N>
@@ -52,3 +53,13 @@ namespace PR9049 {
   template<typename T>
   inline void WBCFRelease(__attribute__((cf_consumed)) T aValue) { if(aValue) CFRelease(aValue); }
 }
+
+// CHECK: FunctionTemplateDecl {{.*}} HasAnnotations
+// CHECK:   AnnotateAttr {{.*}} "ANNOTATE_BAR"
+// CHECK:   AnnotateAttr {{.*}} "ANNOTATE_FOO"
+// CHECK: FunctionDecl {{.*}} HasAnnotations
+// CHECK:   TemplateArgument type 'int'
+// CHECK:   AnnotateAttr {{.*}} "ANNOTATE_BAR"
+// CHECK:   AnnotateAttr {{.*}} "ANNOTATE_FOO"
+template<typename T> [[clang::annotate("ANNOTATE_FOO"), clang::annotate("ANNOTATE_BAR")]] void HasAnnotations();
+void UseAnnotations() { HasAnnotations<int>(); }
diff --git a/test/SemaTemplate/cxx17-inline-variables.cpp b/test/SemaTemplate/cxx17-inline-variables.cpp
index 9e6761ee57aa..7fc0aa8eeeb0 100644
--- a/test/SemaTemplate/cxx17-inline-variables.cpp
+++ b/test/SemaTemplate/cxx17-inline-variables.cpp
@@ -16,3 +16,14 @@ namespace CompleteType {
 
   constexpr int n = X<true>::value;
 }
+
+template <typename T> struct A {
+  static const int n;
+  static const int m;
+  constexpr int f() { return n; }
+  constexpr int g() { return n; }
+};
+template <typename T> constexpr int A<T>::n = sizeof(A) + sizeof(T);
+template <typename T> inline constexpr int A<T>::m = sizeof(A) + sizeof(T);
+static_assert(A<int>().f() == 5);
+static_assert(A<int>().g() == 5);
diff --git a/test/SemaTemplate/undefined-template.cpp b/test/SemaTemplate/undefined-template.cpp
index 7dfe2fde94b0..52530e2e3909 100644
--- a/test/SemaTemplate/undefined-template.cpp
+++ b/test/SemaTemplate/undefined-template.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 -Wundefined-func-template %s
 
+#if !defined(INCLUDE)
 template <class T> struct C1 {
   static char s_var_1;       // expected-note{{forward declaration of template entity is here}}
   static char s_var_2;       // expected-note{{forward declaration of template entity is here}}
@@ -142,6 +143,16 @@ namespace test_24 {
   void h(X<int> x) { g(x); } // no warning for use of 'g' despite the declaration having been instantiated from a template
 }
 
+#define INCLUDE
+#include "undefined-template.cpp"
+void func_25(SystemHeader<char> *x) {
+  x->meth();
+}
+
 int main() {
   return 0;
 }
+#else
+#pragma clang system_header
+template <typename T> struct SystemHeader { T meth(); };
+#endif
diff --git a/test/SemaTemplate/warn-thread-safety-analysis.cpp b/test/SemaTemplate/warn-thread-safety-analysis.cpp
new file mode 100644
index 000000000000..03bae7a29258
--- /dev/null
+++ b/test/SemaTemplate/warn-thread-safety-analysis.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=c++11 %s -verify -Wthread-safety-analysis
+
+class Mutex {
+public:
+  void Lock() __attribute__((exclusive_lock_function()));
+  void Unlock() __attribute__((unlock_function()));
+};
+
+class A {
+public:
+  Mutex mu1, mu2;
+
+  void foo() __attribute__((exclusive_locks_required(mu1))) __attribute__((exclusive_locks_required(mu2))) {}
+
+  template <class T> void bar() __attribute__((exclusive_locks_required(mu1))) __attribute__((exclusive_locks_required(mu2))) {
+    foo();
+  }
+};
+
+void f() {
+  A a;
+  a.mu1.Lock();
+  a.mu2.Lock();
+  a.bar<int>();
+  a.mu2.Unlock();
+  a.bar<int>(); // expected-warning {{calling function 'bar' requires holding mutex 'a.mu2' exclusively}}
+  a.mu1.Unlock();
+  a.bar<int>(); // expected-warning {{calling function 'bar' requires holding mutex 'a.mu1' exclusively}} \
+                   expected-warning {{calling function 'bar' requires holding mutex 'a.mu2' exclusively}}
+}
diff --git a/test/VFS/real-path-found-first.m b/test/VFS/real-path-found-first.m
index 5838aa36c452..cc5a0024ea80 100644
--- a/test/VFS/real-path-found-first.m
+++ b/test/VFS/real-path-found-first.m
@@ -7,7 +7,7 @@
 // REQUIRES: shell
 // RUN: rm -rf %t %t-cache %t.pch
 // RUN: mkdir -p %t/SomeFramework.framework/Modules
-// RUN: cp %S/Inputs/some_frame_module.map %t/SomeFramework.framework/Modules/module.modulemap
+// RUN: cat %S/Inputs/some_frame_module.map > %t/SomeFramework.framework/Modules/module.modulemap
 // RUN: sed -e "s:INPUT_DIR:%S/Inputs:g" -e "s:OUT_DIR:%t:g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 
 // Build
diff --git a/test/VFS/umbrella-framework-import-skipnonexist.m b/test/VFS/umbrella-framework-import-skipnonexist.m
index 5c7cd6d9db13..129f475fe774 100644
--- a/test/VFS/umbrella-framework-import-skipnonexist.m
+++ b/test/VFS/umbrella-framework-import-skipnonexist.m
@@ -5,7 +5,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/vdir %t/outdir %t/cache
-// RUN: cp -a %S/Inputs/Bar.framework %t/outdir/
+// RUN: cp -R %S/Inputs/Bar.framework %t/outdir/
 //
 // RUN: sed -e "s:VDIR:%t/vdir:g" -e "s:OUT_DIR:%t/outdir:g" %S/Inputs/bar-headers.yaml > %t/vdir/bar-headers.yaml
 // RUN: rm -f %t/outdir/Bar.framework/Headers/B.h
diff --git a/tools/c-index-test/c-index-test.c b/tools/c-index-test/c-index-test.c
index 99f05669b64c..61e42b26ee41 100644
--- a/tools/c-index-test/c-index-test.c
+++ b/tools/c-index-test/c-index-test.c
@@ -86,6 +86,69 @@ static unsigned getDefaultParsingOptions() {
   return options;
 }
 
+static void ModifyPrintingPolicyAccordingToEnv(CXPrintingPolicy Policy) {
+  struct Mapping {
+    const char *name;
+    enum CXPrintingPolicyProperty property;
+  };
+  struct Mapping mappings[] = {
+      {"CINDEXTEST_PRINTINGPOLICY_INDENTATION", CXPrintingPolicy_Indentation},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSSPECIFIERS",
+       CXPrintingPolicy_SuppressSpecifiers},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSTAGKEYWORD",
+       CXPrintingPolicy_SuppressTagKeyword},
+      {"CINDEXTEST_PRINTINGPOLICY_INCLUDETAGDEFINITION",
+       CXPrintingPolicy_IncludeTagDefinition},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSSCOPE",
+       CXPrintingPolicy_SuppressScope},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSUNWRITTENSCOPE",
+       CXPrintingPolicy_SuppressUnwrittenScope},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSINITIALIZERS",
+       CXPrintingPolicy_SuppressInitializers},
+      {"CINDEXTEST_PRINTINGPOLICY_CONSTANTARRAYSIZEASWRITTEN",
+       CXPrintingPolicy_ConstantArraySizeAsWritten},
+      {"CINDEXTEST_PRINTINGPOLICY_ANONYMOUSTAGLOCATIONS",
+       CXPrintingPolicy_AnonymousTagLocations},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSSTRONGLIFETIME",
+       CXPrintingPolicy_SuppressStrongLifetime},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSLIFETIMEQUALIFIERS",
+       CXPrintingPolicy_SuppressLifetimeQualifiers},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSTEMPLATEARGSINCXXCONSTRUCTORS",
+       CXPrintingPolicy_SuppressTemplateArgsInCXXConstructors},
+      {"CINDEXTEST_PRINTINGPOLICY_BOOL", CXPrintingPolicy_Bool},
+      {"CINDEXTEST_PRINTINGPOLICY_RESTRICT", CXPrintingPolicy_Restrict},
+      {"CINDEXTEST_PRINTINGPOLICY_ALIGNOF", CXPrintingPolicy_Alignof},
+      {"CINDEXTEST_PRINTINGPOLICY_UNDERSCOREALIGNOF",
+       CXPrintingPolicy_UnderscoreAlignof},
+      {"CINDEXTEST_PRINTINGPOLICY_USEVOIDFORZEROPARAMS",
+       CXPrintingPolicy_UseVoidForZeroParams},
+      {"CINDEXTEST_PRINTINGPOLICY_TERSEOUTPUT", CXPrintingPolicy_TerseOutput},
+      {"CINDEXTEST_PRINTINGPOLICY_POLISHFORDECLARATION",
+       CXPrintingPolicy_PolishForDeclaration},
+      {"CINDEXTEST_PRINTINGPOLICY_HALF", CXPrintingPolicy_Half},
+      {"CINDEXTEST_PRINTINGPOLICY_MSWCHAR", CXPrintingPolicy_MSWChar},
+      {"CINDEXTEST_PRINTINGPOLICY_INCLUDENEWLINES",
+       CXPrintingPolicy_IncludeNewlines},
+      {"CINDEXTEST_PRINTINGPOLICY_MSVCFORMATTING",
+       CXPrintingPolicy_MSVCFormatting},
+      {"CINDEXTEST_PRINTINGPOLICY_CONSTANTSASWRITTEN",
+       CXPrintingPolicy_ConstantsAsWritten},
+      {"CINDEXTEST_PRINTINGPOLICY_SUPPRESSIMPLICITBASE",
+       CXPrintingPolicy_SuppressImplicitBase},
+      {"CINDEXTEST_PRINTINGPOLICY_FULLYQUALIFIEDNAME",
+       CXPrintingPolicy_FullyQualifiedName},
+  };
+
+  unsigned i;
+  for (i = 0; i < sizeof(mappings) / sizeof(struct Mapping); i++) {
+    char *value = getenv(mappings[i].name);
+    if (value) {
+      clang_PrintingPolicy_setProperty(Policy, mappings[i].property,
+                                       (unsigned)strtoul(value, 0L, 10));
+    }
+  }
+}
+
 /** \brief Returns 0 in case of success, non-zero in case of a failure. */
 static int checkForErrors(CXTranslationUnit TU);
 
@@ -356,7 +419,11 @@ static void PrintRange(CXSourceRange R, const char *str) {
   PrintExtent(stdout, begin_line, begin_column, end_line, end_column);
 }
 
-int want_display_name = 0;
+static enum DisplayType {
+    DisplayType_Spelling,
+    DisplayType_DisplayName,
+    DisplayType_Pretty
+} wanted_display_type = DisplayType_Spelling;
 
 static void printVersion(const char *Prefix, CXVersion Version) {
   if (Version.Major < 0)
@@ -656,6 +723,28 @@ static int lineCol_cmp(const void *p1, const void *p2) {
   return (int)lhs->col - (int)rhs->col;
 }
 
+static CXString CursorToText(CXCursor Cursor) {
+  CXString text;
+  switch (wanted_display_type) {
+  case DisplayType_Spelling:
+    return clang_getCursorSpelling(Cursor);
+  case DisplayType_DisplayName:
+    return clang_getCursorDisplayName(Cursor);
+  case DisplayType_Pretty: {
+    CXPrintingPolicy Policy = clang_getCursorPrintingPolicy(Cursor);
+    ModifyPrintingPolicyAccordingToEnv(Policy);
+    text = clang_getCursorPrettyPrinted(Cursor, Policy);
+    clang_PrintingPolicy_dispose(Policy);
+    return text;
+  }
+  }
+  assert(0 && "unknown display type"); /* no llvm_unreachable in C. */
+  /* Set to NULL to prevent uninitialized variable warnings. */
+  text.data = NULL;
+  text.private_flags = 0;
+  return text;
+}
+
 static void PrintCursor(CXCursor Cursor, const char *CommentSchemaFile) {
   CXTranslationUnit TU = clang_Cursor_getTranslationUnit(Cursor);
   if (clang_isInvalid(Cursor.kind)) {
@@ -682,8 +771,7 @@ static void PrintCursor(CXCursor Cursor, const char *CommentSchemaFile) {
     int I;
 
     ks = clang_getCursorKindSpelling(Cursor.kind);
-    string = want_display_name? clang_getCursorDisplayName(Cursor) 
-                              : clang_getCursorSpelling(Cursor);
+    string = CursorToText(Cursor);
     printf("%s=%s", clang_getCString(ks),
                     clang_getCString(string));
     clang_disposeString(ks);
@@ -812,6 +900,8 @@ static void PrintCursor(CXCursor Cursor, const char *CommentSchemaFile) {
       printf(" (variadic)");
     if (clang_Cursor_isObjCOptional(Cursor))
       printf(" (@optional)");
+    if (clang_isInvalidDeclaration(Cursor))
+      printf(" (invalid)");
 
     switch (clang_getCursorExceptionSpecificationType(Cursor))
     {
@@ -1038,6 +1128,13 @@ static const char* GetCursorSource(CXCursor Cursor) {
   }
 }
 
+static CXString createCXString(const char *CS) {
+  CXString Str;
+  Str.data = (const void *) CS;
+  Str.private_flags = 0;
+  return Str;
+}
+
 /******************************************************************************/
 /* Callbacks.                                                                 */
 /******************************************************************************/
@@ -1684,7 +1781,12 @@ static int perform_test_load(CXIndex Idx, CXTranslationUnit TU,
     else if (!strcmp(filter, "all-display") || 
              !strcmp(filter, "local-display")) {
       ck = NULL;
-      want_display_name = 1;
+      wanted_display_type = DisplayType_DisplayName;
+    }
+    else if (!strcmp(filter, "all-pretty") ||
+             !strcmp(filter, "local-pretty")) {
+      ck = NULL;
+      wanted_display_type = DisplayType_Pretty;
     }
     else if (!strcmp(filter, "none")) K = (enum CXCursorKind) ~0;
     else if (!strcmp(filter, "category")) K = CXCursor_ObjCCategoryDecl;
@@ -1752,8 +1854,11 @@ int perform_test_load_source(int argc, const char **argv,
   const char *InvocationPath;
 
   Idx = clang_createIndex(/* excludeDeclsFromPCH */
-                          (!strcmp(filter, "local") || 
-                           !strcmp(filter, "local-display"))? 1 : 0,
+                          (!strcmp(filter, "local") ||
+                           !strcmp(filter, "local-display") ||
+                           !strcmp(filter, "local-pretty"))
+                              ? 1
+                              : 0,
                           /* displayDiagnostics=*/1);
   InvocationPath = getenv("CINDEXTEST_INVOCATION_EMISSION_PATH");
   if (InvocationPath)
@@ -2324,6 +2429,8 @@ int perform_code_completion(int argc, const char **argv, int timing_only) {
     completionOptions |= CXCodeComplete_IncludeCodePatterns;
   if (getenv("CINDEXTEST_COMPLETION_BRIEF_COMMENTS"))
     completionOptions |= CXCodeComplete_IncludeBriefComments;
+  if (getenv("CINDEXTEST_COMPLETION_SKIP_PREAMBLE"))
+    completionOptions |= CXCodeComplete_SkipPreamble;
   
   if (timing_only)
     input += strlen("-code-completion-timing=");
@@ -2990,7 +3097,7 @@ typedef struct {
   int first_check_printed;
   int fail_for_error;
   int abort;
-  const char *main_filename;
+  CXString main_filename;
   ImportedASTFilesData *importedASTs;
   IndexDataStringList *strings;
   CXTranslationUnit TU;
@@ -3029,6 +3136,7 @@ static void printCXIndexLoc(CXIdxLoc loc, CXClientData client_data) {
   const char *cname;
   CXIdxClientFile file;
   unsigned line, column;
+  const char *main_filename;
   int isMainFile;
   
   index_data = (IndexData *)client_data;
@@ -3043,7 +3151,8 @@ static void printCXIndexLoc(CXIdxLoc loc, CXClientData client_data) {
   }
   filename = clang_getFileName((CXFile)file);
   cname = clang_getCString(filename);
-  if (strcmp(cname, index_data->main_filename) == 0)
+  main_filename = clang_getCString(index_data->main_filename);
+  if (strcmp(cname, main_filename) == 0)
     isMainFile = 1;
   else
     isMainFile = 0;
@@ -3245,14 +3354,11 @@ static void index_diagnostic(CXClientData client_data,
 static CXIdxClientFile index_enteredMainFile(CXClientData client_data,
                                        CXFile file, void *reserved) {
   IndexData *index_data;
-  CXString filename;
 
   index_data = (IndexData *)client_data;
   printCheck(index_data);
 
-  filename = clang_getFileName(file);
-  index_data->main_filename = clang_getCString(filename);
-  clang_disposeString(filename);
+  index_data->main_filename = clang_getFileName(file);
 
   printf("[enteredMainFile]: ");
   printCXIndexFile((CXIdxClientFile)file);
@@ -3491,7 +3597,7 @@ static int index_compile_args(int num_args, const char **args,
   index_data.first_check_printed = 0;
   index_data.fail_for_error = 0;
   index_data.abort = 0;
-  index_data.main_filename = "";
+  index_data.main_filename = createCXString("");
   index_data.importedASTs = importedASTs;
   index_data.strings = NULL;
   index_data.TU = NULL;
@@ -3507,6 +3613,7 @@ static int index_compile_args(int num_args, const char **args,
   if (index_data.fail_for_error)
     result = -1;
 
+  clang_disposeString(index_data.main_filename);
   free_client_data(&index_data);
   return result;
 }
@@ -3528,7 +3635,7 @@ static int index_ast_file(const char *ast_file,
   index_data.first_check_printed = 0;
   index_data.fail_for_error = 0;
   index_data.abort = 0;
-  index_data.main_filename = "";
+  index_data.main_filename = createCXString("");
   index_data.importedASTs = importedASTs;
   index_data.strings = NULL;
   index_data.TU = TU;
@@ -3541,6 +3648,7 @@ static int index_ast_file(const char *ast_file,
     result = -1;
 
   clang_disposeTranslationUnit(TU);
+  clang_disposeString(index_data.main_filename);
   free_client_data(&index_data);
   return result;
 }
@@ -4033,9 +4141,7 @@ int print_usrs(const char **I, const char **E) {
           if (!isUSR(I[2]))
             return not_usr("<class USR>", I[2]);
           else {
-            CXString x;
-            x.data = (void*) I[2];
-            x.private_flags = 0;
+            CXString x = createCXString(I[2]);
             print_usr(clang_constructUSR_ObjCIvar(I[1], x));
           }
 
@@ -4060,9 +4166,7 @@ int print_usrs(const char **I, const char **E) {
           if (!isUSR(I[3]))
             return not_usr("<class USR>", I[3]);
           else {
-            CXString x;
-            x.data = (void*) I[3];
-            x.private_flags = 0;
+            CXString x = createCXString(I[3]);
             print_usr(clang_constructUSR_ObjCMethod(I[1], atoi(I[2]), x));
           }
           I += 4;
@@ -4090,9 +4194,7 @@ int print_usrs(const char **I, const char **E) {
           if (!isUSR(I[2]))
             return not_usr("<class USR>", I[2]);
           else {
-            CXString x;
-            x.data = (void*) I[2];
-            x.private_flags = 0;
+            CXString x = createCXString(I[2]);
             print_usr(clang_constructUSR_ObjCProperty(I[1], x));
           }
           I += 3;
diff --git a/tools/clang-format/ClangFormat.cpp b/tools/clang-format/ClangFormat.cpp
index b7179ffd6416..2718a72f43c5 100644
--- a/tools/clang-format/ClangFormat.cpp
+++ b/tools/clang-format/ClangFormat.cpp
@@ -357,10 +357,27 @@ int main(int argc, const char **argv) {
   }
 
   if (DumpConfig) {
+    StringRef FileName;
+    std::unique_ptr<llvm::MemoryBuffer> Code;
+    if (FileNames.empty()) {
+      // We can't read the code to detect the language if there's no
+      // file name, so leave Code empty here.
+      FileName = AssumeFileName;
+    } else {
+      // Read in the code in case the filename alone isn't enough to
+      // detect the language.
+      ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
+          MemoryBuffer::getFileOrSTDIN(FileNames[0]);
+      if (std::error_code EC = CodeOrErr.getError()) {
+        llvm::errs() << EC.message() << "\n";
+        return 1;
+      }
+      FileName = (FileNames[0] == "-") ? AssumeFileName : FileNames[0];
+      Code = std::move(CodeOrErr.get());
+    }
     llvm::Expected<clang::format::FormatStyle> FormatStyle =
-        clang::format::getStyle(
-            Style, FileNames.empty() ? AssumeFileName : FileNames[0],
-            FallbackStyle);
+        clang::format::getStyle(Style, FileName, FallbackStyle,
+                                Code ? Code->getBuffer() : "");
     if (!FormatStyle) {
       llvm::errs() << llvm::toString(FormatStyle.takeError()) << "\n";
       return 1;
diff --git a/tools/clang-format/git-clang-format b/tools/clang-format/git-clang-format
index 60cd4fb25b63..0b2103962a31 100755
--- a/tools/clang-format/git-clang-format
+++ b/tools/clang-format/git-clang-format
@@ -79,6 +79,7 @@ def main():
       'm',  # ObjC
       'mm',  # ObjC++
       'cc', 'cp', 'cpp', 'c++', 'cxx', 'hpp',  # C++
+      'cu',  # CUDA
       # Other languages that clang-format supports
       'proto', 'protodevel',  # Protocol Buffers
       'java',  # Java
diff --git a/tools/clang-import-test/clang-import-test.cpp b/tools/clang-import-test/clang-import-test.cpp
index 1ea7ee3611df..4871acdf3614 100644
--- a/tools/clang-import-test/clang-import-test.cpp
+++ b/tools/clang-import-test/clang-import-test.cpp
@@ -327,7 +327,7 @@ llvm::Expected<CIAndOrigins> Parse(const std::string &Path,
     CG.GetModule()->print(llvm::outs(), nullptr);
   if (CI.getDiagnosticClient().getNumErrors())
     return llvm::make_error<llvm::StringError>(
-        "Errors occured while parsing the expression.", std::error_code());
+        "Errors occurred while parsing the expression.", std::error_code());
   return std::move(CI);
 }
 
diff --git a/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/tools/clang-offload-bundler/ClangOffloadBundler.cpp
index 6ff4becb5030..1df165624459 100644
--- a/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ b/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -412,7 +412,7 @@ class ObjectFileHandler final : public FileHandler {
   /// read from the buffers.
   unsigned NumberOfProcessedInputs = 0;
 
-  /// LLVM context used to to create the auxiliary modules.
+  /// LLVM context used to create the auxiliary modules.
   LLVMContext VMContext;
 
   /// LLVM module used to create an object with all the bundle
diff --git a/tools/clang-refactor/CMakeLists.txt b/tools/clang-refactor/CMakeLists.txt
index d2029066b9b7..b435744ca4bd 100644
--- a/tools/clang-refactor/CMakeLists.txt
+++ b/tools/clang-refactor/CMakeLists.txt
@@ -20,5 +20,3 @@ target_link_libraries(clang-refactor
   clangToolingCore
   clangToolingRefactor
   )
-
-install(TARGETS clang-refactor RUNTIME DESTINATION bin)
diff --git a/tools/driver/CMakeLists.txt b/tools/driver/CMakeLists.txt
index 22a498422aef..fd9d5b78b473 100644
--- a/tools/driver/CMakeLists.txt
+++ b/tools/driver/CMakeLists.txt
@@ -60,7 +60,7 @@ endif()
 add_dependencies(clang clang-headers)
 
 if(NOT CLANG_LINKS_TO_CREATE)
-  set(CLANG_LINKS_TO_CREATE clang++ clang-cl clang-cpp)
+  set(CLANG_LINKS_TO_CREATE clang++ clang-cl clang-cpp flang)
 
   if (MSVC)
     list(APPEND CLANG_LINKS_TO_CREATE ../msbuild-bin/cl)
diff --git a/tools/driver/cc1as_main.cpp b/tools/driver/cc1as_main.cpp
index 9b90562af903..630a0bcffaa7 100644
--- a/tools/driver/cc1as_main.cpp
+++ b/tools/driver/cc1as_main.cpp
@@ -181,7 +181,13 @@ bool AssemblerInvocation::CreateFromArgs(AssemblerInvocation &Opts,
 
   // Issue errors on unknown arguments.
   for (const Arg *A : Args.filtered(OPT_UNKNOWN)) {
-    Diags.Report(diag::err_drv_unknown_argument) << A->getAsString(Args);
+    auto ArgString = A->getAsString(Args);
+    std::string Nearest;
+    if (OptTbl->findNearest(ArgString, Nearest, IncludedFlagsBitmask) > 1)
+      Diags.Report(diag::err_drv_unknown_argument) << ArgString;
+    else
+      Diags.Report(diag::err_drv_unknown_argument_with_suggestion)
+          << ArgString << Nearest;
     Success = false;
   }
 
@@ -397,7 +403,7 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts,
     if (Opts.ShowEncoding) {
       CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
       MCTargetOptions Options;
-      MAB = TheTarget->createMCAsmBackend(*MRI, Opts.Triple, Opts.CPU, Options);
+      MAB = TheTarget->createMCAsmBackend(*STI, *MRI, Options);
     }
     auto FOut = llvm::make_unique<formatted_raw_ostream>(*Out);
     Str.reset(TheTarget->createAsmStreamer(
@@ -415,8 +421,7 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts,
 
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
     MCTargetOptions Options;
-    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*MRI, Opts.Triple,
-                                                      Opts.CPU, Options);
+    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, Options);
     Triple T(Opts.Triple);
     Str.reset(TheTarget->createMCObjectStreamer(
                 T, Ctx, std::unique_ptr<MCAsmBackend>(MAB), *Out, std::unique_ptr<MCCodeEmitter>(CE), *STI,
diff --git a/tools/driver/driver.cpp b/tools/driver/driver.cpp
index fa757da9535c..611bff233e41 100644
--- a/tools/driver/driver.cpp
+++ b/tools/driver/driver.cpp
@@ -311,7 +311,8 @@ static int ExecuteCC1Tool(ArrayRef<const char *> argv, StringRef Tool) {
     return cc1as_main(argv.slice(2), argv[0], GetExecutablePathVP);
 
   // Reject unknown tools.
-  llvm::errs() << "error: unknown integrated tool '" << Tool << "'\n";
+  llvm::errs() << "error: unknown integrated tool '" << Tool << "'. "
+               << "Valid tools include '-cc1' and '-cc1as'.\n";
   return 1;
 }
 
diff --git a/tools/libclang/CIndex.cpp b/tools/libclang/CIndex.cpp
index f4d347108c9f..7a8df9d7f88b 100644
--- a/tools/libclang/CIndex.cpp
+++ b/tools/libclang/CIndex.cpp
@@ -785,6 +785,16 @@ bool CursorVisitor::VisitDeclaratorDecl(DeclaratorDecl *DD) {
   return false;
 }
 
+static bool HasTrailingReturnType(FunctionDecl *ND) {
+  const QualType Ty = ND->getType();
+  if (const FunctionType *AFT = Ty->getAs<FunctionType>()) {
+    if (const FunctionProtoType *FT = dyn_cast<FunctionProtoType>(AFT))
+      return FT->hasTrailingReturn();
+  }
+
+  return false;
+}
+
 /// \brief Compare two base or member initializers based on their source order.
 static int CompareCXXCtorInitializers(CXXCtorInitializer *const *X,
                                       CXXCtorInitializer *const *Y) {
@@ -804,14 +814,16 @@ bool CursorVisitor::VisitFunctionDecl(FunctionDecl *ND) {
     // written. This requires a bit of work.
     TypeLoc TL = TSInfo->getTypeLoc().IgnoreParens();
     FunctionTypeLoc FTL = TL.getAs<FunctionTypeLoc>();
+    const bool HasTrailingRT = HasTrailingReturnType(ND);
     
     // If we have a function declared directly (without the use of a typedef),
     // visit just the return type. Otherwise, just visit the function's type
     // now.
-    if ((FTL && !isa<CXXConversionDecl>(ND) && Visit(FTL.getReturnLoc())) ||
+    if ((FTL && !isa<CXXConversionDecl>(ND) && !HasTrailingRT &&
+         Visit(FTL.getReturnLoc())) ||
         (!FTL && Visit(TL)))
       return true;
-    
+
     // Visit the nested-name-specifier, if present.
     if (NestedNameSpecifierLoc QualifierLoc = ND->getQualifierLoc())
       if (VisitNestedNameSpecifierLoc(QualifierLoc))
@@ -827,7 +839,11 @@ bool CursorVisitor::VisitFunctionDecl(FunctionDecl *ND) {
     // Visit the function parameters, if we have a function type.
     if (FTL && VisitFunctionTypeLoc(FTL, true))
       return true;
-    
+
+    // Visit the function's trailing return type.
+    if (FTL && HasTrailingRT && Visit(FTL.getReturnLoc()))
+      return true;
+
     // FIXME: Attributes?
   }
   
@@ -4690,6 +4706,195 @@ CXStringSet *clang_Cursor_getObjCManglings(CXCursor C) {
   return cxstring::createSet(Manglings);
 }
 
+CXPrintingPolicy clang_getCursorPrintingPolicy(CXCursor C) {
+  if (clang_Cursor_isNull(C))
+    return 0;
+  return new PrintingPolicy(getCursorContext(C).getPrintingPolicy());
+}
+
+void clang_PrintingPolicy_dispose(CXPrintingPolicy Policy) {
+  if (Policy)
+    delete static_cast<PrintingPolicy *>(Policy);
+}
+
+unsigned
+clang_PrintingPolicy_getProperty(CXPrintingPolicy Policy,
+                                 enum CXPrintingPolicyProperty Property) {
+  if (!Policy)
+    return 0;
+
+  PrintingPolicy *P = static_cast<PrintingPolicy *>(Policy);
+  switch (Property) {
+  case CXPrintingPolicy_Indentation:
+    return P->Indentation;
+  case CXPrintingPolicy_SuppressSpecifiers:
+    return P->SuppressSpecifiers;
+  case CXPrintingPolicy_SuppressTagKeyword:
+    return P->SuppressTagKeyword;
+  case CXPrintingPolicy_IncludeTagDefinition:
+    return P->IncludeTagDefinition;
+  case CXPrintingPolicy_SuppressScope:
+    return P->SuppressScope;
+  case CXPrintingPolicy_SuppressUnwrittenScope:
+    return P->SuppressUnwrittenScope;
+  case CXPrintingPolicy_SuppressInitializers:
+    return P->SuppressInitializers;
+  case CXPrintingPolicy_ConstantArraySizeAsWritten:
+    return P->ConstantArraySizeAsWritten;
+  case CXPrintingPolicy_AnonymousTagLocations:
+    return P->AnonymousTagLocations;
+  case CXPrintingPolicy_SuppressStrongLifetime:
+    return P->SuppressStrongLifetime;
+  case CXPrintingPolicy_SuppressLifetimeQualifiers:
+    return P->SuppressLifetimeQualifiers;
+  case CXPrintingPolicy_SuppressTemplateArgsInCXXConstructors:
+    return P->SuppressTemplateArgsInCXXConstructors;
+  case CXPrintingPolicy_Bool:
+    return P->Bool;
+  case CXPrintingPolicy_Restrict:
+    return P->Restrict;
+  case CXPrintingPolicy_Alignof:
+    return P->Alignof;
+  case CXPrintingPolicy_UnderscoreAlignof:
+    return P->UnderscoreAlignof;
+  case CXPrintingPolicy_UseVoidForZeroParams:
+    return P->UseVoidForZeroParams;
+  case CXPrintingPolicy_TerseOutput:
+    return P->TerseOutput;
+  case CXPrintingPolicy_PolishForDeclaration:
+    return P->PolishForDeclaration;
+  case CXPrintingPolicy_Half:
+    return P->Half;
+  case CXPrintingPolicy_MSWChar:
+    return P->MSWChar;
+  case CXPrintingPolicy_IncludeNewlines:
+    return P->IncludeNewlines;
+  case CXPrintingPolicy_MSVCFormatting:
+    return P->MSVCFormatting;
+  case CXPrintingPolicy_ConstantsAsWritten:
+    return P->ConstantsAsWritten;
+  case CXPrintingPolicy_SuppressImplicitBase:
+    return P->SuppressImplicitBase;
+  case CXPrintingPolicy_FullyQualifiedName:
+    return P->FullyQualifiedName;
+  }
+
+  assert(false && "Invalid CXPrintingPolicyProperty");
+  return 0;
+}
+
+void clang_PrintingPolicy_setProperty(CXPrintingPolicy Policy,
+                                      enum CXPrintingPolicyProperty Property,
+                                      unsigned Value) {
+  if (!Policy)
+    return;
+
+  PrintingPolicy *P = static_cast<PrintingPolicy *>(Policy);
+  switch (Property) {
+  case CXPrintingPolicy_Indentation:
+    P->Indentation = Value;
+    return;
+  case CXPrintingPolicy_SuppressSpecifiers:
+    P->SuppressSpecifiers = Value;
+    return;
+  case CXPrintingPolicy_SuppressTagKeyword:
+    P->SuppressTagKeyword = Value;
+    return;
+  case CXPrintingPolicy_IncludeTagDefinition:
+    P->IncludeTagDefinition = Value;
+    return;
+  case CXPrintingPolicy_SuppressScope:
+    P->SuppressScope = Value;
+    return;
+  case CXPrintingPolicy_SuppressUnwrittenScope:
+    P->SuppressUnwrittenScope = Value;
+    return;
+  case CXPrintingPolicy_SuppressInitializers:
+    P->SuppressInitializers = Value;
+    return;
+  case CXPrintingPolicy_ConstantArraySizeAsWritten:
+    P->ConstantArraySizeAsWritten = Value;
+    return;
+  case CXPrintingPolicy_AnonymousTagLocations:
+    P->AnonymousTagLocations = Value;
+    return;
+  case CXPrintingPolicy_SuppressStrongLifetime:
+    P->SuppressStrongLifetime = Value;
+    return;
+  case CXPrintingPolicy_SuppressLifetimeQualifiers:
+    P->SuppressLifetimeQualifiers = Value;
+    return;
+  case CXPrintingPolicy_SuppressTemplateArgsInCXXConstructors:
+    P->SuppressTemplateArgsInCXXConstructors = Value;
+    return;
+  case CXPrintingPolicy_Bool:
+    P->Bool = Value;
+    return;
+  case CXPrintingPolicy_Restrict:
+    P->Restrict = Value;
+    return;
+  case CXPrintingPolicy_Alignof:
+    P->Alignof = Value;
+    return;
+  case CXPrintingPolicy_UnderscoreAlignof:
+    P->UnderscoreAlignof = Value;
+    return;
+  case CXPrintingPolicy_UseVoidForZeroParams:
+    P->UseVoidForZeroParams = Value;
+    return;
+  case CXPrintingPolicy_TerseOutput:
+    P->TerseOutput = Value;
+    return;
+  case CXPrintingPolicy_PolishForDeclaration:
+    P->PolishForDeclaration = Value;
+    return;
+  case CXPrintingPolicy_Half:
+    P->Half = Value;
+    return;
+  case CXPrintingPolicy_MSWChar:
+    P->MSWChar = Value;
+    return;
+  case CXPrintingPolicy_IncludeNewlines:
+    P->IncludeNewlines = Value;
+    return;
+  case CXPrintingPolicy_MSVCFormatting:
+    P->MSVCFormatting = Value;
+    return;
+  case CXPrintingPolicy_ConstantsAsWritten:
+    P->ConstantsAsWritten = Value;
+    return;
+  case CXPrintingPolicy_SuppressImplicitBase:
+    P->SuppressImplicitBase = Value;
+    return;
+  case CXPrintingPolicy_FullyQualifiedName:
+    P->FullyQualifiedName = Value;
+    return;
+  }
+
+  assert(false && "Invalid CXPrintingPolicyProperty");
+}
+
+CXString clang_getCursorPrettyPrinted(CXCursor C, CXPrintingPolicy cxPolicy) {
+  if (clang_Cursor_isNull(C))
+    return cxstring::createEmpty();
+
+  if (clang_isDeclaration(C.kind)) {
+    const Decl *D = getCursorDecl(C);
+    if (!D)
+      return cxstring::createEmpty();
+
+    SmallString<128> Str;
+    llvm::raw_svector_ostream OS(Str);
+    PrintingPolicy *UserPolicy = static_cast<PrintingPolicy *>(cxPolicy);
+    D->print(OS, UserPolicy ? *UserPolicy
+                            : getCursorContext(C).getPrintingPolicy());
+
+    return cxstring::createDup(OS.str());
+  }
+
+  return cxstring::createEmpty();
+}
+
 CXString clang_getCursorDisplayName(CXCursor C) {
   if (!clang_isDeclaration(C.kind))
     return clang_getCursorSpelling(C);
@@ -5421,6 +5626,15 @@ unsigned clang_isDeclaration(enum CXCursorKind K) {
          (K >= CXCursor_FirstExtraDecl && K <= CXCursor_LastExtraDecl);
 }
 
+unsigned clang_isInvalidDeclaration(CXCursor C) {
+  if (clang_isDeclaration(C.kind)) {
+    if (const Decl *D = getCursorDecl(C))
+      return D->isInvalidDecl();
+  }
+
+  return 0;
+}
+
 unsigned clang_isReference(enum CXCursorKind K) {
   return K >= CXCursor_FirstRef && K <= CXCursor_LastRef;
 }
@@ -8135,6 +8349,7 @@ CXSourceRangeList *clang_getSkippedRanges(CXTranslationUnit TU, CXFile file) {
   SourceManager &sm = Ctx.getSourceManager();
   FileEntry *fileEntry = static_cast<FileEntry *>(file);
   FileID wantedFileID = sm.translateFile(fileEntry);
+  bool isMainFile = wantedFileID == sm.getMainFileID();
 
   const std::vector<SourceRange> &SkippedRanges = ppRec->getSkippedRanges();
   std::vector<SourceRange> wantedRanges;
@@ -8142,6 +8357,8 @@ CXSourceRangeList *clang_getSkippedRanges(CXTranslationUnit TU, CXFile file) {
        i != ei; ++i) {
     if (sm.getFileID(i->getBegin()) == wantedFileID || sm.getFileID(i->getEnd()) == wantedFileID)
       wantedRanges.push_back(*i);
+    else if (isMainFile && (astUnit->isInPreambleFileID(i->getBegin()) || astUnit->isInPreambleFileID(i->getEnd())))
+      wantedRanges.push_back(*i);
   }
 
   skipped->count = wantedRanges.size();
diff --git a/tools/libclang/CIndexCodeCompletion.cpp b/tools/libclang/CIndexCodeCompletion.cpp
index d4af0870c0b6..ba2c818cca30 100644
--- a/tools/libclang/CIndexCodeCompletion.cpp
+++ b/tools/libclang/CIndexCodeCompletion.cpp
@@ -643,6 +643,7 @@ clang_codeCompleteAt_Impl(CXTranslationUnit TU, const char *complete_filename,
                           ArrayRef<CXUnsavedFile> unsaved_files,
                           unsigned options) {
   bool IncludeBriefComments = options & CXCodeComplete_IncludeBriefComments;
+  bool SkipPreamble = options & CXCodeComplete_SkipPreamble;
 
 #ifdef UDP_CODE_COMPLETION_LOGGER
 #ifdef UDP_CODE_COMPLETION_LOGGER_PORT
@@ -689,6 +690,7 @@ clang_codeCompleteAt_Impl(CXTranslationUnit TU, const char *complete_filename,
   // Create a code-completion consumer to capture the results.
   CodeCompleteOptions Opts;
   Opts.IncludeBriefComments = IncludeBriefComments;
+  Opts.LoadExternal = !SkipPreamble;
   CaptureCompletionResults Capture(Opts, *Results, &TU);
 
   // Perform completion.
diff --git a/tools/libclang/CXIndexDataConsumer.cpp b/tools/libclang/CXIndexDataConsumer.cpp
index 89ac23be7344..0db6f0c1f078 100644
--- a/tools/libclang/CXIndexDataConsumer.cpp
+++ b/tools/libclang/CXIndexDataConsumer.cpp
@@ -890,7 +890,7 @@ bool CXIndexDataConsumer::handleReference(const NamedDecl *D, SourceLocation Loc
                                       const DeclContext *DC,
                                       const Expr *E,
                                       CXIdxEntityRefKind Kind) {
-  if (!D)
+  if (!D || !DC)
     return false;
 
   CXCursor Cursor = E ? MakeCXCursor(E, cast<Decl>(DC), CXTU)
@@ -907,7 +907,7 @@ bool CXIndexDataConsumer::handleReference(const NamedDecl *D, SourceLocation Loc
   if (!CB.indexEntityReference)
     return false;
 
-  if (!D)
+  if (!D || !DC)
     return false;
   if (Loc.isInvalid())
     return false;
diff --git a/tools/libclang/libclang.exports b/tools/libclang/libclang.exports
index 4d3a029567d4..67e4b4334b7e 100644
--- a/tools/libclang/libclang.exports
+++ b/tools/libclang/libclang.exports
@@ -178,6 +178,8 @@ clang_getCursorAvailability
 clang_getCursorCompletionString
 clang_getCursorDefinition
 clang_getCursorDisplayName
+clang_getCursorPrintingPolicy
+clang_getCursorPrettyPrinted
 clang_getCursorExtent
 clang_getCursorExceptionSpecificationType
 clang_getCursorKind
@@ -220,6 +222,7 @@ clang_getExceptionSpecificationType
 clang_getFieldDeclBitWidth
 clang_getExpansionLocation
 clang_getFile
+clang_getFileContents
 clang_getFileLocation
 clang_getFileName
 clang_getFileTime
@@ -290,6 +293,7 @@ clang_isAttribute
 clang_isConstQualifiedType
 clang_isCursorDefinition
 clang_isDeclaration
+clang_isInvalidDeclaration
 clang_isExpression
 clang_isFileMultipleIncludeGuarded
 clang_isFunctionTypeVariadic
@@ -357,3 +361,6 @@ clang_EvalResult_isUnsignedInt
 clang_EvalResult_getAsDouble
 clang_EvalResult_getAsStr
 clang_EvalResult_dispose
+clang_PrintingPolicy_getProperty
+clang_PrintingPolicy_setProperty
+clang_PrintingPolicy_dispose
diff --git a/tools/scan-build-py/libscanbuild/analyze.py b/tools/scan-build-py/libscanbuild/analyze.py
index b5614b5b6da0..6c129bd7c6d0 100644
--- a/tools/scan-build-py/libscanbuild/analyze.py
+++ b/tools/scan-build-py/libscanbuild/analyze.py
@@ -317,7 +317,7 @@ def run(opts):
 
         return arch_check(opts)
     except Exception:
-        logging.error("Problem occured during analyzis.", exc_info=1)
+        logging.error("Problem occurred during analyzis.", exc_info=1)
         return None
 
 
diff --git a/tools/scan-build/bin/scan-build b/tools/scan-build/bin/scan-build
index cbf3bf3d6f0b..49018ebf0261 100755
--- a/tools/scan-build/bin/scan-build
+++ b/tools/scan-build/bin/scan-build
@@ -51,6 +51,7 @@ my %Options = (
   OutputDir => undef,        # Parent directory to store HTML files.
   HtmlTitle => basename($CurrentDir)." - scan-build results",
   IgnoreErrors => 0,         # Ignore build errors.
+  KeepCC => 0,               # Do not override CC and CXX make variables
   ViewResults => 0,          # View results when the build terminates.
   ExitStatusFoundBugs => 0,  # Exit status reflects whether bugs were found
   ShowDescription => 0,      # Display the description of the defect in the list
@@ -1062,6 +1063,7 @@ sub RunXcodebuild {
 sub RunBuildCommand {
   my $Args = shift;
   my $IgnoreErrors = shift;
+  my $KeepCC = shift;
   my $Cmd = $Args->[0];
   my $CCAnalyzer = shift;
   my $CXXAnalyzer = shift;
@@ -1099,8 +1101,10 @@ sub RunBuildCommand {
     unshift @$Args, $CXXAnalyzer;
   }
   elsif ($Cmd eq "make" or $Cmd eq "gmake" or $Cmd eq "mingw32-make") {
-    AddIfNotPresent($Args, "CC=$CCAnalyzer");
-    AddIfNotPresent($Args, "CXX=$CXXAnalyzer");
+    if (!$KeepCC) {
+      AddIfNotPresent($Args, "CC=$CCAnalyzer");
+      AddIfNotPresent($Args, "CXX=$CXXAnalyzer");
+    }
     if ($IgnoreErrors) {
       AddIfNotPresent($Args,"-k");
       AddIfNotPresent($Args,"-i");
@@ -1158,6 +1162,12 @@ OPTIONS:
    currently supports make and xcodebuild. This is a convenience option; one
    can specify this behavior directly using build options.
 
+ --keep-cc
+
+   Do not override CC and CXX make variables. Useful when running make in
+   autoconf-based (and similar) projects where configure can add extra flags
+   to those variables.
+
  --html-title [title]
  --html-title=[title]
 
@@ -1532,6 +1542,12 @@ sub ProcessArgs {
       next;
     }
 
+    if ($arg eq "--keep-cc") {
+      shift @$Args;
+      $Options{KeepCC} = 1;
+      next;
+    }
+
     if ($arg =~ /^--use-cc(=(.+))?$/) {
       shift @$Args;
       my $cc;
@@ -1838,8 +1854,8 @@ my %EnvVars = (
 );
 
 # Run the build.
-my $ExitStatus = RunBuildCommand(\@ARGV, $Options{IgnoreErrors}, $Cmd, $CmdCXX,
-                                \%EnvVars);
+my $ExitStatus = RunBuildCommand(\@ARGV, $Options{IgnoreErrors}, $Options{KeepCC},
+	                        $Cmd, $CmdCXX, \%EnvVars);
 
 if (defined $Options{OutputFormat}) {
   if ($Options{OutputFormat} =~ /plist/) {
diff --git a/tools/scan-build/libexec/ccc-analyzer b/tools/scan-build/libexec/ccc-analyzer
index b0ec7e7e7487..73cd2ff3d915 100755
--- a/tools/scan-build/libexec/ccc-analyzer
+++ b/tools/scan-build/libexec/ccc-analyzer
@@ -419,7 +419,7 @@ my %LangMap = (
   'cc'  => 'c++',
   'C'   => 'c++',
   'ii'  => 'c++-cpp-output',
-  'i'   => $IsCXX ? 'c++-cpp-output' : 'c-cpp-output',
+  'i'   => $IsCXX ? 'c++-cpp-output' : 'cpp-output',
   'm'   => 'objective-c',
   'mi'  => 'objective-c-cpp-output',
   'mm'  => 'objective-c++',
@@ -439,7 +439,7 @@ my %LangsAccepted = (
   "c" => 1,
   "c++" => 1,
   "objective-c++" => 1,
-  "c-cpp-output" => 1,
+  "cpp-output" => 1,
   "objective-c-cpp-output" => 1,
   "c++-cpp-output" => 1
 );
diff --git a/unittests/AST/ASTImporterTest.cpp b/unittests/AST/ASTImporterTest.cpp
index a6ec304ff1a1..41134f96efe8 100644
--- a/unittests/AST/ASTImporterTest.cpp
+++ b/unittests/AST/ASTImporterTest.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MatchVerifier.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTImporter.h"
-#include "MatchVerifier.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Tooling/Tooling.h"
@@ -40,10 +40,10 @@ static RunOptions getRunOptionsForLanguage(Language Lang) {
     BasicArgs = {"-x", "c", "-std=c89"};
     break;
   case Lang_CXX:
-    BasicArgs = {"-std=c++98"};
+    BasicArgs = {"-std=c++98", "-frtti"};
     break;
   case Lang_CXX11:
-    BasicArgs = {"-std=c++11"};
+    BasicArgs = {"-std=c++11", "-frtti"};
     break;
   case Lang_OpenCL:
   case Lang_OBJCXX:
@@ -99,7 +99,11 @@ testImport(const std::string &FromCode, const ArgVector &FromArgs,
   if (FoundDecls.size() != 1)
     return testing::AssertionFailure() << "Multiple declarations were found!";
 
-  auto Imported = Importer.Import(*FoundDecls.begin());
+  // Sanity check: the node being imported should match in the same way as
+  // the result node.
+  EXPECT_TRUE(Verifier.match(FoundDecls.front(), AMatcher));
+
+  auto Imported = Importer.Import(FoundDecls.front());
   if (!Imported)
     return testing::AssertionFailure() << "Import failed, nullptr returned!";
 
@@ -480,6 +484,16 @@ TEST(ImportExpr, ImportVAArgExpr) {
                          vaArgExpr())))))));
 }
 
+TEST(ImportExpr, CXXTemporaryObjectExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport("struct C {};"
+             "void declToImport() { C c = C(); }",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             functionDecl(hasBody(compoundStmt(has(
+                 declStmt(has(varDecl(has(exprWithCleanups(has(cxxConstructExpr(
+                     has(materializeTemporaryExpr(has(implicitCastExpr(
+                         has(cxxTemporaryObjectExpr())))))))))))))))));
+}
 
 TEST(ImportType, ImportAtomicType) {
   MatchVerifier<Decl> Verifier;
@@ -564,6 +578,50 @@ TEST(ImportType, ImportPackExpansion) {
                              declRefExpr())))))))));
 }
 
+const internal::VariadicDynCastAllOfMatcher<Type,
+                                            DependentTemplateSpecializationType>
+    dependentTemplateSpecializationType;
+
+TEST(ImportType, ImportDependentTemplateSpecialization) {
+  MatchVerifier<Decl> Verifier;
+  testImport("template<typename T>"
+             "struct A;"
+             "template<typename T>"
+             "struct declToImport {"
+             "  typename A<T>::template B<T> a;"
+             "};",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             classTemplateDecl(has(cxxRecordDecl(has(
+                 fieldDecl(hasType(dependentTemplateSpecializationType())))))));
+}
+
+const internal::VariadicDynCastAllOfMatcher<Stmt, SizeOfPackExpr>
+    sizeOfPackExpr;
+
+TEST(ImportExpr, ImportSizeOfPackExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport("template <typename... Ts>"
+             "void declToImport() {"
+             "  const int i = sizeof...(Ts);"
+             "};"
+             "void g() { declToImport<int>(); }",
+             Lang_CXX11, "", Lang_CXX11, Verifier,
+             functionTemplateDecl(has(functionDecl(
+                 hasBody(compoundStmt(has(declStmt(has(varDecl(hasInitializer(
+                     implicitCastExpr(has(sizeOfPackExpr())))))))))))));
+  testImport(
+      "template <typename... Ts>"
+      "using X = int[sizeof...(Ts)];"
+      "template <typename... Us>"
+      "struct Y {"
+      "  X<Us..., int, double, int, Us...> f;"
+      "};"
+      "Y<float, int> declToImport;",
+      Lang_CXX11, "", Lang_CXX11, Verifier,
+      varDecl(hasType(classTemplateSpecializationDecl(has(fieldDecl(hasType(
+          hasUnqualifiedDesugaredType(constantArrayType(hasSize(7))))))))));
+}
+
 /// \brief Matches __builtin_types_compatible_p:
 /// GNU extension to check equivalent types
 /// Given
@@ -586,6 +644,24 @@ TEST(ImportExpr, ImportTypeTraitExpr) {
                      typeTraitExpr(hasType(asString("int"))))))));
 }
 
+const internal::VariadicDynCastAllOfMatcher<Stmt, CXXTypeidExpr> cxxTypeidExpr;
+
+TEST(ImportExpr, ImportCXXTypeidExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport(
+      "namespace std { class type_info {}; }"
+      "void declToImport() {"
+      "  int x;"
+      "  auto a = typeid(int); auto b = typeid(x);"
+      "}",
+      Lang_CXX11, "", Lang_CXX11, Verifier,
+      functionDecl(
+          hasDescendant(varDecl(
+              hasName("a"), hasInitializer(hasDescendant(cxxTypeidExpr())))),
+          hasDescendant(varDecl(
+              hasName("b"), hasInitializer(hasDescendant(cxxTypeidExpr()))))));
+}
+
 TEST(ImportExpr, ImportTypeTraitExprValDep) {
   MatchVerifier<Decl> Verifier;
   testImport("template<typename T> struct declToImport {"
@@ -624,7 +700,7 @@ TEST(ImportExpr, ImportCXXPseudoDestructorExpr) {
 TEST(ImportDecl, ImportUsingDecl) {
   MatchVerifier<Decl> Verifier;
   testImport("namespace foo { int bar; }"
-             "int declToImport(){ using foo::bar; }",
+             "void declToImport() { using foo::bar; }",
              Lang_CXX, "", Lang_CXX, Verifier,
              functionDecl(
                has(
@@ -656,5 +732,77 @@ TEST(ImportDecl, ImportUsingShadowDecl) {
              namespaceDecl(has(usingShadowDecl())));
 }
 
+TEST(ImportExpr, ImportUnresolvedLookupExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport("template<typename T> int foo();"
+             "template <typename T> void declToImport() {"
+             "  ::foo<T>;"
+             "  ::template foo<T>;"
+             "}"
+             "void instantiate() { declToImport<int>(); }",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             functionTemplateDecl(has(functionDecl(
+                 has(compoundStmt(has(unresolvedLookupExpr())))))));
+}
+
+TEST(ImportExpr, ImportCXXUnresolvedConstructExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport("template <typename T> struct C { T t; };"
+             "template <typename T> void declToImport() {"
+             "  C<T> d;"
+             "  d.t = T();"
+             "}"
+             "void instantiate() { declToImport<int>(); }",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             functionTemplateDecl(has(functionDecl(has(compoundStmt(has(
+                 binaryOperator(has(cxxUnresolvedConstructExpr())))))))));
+  testImport("template <typename T> struct C { T t; };"
+             "template <typename T> void declToImport() {"
+             "  C<T> d;"
+             "  (&d)->t = T();"
+             "}"
+             "void instantiate() { declToImport<int>(); }",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             functionTemplateDecl(has(functionDecl(has(compoundStmt(has(
+                 binaryOperator(has(cxxUnresolvedConstructExpr())))))))));
+}
+
+/// Check that function "declToImport()" (which is the templated function
+/// for corresponding FunctionTemplateDecl) is not added into DeclContext.
+/// Same for class template declarations.
+TEST(ImportDecl, ImportTemplatedDeclForTemplate) {
+  MatchVerifier<Decl> Verifier;
+  testImport("template <typename T> void declToImport() { T a = 1; }"
+             "void instantiate() { declToImport<int>(); }",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             functionTemplateDecl(hasAncestor(translationUnitDecl(
+                 unless(has(functionDecl(hasName("declToImport"))))))));
+  testImport("template <typename T> struct declToImport { T t; };"
+             "void instantiate() { declToImport<int>(); }",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             classTemplateDecl(hasAncestor(translationUnitDecl(
+                 unless(has(cxxRecordDecl(hasName("declToImport"))))))));
+}
+
+TEST(ImportExpr, CXXOperatorCallExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport("class declToImport {"
+             "  void f() { *this = declToImport(); }"
+             "};",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             cxxRecordDecl(has(cxxMethodDecl(hasBody(compoundStmt(
+                 has(exprWithCleanups(has(cxxOperatorCallExpr())))))))));
+}
+
+TEST(ImportExpr, DependentSizedArrayType) {
+  MatchVerifier<Decl> Verifier;
+  testImport("template<typename T, int Size> class declToImport {"
+             "  T data[Size];"
+             "};",
+             Lang_CXX, "", Lang_CXX, Verifier,
+             classTemplateDecl(has(cxxRecordDecl(
+                 has(fieldDecl(hasType(dependentSizedArrayType())))))));
+}
+
 } // end namespace ast_matchers
 } // end namespace clang
diff --git a/unittests/AST/CMakeLists.txt b/unittests/AST/CMakeLists.txt
index 9839cdb1f2ec..89590c110668 100644
--- a/unittests/AST/CMakeLists.txt
+++ b/unittests/AST/CMakeLists.txt
@@ -15,7 +15,6 @@ add_clang_unittest(ASTTests
   EvaluateAsRValueTest.cpp
   ExternalASTSourceTest.cpp
   NamedDeclPrinterTest.cpp
-  PostOrderASTVisitor.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
   )
diff --git a/unittests/AST/PostOrderASTVisitor.cpp b/unittests/AST/PostOrderASTVisitor.cpp
deleted file mode 100644
index 2921f3ead50e..000000000000
--- a/unittests/AST/PostOrderASTVisitor.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//===- unittests/AST/PostOrderASTVisitor.cpp - Declaration printer tests --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains tests for the post-order traversing functionality
-// of RecursiveASTVisitor.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/AST/RecursiveASTVisitor.h"
-#include "clang/Tooling/Tooling.h"
-#include "gtest/gtest.h"
-
-using namespace clang;
-
-namespace {
-
-  class RecordingVisitor
-    : public RecursiveASTVisitor<RecordingVisitor> {
-
-    bool VisitPostOrder;
-  public:
-    explicit RecordingVisitor(bool VisitPostOrder)
-      : VisitPostOrder(VisitPostOrder) {
-    }
-
-    // List of visited nodes during traversal.
-    std::vector<std::string> VisitedNodes;
-
-    bool shouldTraversePostOrder() const { return VisitPostOrder; }
-
-    bool VisitUnaryOperator(UnaryOperator *Op) {
-      VisitedNodes.push_back(Op->getOpcodeStr(Op->getOpcode()));
-      return true;
-    }
-
-    bool VisitBinaryOperator(BinaryOperator *Op) {
-      VisitedNodes.push_back(Op->getOpcodeStr());
-      return true;
-    }
-
-    bool VisitIntegerLiteral(IntegerLiteral *Lit) {
-      VisitedNodes.push_back(Lit->getValue().toString(10, false));
-      return true;
-    }
-
-    bool VisitVarDecl(VarDecl* D) {
-      VisitedNodes.push_back(D->getNameAsString());
-      return true;
-    }
-
-    bool VisitCXXMethodDecl(CXXMethodDecl *D) {
-      VisitedNodes.push_back(D->getQualifiedNameAsString());
-      return true;
-    }
-
-    bool VisitReturnStmt(ReturnStmt *S) {
-      VisitedNodes.push_back("return");
-      return true;
-    }
-
-    bool VisitCXXRecordDecl(CXXRecordDecl *Declaration) {
-      VisitedNodes.push_back(Declaration->getQualifiedNameAsString());
-      return true;
-    }
-
-    bool VisitTemplateTypeParmType(TemplateTypeParmType *T) {
-      VisitedNodes.push_back(T->getDecl()->getQualifiedNameAsString());
-      return true;
-    }
-  };
-
-}
-
-TEST(RecursiveASTVisitor, PostOrderTraversal) {
-  auto ASTUnit = tooling::buildASTFromCode(
-    "class A {"
-    "  class B {"
-    "    int foo() { while(4) { int i = 9; int j = -5; } return (1 + 3) + 2; }"
-    "  };"
-    "};"
-  );
-  auto TU = ASTUnit->getASTContext().getTranslationUnitDecl();
-  // We traverse the translation unit and store all
-  // visited nodes.
-  RecordingVisitor Visitor(true);
-  Visitor.TraverseTranslationUnitDecl(TU);
-
-  std::vector<std::string> expected = {"4", "9",      "i",         "5",    "-",
-                                       "j", "1",      "3",         "+",    "2",
-                                       "+", "return", "A::B::foo", "A::B", "A"};
-  // Compare the list of actually visited nodes
-  // with the expected list of visited nodes.
-  ASSERT_EQ(expected.size(), Visitor.VisitedNodes.size());
-  for (std::size_t I = 0; I < expected.size(); I++) {
-    ASSERT_EQ(expected[I], Visitor.VisitedNodes[I]);
-  }
-}
-
-TEST(RecursiveASTVisitor, NoPostOrderTraversal) {
-  auto ASTUnit = tooling::buildASTFromCode(
-    "class A {"
-    "  class B {"
-    "    int foo() { return 1 + 2; }"
-    "  };"
-    "};"
-  );
-  auto TU = ASTUnit->getASTContext().getTranslationUnitDecl();
-  // We traverse the translation unit and store all
-  // visited nodes.
-  RecordingVisitor Visitor(false);
-  Visitor.TraverseTranslationUnitDecl(TU);
-
-  std::vector<std::string> expected = {
-    "A", "A::B", "A::B::foo", "return", "+", "1", "2"
-  };
-  // Compare the list of actually visited nodes
-  // with the expected list of visited nodes.
-  ASSERT_EQ(expected.size(), Visitor.VisitedNodes.size());
-  for (std::size_t I = 0; I < expected.size(); I++) {
-    ASSERT_EQ(expected[I], Visitor.VisitedNodes[I]);
-  }
-}
diff --git a/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index 7e7d02707112..55c53e66ecd6 100644
--- a/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -1770,6 +1770,84 @@ TEST(IsExplicitTemplateSpecialization,
     functionDecl(isExplicitTemplateSpecialization())));
 }
 
+TEST(TypeMatching, MatchesNoReturn) {
+  EXPECT_TRUE(notMatches("void func();", functionDecl(isNoReturn())));
+  EXPECT_TRUE(notMatches("void func() {}", functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(notMatchesC("void func();", functionDecl(isNoReturn())));
+  EXPECT_TRUE(notMatchesC("void func() {}", functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(
+      notMatches("struct S { void func(); };", functionDecl(isNoReturn())));
+  EXPECT_TRUE(
+      notMatches("struct S { void func() {} };", functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(notMatches("struct S { static void func(); };",
+                         functionDecl(isNoReturn())));
+  EXPECT_TRUE(notMatches("struct S { static void func() {} };",
+                         functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(notMatches("struct S { S(); };", functionDecl(isNoReturn())));
+  EXPECT_TRUE(notMatches("struct S { S() {} };", functionDecl(isNoReturn())));
+
+  // ---
+
+  EXPECT_TRUE(matches("[[noreturn]] void func();", functionDecl(isNoReturn())));
+  EXPECT_TRUE(
+      matches("[[noreturn]] void func() {}", functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(matches("struct S { [[noreturn]] void func(); };",
+                      functionDecl(isNoReturn())));
+  EXPECT_TRUE(matches("struct S { [[noreturn]] void func() {} };",
+                      functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(matches("struct S { [[noreturn]] static void func(); };",
+                      functionDecl(isNoReturn())));
+  EXPECT_TRUE(matches("struct S { [[noreturn]] static void func() {} };",
+                      functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(
+      matches("struct S { [[noreturn]] S(); };", functionDecl(isNoReturn())));
+  EXPECT_TRUE(matches("struct S { [[noreturn]] S() {} };",
+                      functionDecl(isNoReturn())));
+
+  // ---
+
+  EXPECT_TRUE(matches("__attribute__((noreturn)) void func();",
+                      functionDecl(isNoReturn())));
+  EXPECT_TRUE(matches("__attribute__((noreturn)) void func() {}",
+                      functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(matches("struct S { __attribute__((noreturn)) void func(); };",
+                      functionDecl(isNoReturn())));
+  EXPECT_TRUE(matches("struct S { __attribute__((noreturn)) void func() {} };",
+                      functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(
+      matches("struct S { __attribute__((noreturn)) static void func(); };",
+              functionDecl(isNoReturn())));
+  EXPECT_TRUE(
+      matches("struct S { __attribute__((noreturn)) static void func() {} };",
+              functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(matches("struct S { __attribute__((noreturn)) S(); };",
+                      functionDecl(isNoReturn())));
+  EXPECT_TRUE(matches("struct S { __attribute__((noreturn)) S() {} };",
+                      functionDecl(isNoReturn())));
+
+  // ---
+
+  EXPECT_TRUE(matchesC("__attribute__((noreturn)) void func();",
+                      functionDecl(isNoReturn())));
+  EXPECT_TRUE(matchesC("__attribute__((noreturn)) void func() {}",
+                      functionDecl(isNoReturn())));
+
+  EXPECT_TRUE(matchesC("_Noreturn void func();",
+                      functionDecl(isNoReturn())));
+  EXPECT_TRUE(matchesC("_Noreturn void func() {}",
+                      functionDecl(isNoReturn())));
+}
+
 TEST(TypeMatching, MatchesBool) {
   EXPECT_TRUE(matches("struct S { bool func(); };",
                       cxxMethodDecl(returns(booleanType()))));
@@ -2029,5 +2107,29 @@ TEST(HasDefinition, MatchesUnionDefinition) {
                       cxxRecordDecl(hasDefinition())));
 }
 
+TEST(IsScopedEnum, MatchesScopedEnum) {
+  EXPECT_TRUE(matches("enum class X {};", enumDecl(isScoped())));
+  EXPECT_TRUE(notMatches("enum X {};", enumDecl(isScoped())));
+}
+
+TEST(HasTrailingReturn, MatchesTrailingReturn) {
+  EXPECT_TRUE(matches("auto Y() -> int { return 0; }",
+                      functionDecl(hasTrailingReturn())));
+  EXPECT_TRUE(matches("auto X() -> int;", functionDecl(hasTrailingReturn())));
+  EXPECT_TRUE(notMatches("int X() { return 0; }", 
+                      functionDecl(hasTrailingReturn())));
+  EXPECT_TRUE(notMatches("int X();", functionDecl(hasTrailingReturn())));
+  EXPECT_TRUE(notMatchesC("void X();", functionDecl(hasTrailingReturn())));
+}
+
+TEST(HasTrailingReturn, MatchesLambdaTrailingReturn) {
+  EXPECT_TRUE(matches(
+          "auto lambda2 = [](double x, double y) -> double {return x + y;};",
+          functionDecl(hasTrailingReturn())));
+  EXPECT_TRUE(notMatches(
+          "auto lambda2 = [](double x, double y) {return x + y;};",
+          functionDecl(hasTrailingReturn())));
+}
+
 } // namespace ast_matchers
 } // namespace clang
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index 2cae9dd0c547..cbabc459c4dc 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -588,6 +588,23 @@ TEST_F(FormatTest, FormatShortBracedStatements) {
                AllowSimpleBracedStatements);
 }
 
+TEST_F(FormatTest, ShortBlocksInMacrosDontMergeWithCodeAfterMacro) {
+  FormatStyle Style = getLLVMStyleWithColumns(60);
+  Style.AllowShortBlocksOnASingleLine = true;
+  Style.AllowShortIfStatementsOnASingleLine = true;
+  Style.BreakBeforeBraces = FormatStyle::BS_Allman;
+  EXPECT_EQ("#define A                                                  \\\n"
+            "  if (HANDLEwernufrnuLwrmviferuvnierv)                     \\\n"
+            "  { RET_ERR1_ANUIREUINERUIFNIOAerwfwrvnuier; }\n"
+            "X;",
+            format("#define A \\\n"
+                   "   if (HANDLEwernufrnuLwrmviferuvnierv) { \\\n"
+                   "      RET_ERR1_ANUIREUINERUIFNIOAerwfwrvnuier; \\\n"
+                   "   }\n"
+                   "X;",
+                   Style));
+}
+
 TEST_F(FormatTest, ParseIfElse) {
   verifyFormat("if (true)\n"
                "  if (true)\n"
@@ -2484,8 +2501,6 @@ TEST_F(FormatTest, IndentPreprocessorDirectives) {
                "#endif",
                Style);
   // Test with include guards.
-  // EXPECT_EQ is used because verifyFormat() calls messUp() which incorrectly
-  // merges lines.
   verifyFormat("#ifndef HEADER_H\n"
                "#define HEADER_H\n"
                "code();\n"
@@ -2580,21 +2595,85 @@ TEST_F(FormatTest, IndentPreprocessorDirectives) {
                    "code();\n"
                    "#endif",
                    Style));
-  // FIXME: The comment indent corrector in TokenAnnotator gets thrown off by
-  // preprocessor indentation.
-  EXPECT_EQ("#if 1\n"
-            "  // comment\n"
-            "#  define A 0\n"
-            "// comment\n"
-            "#  define B 0\n"
-            "#endif",
-            format("#if 1\n"
-                   "// comment\n"
-                   "#  define A 0\n"
-                   "   // comment\n"
-                   "#  define B 0\n"
-                   "#endif",
-                   Style));
+  // Keep comments aligned with #, otherwise indent comments normally. These
+  // tests cannot use verifyFormat because messUp manipulates leading
+  // whitespace.
+  {
+    const char *Expected = ""
+                           "void f() {\n"
+                           "#if 1\n"
+                           "// Preprocessor aligned.\n"
+                           "#  define A 0\n"
+                           "  // Code. Separated by blank line.\n"
+                           "\n"
+                           "#  define B 0\n"
+                           "  // Code. Not aligned with #\n"
+                           "#  define C 0\n"
+                           "#endif";
+    const char *ToFormat = ""
+                           "void f() {\n"
+                           "#if 1\n"
+                           "// Preprocessor aligned.\n"
+                           "#  define A 0\n"
+                           "// Code. Separated by blank line.\n"
+                           "\n"
+                           "#  define B 0\n"
+                           "   // Code. Not aligned with #\n"
+                           "#  define C 0\n"
+                           "#endif";
+    EXPECT_EQ(Expected, format(ToFormat, Style));
+    EXPECT_EQ(Expected, format(Expected, Style));
+  }
+  // Keep block quotes aligned.
+  {
+    const char *Expected = ""
+                           "void f() {\n"
+                           "#if 1\n"
+                           "/* Preprocessor aligned. */\n"
+                           "#  define A 0\n"
+                           "  /* Code. Separated by blank line. */\n"
+                           "\n"
+                           "#  define B 0\n"
+                           "  /* Code. Not aligned with # */\n"
+                           "#  define C 0\n"
+                           "#endif";
+    const char *ToFormat = ""
+                           "void f() {\n"
+                           "#if 1\n"
+                           "/* Preprocessor aligned. */\n"
+                           "#  define A 0\n"
+                           "/* Code. Separated by blank line. */\n"
+                           "\n"
+                           "#  define B 0\n"
+                           "   /* Code. Not aligned with # */\n"
+                           "#  define C 0\n"
+                           "#endif";
+    EXPECT_EQ(Expected, format(ToFormat, Style));
+    EXPECT_EQ(Expected, format(Expected, Style));
+  }
+  // Keep comments aligned with un-indented directives.
+  {
+    const char *Expected = ""
+                           "void f() {\n"
+                           "// Preprocessor aligned.\n"
+                           "#define A 0\n"
+                           "  // Code. Separated by blank line.\n"
+                           "\n"
+                           "#define B 0\n"
+                           "  // Code. Not aligned with #\n"
+                           "#define C 0\n";
+    const char *ToFormat = ""
+                           "void f() {\n"
+                           "// Preprocessor aligned.\n"
+                           "#define A 0\n"
+                           "// Code. Separated by blank line.\n"
+                           "\n"
+                           "#define B 0\n"
+                           "   // Code. Not aligned with #\n"
+                           "#define C 0\n";
+    EXPECT_EQ(Expected, format(ToFormat, Style));
+    EXPECT_EQ(Expected, format(Expected, Style));
+  }
   // Test with tabs.
   Style.UseTab = FormatStyle::UT_Always;
   Style.IndentWidth = 8;
@@ -10410,16 +10489,38 @@ TEST_F(FormatTest, ParsesConfiguration) {
 
   Style.RawStringFormats.clear();
   std::vector<FormatStyle::RawStringFormat> ExpectedRawStringFormats = {
-      {"pb", FormatStyle::LK_TextProto, "llvm"},
-      {"cpp", FormatStyle::LK_Cpp, "google"}};
+      {
+          FormatStyle::LK_TextProto,
+          {"pb", "proto"},
+          {"PARSE_TEXT_PROTO"},
+          /*CanonicalDelimiter=*/"",
+          "llvm",
+      },
+      {
+          FormatStyle::LK_Cpp,
+          {"cc", "cpp"},
+          {"C_CODEBLOCK", "CPPEVAL"},
+          /*CanonicalDelimiter=*/"cc",
+          /*BasedOnStyle=*/"",
+      },
+  };
 
   CHECK_PARSE("RawStringFormats:\n"
-              "  - Delimiter: 'pb'\n"
-              "    Language: TextProto\n"
+              "  - Language: TextProto\n"
+              "    Delimiters:\n"
+              "      - 'pb'\n"
+              "      - 'proto'\n"
+              "    EnclosingFunctions:\n"
+              "      - 'PARSE_TEXT_PROTO'\n"
               "    BasedOnStyle: llvm\n"
-              "  - Delimiter: 'cpp'\n"
-              "    Language: Cpp\n"
-              "    BasedOnStyle: google",
+              "  - Language: Cpp\n"
+              "    Delimiters:\n"
+              "      - 'cc'\n"
+              "      - 'cpp'\n"
+              "    EnclosingFunctions:\n"
+              "      - 'C_CODEBLOCK'\n"
+              "      - 'CPPEVAL'\n"
+              "    CanonicalDelimiter: 'cc'",
               RawStringFormats, ExpectedRawStringFormats);
 }
 
diff --git a/unittests/Format/FormatTestJS.cpp b/unittests/Format/FormatTestJS.cpp
index 2a929563f754..47886521008e 100644
--- a/unittests/Format/FormatTestJS.cpp
+++ b/unittests/Format/FormatTestJS.cpp
@@ -1157,6 +1157,9 @@ TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) {
                "foo() {}",
                getGoogleJSStyleWithColumns(10));
   verifyFormat("await theReckoning;", getGoogleJSStyleWithColumns(10));
+  verifyFormat("some['a']['b']", getGoogleJSStyleWithColumns(10));
+  verifyFormat("x = (a['a']\n"
+               "      ['b']);", getGoogleJSStyleWithColumns(10));
 }
 
 TEST_F(FormatTestJS, AutomaticSemicolonInsertionHeuristic) {
diff --git a/unittests/Format/FormatTestJava.cpp b/unittests/Format/FormatTestJava.cpp
index 2f376f765d66..1d471b73ee64 100644
--- a/unittests/Format/FormatTestJava.cpp
+++ b/unittests/Format/FormatTestJava.cpp
@@ -144,6 +144,7 @@ TEST_F(FormatTestJava, ClassDeclarations) {
   verifyFormat("public interface SomeInterface {\n"
                "  void doStuff(int theStuff);\n"
                "  void doMoreStuff(int moreStuff);\n"
+               "  default void doStuffWithDefault() {}\n"
                "}");
   verifyFormat("@interface SomeInterface {\n"
                "  void doStuff(int theStuff);\n"
diff --git a/unittests/Format/FormatTestObjC.cpp b/unittests/Format/FormatTestObjC.cpp
index 4220b44b4c47..23e90663717b 100644
--- a/unittests/Format/FormatTestObjC.cpp
+++ b/unittests/Format/FormatTestObjC.cpp
@@ -94,6 +94,66 @@ TEST(FormatTestObjCStyle, DetectsObjCInHeaders) {
   Style = getStyle("LLVM", "a.h", "none", "void f() {}");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none", "@interface Foo\n@end\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none",
+                   "const int interface = 1;\nconst int end = 2;\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none", "@protocol Foo\n@end\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none",
+                   "const int protocol = 1;\nconst int end = 2;\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none", "extern NSString *kFoo;\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style =
+      getStyle("{}", "a.h", "none", "typedef NS_ENUM(NSInteger, Foo) {};\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none", "enum Foo {};");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none", "extern NSInteger Foo();\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style =
+      getStyle("{}", "a.h", "none", "inline void Foo() { Log(@\"Foo\"); }\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style =
+      getStyle("{}", "a.h", "none", "inline void Foo() { Log(\"Foo\"); }\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
+
+  Style =
+      getStyle("{}", "a.h", "none", "inline void Foo() { id = @[1, 2, 3]; }\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none",
+                   "inline void Foo() { id foo = @{1: 2, 3: 4, 5: 6}; }\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+
+  Style = getStyle("{}", "a.h", "none",
+                   "inline void Foo() { int foo[] = {1, 2, 3}; }\n");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
 }
 
 TEST_F(FormatTestObjC, FormatObjCTryCatch) {
@@ -155,6 +215,12 @@ TEST_F(FormatTestObjC, FormatObjCInterface) {
                "@end");
 
   verifyFormat("@interface Foo : Bar\n"
+               "@property(assign, readwrite) NSInteger bar;\n"
+               "+ (id)init;\n"
+               "@end");
+
+  verifyFormat("FOUNDATION_EXPORT NS_AVAILABLE_IOS(10.0) @interface Foo : Bar\n"
+               "@property(assign, readwrite) NSInteger bar;\n"
                "+ (id)init;\n"
                "@end");
 
@@ -210,8 +276,22 @@ TEST_F(FormatTestObjC, FormatObjCInterface) {
                "+ (id)init;\n"
                "@end");
 
+  Style.ColumnLimit = 40;
+  verifyFormat("@interface ccccccccccccc () <\n"
+               "    ccccccccccccc, ccccccccccccc,\n"
+               "    ccccccccccccc, ccccccccccccc> {\n"
+               "}");
+
+  Style.BinPackParameters = false;
+  verifyFormat("@interface ddddddddddddd () <\n"
+               "    ddddddddddddd,\n"
+               "    ddddddddddddd,\n"
+               "    ddddddddddddd,\n"
+               "    ddddddddddddd> {\n"
+               "}");
+
   Style = getGoogleStyle(FormatStyle::LK_ObjC);
-  verifyFormat("@interface Foo : NSObject<NSSomeDelegate> {\n"
+  verifyFormat("@interface Foo : NSObject <NSSomeDelegate> {\n"
                " @public\n"
                "  int field1;\n"
                " @protected\n"
@@ -223,15 +303,15 @@ TEST_F(FormatTestObjC, FormatObjCInterface) {
                "}\n"
                "+ (id)init;\n"
                "@end");
-  verifyFormat("@interface Foo : Bar<Baz, Quux>\n"
+  verifyFormat("@interface Foo : Bar <Baz, Quux>\n"
                "+ (id)init;\n"
                "@end");
-  verifyFormat("@interface Foo (HackStuff)<MyProtocol>\n"
+  verifyFormat("@interface Foo (HackStuff) <MyProtocol>\n"
                "+ (id)init;\n"
                "@end");
   Style.BinPackParameters = false;
   Style.ColumnLimit = 80;
-  verifyFormat("@interface aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ()<\n"
+  verifyFormat("@interface aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa () <\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
@@ -334,6 +414,10 @@ TEST_F(FormatTestObjC, FormatObjCProtocol) {
                "@protocol Bar\n"
                "@end");
 
+  verifyFormat("FOUNDATION_EXPORT NS_AVAILABLE_IOS(10.0) @protocol Foo\n"
+               "@property(assign, readwrite) NSInteger bar;\n"
+               "@end");
+
   verifyFormat("@protocol myProtocol\n"
                "- (void)mandatoryWithInt:(int)i;\n"
                "@optional\n"
@@ -354,7 +438,7 @@ TEST_F(FormatTestObjC, FormatObjCProtocol) {
                "@end");
 
   Style = getGoogleStyle(FormatStyle::LK_ObjC);
-  verifyFormat("@protocol MyProtocol<NSObject>\n"
+  verifyFormat("@protocol MyProtocol <NSObject>\n"
                "- (NSUInteger)numberOfThings;\n"
                "@end");
 }
diff --git a/unittests/Format/FormatTestProto.cpp b/unittests/Format/FormatTestProto.cpp
index df94d8172730..4d52a61dc3fb 100644
--- a/unittests/Format/FormatTestProto.cpp
+++ b/unittests/Format/FormatTestProto.cpp
@@ -97,7 +97,7 @@ TEST_F(FormatTestProto, FormatsEnums) {
                "  TYPE_B = 2;\n"
                "};");
   verifyFormat("enum Type {\n"
-               "  UNKNOWN = 0 [(some_options) = {a: aa, b: bb}];\n"
+               "  UNKNOWN = 0 [(some_options) = { a: aa, b: bb }];\n"
                "};");
   verifyFormat("enum Type {\n"
                "  UNKNOWN = 0 [(some_options) = {\n"
@@ -121,7 +121,7 @@ TEST_F(FormatTestProto, MessageFieldAttributes) {
   verifyFormat("optional LongMessageType long_proto_field = 1\n"
                "    [default = REALLY_REALLY_LONG_CONSTANT_VALUE];");
   verifyFormat("repeated double value = 1\n"
-               "    [(aaaaaaa.aaaaaaaaa) = {aaaaaaaaaaaaaaaaa: AAAAAAAA}];");
+               "    [(aaaaaaa.aaaaaaaaa) = { aaaaaaaaaaaaaaaaa: AAAAAAAA }];");
   verifyFormat("repeated double value = 1 [(aaaaaaa.aaaaaaaaa) = {\n"
                "  aaaaaaaaaaaaaaaa: AAAAAAAAAA,\n"
                "  bbbbbbbbbbbbbbbb: BBBBBBBBBB\n"
@@ -183,27 +183,27 @@ TEST_F(FormatTestProto, FormatsOptions) {
                "  field_a: OK\n"
                "  field_b: \"OK\"\n"
                "  field_c: \"OK\"\n"
-               "  msg_field: {field_d: 123}\n"
+               "  msg_field: { field_d: 123 }\n"
                "};");
   verifyFormat("option (MyProto.options) = {\n"
                "  field_a: OK\n"
                "  field_b: \"OK\"\n"
                "  field_c: \"OK\"\n"
-               "  msg_field: {field_d: 123 field_e: OK}\n"
+               "  msg_field: { field_d: 123 field_e: OK }\n"
                "};");
   verifyFormat("option (MyProto.options) = {\n"
                "  field_a: OK  // Comment\n"
                "  field_b: \"OK\"\n"
                "  field_c: \"OK\"\n"
-               "  msg_field: {field_d: 123}\n"
+               "  msg_field: { field_d: 123 }\n"
                "};");
   verifyFormat("option (MyProto.options) = {\n"
                "  field_c: \"OK\"\n"
-               "  msg_field {field_d: 123}\n"
+               "  msg_field { field_d: 123 }\n"
                "};");
   verifyFormat("option (MyProto.options) = {\n"
                "  field_a: OK\n"
-               "  field_b {field_c: OK}\n"
+               "  field_b { field_c: OK }\n"
                "  field_d: OKOKOK\n"
                "  field_e: OK\n"
                "}");
@@ -247,7 +247,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = <\n"
                "  field_a: \"OK\"\n"
-               "  msg_field: {field_b: OK}\n"
+               "  msg_field: { field_b: OK }\n"
                "  field_g: OK\n"
                "  field_g: OK\n"
                "  field_g: OK\n"
@@ -359,7 +359,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = <\n"
                "  data1 <key1: value1>\n"
-               "  data2 {key2: value2}\n"
+               "  data2 { key2: value2 }\n"
                ">;");
 
   verifyFormat("option (MyProto.options) = <\n"
@@ -372,7 +372,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
                "  app_id: 'com.javax.swing.salsa.latino'\n"
                "  head_id: 1\n"
                "  headheadheadheadheadhead_id: 1\n"
-               "  product_data {product {1}}\n"
+               "  product_data { product { 1 } }\n"
                "};");
 }
 
diff --git a/unittests/Format/FormatTestRawStrings.cpp b/unittests/Format/FormatTestRawStrings.cpp
index 6e7b7065875d..4d1da9775951 100644
--- a/unittests/Format/FormatTestRawStrings.cpp
+++ b/unittests/Format/FormatTestRawStrings.cpp
@@ -65,23 +65,43 @@ class FormatTestRawStrings : public ::testing::Test {
   FormatStyle getRawStringPbStyleWithColumns(unsigned ColumnLimit) {
     FormatStyle Style = getLLVMStyle();
     Style.ColumnLimit = ColumnLimit;
-    Style.RawStringFormats = {{/*Delimiter=*/"pb",
-                               /*Kind=*/FormatStyle::LK_TextProto,
-                               /*BasedOnStyle=*/"google"}};
+    Style.RawStringFormats = {
+        {
+            /*Language=*/FormatStyle::LK_TextProto,
+            /*Delimiters=*/{"pb"},
+            /*EnclosingFunctions=*/{},
+            /*CanonicalDelimiter=*/"",
+            /*BasedOnStyle=*/"google",
+        },
+    };
     return Style;
   }
 
   FormatStyle getRawStringLLVMCppStyleBasedOn(std::string BasedOnStyle) {
     FormatStyle Style = getLLVMStyle();
-    Style.RawStringFormats = {{/*Delimiter=*/"cpp",
-                               /*Kind=*/FormatStyle::LK_Cpp, BasedOnStyle}};
+    Style.RawStringFormats = {
+        {
+            /*Language=*/FormatStyle::LK_Cpp,
+            /*Delimiters=*/{"cpp"},
+            /*EnclosingFunctions=*/{},
+            /*CanonicalDelimiter=*/"",
+            BasedOnStyle,
+        },
+    };
     return Style;
   }
 
   FormatStyle getRawStringGoogleCppStyleBasedOn(std::string BasedOnStyle) {
     FormatStyle Style = getGoogleStyle(FormatStyle::LK_Cpp);
-    Style.RawStringFormats = {{/*Delimiter=*/"cpp",
-                               /*Kind=*/FormatStyle::LK_Cpp, BasedOnStyle}};
+    Style.RawStringFormats = {
+        {
+            /*Language=*/FormatStyle::LK_Cpp,
+            /*Delimiters=*/{"cpp"},
+            /*EnclosingFunctions=*/{},
+            /*CanonicalDelimiter=*/"",
+            BasedOnStyle,
+        },
+    };
     return Style;
   }
 
@@ -112,6 +132,27 @@ TEST_F(FormatTestRawStrings, ReformatsAccordingToBaseStyle) {
                    getRawStringGoogleCppStyleBasedOn("llvm")));
 }
 
+TEST_F(FormatTestRawStrings, UsesConfigurationOverBaseStyle) {
+  // llvm style puts '*' on the right.
+  // google style puts '*' on the left.
+
+  // Uses the configured google style inside raw strings even if BasedOnStyle in
+  // the raw string format is llvm.
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_Cpp);
+  EXPECT_EQ(0, parseConfiguration("---\n"
+                                  "Language: Cpp\n"
+                                  "BasedOnStyle: Google", &Style).value());
+  Style.RawStringFormats = {{
+      FormatStyle::LK_Cpp,
+      {"cpp"},
+      {},
+      /*CanonicalDelimiter=*/"",
+      /*BasedOnStyle=*/"llvm",
+  }};
+  expect_eq(R"test(int* i = R"cpp(int* j = 0;)cpp";)test",
+            format(R"test(int * i = R"cpp(int * j = 0;)cpp";)test", Style));
+}
+
 TEST_F(FormatTestRawStrings, MatchesDelimitersCaseSensitively) {
   // Don't touch the 'PB' raw string, format the 'pb' raw string.
   expect_eq(R"test(
@@ -121,29 +162,6 @@ t = R"pb(item: 1)pb";)test",
 s = R"PB(item:1)PB";
 t = R"pb(item:1)pb";)test",
                    getRawStringPbStyleWithColumns(40)));
-
-  FormatStyle MixedStyle = getLLVMStyle();
-  MixedStyle.RawStringFormats = {
-      {/*Delimiter=*/"cpp", /*Kind=*/FormatStyle::LK_Cpp,
-       /*BasedOnStyle=*/"llvm"},
-      {/*Delimiter=*/"CPP", /*Kind=*/FormatStyle::LK_Cpp,
-       /*BasedOnStyle=*/"google"}};
-
-  // Format the 'cpp' raw string with '*' on the right.
-  // Format the 'CPP' raw string with '*' on the left.
-  // Do not format the 'Cpp' raw string.
-  // Do not format non-raw strings.
-  expect_eq(R"test(
-a = R"cpp(int *i = 0;)cpp";
-b = R"CPP(int* j = 0;)CPP";
-c = R"Cpp(int * k = 0;)Cpp";
-d = R"cpp(int * k = 0;)Cpp";)test",
-            format(R"test(
-a = R"cpp(int * i = 0;)cpp";
-b = R"CPP(int * j = 0;)CPP";
-c = R"Cpp(int * k = 0;)Cpp";
-d = R"cpp(int * k = 0;)Cpp";)test",
-                   MixedStyle));
 }
 
 TEST_F(FormatTestRawStrings, ReformatsShortRawStringsOnSingleLine) {
@@ -168,7 +186,7 @@ TEST_F(FormatTestRawStrings, ReformatsShortRawStringsOnSingleLine) {
           R"test(P p = TP(R"pb(item_1:1 item_2:2)pb");)test",
           getRawStringPbStyleWithColumns(40)));
   expect_eq(
-      R"test(P p = TP(R"pb(item_1 <1> item_2: {2})pb");)test",
+      R"test(P p = TP(R"pb(item_1 <1> item_2: { 2 })pb");)test",
       format(
           R"test(P p = TP(R"pb(item_1<1> item_2:{2})pb");)test",
           getRawStringPbStyleWithColumns(40)));
@@ -228,9 +246,9 @@ P p = TP(R"pb(item_1: 1, item_2: 2, item_3: 3, item_4: 4)pb");)test",
   expect_eq(R"test(
 P p = TPPPPPPPPPPPPPPP(
     R"pb(item_1 <1>,
-         item_2: {2},
+         item_2: { 2 },
          item_3: <3>,
-         item_4: {4})pb");)test",
+         item_4: { 4 })pb");)test",
             format(R"test(
 P p = TPPPPPPPPPPPPPPP(R"pb(item_1<1>, item_2: {2}, item_3: <3>, item_4:{4})pb");)test",
                    getRawStringPbStyleWithColumns(40)));
@@ -277,7 +295,7 @@ int f(string s) {
 
 TEST_F(FormatTestRawStrings, FormatsRawStringArguments) {
   expect_eq(R"test(
-P p = TP(R"pb(key {1})pb", param_2);)test",
+P p = TP(R"pb(key { 1 })pb", param_2);)test",
             format(R"test(
 P p = TP(R"pb(key{1})pb",param_2);)test",
                    getRawStringPbStyleWithColumns(40)));
@@ -290,9 +308,9 @@ PPPPPPPPPPPPP(R"pb(keykeyk)pb", param_2);)test",
                    getRawStringPbStyleWithColumns(40)));
 
   expect_eq(R"test(
-P p =
-    TP(R"pb(item: {i: 1, s: 's'}
-            item: {i: 2, s: 't'})pb");)test",
+P p = TP(
+    R"pb(item: { i: 1, s: 's' }
+         item: { i: 2, s: 't' })pb");)test",
             format(R"test(
 P p = TP(R"pb(item: {i: 1, s: 's'} item: {i: 2, s: 't'})pb");)test",
                    getRawStringPbStyleWithColumns(40)));
@@ -728,6 +746,41 @@ TEST_F(FormatTestRawStrings, DontFormatNonRawStrings) {
                    getRawStringPbStyleWithColumns(20)));
 }
 
+TEST_F(FormatTestRawStrings, FormatsRawStringsWithEnclosingFunctionName) {
+  FormatStyle Style = getRawStringPbStyleWithColumns(40);
+  Style.RawStringFormats[0].EnclosingFunctions.push_back(
+      "PARSE_TEXT_PROTO");
+  Style.RawStringFormats[0].EnclosingFunctions.push_back("ParseTextProto");
+  expect_eq(R"test(a = PARSE_TEXT_PROTO(R"(key: value)");)test",
+            format(R"test(a = PARSE_TEXT_PROTO(R"(key:value)");)test", Style));
+
+  expect_eq(R"test(
+a = PARSE_TEXT_PROTO /**/ (
+    /**/ R"(key: value)");)test",
+            format(R"test(
+a = PARSE_TEXT_PROTO/**/(/**/R"(key:value)");)test",
+                   Style));
+
+  expect_eq(R"test(
+a = ParseTextProto<ProtoType>(
+    R"(key: value)");)test",
+            format(R"test(
+a = ParseTextProto<ProtoType>(R"(key:value)");)test",
+                   Style));
+}
+
+TEST_F(FormatTestRawStrings, UpdatesToCanonicalDelimiters) {
+  FormatStyle Style = getRawStringPbStyleWithColumns(25);
+  Style.RawStringFormats[0].CanonicalDelimiter = "proto";
+  expect_eq(R"test(a = R"proto(key: value)proto";)test",
+            format(R"test(a = R"pb(key:value)pb";)test", Style));
+
+  // Don't update to canonical delimiter if it occurs as a raw string suffix in
+  // the raw string content.
+  expect_eq(R"test(a = R"pb(key: ")proto")pb";)test",
+            format(R"test(a = R"pb(key:")proto")pb";)test", Style));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/unittests/Format/FormatTestTextProto.cpp b/unittests/Format/FormatTestTextProto.cpp
index 0a7bcdd82362..65f73066fe0b 100644
--- a/unittests/Format/FormatTestTextProto.cpp
+++ b/unittests/Format/FormatTestTextProto.cpp
@@ -49,9 +49,9 @@ TEST_F(FormatTestTextProto, KeepsTopLevelEntriesFittingALine) {
 TEST_F(FormatTestTextProto, SupportsMessageFields) {
   verifyFormat("msg_field: {}");
 
-  verifyFormat("msg_field: {field_a: A}");
+  verifyFormat("msg_field: { field_a: A }");
 
-  verifyFormat("msg_field: {field_a: \"OK\" field_b: 123}");
+  verifyFormat("msg_field: { field_a: \"OK\" field_b: 123 }");
 
   verifyFormat("msg_field: {\n"
                "  field_a: 1\n"
@@ -63,9 +63,9 @@ TEST_F(FormatTestTextProto, SupportsMessageFields) {
 
   verifyFormat("msg_field {}");
 
-  verifyFormat("msg_field {field_a: A}");
+  verifyFormat("msg_field { field_a: A }");
 
-  verifyFormat("msg_field {field_a: \"OK\" field_b: 123}");
+  verifyFormat("msg_field { field_a: \"OK\" field_b: 123 }");
 
   verifyFormat("msg_field {\n"
                "  field_a: 1\n"
@@ -78,11 +78,11 @@ TEST_F(FormatTestTextProto, SupportsMessageFields) {
                "  field_h: 1234.567e-89\n"
                "}");
 
-  verifyFormat("msg_field: {msg_field {field_a: 1}}");
+  verifyFormat("msg_field: { msg_field { field_a: 1 } }");
 
   verifyFormat("id: \"ala.bala\"\n"
-               "item {type: ITEM_A rank: 1 score: 90.0}\n"
-               "item {type: ITEM_B rank: 2 score: 70.5}\n"
+               "item { type: ITEM_A rank: 1 score: 90.0 }\n"
+               "item { type: ITEM_B rank: 2 score: 70.5 }\n"
                "item {\n"
                "  type: ITEM_A\n"
                "  rank: 3\n"
@@ -102,24 +102,24 @@ TEST_F(FormatTestTextProto, AvoidsTopLevelBinPacking) {
   verifyFormat("field_a: OK\n"
                "field_b: \"OK\"\n"
                "field_c: \"OK\"\n"
-               "msg_field: {field_d: 123}\n"
+               "msg_field: { field_d: 123 }\n"
                "field_e: OK\n"
                "field_f: OK");
 
   verifyFormat("field_a: OK\n"
                "field_b: \"OK\"\n"
                "field_c: \"OK\"\n"
-               "msg_field: {field_d: 123 field_e: OK}");
+               "msg_field: { field_d: 123 field_e: OK }");
 
   verifyFormat("a: {\n"
                "  field_a: OK\n"
-               "  field_b {field_c: OK}\n"
+               "  field_b { field_c: OK }\n"
                "  field_d: OKOKOK\n"
                "  field_e: OK\n"
                "}");
 
   verifyFormat("field_a: OK,\n"
-               "field_b {field_c: OK},\n"
+               "field_b { field_c: OK },\n"
                "field_d: OKOKOK,\n"
                "field_e: OK");
 }
@@ -150,10 +150,10 @@ TEST_F(FormatTestTextProto, SupportsAngleBracketMessageFields) {
   verifyFormat("msg_field: <field_a <field_b: <>>>");
   verifyFormat("msg_field <field_a: OK, field_b: \"OK\">");
   verifyFormat("msg_field <field_a: OK field_b: <>, field_c: OK>");
-  verifyFormat("msg_field <field_a {field_b: 1}, field_c: <field_d: 2>>");
+  verifyFormat("msg_field <field_a { field_b: 1 }, field_c: <field_d: 2>>");
   verifyFormat("msg_field: <field_a: OK, field_b: \"OK\">");
   verifyFormat("msg_field: <field_a: OK field_b: <>, field_c: OK>");
-  verifyFormat("msg_field: <field_a {field_b: 1}, field_c: <field_d: 2>>");
+  verifyFormat("msg_field: <field_a { field_b: 1 }, field_c: <field_d: 2>>");
   verifyFormat("field_a: \"OK\", msg_field: <field_b: 123>, field_c: {}");
   verifyFormat("field_a <field_b: 1>, msg_field: <field_b: 123>, field_c <>");
   verifyFormat("field_a <field_b: 1> msg_field: <field_b: 123> field_c <>");
@@ -200,7 +200,7 @@ TEST_F(FormatTestTextProto, SupportsAngleBracketMessageFields) {
 
   verifyFormat("msg_field: <\n"
                "  field_a: \"OK\"\n"
-               "  msg_field: {field_b: OK}\n"
+               "  msg_field: { field_b: OK }\n"
                "  field_g: OK\n"
                "  field_g: OK\n"
                "  field_g: OK\n"
@@ -226,7 +226,7 @@ TEST_F(FormatTestTextProto, SupportsAngleBracketMessageFields) {
                "  field_b2: ok,\n"
                "  field_b3: <\n"
                "    field_x {}  // Comment\n"
-               "    field_y: {field_z: 1}\n"
+               "    field_y: { field_z: 1 }\n"
                "    field_w: ok\n"
                "  >\n"
                "  field {\n"
@@ -258,7 +258,7 @@ TEST_F(FormatTestTextProto, SupportsAngleBracketMessageFields) {
   verifyFormat("app_id: 'com.javax.swing.salsa.latino'\n"
                "head_id: 1\n"
                "data <key: value>\n"
-               "data {key: value}");
+               "data { key: value }");
 
   verifyFormat("app {\n"
                "  app_id: 'com.javax.swing.salsa.latino'\n"
@@ -274,11 +274,11 @@ TEST_F(FormatTestTextProto, SupportsAngleBracketMessageFields) {
 
   verifyFormat("app_id: 'com.javax.swing.salsa.latino'\n"
                "headheadheadheadheadhead_id: 1\n"
-               "product_data {product {1}}");
+               "product_data { product { 1 } }");
 
   verifyFormat("app_id: 'com.javax.swing.salsa.latino'\n"
                "headheadheadheadheadhead_id: 1\n"
-               "product_data <product {1}>");
+               "product_data <product { 1 }>");
 
   verifyFormat("app_id: 'com.javax.swing.salsa.latino'\n"
                "headheadheadheadheadhead_id: 1\n"
@@ -287,8 +287,24 @@ TEST_F(FormatTestTextProto, SupportsAngleBracketMessageFields) {
   verifyFormat("app <\n"
                "  app_id: 'com.javax.swing.salsa.latino'\n"
                "  headheadheadheadheadhead_id: 1\n"
-               "  product_data <product {1}>\n"
+               "  product_data <product { 1 }>\n"
                ">");
+
+  verifyFormat("dcccwrnfioeruvginerurneitinfo {\n"
+               "  exte3nsionrnfvui { key: value }\n"
+               "}");
+}
+
+TEST_F(FormatTestTextProto, DiscardsUnbreakableTailIfCanBreakAfter) {
+  // The two closing braces count towards the string UnbreakableTailLength, but
+  // since we have broken after the corresponding opening braces, we don't
+  // consider that length for string breaking.
+  verifyFormat(
+      "foo: {\n"
+      "  bar: {\n"
+      "    text: \"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"\n"
+      "  }\n"
+      "}");
 }
 } // end namespace tooling
 } // end namespace clang
diff --git a/unittests/Lex/CMakeLists.txt b/unittests/Lex/CMakeLists.txt
index ea6f9fd23402..bb0f66d86073 100644
--- a/unittests/Lex/CMakeLists.txt
+++ b/unittests/Lex/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_unittest(LexTests
   HeaderMapTest.cpp
+  HeaderSearchTest.cpp
   LexerTest.cpp
   PPCallbacksTest.cpp
   PPConditionalDirectiveRecordTest.cpp
diff --git a/unittests/Lex/HeaderSearchTest.cpp b/unittests/Lex/HeaderSearchTest.cpp
new file mode 100644
index 000000000000..c2794f5e9593
--- /dev/null
+++ b/unittests/Lex/HeaderSearchTest.cpp
@@ -0,0 +1,96 @@
+//===- unittests/Lex/HeaderSearchTest.cpp ------ HeaderSearch tests -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/HeaderSearch.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/MemoryBufferCache.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/HeaderSearchOptions.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace {
+
+// The test fixture.
+class HeaderSearchTest : public ::testing::Test {
+protected:
+  HeaderSearchTest()
+      : VFS(new vfs::InMemoryFileSystem), FileMgr(FileMgrOpts, VFS),
+        DiagID(new DiagnosticIDs()),
+        Diags(DiagID, new DiagnosticOptions, new IgnoringDiagConsumer()),
+        SourceMgr(Diags, FileMgr), TargetOpts(new TargetOptions),
+        Search(std::make_shared<HeaderSearchOptions>(), SourceMgr, Diags,
+               LangOpts, Target.get()) {
+    TargetOpts->Triple = "x86_64-apple-darwin11.1.0";
+    Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts);
+  }
+
+  void addSearchDir(llvm::StringRef Dir) {
+    VFS->addFile(Dir, 0, llvm::MemoryBuffer::getMemBuffer(""), /*User=*/None,
+                 /*Group=*/None, llvm::sys::fs::file_type::directory_file);
+    const DirectoryEntry *DE = FileMgr.getDirectory(Dir);
+    assert(DE);
+    auto DL = DirectoryLookup(DE, SrcMgr::C_User, /*isFramework=*/false);
+    Search.AddSearchPath(DL, /*isAngled=*/false);
+  }
+
+  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> VFS;
+  FileSystemOptions FileMgrOpts;
+  FileManager FileMgr;
+  IntrusiveRefCntPtr<DiagnosticIDs> DiagID;
+  DiagnosticsEngine Diags;
+  SourceManager SourceMgr;
+  LangOptions LangOpts;
+  std::shared_ptr<TargetOptions> TargetOpts;
+  IntrusiveRefCntPtr<TargetInfo> Target;
+  HeaderSearch Search;
+};
+
+TEST_F(HeaderSearchTest, NoSearchDir) {
+  EXPECT_EQ(Search.search_dir_size(), 0u);
+  EXPECT_EQ(Search.suggestPathToFileForDiagnostics("/x/y/z", /*WorkingDir=*/""),
+            "/x/y/z");
+}
+
+TEST_F(HeaderSearchTest, SimpleShorten) {
+  addSearchDir("/x");
+  addSearchDir("/x/y");
+  EXPECT_EQ(Search.suggestPathToFileForDiagnostics("/x/y/z", /*WorkingDir=*/""),
+            "z");
+  addSearchDir("/a/b/");
+  EXPECT_EQ(Search.suggestPathToFileForDiagnostics("/a/b/c", /*WorkingDir=*/""),
+            "c");
+}
+
+TEST_F(HeaderSearchTest, ShortenWithWorkingDir) {
+  addSearchDir("x/y");
+  EXPECT_EQ(Search.suggestPathToFileForDiagnostics("/a/b/c/x/y/z",
+                                                   /*WorkingDir=*/"/a/b/c"),
+            "z");
+}
+
+TEST_F(HeaderSearchTest, Dots) {
+  addSearchDir("/x/./y/");
+  EXPECT_EQ(Search.suggestPathToFileForDiagnostics("/x/y/./z",
+                                                   /*WorkingDir=*/""),
+            "z");
+  addSearchDir("a/.././c/");
+  EXPECT_EQ(Search.suggestPathToFileForDiagnostics("/m/n/./c/z",
+                                                   /*WorkingDir=*/"/m/n/"),
+            "z");
+}
+
+} // namespace
+} // namespace clang
diff --git a/unittests/Lex/LexerTest.cpp b/unittests/Lex/LexerTest.cpp
index d699a44b13fd..216672a90d5e 100644
--- a/unittests/Lex/LexerTest.cpp
+++ b/unittests/Lex/LexerTest.cpp
@@ -474,8 +474,9 @@ TEST_F(LexerTest, GetBeginningOfTokenWithEscapedNewLine) {
 }
 
 TEST_F(LexerTest, AvoidPastEndOfStringDereference) {
-  std::vector<Token> LexedTokens = Lex("  //  \\\n");
-  EXPECT_TRUE(LexedTokens.empty());
+  EXPECT_TRUE(Lex("  //  \\\n").empty());
+  EXPECT_TRUE(Lex("#include <\\\\").empty());
+  EXPECT_TRUE(Lex("#include <\\\\\n").empty());
 }
 
 TEST_F(LexerTest, StringizingRasString) {
diff --git a/unittests/Sema/CMakeLists.txt b/unittests/Sema/CMakeLists.txt
index 16fae820dfe4..45460f1e0f6c 100644
--- a/unittests/Sema/CMakeLists.txt
+++ b/unittests/Sema/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_unittest(SemaTests
   ExternalSemaSourceTest.cpp
+  CodeCompleteTest.cpp
   )
 
 target_link_libraries(SemaTests
diff --git a/unittests/Sema/CodeCompleteTest.cpp b/unittests/Sema/CodeCompleteTest.cpp
new file mode 100644
index 000000000000..8e888cbe528a
--- /dev/null
+++ b/unittests/Sema/CodeCompleteTest.cpp
@@ -0,0 +1,134 @@
+//=== unittests/Sema/CodeCompleteTest.cpp - Code Complete tests ==============//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Parse/ParseAST.h"
+#include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaDiagnostic.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
+namespace {
+
+using namespace clang;
+using namespace clang::tooling;
+using ::testing::UnorderedElementsAre;
+
+const char TestCCName[] = "test.cc";
+using VisitedContextResults = std::vector<std::string>;
+
+class VisitedContextFinder: public CodeCompleteConsumer {
+public:
+  VisitedContextFinder(VisitedContextResults &Results)
+      : CodeCompleteConsumer(/*CodeCompleteOpts=*/{},
+                             /*CodeCompleteConsumer*/ false),
+        VCResults(Results),
+        CCTUInfo(std::make_shared<GlobalCodeCompletionAllocator>()) {}
+
+  void ProcessCodeCompleteResults(Sema &S, CodeCompletionContext Context,
+                                  CodeCompletionResult *Results,
+                                  unsigned NumResults) override {
+    VisitedContexts = Context.getVisitedContexts();
+    VCResults = getVisitedNamespace();
+  }
+
+  CodeCompletionAllocator &getAllocator() override {
+    return CCTUInfo.getAllocator();
+  }
+
+  CodeCompletionTUInfo &getCodeCompletionTUInfo() override { return CCTUInfo; }
+
+  std::vector<std::string> getVisitedNamespace() const {
+    std::vector<std::string> NSNames;
+    for (const auto *Context : VisitedContexts)
+      if (const auto *NS = llvm::dyn_cast<NamespaceDecl>(Context))
+        NSNames.push_back(NS->getQualifiedNameAsString());
+    return NSNames;
+  }
+
+private:
+  VisitedContextResults& VCResults;
+  CodeCompletionTUInfo CCTUInfo;
+  CodeCompletionContext::VisitedContextSet VisitedContexts;
+};
+
+class CodeCompleteAction : public SyntaxOnlyAction {
+public:
+  CodeCompleteAction(ParsedSourceLocation P, VisitedContextResults &Results)
+      : CompletePosition(std::move(P)), VCResults(Results) {}
+
+  bool BeginInvocation(CompilerInstance &CI) override {
+    CI.getFrontendOpts().CodeCompletionAt = CompletePosition;
+    CI.setCodeCompletionConsumer(new VisitedContextFinder(VCResults));
+    return true;
+  }
+
+private:
+  // 1-based code complete position <Line, Col>;
+  ParsedSourceLocation CompletePosition;
+  VisitedContextResults& VCResults;
+};
+
+ParsedSourceLocation offsetToPosition(llvm::StringRef Code, size_t Offset) {
+  Offset = std::min(Code.size(), Offset);
+  StringRef Before = Code.substr(0, Offset);
+  int Lines = Before.count('\n');
+  size_t PrevNL = Before.rfind('\n');
+  size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1);
+  return {TestCCName, static_cast<unsigned>(Lines + 1),
+          static_cast<unsigned>(Offset - StartOfLine + 1)};
+}
+
+VisitedContextResults runCodeCompleteOnCode(StringRef Code) {
+  VisitedContextResults Results;
+  auto TokenOffset = Code.find('^');
+  assert(TokenOffset != StringRef::npos &&
+         "Completion token ^ wasn't found in Code.");
+  std::string WithoutToken = Code.take_front(TokenOffset);
+  WithoutToken += Code.drop_front(WithoutToken.size() + 1);
+  assert(StringRef(WithoutToken).find('^') == StringRef::npos &&
+         "expected exactly one completion token ^ inside the code");
+
+  auto Action = llvm::make_unique<CodeCompleteAction>(
+      offsetToPosition(WithoutToken, TokenOffset), Results);
+  clang::tooling::runToolOnCodeWithArgs(Action.release(), Code, {"-std=c++11"},
+                                        TestCCName);
+  return Results;
+}
+
+TEST(SemaCodeCompleteTest, VisitedNSForValidQualifiedId) {
+  auto VisitedNS = runCodeCompleteOnCode(R"cpp(
+     namespace ns1 {}
+     namespace ns2 {}
+     namespace ns3 {}
+     namespace ns3 { namespace nns3 {} }
+
+     namespace foo {
+     using namespace ns1;
+     namespace ns4 {} // not visited
+     namespace { using namespace ns2; }
+     inline namespace bar { using namespace ns3::nns3; }
+     } // foo
+     namespace ns { foo::^ }
+  )cpp");
+  EXPECT_THAT(VisitedNS, UnorderedElementsAre("foo", "ns1", "ns2", "ns3::nns3",
+                                              "foo::(anonymous)"));
+}
+
+TEST(SemaCodeCompleteTest, VisitedNSForInvalideQualifiedId) {
+  auto VisitedNS = runCodeCompleteOnCode(R"cpp(
+     namespace ns { foo::^ }
+  )cpp");
+  EXPECT_TRUE(VisitedNS.empty());
+}
+
+} // namespace
diff --git a/unittests/Tooling/CMakeLists.txt b/unittests/Tooling/CMakeLists.txt
index 557d1007ae2c..59ab01bd99fa 100644
--- a/unittests/Tooling/CMakeLists.txt
+++ b/unittests/Tooling/CMakeLists.txt
@@ -25,6 +25,7 @@ add_clang_unittest(ToolingTests
   RecursiveASTVisitorTestCallVisitor.cpp
   RecursiveASTVisitorTestDeclVisitor.cpp
   RecursiveASTVisitorTestExprVisitor.cpp
+  RecursiveASTVisitorTestPostOrderVisitor.cpp
   RecursiveASTVisitorTestTypeLocVisitor.cpp
   RefactoringActionRulesTest.cpp
   RefactoringCallbacksTest.cpp
diff --git a/unittests/Tooling/ExecutionTest.cpp b/unittests/Tooling/ExecutionTest.cpp
index b0c16d6cc5d2..26db8c6d0ea8 100644
--- a/unittests/Tooling/ExecutionTest.cpp
+++ b/unittests/Tooling/ExecutionTest.cpp
@@ -7,17 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Tooling/Execution.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
+#include "clang/Tooling/AllTUsExecution.h"
 #include "clang/Tooling/CompilationDatabase.h"
-#include "clang/Tooling/Execution.h"
 #include "clang/Tooling/StandaloneExecution.h"
 #include "clang/Tooling/ToolExecutorPluginRegistry.h"
 #include "clang/Tooling/Tooling.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <string>
@@ -44,7 +46,10 @@ class ASTConsumerWithResult
   }
 
   bool TraverseFunctionDecl(clang::FunctionDecl *Decl) {
-    Context->reportResult(Decl->getNameAsString(), "1");
+    Context->reportResult(Decl->getNameAsString(),
+                          Context->getRevision() + ":" + Context->getCorpus() +
+                              ":" + Context->getCurrentCompilationUnit() +
+                              "/1");
     return ASTVisitor::TraverseFunctionDecl(Decl);
   }
 
@@ -211,10 +216,77 @@ TEST(StandaloneToolTest, SimpleActionWithResult) {
   auto KVs = Executor.getToolResults()->AllKVResults();
   ASSERT_EQ(KVs.size(), 1u);
   EXPECT_EQ("f", KVs[0].first);
-  EXPECT_EQ("1", KVs[0].second);
+  // Currently the standlone executor returns empty corpus, revision, and
+  // compilation unit.
+  EXPECT_EQ("::/1", KVs[0].second);
+
+  Executor.getToolResults()->forEachResult(
+      [](StringRef, StringRef Value) { EXPECT_EQ("::/1", Value); });
+}
+
+class FixedCompilationDatabaseWithFiles : public CompilationDatabase {
+public:
+  FixedCompilationDatabaseWithFiles(Twine Directory,
+                                    ArrayRef<std::string> Files,
+                                    ArrayRef<std::string> CommandLine)
+      : FixedCompilations(Directory, CommandLine), Files(Files) {}
+
+  std::vector<CompileCommand>
+  getCompileCommands(StringRef FilePath) const override {
+    return FixedCompilations.getCompileCommands(FilePath);
+  }
+
+  std::vector<std::string> getAllFiles() const override { return Files; }
+
+private:
+  FixedCompilationDatabase FixedCompilations;
+  std::vector<std::string> Files;
+};
+
+MATCHER_P(Named, Name, "") { return arg.first == Name; }
 
+TEST(AllTUsToolTest, AFewFiles) {
+  FixedCompilationDatabaseWithFiles Compilations(".", {"a.cc", "b.cc", "c.cc"},
+                                                 std::vector<std::string>());
+  AllTUsToolExecutor Executor(Compilations, /*ThreadCount=*/0);
+  Executor.mapVirtualFile("a.cc", "void x() {}");
+  Executor.mapVirtualFile("b.cc", "void y() {}");
+  Executor.mapVirtualFile("c.cc", "void z() {}");
+
+  auto Err = Executor.execute(std::unique_ptr<FrontendActionFactory>(
+      new ReportResultActionFactory(Executor.getExecutionContext())));
+  ASSERT_TRUE(!Err);
+  EXPECT_THAT(
+      Executor.getToolResults()->AllKVResults(),
+      ::testing::UnorderedElementsAre(Named("x"), Named("y"), Named("z")));
+}
+
+TEST(AllTUsToolTest, ManyFiles) {
+  unsigned NumFiles = 100;
+  std::vector<std::string> Files;
+  std::map<std::string, std::string> FileToContent;
+  std::vector<std::string> ExpectedSymbols;
+  for (unsigned i = 1; i <= NumFiles; ++i) {
+    std::string File = "f" + std::to_string(i) + ".cc";
+    std::string Symbol = "looong_function_name_" + std::to_string(i);
+    Files.push_back(File);
+    FileToContent[File] = "void " + Symbol + "() {}";
+    ExpectedSymbols.push_back(Symbol);
+  }
+  FixedCompilationDatabaseWithFiles Compilations(".", Files,
+                                                 std::vector<std::string>());
+  AllTUsToolExecutor Executor(Compilations, /*ThreadCount=*/0);
+  for (const auto &FileAndContent : FileToContent) {
+    Executor.mapVirtualFile(FileAndContent.first, FileAndContent.second);
+  }
+
+  auto Err = Executor.execute(std::unique_ptr<FrontendActionFactory>(
+      new ReportResultActionFactory(Executor.getExecutionContext())));
+  ASSERT_TRUE(!Err);
+  std::vector<std::string> Results;
   Executor.getToolResults()->forEachResult(
-      [](StringRef, StringRef Value) { EXPECT_EQ("1", Value); });
+      [&](StringRef Name, StringRef) { Results.push_back(Name); });
+  EXPECT_THAT(ExpectedSymbols, ::testing::UnorderedElementsAreArray(Results));
 }
 
 } // end namespace tooling
diff --git a/unittests/Tooling/RecursiveASTVisitorTestPostOrderVisitor.cpp b/unittests/Tooling/RecursiveASTVisitorTestPostOrderVisitor.cpp
new file mode 100644
index 000000000000..2e7b398c3d1f
--- /dev/null
+++ b/unittests/Tooling/RecursiveASTVisitorTestPostOrderVisitor.cpp
@@ -0,0 +1,116 @@
+//===- unittests/Tooling/RecursiveASTVisitorPostOrderASTVisitor.cpp -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains tests for the post-order traversing functionality
+// of RecursiveASTVisitor.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestVisitor.h"
+
+using namespace clang;
+
+namespace {
+
+class RecordingVisitor : public TestVisitor<RecordingVisitor> {
+
+  bool VisitPostOrder;
+
+public:
+  explicit RecordingVisitor(bool VisitPostOrder)
+      : VisitPostOrder(VisitPostOrder) {}
+
+  // List of visited nodes during traversal.
+  std::vector<std::string> VisitedNodes;
+
+  bool shouldTraversePostOrder() const { return VisitPostOrder; }
+
+  bool VisitUnaryOperator(UnaryOperator *Op) {
+    VisitedNodes.push_back(Op->getOpcodeStr(Op->getOpcode()));
+    return true;
+  }
+
+  bool VisitBinaryOperator(BinaryOperator *Op) {
+    VisitedNodes.push_back(Op->getOpcodeStr());
+    return true;
+  }
+
+  bool VisitIntegerLiteral(IntegerLiteral *Lit) {
+    VisitedNodes.push_back(Lit->getValue().toString(10, false));
+    return true;
+  }
+
+  bool VisitVarDecl(VarDecl *D) {
+    VisitedNodes.push_back(D->getNameAsString());
+    return true;
+  }
+
+  bool VisitCXXMethodDecl(CXXMethodDecl *D) {
+    VisitedNodes.push_back(D->getQualifiedNameAsString());
+    return true;
+  }
+
+  bool VisitReturnStmt(ReturnStmt *S) {
+    VisitedNodes.push_back("return");
+    return true;
+  }
+
+  bool VisitCXXRecordDecl(CXXRecordDecl *D) {
+    if (!D->isImplicit())
+      VisitedNodes.push_back(D->getQualifiedNameAsString());
+    return true;
+  }
+
+  bool VisitTemplateTypeParmType(TemplateTypeParmType *T) {
+    VisitedNodes.push_back(T->getDecl()->getQualifiedNameAsString());
+    return true;
+  }
+};
+} // namespace
+
+TEST(RecursiveASTVisitor, PostOrderTraversal) {
+  // We traverse the translation unit and store all visited nodes.
+  RecordingVisitor Visitor(true);
+  Visitor.runOver("class A {\n"
+                  "  class B {\n"
+                  "    int foo() {\n"
+                  "      while(4) { int i = 9; int j = -5; }\n"
+                  "      return (1 + 3) + 2; }\n"
+                  "  };\n"
+                  "};\n");
+
+  std::vector<std::string> expected = {"4", "9",      "i",         "5",    "-",
+                                       "j", "1",      "3",         "+",    "2",
+                                       "+", "return", "A::B::foo", "A::B", "A"};
+  // Compare the list of actually visited nodes with the expected list of
+  // visited nodes.
+  ASSERT_EQ(expected.size(), Visitor.VisitedNodes.size());
+  for (std::size_t I = 0; I < expected.size(); I++) {
+    ASSERT_EQ(expected[I], Visitor.VisitedNodes[I]);
+  }
+}
+
+TEST(RecursiveASTVisitor, NoPostOrderTraversal) {
+  // We traverse the translation unit and store all visited nodes.
+  RecordingVisitor Visitor(false);
+  Visitor.runOver("class A {\n"
+                  "  class B {\n"
+                  "    int foo() { return 1 + 2; }\n"
+                  "  };\n"
+                  "};\n");
+
+  std::vector<std::string> expected = {"A", "A::B", "A::B::foo", "return",
+                                       "+", "1",    "2"};
+  // Compare the list of actually visited nodes with the expected list of
+  // visited nodes.
+  ASSERT_EQ(expected.size(), Visitor.VisitedNodes.size());
+  for (std::size_t I = 0; I < expected.size(); I++) {
+    ASSERT_EQ(expected[I], Visitor.VisitedNodes[I]);
+  }
+}
diff --git a/unittests/Tooling/ToolingTest.cpp b/unittests/Tooling/ToolingTest.cpp
index 891907a4d081..6dd0e53c4ddc 100644
--- a/unittests/Tooling/ToolingTest.cpp
+++ b/unittests/Tooling/ToolingTest.cpp
@@ -402,6 +402,24 @@ TEST(ClangToolTest, ArgumentAdjusters) {
   EXPECT_FALSE(Found);
 }
 
+TEST(ClangToolTest, BaseVirtualFileSystemUsage) {
+  FixedCompilationDatabase Compilations("/", std::vector<std::string>());
+  llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> OverlayFileSystem(
+      new vfs::OverlayFileSystem(vfs::getRealFileSystem()));
+  llvm::IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new vfs::InMemoryFileSystem);
+  OverlayFileSystem->pushOverlay(InMemoryFileSystem);
+
+  InMemoryFileSystem->addFile(
+      "a.cpp", 0, llvm::MemoryBuffer::getMemBuffer("int main() {}"));
+
+  ClangTool Tool(Compilations, std::vector<std::string>(1, "a.cpp"),
+                 std::make_shared<PCHContainerOperations>(), OverlayFileSystem);
+  std::unique_ptr<FrontendActionFactory> Action(
+      newFrontendActionFactory<SyntaxOnlyAction>());
+  EXPECT_EQ(0, Tool.run(Action.get()));
+}
+
 // Check getClangStripDependencyFileAdjuster doesn't strip args after -MD/-MMD.
 TEST(ClangToolTest, StripDependencyFileAdjuster) {
   FixedCompilationDatabase Compilations("/", {"-MD", "-c", "-MMD", "-w"});
diff --git a/unittests/libclang/LibclangTest.cpp b/unittests/libclang/LibclangTest.cpp
index f2a96d6be6c1..c44095d54283 100644
--- a/unittests/libclang/LibclangTest.cpp
+++ b/unittests/libclang/LibclangTest.cpp
@@ -572,3 +572,114 @@ TEST_F(LibclangReparseTest, clang_parseTranslationUnit2FullArgv) {
   EXPECT_EQ(0U, clang_getNumDiagnostics(ClangTU));
   DisplayDiagnostics();
 }
+
+class LibclangPrintingPolicyTest : public LibclangParseTest {
+public:
+  CXPrintingPolicy Policy = nullptr;
+
+  void SetUp() override {
+    LibclangParseTest::SetUp();
+    std::string File = "file.cpp";
+    WriteFile(File, "int i;\n");
+    ClangTU = clang_parseTranslationUnit(Index, File.c_str(), nullptr, 0,
+                                         nullptr, 0, TUFlags);
+    CXCursor TUCursor = clang_getTranslationUnitCursor(ClangTU);
+    Policy = clang_getCursorPrintingPolicy(TUCursor);
+  }
+  void TearDown() override {
+    clang_PrintingPolicy_dispose(Policy);
+    LibclangParseTest::TearDown();
+  }
+};
+
+TEST_F(LibclangPrintingPolicyTest, SetAndGetProperties) {
+  for (unsigned Value = 0; Value < 2; ++Value) {
+    for (int I = 0; I < CXPrintingPolicy_LastProperty; ++I) {
+      auto Property = static_cast<enum CXPrintingPolicyProperty>(I);
+
+      clang_PrintingPolicy_setProperty(Policy, Property, Value);
+      EXPECT_EQ(Value, clang_PrintingPolicy_getProperty(Policy, Property));
+    }
+  }
+}
+
+TEST_F(LibclangReparseTest, PreprocessorSkippedRanges) {
+  std::string Header = "header.h", Main = "main.cpp";
+  WriteFile(Header,
+    "#ifdef MANGOS\n"
+    "printf(\"mmm\");\n"
+    "#endif");
+  WriteFile(Main,
+    "#include \"header.h\"\n"
+    "#ifdef GUAVA\n"
+    "#endif\n"
+    "#ifdef KIWIS\n"
+    "printf(\"mmm!!\");\n"
+    "#endif");
+
+  for (int i = 0; i != 3; ++i) {
+    unsigned flags = TUFlags | CXTranslationUnit_PrecompiledPreamble;
+    if (i == 2)
+      flags |= CXTranslationUnit_CreatePreambleOnFirstParse;
+
+    if (i != 0)
+       clang_disposeTranslationUnit(ClangTU);  // dispose from previous iter
+
+    // parse once
+    ClangTU = clang_parseTranslationUnit(Index, Main.c_str(), nullptr, 0,
+                                         nullptr, 0, flags);
+    if (i != 0) {
+      // reparse
+      ASSERT_TRUE(ReparseTU(0, nullptr /* No unsaved files. */));
+    }
+
+    // Check all ranges are there
+    CXSourceRangeList *Ranges = clang_getAllSkippedRanges(ClangTU);
+    EXPECT_EQ(3U, Ranges->count);
+
+    CXSourceLocation cxl;
+    unsigned line;
+    cxl = clang_getRangeStart(Ranges->ranges[0]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(1U, line);
+    cxl = clang_getRangeEnd(Ranges->ranges[0]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(3U, line);
+
+    cxl = clang_getRangeStart(Ranges->ranges[1]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(2U, line);
+    cxl = clang_getRangeEnd(Ranges->ranges[1]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(3U, line);
+
+    cxl = clang_getRangeStart(Ranges->ranges[2]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(4U, line);
+    cxl = clang_getRangeEnd(Ranges->ranges[2]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(6U, line);
+
+    clang_disposeSourceRangeList(Ranges);
+
+    // Check obtaining ranges by each file works
+    CXFile cxf = clang_getFile(ClangTU, Header.c_str());
+    Ranges = clang_getSkippedRanges(ClangTU, cxf);
+    EXPECT_EQ(1U, Ranges->count);
+    cxl = clang_getRangeStart(Ranges->ranges[0]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(1U, line);
+    clang_disposeSourceRangeList(Ranges);
+
+    cxf = clang_getFile(ClangTU, Main.c_str());
+    Ranges = clang_getSkippedRanges(ClangTU, cxf);
+    EXPECT_EQ(2U, Ranges->count);
+    cxl = clang_getRangeStart(Ranges->ranges[0]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(2U, line);
+    cxl = clang_getRangeStart(Ranges->ranges[1]);
+    clang_getSpellingLocation(cxl, nullptr, &line, nullptr, nullptr);
+    EXPECT_EQ(4U, line);
+    clang_disposeSourceRangeList(Ranges);
+  }
+}
diff --git a/utils/TableGen/ClangAttrEmitter.cpp b/utils/TableGen/ClangAttrEmitter.cpp
index b0e2ddd91362..c7e3d31001bf 100644
--- a/utils/TableGen/ClangAttrEmitter.cpp
+++ b/utils/TableGen/ClangAttrEmitter.cpp
@@ -87,6 +87,8 @@ GetFlattenedSpellings(const Record &Attr) {
     } else if (Variety == "Clang") {
       Ret.emplace_back("GNU", Name, "", false);
       Ret.emplace_back("CXX11", Name, "clang", false);
+      if (Spelling->getValueAsBit("AllowInC"))
+        Ret.emplace_back("C2x", Name, "clang", false);
     } else
       Ret.push_back(FlattenedSpelling(*Spelling));
   }
@@ -2063,10 +2065,13 @@ void EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) {
     ArrayRef<std::pair<Record *, SMRange>> Supers = R.getSuperClasses();
     assert(!Supers.empty() && "Forgot to specify a superclass for the attr");
     std::string SuperName;
+    bool Inheritable = false;
     for (const auto &Super : llvm::reverse(Supers)) {
       const Record *R = Super.first;
       if (R->getName() != "TargetSpecificAttr" && SuperName.empty())
         SuperName = R->getName();
+      if (R->getName() == "InheritableAttr")
+        Inheritable = true;
     }
 
     OS << "class " << R.getName() << "Attr : public " << SuperName << " {\n";
@@ -2160,8 +2165,13 @@ void EmitClangAttrClass(RecordKeeper &Records, raw_ostream &OS) {
 
       OS << "             )\n";
       OS << "    : " << SuperName << "(attr::" << R.getName() << ", R, SI, "
-         << ( R.getValueAsBit("LateParsed") ? "true" : "false" ) << ", "
-         << ( R.getValueAsBit("DuplicatesAllowedWhileMerging") ? "true" : "false" ) << ")\n";
+         << ( R.getValueAsBit("LateParsed") ? "true" : "false" );
+      if (Inheritable) {
+        OS << ", "
+           << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true"
+                                                              : "false");
+      }
+      OS << ")\n";
 
       for (auto const &ai : Args) {
         OS << "              , ";
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index 8117d2f4a232..d4ce6c7c6e27 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -552,7 +552,11 @@ class NeonEmitter {
   // run - Emit arm_neon.h.inc
   void run(raw_ostream &o);
 
+  // runFP16 - Emit arm_fp16.h.inc
+  void runFP16(raw_ostream &o);
+
   // runHeader - Emit all the __builtin prototypes used in arm_neon.h
+	// and arm_fp16.h
   void runHeader(raw_ostream &o);
 
   // runTests - Emit tests for all the Neon intrinsics.
@@ -852,6 +856,35 @@ void Type::applyModifier(char Mod) {
     NumVectors = 0;
     Float = true;
     break;
+  case 'Y':
+    Bitwidth = ElementBitwidth = 16;
+    NumVectors = 0;
+    Float = true;
+    break;
+  case 'I':
+    Bitwidth = ElementBitwidth = 32;
+    NumVectors = 0;
+    Float = false;
+    Signed = true;
+    break;
+  case 'L':
+    Bitwidth = ElementBitwidth = 64;
+    NumVectors = 0;
+    Float = false;
+    Signed = true;
+    break;
+  case 'U':
+    Bitwidth = ElementBitwidth = 32;
+    NumVectors = 0;
+    Float = false;
+    Signed = false;
+    break;
+  case 'O':
+    Bitwidth = ElementBitwidth = 64;
+    NumVectors = 0;
+    Float = false;
+    Signed = false;
+    break;
   case 'f':
     Float = true;
     ElementBitwidth = 32;
@@ -1010,7 +1043,7 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
 }
 
 static bool isFloatingPointProtoModifier(char Mod) {
-  return Mod == 'F' || Mod == 'f' || Mod == 'H';
+  return Mod == 'F' || Mod == 'f' || Mod == 'H' || Mod == 'Y' || Mod == 'I';
 }
 
 std::string Intrinsic::getBuiltinTypeStr() {
@@ -2420,12 +2453,125 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "#endif /* __ARM_NEON_H */\n";
 }
 
+/// run - Read the records in arm_fp16.td and output arm_fp16.h.  arm_fp16.h
+/// is comprised of type definitions and function declarations.
+void NeonEmitter::runFP16(raw_ostream &OS) {
+  OS << "/*===---- arm_fp16.h - ARM FP16 intrinsics "
+        "------------------------------"
+        "---===\n"
+        " *\n"
+        " * Permission is hereby granted, free of charge, to any person "
+        "obtaining a copy\n"
+        " * of this software and associated documentation files (the "
+				"\"Software\"), to deal\n"
+        " * in the Software without restriction, including without limitation "
+				"the rights\n"
+        " * to use, copy, modify, merge, publish, distribute, sublicense, "
+				"and/or sell\n"
+        " * copies of the Software, and to permit persons to whom the Software "
+				"is\n"
+        " * furnished to do so, subject to the following conditions:\n"
+        " *\n"
+        " * The above copyright notice and this permission notice shall be "
+        "included in\n"
+        " * all copies or substantial portions of the Software.\n"
+        " *\n"
+        " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, "
+        "EXPRESS OR\n"
+        " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF "
+        "MERCHANTABILITY,\n"
+        " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT "
+        "SHALL THE\n"
+        " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR "
+        "OTHER\n"
+        " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, "
+        "ARISING FROM,\n"
+        " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER "
+        "DEALINGS IN\n"
+        " * THE SOFTWARE.\n"
+        " *\n"
+        " *===-----------------------------------------------------------------"
+        "---"
+        "---===\n"
+        " */\n\n";
+
+  OS << "#ifndef __ARM_FP16_H\n";
+  OS << "#define __ARM_FP16_H\n\n";
+
+  OS << "#include <stdint.h>\n\n";
+
+  OS << "typedef __fp16 float16_t;\n";
+
+  OS << "#define __ai static inline __attribute__((__always_inline__, "
+        "__nodebug__))\n\n";
+
+  SmallVector<Intrinsic *, 128> Defs;
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  for (auto *I : Defs)
+    I->indexBody();
+
+  std::stable_sort(
+      Defs.begin(), Defs.end(),
+      [](const Intrinsic *A, const Intrinsic *B) { return *A < *B; });
+
+  // Only emit a def when its requirements have been met.
+  // FIXME: This loop could be made faster, but it's fast enough for now.
+  bool MadeProgress = true;
+  std::string InGuard;
+  while (!Defs.empty() && MadeProgress) {
+    MadeProgress = false;
+
+    for (SmallVector<Intrinsic *, 128>::iterator I = Defs.begin();
+         I != Defs.end(); /*No step*/) {
+      bool DependenciesSatisfied = true;
+      for (auto *II : (*I)->getDependencies()) {
+        if (std::find(Defs.begin(), Defs.end(), II) != Defs.end())
+          DependenciesSatisfied = false;
+      }
+      if (!DependenciesSatisfied) {
+        // Try the next one.
+        ++I;
+        continue;
+      }
+
+      // Emit #endif/#if pair if needed.
+      if ((*I)->getGuard() != InGuard) {
+        if (!InGuard.empty())
+          OS << "#endif\n";
+        InGuard = (*I)->getGuard();
+        if (!InGuard.empty())
+          OS << "#if " << InGuard << "\n";
+      }
+
+      // Actually generate the intrinsic code.
+      OS << (*I)->generate();
+
+      MadeProgress = true;
+      I = Defs.erase(I);
+    }
+  }
+  assert(Defs.empty() && "Some requirements were not satisfied!");
+  if (!InGuard.empty())
+    OS << "#endif\n";
+
+  OS << "\n";
+  OS << "#undef __ai\n\n";
+  OS << "#endif /* __ARM_FP16_H */\n";
+}
+
 namespace clang {
 
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).run(OS);
 }
 
+void EmitFP16(RecordKeeper &Records, raw_ostream &OS) {
+  NeonEmitter(Records).runFP16(OS);
+}
+
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runHeader(OS);
 }
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 840b330a732c..a2ba131628f1 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -52,6 +52,7 @@ enum ActionType {
   GenClangCommentCommandInfo,
   GenClangCommentCommandList,
   GenArmNeon,
+  GenArmFP16,
   GenArmNeonSema,
   GenArmNeonTest,
   GenAttrDocs,
@@ -139,6 +140,7 @@ cl::opt<ActionType> Action(
                    "Generate list of commands that are used in "
                    "documentation comments"),
         clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"),
+        clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"),
         clEnumValN(GenArmNeonSema, "gen-arm-neon-sema",
                    "Generate ARM NEON sema support for clang"),
         clEnumValN(GenArmNeonTest, "gen-arm-neon-test",
@@ -250,6 +252,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenArmNeon:
     EmitNeon(Records, OS);
     break;
+  case GenArmFP16:
+    EmitFP16(Records, OS);
+    break;
   case GenArmNeonSema:
     EmitNeonSema(Records, OS);
     break;
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 342c889ca47a..706e812ae874 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -65,6 +65,7 @@ void EmitClangCommentCommandInfo(RecordKeeper &Records, raw_ostream &OS);
 void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS);
 
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS);
+void EmitFP16(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeon2(RecordKeeper &Records, raw_ostream &OS);
diff --git a/utils/analyzer/SATestAdd.py b/utils/analyzer/SATestAdd.py
index 4c3e35cdcb5d..041b24409fe3 100755
--- a/utils/analyzer/SATestAdd.py
+++ b/utils/analyzer/SATestAdd.py
@@ -32,11 +32,11 @@
                                      (e.g., to adapt to newer version of clang)
                                      that should be applied to CachedSource
                                      before analysis. To construct this patch,
-                                     run the the download script to download
+                                     run the download script to download
                                      the project to CachedSource, copy the
                                      CachedSource to another directory (for
                                      example, PatchedSource) and make any
-                                     needed modifications to the the copied
+                                     needed modifications to the copied
                                      source.
                                      Then run:
                                           diff -ur CachedSource PatchedSource \
diff --git a/utils/analyzer/SATestBuild.py b/utils/analyzer/SATestBuild.py
index 60c8796e338f..cc5ef4802995 100755
--- a/utils/analyzer/SATestBuild.py
+++ b/utils/analyzer/SATestBuild.py
@@ -247,6 +247,7 @@ def runScanBuild(Dir, SBOutputDir, PBuildLogFile):
     SBOptions += "-plist-html -o '%s' " % SBOutputDir
     SBOptions += "-enable-checker " + AllCheckers + " "
     SBOptions += "--keep-empty "
+    SBOptions += "-analyzer-config 'stable-report-filename=true' "
     # Always use ccc-analyze to ensure that we can locate the failures
     # directory.
     SBOptions += "--override-compiler "
diff --git a/utils/find-unused-diagnostics.sh b/utils/find-unused-diagnostics.sh
index cd48e6920fb8..783f58ba1d30 100755
--- a/utils/find-unused-diagnostics.sh
+++ b/utils/find-unused-diagnostics.sh
@@ -8,7 +8,7 @@
 ALL_DIAGS=$(grep -E --only-matching --no-filename '(err_|warn_|ext_|note_)[a-z_]+' ./include/clang/Basic/Diagnostic*.td)
 
 # Now look for all potential identifiers in the source files.
-ALL_SOURCES=$(find lib include tools -name \*.cpp -or -name \*.h)
+ALL_SOURCES=$(find lib include tools utils -name \*.cpp -or -name \*.h)
 DIAGS_IN_SOURCES=$(grep -E --only-matching --no-filename '(err_|warn_|ext_|note_)[a-z_]+' $ALL_SOURCES)
 
 # Print all diags that occur in the .td files but not in the source.
diff --git a/www/analyzer/scan-build.html b/www/analyzer/scan-build.html
index b16f6bb53fa7..83efea90151d 100644
--- a/www/analyzer/scan-build.html
+++ b/www/analyzer/scan-build.html
@@ -248,7 +248,7 @@ <h3 id="recommended_autoconf">Run './configure' through scan-build</h3>
 
 <pre class="code_example">
 $ scan-build ./configure
-$ scan-build make
+$ scan-build --keep-cc make
 </pre>
 
 <p>The reason <tt>configure</tt> also needs to be run through
diff --git a/www/cxx_dr_status.html b/www/cxx_dr_status.html
index 5bea91aa8234..6a63ef7bcc13 100644
--- a/www/cxx_dr_status.html
+++ b/www/cxx_dr_status.html
@@ -795,7 +795,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#126">126</a></td>
     <td>TC1</td>
     <td>Exception specifications and <TT>const</TT></td>
-    <td class="none" align="center">No</td>
+    <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="127">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#127">127</a></td>
@@ -1292,7 +1292,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   <tr id="209">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#209">209</a></td>
     <td>NAD</td>
-    <td>Must friend declaration names be
+    <td>Must friend declaration names be
 accessible?</td>
     <td class="full" align="center">Yes</td>
   </tr>
@@ -1653,7 +1653,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   <tr id="269">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#269">269</a></td>
     <td>NAD</td>
-    <td>Order of initialization of multiply-defined static data members
+    <td>Order of initialization of multiply-defined static data members
 of class templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
@@ -3268,8 +3268,8 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   <tr id="538">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#538">538</a></td>
     <td>CD1</td>
-    <td>Definition and usage
-of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>,
+    <td>Definition and usage
+of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>,
 and <I>POD class</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
@@ -6493,7 +6493,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1113">1113</a></td>
     <td>C++11</td>
     <td>Linkage of namespace member of unnamed namespace</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1114">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1114">1114</a></td>
diff --git a/www/cxx_status.html b/www/cxx_status.html
index 265e22636296..ed9a339c178a 100644
--- a/www/cxx_status.html
+++ b/www/cxx_status.html
@@ -766,7 +766,7 @@ <h2 id="cxx17">C++17 implementation status</h2>
 functions using expression syntax are no longer guaranteed to be destroyed in
 reverse construction order in that ABI.
 </span><br>
-<span id="p0522">(12): Despite being the the resolution to a Defect Report, this
+<span id="p0522">(12): Despite being the resolution to a Defect Report, this
 feature is disabled by default in all language versions, and can be enabled
 explicitly with the flag <tt>-frelaxed-template-template-args</tt> in Clang 4
 onwards.