diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 9e670c447fbadd..b347c443da677f 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -18,7 +18,7 @@ set -o pipefail
 
 MONOREPO_ROOT="${MONOREPO_ROOT:="$(git rev-parse --show-toplevel)"}"
 BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
-rm -rf ${BUILD_DIR}
+rm -rf "${BUILD_DIR}"
 
 ccache --zero-stats
 
@@ -37,8 +37,8 @@ projects="${1}"
 targets="${2}"
 
 echo "--- cmake"
-pip install -q -r ${MONOREPO_ROOT}/mlir/python/requirements.txt
-cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
+pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt
+cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
       -D CMAKE_BUILD_TYPE=Release \
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 52ba13036f9159..4fd88ea81c84a8 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -19,7 +19,7 @@ set -o pipefail
 MONOREPO_ROOT="${MONOREPO_ROOT:="$(git rev-parse --show-toplevel)"}"
 BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 
-rm -rf ${BUILD_DIR}
+rm -rf "${BUILD_DIR}"
 
 if [[ -n "${CLEAR_CACHE:-}" ]]; then
   echo "clearing sccache"
@@ -37,14 +37,14 @@ projects="${1}"
 targets="${2}"
 
 echo "--- cmake"
-pip install -q -r ${MONOREPO_ROOT}/mlir/python/requirements.txt
+pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt
 
 # The CMAKE_*_LINKER_FLAGS to disable the manifest come from research
 # on fixing a build reliability issue on the build server, please
 # see https://github.com/llvm/llvm-project/pull/82393 and
 # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40
 # for further information.
-cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
+cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
       -D CMAKE_BUILD_TYPE=Release \
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index fdadef9dcd3848..c9e037c225dd41 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -3547,7 +3547,7 @@ MCSymbol *BinaryFunction::getSymbolForEntryID(uint64_t EntryID) {
   if (!isMultiEntry())
     return nullptr;
 
-  uint64_t NumEntries = 0;
+  uint64_t NumEntries = 1;
   if (hasCFG()) {
     for (BinaryBasicBlock *BB : BasicBlocks) {
       MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB);
@@ -3580,7 +3580,7 @@ uint64_t BinaryFunction::getEntryIDForSymbol(const MCSymbol *Symbol) const {
       return 0;
 
   // Check all secondary entries available as either basic blocks or lables.
-  uint64_t NumEntries = 0;
+  uint64_t NumEntries = 1;
   for (const BinaryBasicBlock *BB : BasicBlocks) {
     MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB);
     if (!EntrySymbol)
@@ -3589,7 +3589,7 @@ uint64_t BinaryFunction::getEntryIDForSymbol(const MCSymbol *Symbol) const {
       return NumEntries;
     ++NumEntries;
   }
-  NumEntries = 0;
+  NumEntries = 1;
   for (const std::pair<const uint32_t, MCSymbol *> &KV : Labels) {
     MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(KV.second);
     if (!EntrySymbol)
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 6fcc4a956fa1a1..0f082086c1fc24 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -25,6 +25,25 @@ extern llvm::cl::opt<bool> ProfileUseDFS;
 namespace llvm {
 namespace bolt {
 
+/// Set CallSiteInfo destination fields from \p Symbol and return a target
+/// BinaryFunction for that symbol.
+static const BinaryFunction *setCSIDestination(const BinaryContext &BC,
+                                               yaml::bolt::CallSiteInfo &CSI,
+                                               const MCSymbol *Symbol) {
+  CSI.DestId = 0; // designated for unknown functions
+  CSI.EntryDiscriminator = 0;
+  if (Symbol) {
+    uint64_t EntryID = 0;
+    if (const BinaryFunction *const Callee =
+            BC.getFunctionForSymbol(Symbol, &EntryID)) {
+      CSI.DestId = Callee->getFunctionNumber();
+      CSI.EntryDiscriminator = EntryID;
+      return Callee;
+    }
+  }
+  return nullptr;
+}
+
 yaml::bolt::BinaryFunctionProfile
 YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS) {
   yaml::bolt::BinaryFunctionProfile YamlBF;
@@ -79,31 +98,20 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS) {
           continue;
         for (const IndirectCallProfile &CSP : ICSP.get()) {
           StringRef TargetName = "";
-          CSI.DestId = 0; // designated for unknown functions
-          CSI.EntryDiscriminator = 0;
-          if (CSP.Symbol) {
-            const BinaryFunction *Callee = BC.getFunctionForSymbol(CSP.Symbol);
-            if (Callee) {
-              CSI.DestId = Callee->getFunctionNumber();
-              TargetName = Callee->getOneName();
-            }
-          }
+          const BinaryFunction *Callee = setCSIDestination(BC, CSI, CSP.Symbol);
+          if (Callee)
+            TargetName = Callee->getOneName();
           CSI.Count = CSP.Count;
           CSI.Mispreds = CSP.Mispreds;
           CSTargets.emplace_back(TargetName, CSI);
         }
       } else { // direct call or a tail call
-        uint64_t EntryID = 0;
-        CSI.DestId = 0;
         StringRef TargetName = "";
         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Instr);
         const BinaryFunction *const Callee =
-            BC.getFunctionForSymbol(CalleeSymbol, &EntryID);
-        if (Callee) {
-          CSI.DestId = Callee->getFunctionNumber();
-          CSI.EntryDiscriminator = EntryID;
+            setCSIDestination(BC, CSI, CalleeSymbol);
+        if (Callee)
           TargetName = Callee->getOneName();
-        }
 
         auto getAnnotationWithDefault = [&](const MCInst &Inst, StringRef Ann) {
           return BC.MIB->getAnnotationWithDefault(Instr, Ann, 0ull);
diff --git a/bolt/test/X86/yaml-secondary-entry-discriminator.s b/bolt/test/X86/yaml-secondary-entry-discriminator.s
new file mode 100644
index 00000000000000..43c2e2a7f05549
--- /dev/null
+++ b/bolt/test/X86/yaml-secondary-entry-discriminator.s
@@ -0,0 +1,74 @@
+# This reproduces a bug with BOLT setting incorrect discriminator for
+# secondary entry points in YAML profile.
+
+# REQUIRES: system-linux
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata -w %t.yaml --print-profile \
+# RUN:   --print-only=main | FileCheck %s --check-prefix=CHECK-CFG
+# RUN: FileCheck %s -input-file %t.yaml
+# CHECK:      - name:    main
+# CHECK-NEXT:   fid:     2
+# CHECK-NEXT:   hash:    0xADF270D550151185
+# CHECK-NEXT:   exec:    0
+# CHECK-NEXT:   nblocks: 4
+# CHECK-NEXT:   blocks:
+# CHECK:          - bid:   1
+# CHECK-NEXT:       insns: 1
+# CHECK-NEXT:       hash:  0x36A303CBA4360014
+# CHECK-NEXT:       calls: [ { off: 0x0, fid: 1, disc: 1, cnt: 1 } ]
+# CHECK:          - bid:   2
+# CHECK-NEXT:       insns: 5
+# CHECK-NEXT:       hash:  0x8B2F5747CD0019
+# CHECK-NEXT:       calls: [ { off: 0x0, fid: 1, disc: 1, cnt: 1, mis: 1 } ]
+
+# Make sure that the profile is attached correctly
+# RUN: llvm-bolt %t.exe -o %t.out --data %t.yaml --print-profile \
+# RUN:   --print-only=main | FileCheck %s --check-prefix=CHECK-CFG
+
+# CHECK-CFG: Binary Function "main" after attaching profile {
+# CHECK-CFG:      callq secondary_entry # Offset: [[#]] # Count: 1
+# CHECK-CFG:      callq *%rax # Offset: [[#]] # CallProfile: 1 (1 misses) :
+# CHECK-CFG-NEXT:     { secondary_entry: 1 (1 misses) }
+
+.globl func
+.type	func, @function
+func:
+# FDATA: 0 [unknown] 0 1 func 0 1 0
+  .cfi_startproc
+  pushq   %rbp
+  movq    %rsp, %rbp
+.globl secondary_entry
+secondary_entry:
+  popq    %rbp
+  retq
+  nopl    (%rax)
+  .cfi_endproc
+  .size	func, .-func
+
+.globl main
+.type	main, @function
+main:
+  .cfi_startproc
+  pushq   %rbp
+  movq    %rsp, %rbp
+  subq    $16, %rsp
+  movl    $0, -4(%rbp)
+  testq   %rax, %rax
+  jne     Lindcall
+Lcall:
+  call    secondary_entry
+# FDATA: 1 main #Lcall# 1 secondary_entry 0 1 1
+Lindcall:
+  callq   *%rax
+# FDATA: 1 main #Lindcall# 1 secondary_entry 0 1 1
+  xorl    %eax, %eax
+  addq    $16, %rsp
+  popq    %rbp
+  retq
+# For relocations against .text
+  call exit
+  .cfi_endproc
+  .size	main, .-main
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 128cf45f3179bc..7d72fb06320db7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -289,6 +289,9 @@ Improvements to Clang's diagnostics
   annotated with the ``clang::always_destroy`` attribute.
   Fixes #GH68686, #GH86486
 
+- ``-Wmicrosoft``, ``-Wgnu``, or ``-pedantic`` is now required to diagnose C99
+  flexible array members in a union or alone in a struct. Fixes GH#84565.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f4ed3e892bc076..5c6e62e59721d9 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6464,9 +6464,6 @@ def ext_c99_flexible_array_member : Extension<
 def err_flexible_array_virtual_base : Error<
   "flexible array member %0 not allowed in "
   "%select{struct|interface|union|class|enum}1 which has a virtual base class">;
-def err_flexible_array_empty_aggregate : Error<
-  "flexible array member %0 not allowed in otherwise empty "
-  "%select{struct|interface|union|class|enum}1">;
 def err_flexible_array_has_nontrivial_dtor : Error<
   "flexible array member %0 of type %1 with non-trivial destruction">;
 def ext_flexible_array_in_struct : Extension<
@@ -6481,8 +6478,6 @@ def ext_flexible_array_empty_aggregate_ms : Extension<
   "flexible array member %0 in otherwise empty "
   "%select{struct|interface|union|class|enum}1 is a Microsoft extension">,
   InGroup<MicrosoftFlexibleArray>;
-def err_flexible_array_union : Error<
-  "flexible array member %0 in a union is not allowed">;
 def ext_flexible_array_union_ms : Extension<
   "flexible array member %0 in a union is a Microsoft extension">,
   InGroup<MicrosoftFlexibleArray>;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index ef8842a45ae4a1..d17e53847ebe7b 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -9601,11 +9601,11 @@ static uint64_t getRVVTypeSize(ASTContext &Context, const BuiltinType *Ty) {
 
   ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(Ty);
 
-  unsigned EltSize = Context.getTypeSize(Info.ElementType);
+  uint64_t EltSize = Context.getTypeSize(Info.ElementType);
   if (Info.ElementType == Context.BoolTy)
     EltSize = 1;
 
-  unsigned MinElts = Info.EC.getKnownMinValue();
+  uint64_t MinElts = Info.EC.getKnownMinValue();
   return VScale->first * MinElts * EltSize;
 }
 
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index b873bc6737bb0a..c3774d0cb75edc 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -289,7 +289,7 @@ class ComplexExprEmitter
                                         const BinOpInfo &Op);
 
   QualType GetHigherPrecisionFPType(QualType ElementType) {
-    const auto *CurrentBT = dyn_cast<BuiltinType>(ElementType);
+    const auto *CurrentBT = cast<BuiltinType>(ElementType);
     switch (CurrentBT->getKind()) {
     case BuiltinType::Kind::Float16:
       return CGF.getContext().FloatTy;
diff --git a/clang/lib/CodeGen/Targets/PPC.cpp b/clang/lib/CodeGen/Targets/PPC.cpp
index 00b04723f17dd2..3eadb19bd2058f 100644
--- a/clang/lib/CodeGen/Targets/PPC.cpp
+++ b/clang/lib/CodeGen/Targets/PPC.cpp
@@ -274,7 +274,7 @@ void AIXTargetCodeGenInfo::setTargetAttributes(
   if (!isa<llvm::GlobalVariable>(GV))
     return;
 
-  auto *GVar = dyn_cast<llvm::GlobalVariable>(GV);
+  auto *GVar = cast<llvm::GlobalVariable>(GV);
   auto GVId = GV->getName();
 
   // Is this a global variable specified by the user as toc-data?
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8d2baa4eb763df..e2b06f3c1492a9 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -770,8 +770,9 @@ bool tools::isTLSDESCEnabled(const ToolChain &TC,
 void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
                           ArgStringList &CmdArgs, const InputInfo &Output,
                           const InputInfo &Input, bool IsThinLTO) {
-  const bool IsOSAIX = ToolChain.getTriple().isOSAIX();
-  const bool IsAMDGCN = ToolChain.getTriple().isAMDGCN();
+  const llvm::Triple &Triple = ToolChain.getTriple();
+  const bool IsOSAIX = Triple.isOSAIX();
+  const bool IsAMDGCN = Triple.isAMDGCN();
   const char *Linker = Args.MakeArgString(ToolChain.GetLinkerPath());
   const Driver &D = ToolChain.getDriver();
   bool ClosedNeeded =
@@ -782,7 +783,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   const bool IsUnifiedLTO = Args.hasArg(options::OPT_funified_lto);
   if (llvm::sys::path::filename(Linker) != "ld.lld" &&
       llvm::sys::path::stem(Linker) != "ld.lld" && !ClosedNeeded &&
-      !ToolChain.getTriple().isOSOpenBSD()) {
+      !Triple.isOSOpenBSD()) {
     // Tell the linker to load the plugin. This has to come before
     // AddLinkerInputs as gold requires -plugin and AIX ld requires -bplugin to
     // come before any -plugin-opt/-bplugin_opt that -Wl might forward.
@@ -851,7 +852,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   // the plugin.
 
   // Handle flags for selecting CPU variants.
-  std::string CPU = getCPUName(D, Args, ToolChain.getTriple());
+  std::string CPU = getCPUName(D, Args, Triple);
   if (!CPU.empty())
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + ExtraDash + "mcpu=" + CPU));
@@ -982,10 +983,9 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
     bool HasRoptr = Args.hasFlag(options::OPT_mxcoff_roptr,
                                  options::OPT_mno_xcoff_roptr, false);
     StringRef OptStr = HasRoptr ? "-mxcoff-roptr" : "-mno-xcoff-roptr";
-
     if (!IsOSAIX)
       D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << OptStr << ToolChain.getTriple().str();
+          << OptStr << Triple.str();
 
     if (HasRoptr) {
       // The data sections option is on by default on AIX. We only need to error
@@ -1048,7 +1048,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   }
 
   if (Args.hasFlag(options::OPT_femulated_tls, options::OPT_fno_emulated_tls,
-                   ToolChain.getTriple().hasDefaultEmulatedTLS())) {
+                   Triple.hasDefaultEmulatedTLS())) {
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-emulated-tls"));
   }
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 4c83a7a3a323be..b9144cf55452e2 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4827,6 +4827,10 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
         Right.is(TT_TemplateOpener)) {
       return true;
     }
+    if (Left.is(tok::identifier) && Right.is(tok::numeric_constant) &&
+        Right.TokenText[0] == '.') {
+      return false;
+    }
   } else if (Style.isProto()) {
     if (Right.is(tok::period) &&
         Left.isOneOf(Keywords.kw_optional, Keywords.kw_required,
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index f8f5d8d53d5691..6476c5107cb5cc 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -255,7 +255,7 @@ bool InstallAPIVisitor::VisitFunctionDecl(const FunctionDecl *D) {
       return true;
 
     // Skip methods in CXX RecordDecls.
-    for (auto P : D->getASTContext().getParents(*M)) {
+    for (const DynTypedNode &P : D->getASTContext().getParents(*M)) {
       if (P.get<CXXRecordDecl>())
         return true;
     }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 8b44d24f5273aa..0bd88ece2aa544 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -19429,15 +19429,11 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
         } else if (Record->isUnion())
           DiagID = getLangOpts().MicrosoftExt
                        ? diag::ext_flexible_array_union_ms
-                       : getLangOpts().CPlusPlus
-                             ? diag::ext_flexible_array_union_gnu
-                             : diag::err_flexible_array_union;
+                       : diag::ext_flexible_array_union_gnu;
         else if (NumNamedMembers < 1)
           DiagID = getLangOpts().MicrosoftExt
                        ? diag::ext_flexible_array_empty_aggregate_ms
-                       : getLangOpts().CPlusPlus
-                             ? diag::ext_flexible_array_empty_aggregate_gnu
-                             : diag::err_flexible_array_empty_aggregate;
+                       : diag::ext_flexible_array_empty_aggregate_gnu;
 
         if (DiagID)
           Diag(FD->getLocation(), DiagID)
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 2b4805d62d07d0..dce225a7204da8 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -2329,11 +2329,11 @@ void InitListChecker::CheckStructUnionTypes(
       break;
     }
 
-    // We've already initialized a member of a union. We're done.
+    // We've already initialized a member of a union. We can stop entirely.
     if (InitializedSomething && RD->isUnion())
-      break;
+      return;
 
-    // If we've hit the flexible array member at the end, we're done.
+    // Stop if we've hit a flexible array member.
     if (Field->getType()->isIncompleteArrayType())
       break;
 
@@ -2456,6 +2456,11 @@ void InitListChecker::CheckStructUnionTypes(
   else
     CheckImplicitInitList(MemberEntity, IList, Field->getType(), Index,
                           StructuredList, StructuredIndex);
+
+  if (RD->isUnion() && StructuredList) {
+    // Initialize the first field within the union.
+    StructuredList->setInitializedFieldInUnion(*Field);
+  }
 }
 
 /// Expand a field designator that refers to a member of an
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 005529a53270c3..aab72dbaf48c46 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2974,7 +2974,7 @@ void DeclareImplicitDeductionGuidesForTypeAlias(
     if (auto *FPrime = SemaRef.InstantiateFunctionDeclaration(
             F, TemplateArgListForBuildingFPrime, AliasTemplate->getLocation(),
             Sema::CodeSynthesisContext::BuildingDeductionGuides)) {
-      auto *GG = dyn_cast<CXXDeductionGuideDecl>(FPrime);
+      auto *GG = cast<CXXDeductionGuideDecl>(FPrime);
       buildDeductionGuide(SemaRef, AliasTemplate, FPrimeTemplateParamList,
                           GG->getCorrespondingConstructor(),
                           GG->getExplicitSpecifier(), GG->getTypeSourceInfo(),
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 97f8445bf819c8..9a55881f644254 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5514,9 +5514,9 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate(
   QualType Obj2Ty;
   if (TPOC == TPOC_Call) {
     const FunctionProtoType *Proto1 =
-        FD1->getType()->getAs<FunctionProtoType>();
+        FD1->getType()->castAs<FunctionProtoType>();
     const FunctionProtoType *Proto2 =
-        FD2->getType()->getAs<FunctionProtoType>();
+        FD2->getType()->castAs<FunctionProtoType>();
 
     //   - In the context of a function call, the function parameter types are
     //     used.
diff --git a/clang/test/C/drs/dr5xx.c b/clang/test/C/drs/dr5xx.c
index 68bcef78baccd7..13464f78b6a654 100644
--- a/clang/test/C/drs/dr5xx.c
+++ b/clang/test/C/drs/dr5xx.c
@@ -29,7 +29,7 @@ void dr502(void) {
    */
   struct t {
     int i;
-    struct { int a[]; }; /* expected-error {{flexible array member 'a' not allowed in otherwise empty struct}}
+    struct { int a[]; }; /* expected-warning {{flexible array member 'a' in otherwise empty struct is a GNU extension}}
                             c89only-warning {{flexible array members are a C99 feature}}
                             expected-warning {{'' may not be nested in a struct due to flexible array member}}
                           */
diff --git a/clang/test/CodeGen/flexible-array-init.c b/clang/test/CodeGen/flexible-array-init.c
index bae926da5feb07..15a30c15ac966e 100644
--- a/clang/test/CodeGen/flexible-array-init.c
+++ b/clang/test/CodeGen/flexible-array-init.c
@@ -3,9 +3,15 @@
 struct { int x; int y[]; } a = { 1, 7, 11 };
 // CHECK: @a ={{.*}} global { i32, [2 x i32] } { i32 1, [2 x i32] [i32 7, i32 11] }
 
+struct { int y[]; } a1 = { 8, 12 };
+// CHECK: @a1 ={{.*}} global { [2 x i32] } { [2 x i32] [i32 8, i32 12] }
+
 struct { int x; int y[]; } b = { 1, { 13, 15 } };
 // CHECK: @b ={{.*}} global { i32, [2 x i32] } { i32 1, [2 x i32] [i32 13, i32 15] }
 
+struct { int y[]; } b1 = { { 14, 16 } };
+// CHECK: @b1 ={{.*}} global { [2 x i32] } { [2 x i32] [i32 14, i32 16] }
+
 // sizeof(c) == 8, so this global should be at least 8 bytes.
 struct { int x; char c; char y[]; } c = { 1, 2, { 13, 15 } };
 // CHECK: @c ={{.*}} global { i32, i8, [2 x i8] } { i32 1, i8 2, [2 x i8] c"\0D\0F" }
@@ -21,10 +27,79 @@ struct __attribute((packed, aligned(4))) { char a; int x; char z[]; } e = { 1, 2
 struct { int x; char y[]; } f = { 1, { 13, 15 } };
 // CHECK: @f ={{.*}} global <{ i32, [2 x i8] }> <{ i32 1, [2 x i8] c"\0D\0F" }>
 
-union {
-  struct {
-    int a;
-    char b[];
-  } x;
-} in_union = {};
-// CHECK: @in_union ={{.*}} global %union.anon zeroinitializer
+struct __attribute((packed)) { short a; char z[]; } g = { 2, { 11, 13, 15 } };
+// CHECK: @g ={{.*}} <{ i16, [3 x i8] }> <{ i16 2, [3 x i8] c"\0B\0D\0F" }>,
+
+// Last member is the potential flexible array, unnamed initializer skips it.
+struct { int a; union { int b; short x; }; int c; int d; } h = {1, 2, {}, 3};
+// CHECK: @h = global %struct.anon{{.*}} { i32 1, %union.anon{{.*}} { i32 2 }, i32 0, i32 3 }
+struct { int a; union { int b; short x[0]; }; int c; int d; } h0 = {1, 2, {}, 3};
+// CHECK: @h0 = global %struct.anon{{.*}} { i32 1, %union.anon{{.*}} { i32 2 }, i32 0, i32 3 }
+struct { int a; union { int b; short x[1]; }; int c; int d; } h1 = {1, 2, {}, 3};
+// CHECK: @h1 = global %struct.anon{{.*}} { i32 1, %union.anon{{.*}} { i32 2 }, i32 0, i32 3 }
+struct {
+  int a;
+  union {
+    int b;
+    struct {
+      struct { } __ununsed;
+      short x[];
+    };
+  };
+  int c;
+  int d;
+} hiding = {1, 2, {}, 3};
+// CHECK: @hiding = global %struct.anon{{.*}} { i32 1, %union.anon{{.*}} { i32 2 }, i32 0, i32 3 }
+struct { int a; union { int b; short x[]; }; int c; int d; } hf = {1, 2, {}, 3};
+// CHECK: @hf = global %struct.anon{{.*}} { i32 1, %union.anon{{.*}} { i32 2 }, i32 0, i32 3 }
+
+// First member is the potential flexible array, initialization requires braces.
+struct { int a; union { short x; int b; }; int c; int d; } i = {1, 2, {}, 3};
+// CHECK: @i = global { i32, { i16, [2 x i8] }, i32, i32 } { i32 1, { i16, [2 x i8] } { i16 2, [2 x i8] undef }, i32 0, i32 3 }
+struct { int a; union { short x[0]; int b; }; int c; int d; } i0 = {1, {}, 2, 3};
+// CHECK: @i0 = global { i32, { [0 x i16], [4 x i8] }, i32, i32 } { i32 1, { [0 x i16], [4 x i8] } { [0 x i16] zeroinitializer, [4 x i8] undef }, i32 2, i32 3 }
+struct { int a; union { short x[1]; int b; }; int c; int d; } i1 = {1, {2}, {}, 3};
+// CHECK: @i1 = global { i32, { [1 x i16], [2 x i8] }, i32, i32 } { i32 1, { [1 x i16], [2 x i8] } { [1 x i16] [i16 2], [2 x i8] undef }, i32 0, i32 3 }
+struct { int a; union { short x[]; int b; }; int c; int d; } i_f = {4, {}, {}, 6};
+// CHECK: @i_f = global { i32, { [0 x i16], [4 x i8] }, i32, i32 } { i32 4, { [0 x i16], [4 x i8] } { [0 x i16] zeroinitializer, [4 x i8] undef }, i32 0, i32 6 }
+
+// Named initializers; order doesn't matter.
+struct { int a; union { int b; short x; }; int c; int d; } hn = {.a = 1, .x = 2, .c = 3};
+// CHECK: @hn = global { i32, { i16, [2 x i8] }, i32, i32 } { i32 1, { i16, [2 x i8] } { i16 2, [2 x i8] undef }, i32 3, i32 0 }
+struct { int a; union { int b; short x[0]; }; int c; int d; } hn0 = {.a = 1, .x = {2}, .c = 3};
+// CHECK: @hn0 = global { i32, { [0 x i16], [4 x i8] }, i32, i32 } { i32 1, { [0 x i16], [4 x i8] } { [0 x i16] zeroinitializer, [4 x i8] undef }, i32 3, i32 0 }
+struct { int a; union { int b; short x[1]; }; int c; int d; } hn1 = {.a = 1, .x = {2}, .c = 3};
+// CHECK: @hn1 = global { i32, { [1 x i16], [2 x i8] }, i32, i32 } { i32 1, { [1 x i16], [2 x i8] } { [1 x i16] [i16 2], [2 x i8] undef }, i32 3, i32 0 }
+
+struct { char a[]; } empty_struct = {};
+// CHECK: @empty_struct ={{.*}} global %struct.anon{{.*}} zeroinitializer, align 1
+
+struct { char a[]; } empty_struct0 = {0};
+// CHECK: @empty_struct0 = global { [1 x i8] } zeroinitializer, align 1
+
+union { struct { int a; char b[]; }; } struct_in_union = {};
+// CHECK: @struct_in_union = global %union.anon{{.*}} zeroinitializer, align 4
+
+union { struct { int a; char b[]; }; } struct_in_union0 = {0};
+// CHECK: @struct_in_union0 = global %union.anon{{.*}} zeroinitializer, align 4
+
+union { int a; char b[]; } trailing_in_union = {};
+// CHECK: @trailing_in_union = global %union.anon{{.*}} zeroinitializer, align 4
+
+union { int a; char b[]; } trailing_in_union0 = {0};
+// CHECK: @trailing_in_union0 = global %union.anon{{.*}} zeroinitializer, align 4
+
+union { char a[]; } only_in_union = {};
+// CHECK: @only_in_union = global %union.anon{{.*}} zeroinitializer, align 1
+
+union { char a[]; } only_in_union0 = {0};
+// CHECK: @only_in_union0 = global { [1 x i8] } zeroinitializer, align 1
+
+union { char a[]; int b; } first_in_union = {};
+// CHECK: @first_in_union = global { [0 x i8], [4 x i8] } { [0 x i8] zeroinitializer, [4 x i8] undef }, align 4
+
+union { char a[]; int b; } first_in_union0 = {0};
+// CHECK: @first_in_union0 = global { [1 x i8], [3 x i8] } { [1 x i8] zeroinitializer, [3 x i8] undef }, align 4
+
+union { char a[]; int b; } first_in_union123 = { {1, 2, 3} };
+// CHECK: @first_in_union123 = global { [3 x i8], i8 } { [3 x i8] c"\01\02\03", i8 undef }, align 4
diff --git a/clang/test/CodeGen/flexible-array-init.cpp b/clang/test/CodeGen/flexible-array-init.cpp
new file mode 100644
index 00000000000000..d067a614e1afe5
--- /dev/null
+++ b/clang/test/CodeGen/flexible-array-init.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -triple i386-unknown-unknown -x c++ -emit-llvm -o - %s | FileCheck %s
+
+union _u { char a[]; } u = {};
+union _u0 { char a[]; } u0 = {0};
+
+// CHECK: %union._u = type { [0 x i8] }
+
+// CHECK: @u = global %union._u zeroinitializer, align 1
+// CHECK: @u0 = global { [1 x i8] } zeroinitializer, align 1
+
+union { char a[]; } z = {};
+// CHECK: @z = internal global %union.{{.*}} zeroinitializer, align 1
+union { char a[]; } z0 = {0};
+// CHECK: @z0 = internal global { [1 x i8] } zeroinitializer, align 1
+
+/* C++ requires global anonymous unions have static storage, so we have to
+   reference them to keep them in the IR output. */
+char keep(int pick)
+{
+	if (pick)
+		return z.a[0];
+	else
+		return z0.a[0];
+}
diff --git a/clang/test/Sema/flexible-array-in-union.c b/clang/test/Sema/flexible-array-in-union.c
index 5fabfbe0b1eaab..dd5e8069665fea 100644
--- a/clang/test/Sema/flexible-array-in-union.c
+++ b/clang/test/Sema/flexible-array-in-union.c
@@ -1,13 +1,188 @@
-// RUN: %clang_cc1 %s -verify=c -fsyntax-only
-// RUN: %clang_cc1 %s -verify -fsyntax-only -x c++
-// RUN: %clang_cc1 %s -verify -fsyntax-only -fms-compatibility
-// RUN: %clang_cc1 %s -verify -fsyntax-only -fms-compatibility -x c++
+// RUN: %clang_cc1 %s -verify=stock,c -fsyntax-only
+// RUN: %clang_cc1 %s -verify=stock,cpp -fsyntax-only -x c++
+// RUN: %clang_cc1 %s -verify=stock,cpp -fsyntax-only -fms-compatibility -x c++
+// RUN: %clang_cc1 %s -verify=stock,c,gnu -fsyntax-only -Wgnu-flexible-array-union-member -Wgnu-empty-struct
+// RUN: %clang_cc1 %s -verify=stock,c,microsoft -fsyntax-only -fms-compatibility -Wmicrosoft
 
 // The test checks that an attempt to initialize union with flexible array
 // member with an initializer list doesn't crash clang.
 
 
-union { char x[]; } r = {0}; // c-error {{flexible array member 'x' in a union is not allowed}}
+union { char x[]; } r = {0}; /* gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                                microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+                              */
+struct _name1 {
+  int a;
+  union {
+    int b;
+    char x[]; /* gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                 microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+               */
+  };
+} name1 = {
+  10,
+  42,        /* initializes "b" */
+};
 
-// expected-no-diagnostics
+struct _name1i {
+  int a;
+  union {
+    int b;
+    char x[]; /* gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                 microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+               */
+  };
+} name1i = {
+  .a = 10,
+  .b = 42,
+};
+
+/* Initialization of flexible array in a union is never allowed. */
+struct _name2 {
+  int a;
+  union {
+    int b;
+    char x[]; /* gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                 microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+               */
+  };
+} name2 = {
+  12,
+  13,
+  { 'c' },   /* c-warning {{excess elements in struct initializer}}
+                cpp-error {{excess elements in struct initializer}}
+              */
+};
+
+/* Initialization of flexible array in a union is never allowed. */
+struct _name2i {
+  int a;
+  union {
+    int b;
+    char x[]; /* gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                 microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+                 stock-note {{initialized flexible array member 'x' is here}}
+               */
+  };
+} name2i = {
+  .a = 12,
+  .b = 13,      /* stock-note {{previous initialization is here}} */
+  .x = { 'c' }, /* stock-error {{initialization of flexible array member is not allowed}}
+                   c-warning {{initializer overrides prior initialization of this subobject}}
+                   cpp-error {{initializer partially overrides prior initialization of this subobject}}
+                 */
+};
+
+/* Flexible array initialization always allowed when not in a union,
+   and when struct has another member.
+ */
+struct _okay {
+  int a;
+  char x[];
+} okay = {
+  22,
+  { 'x', 'y', 'z' },
+};
+
+struct _okayi {
+  int a;
+  char x[];
+} okayi = {
+  .a = 22,
+  .x = { 'x', 'y', 'z' },
+};
+
+struct _okay0 {
+  int a;
+  char x[];
+} okay0 = { };
+
+struct _flex_extension {
+  char x[]; /* gnu-warning {{flexible array member 'x' in otherwise empty struct is a GNU extension}}
+               microsoft-warning {{flexible array member 'x' in otherwise empty struct is a Microsoft extension}}
+             */
+} flex_extension = {
+  { 'x', 'y', 'z' },
+};
+
+struct _flex_extensioni {
+  char x[]; /* gnu-warning {{flexible array member 'x' in otherwise empty struct is a GNU extension}}
+               microsoft-warning {{flexible array member 'x' in otherwise empty struct is a Microsoft extension}}
+             */
+} flex_extensioni = {
+  .x = { 'x', 'y', 'z' },
+};
+
+struct already_hidden {
+  int a;
+  union {
+    int b;
+    struct {
+      struct { } __empty;  // gnu-warning {{empty struct is a GNU extension}}
+      char x[];
+    };
+  };
+};
 
+struct still_zero_sized {
+  struct { } __unused;  // gnu-warning {{empty struct is a GNU extension}}
+  int x[];
+};
+
+struct warn1 {
+  int a;
+  union {
+    int b;
+    char x[]; /* gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                 microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+               */
+  };
+};
+
+struct warn2 {
+  int x[];  /* gnu-warning {{flexible array member 'x' in otherwise empty struct is a GNU extension}}
+               microsoft-warning {{flexible array member 'x' in otherwise empty struct is a Microsoft extension}}
+             */
+};
+
+union warn3 {
+  short x[];  /* gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                 microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+               */
+};
+
+struct quiet1 {
+  int a;
+  short x[];
+};
+
+struct _not_at_end {
+  union { short x[]; }; /* stock-warning-re {{field '' with variable sized type '{{.*}}' not at the end of a struct or class is a GNU extension}}
+                           gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                           microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+                         */
+  int y;
+} not_at_end = {{}, 3};
+
+struct _not_at_end_s {
+  struct { int a; short x[]; }; /* stock-warning-re {{field '' with variable sized type '{{.*}}' not at the end of a struct or class is a GNU extension}} */
+  int y;
+} not_at_end_s = {{}, 3};
+
+struct {
+  int a;
+  union {      /* stock-warning-re {{field '' with variable sized type '{{.*}}' not at the end of a struct or class is a GNU extension}} */
+    short x[]; /* stock-note {{initialized flexible array member 'x' is here}}
+                  gnu-warning {{flexible array member 'x' in a union is a GNU extension}}
+                  microsoft-warning {{flexible array member 'x' in a union is a Microsoft extension}}
+                */
+    int b;
+  };
+  int c;
+  int d;
+} i_f = { 4,
+         {5},  /* stock-error {{initialization of flexible array member is not allowed}} */
+         {},
+          6};
+
+// expected-no-diagnostics
diff --git a/clang/test/Sema/transparent-union.c b/clang/test/Sema/transparent-union.c
index c134a7a9b1c4d0..f02c2298b51ce1 100644
--- a/clang/test/Sema/transparent-union.c
+++ b/clang/test/Sema/transparent-union.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wgnu-flexible-array-union-member %s
 typedef union {
   int *ip;
   float *fp;
@@ -131,7 +131,7 @@ union pr15134v2 {
 
 union pr30520v { void b; } __attribute__((transparent_union)); // expected-error {{field has incomplete type 'void'}}
 
-union pr30520a { int b[]; } __attribute__((transparent_union)); // expected-error {{flexible array member 'b' in a union is not allowed}}
+union pr30520a { int b[]; } __attribute__((transparent_union)); // expected-warning {{flexible array member 'b' in a union is a GNU extension}}
 
 // expected-note@+1 2 {{forward declaration of 'struct stb'}}
 union pr30520s { struct stb b; } __attribute__((transparent_union)); // expected-error {{field has incomplete type 'struct stb'}}
diff --git a/clang/tools/clang-format/clang-format-diff.py b/clang/tools/clang-format/clang-format-diff.py
index 0a2c24743678d0..3a74b90e731578 100755
--- a/clang/tools/clang-format/clang-format-diff.py
+++ b/clang/tools/clang-format/clang-format-diff.py
@@ -138,6 +138,7 @@ def main():
             )
 
     # Reformat files containing changes in place.
+    has_diff = False
     for filename, lines in lines_by_file.items():
         if args.i and args.verbose:
             print("Formatting {}".format(filename))
@@ -169,7 +170,7 @@ def main():
 
         stdout, stderr = p.communicate()
         if p.returncode != 0:
-            sys.exit(p.returncode)
+            return p.returncode
 
         if not args.i:
             with open(filename) as f:
@@ -185,9 +186,12 @@ def main():
             )
             diff_string = "".join(diff)
             if len(diff_string) > 0:
+                has_diff = True
                 sys.stdout.write(diff_string)
-                sys.exit(1)
+
+    if has_diff:
+        return 1
 
 
 if __name__ == "__main__":
-    main()
+    sys.exit(main())
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d1e977dfa66af5..33dec7dae319f0 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -12075,6 +12075,7 @@ TEST_F(FormatTest, UnderstandsSquareAttributes) {
   verifyFormat("SomeType s [[gnu::unused]] (InitValue);");
   verifyFormat("SomeType s [[using gnu: unused]] (InitValue);");
   verifyFormat("[[gsl::suppress(\"clang-tidy-check-name\")]] void f() {}");
+  verifyFormat("[[suppress(type.5)]] int uninitialized_on_purpose;");
   verifyFormat("void f() [[deprecated(\"so sorry\")]];");
   verifyFormat("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
                "    [[unused]] aaaaaaaaaaaaaaaaaaaaaaa(int i);");
diff --git a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
index 3e41f67ba922b7..17a596d712d0ca 100644
--- a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
@@ -136,7 +136,7 @@ TEST(ScudoStringsTest, CapacityIncreaseFails) {
 
   rlimit Limit = {};
   EXPECT_EQ(0, getrlimit(RLIMIT_AS, &Limit));
-  rlimit EmptyLimit = {.rlim_max = Limit.rlim_max};
+  rlimit EmptyLimit = {.rlim_cur = 0, .rlim_max = Limit.rlim_max};
   EXPECT_EQ(0, setrlimit(RLIMIT_AS, &EmptyLimit));
 
   // Test requires that the default length is at least 6 characters.
diff --git a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp
index b7678678d8a294..add62c5a42a3e4 100644
--- a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp
@@ -55,7 +55,7 @@ TEST(ScudoVectorTest, ReallocateFails) {
   rlimit Limit = {};
   EXPECT_EQ(0, getrlimit(RLIMIT_AS, &Limit));
 
-  rlimit EmptyLimit = {.rlim_max = Limit.rlim_max};
+  rlimit EmptyLimit = {.rlim_cur = 0, .rlim_max = Limit.rlim_max};
   EXPECT_EQ(0, setrlimit(RLIMIT_AS, &EmptyLimit));
 
   V.resize(capacity);
diff --git a/compiler-rt/test/tsan/signal_in_futex_wait.cpp b/compiler-rt/test/tsan/Linux/signal_in_futex_wait.cpp
similarity index 94%
rename from compiler-rt/test/tsan/signal_in_futex_wait.cpp
rename to compiler-rt/test/tsan/Linux/signal_in_futex_wait.cpp
index cf31e5467486ad..3c8804aae3d09c 100644
--- a/compiler-rt/test/tsan/signal_in_futex_wait.cpp
+++ b/compiler-rt/test/tsan/Linux/signal_in_futex_wait.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_tsan %s -lstdc++ -o %t && %run %t 2>&1 | FileCheck %s
 
-#include "test.h"
+#include "../test.h"
 #include <errno.h>
 #include <linux/futex.h>
 #include <pthread.h>
@@ -57,16 +57,13 @@ class Mutex {
 Mutex mutex;
 
 void *Thread(void *x) {
-  // fprintf(stderr, "canova here thread 0\n");
   // Waiting for the futex.
   mutex.lock();
-  // fprintf(stderr, "canova here thread 1\n");
   // Finished waiting.
   return nullptr;
 }
 
 static void SigprofHandler(int signal, siginfo_t *info, void *context) {
-  // fprintf(stderr, "canova here sigprof handler\n");
   // Unlock the futex.
   mutex.unlock();
 }
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 03c7a823e6e96b..1497e32044e975 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -31,10 +31,15 @@
 #define NAN __builtin_nanf("")
 
 #define FP_ILOGB0 (-INT_MAX - 1)
-#define FP_ILOGBNAN INT_MAX
-
 #define FP_LLOGB0 (-LONG_MAX - 1)
+
+#ifdef __FP_LOGBNAN_MIN
+#define FP_ILOGBNAN (-INT_MAX - 1)
+#define FP_LLOGBNAN (-LONG_MAX - 1)
+#else
+#define FP_ILOGBNAN INT_MAX
 #define FP_LLOGBNAN LONG_MAX
+#endif
 
 #ifdef __FAST_MATH__
 #define math_errhandling 0
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index f746d7ac6ad41f..a47931bb33900a 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -30,36 +30,32 @@ template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T fmin(T x, T y) {
   const FPBits<T> bitx(x), bity(y);
 
-  if (bitx.is_nan()) {
+  if (bitx.is_nan())
     return y;
-  } else if (bity.is_nan()) {
+  if (bity.is_nan())
     return x;
-  } else if (bitx.sign() != bity.sign()) {
+  if (bitx.sign() != bity.sign())
     // To make sure that fmin(+0, -0) == -0 == fmin(-0, +0), whenever x and
     // y has different signs and both are not NaNs, we return the number
     // with negative sign.
-    return (bitx.is_neg()) ? x : y;
-  } else {
-    return (x < y ? x : y);
-  }
+    return bitx.is_neg() ? x : y;
+  return x < y ? x : y;
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T fmax(T x, T y) {
   FPBits<T> bitx(x), bity(y);
 
-  if (bitx.is_nan()) {
+  if (bitx.is_nan())
     return y;
-  } else if (bity.is_nan()) {
+  if (bity.is_nan())
     return x;
-  } else if (bitx.sign() != bity.sign()) {
+  if (bitx.sign() != bity.sign())
     // To make sure that fmax(+0, -0) == +0 == fmax(-0, +0), whenever x and
     // y has different signs and both are not NaNs, we return the number
     // with positive sign.
-    return (bitx.is_neg() ? y : x);
-  } else {
-    return (x > y ? x : y);
-  }
+    return bitx.is_neg() ? y : x;
+  return x > y ? x : y;
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 54de6d1603cf41..8d7ae630246fb8 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2724,14 +2724,8 @@ void LinkerDriver::link(opt::InputArgList &args) {
 
   parseFiles(files, armCmseImpLib);
 
-  // Now that we have every file, we can decide if we will need a
-  // dynamic symbol table.
-  // We need one if we were asked to export dynamic symbols or if we are
-  // producing a shared library.
-  // We also need one if any shared libraries are used and for pie executables
-  // (probably because the dynamic linker needs it).
-  config->hasDynSymTab =
-      !ctx.sharedFiles.empty() || config->isPic || config->exportDynamic;
+  // Create dynamic sections for dynamic linking and static PIE.
+  config->hasDynSymTab = !ctx.sharedFiles.empty() || config->isPic;
 
   script->addScriptReferencedSymbolsToSymTable();
 
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index cd2b9e22ab3224..93653def328f82 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -539,8 +539,8 @@ void elf::reportDuplicate(const Symbol &sym, const InputFile *newFile,
   if (!d->section && !errSec && errOffset && d->value == errOffset)
     return;
   if (!d->section || !errSec) {
-    error("duplicate symbol: " + toString(sym) + "\n>>> defined in " +
-          toString(sym.file) + "\n>>> defined in " + toString(newFile));
+    errorOrWarn("duplicate symbol: " + toString(sym) + "\n>>> defined in " +
+                toString(sym.file) + "\n>>> defined in " + toString(newFile));
     return;
   }
 
@@ -564,7 +564,7 @@ void elf::reportDuplicate(const Symbol &sym, const InputFile *newFile,
   if (!src2.empty())
     msg += src2 + "\n>>>            ";
   msg += obj2;
-  error(msg);
+  errorOrWarn(msg);
 }
 
 void Symbol::checkDuplicate(const Defined &other) const {
diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp
index c5c0c8a89e2879..279423720be9d5 100644
--- a/lld/MachO/ConcatOutputSection.cpp
+++ b/lld/MachO/ConcatOutputSection.cpp
@@ -323,11 +323,7 @@ void TextOutputSection::finalize() {
       thunkInfo.isec =
           makeSyntheticInputSection(isec->getSegName(), isec->getName());
       thunkInfo.isec->parent = this;
-
-      // This code runs after dead code removal. Need to set the `live` bit
-      // on the thunk isec so that asserts that check that only live sections
-      // get written are happy.
-      thunkInfo.isec->live = true;
+      assert(thunkInfo.isec->live);
 
       StringRef thunkName = saver().save(funcSym->getName() + ".thunk." +
                                          std::to_string(thunkInfo.sequence++));
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index f820513a111ea3..7b45f7f4c39a1b 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -135,6 +135,7 @@ struct Configuration {
   bool emitEncryptionInfo = false;
   bool emitInitOffsets = false;
   bool emitChainedFixups = false;
+  bool emitRelativeMethodLists = false;
   bool thinLTOEmitImportsFiles;
   bool thinLTOEmitIndexFiles;
   bool thinLTOIndexOnly;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 14c111ce9685c9..65de531db04b75 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1086,6 +1086,22 @@ static bool shouldEmitChainedFixups(const InputArgList &args) {
   return isRequested;
 }
 
+static bool shouldEmitRelativeMethodLists(const InputArgList &args) {
+  const Arg *arg = args.getLastArg(OPT_objc_relative_method_lists,
+                                   OPT_no_objc_relative_method_lists);
+  if (arg && arg->getOption().getID() == OPT_objc_relative_method_lists)
+    return true;
+  if (arg && arg->getOption().getID() == OPT_no_objc_relative_method_lists)
+    return false;
+
+  // TODO: If no flag is specified, don't default to false, but instead:
+  //   - default false on   <   ios14
+  //   - default true  on   >=  ios14
+  // For now, until this feature is confirmed stable, default to false if no
+  // flag is explicitly specified
+  return false;
+}
+
 void SymbolPatterns::clear() {
   literals.clear();
   globs.clear();
@@ -1630,6 +1646,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   config->emitChainedFixups = shouldEmitChainedFixups(args);
   config->emitInitOffsets =
       config->emitChainedFixups || args.hasArg(OPT_init_offsets);
+  config->emitRelativeMethodLists = shouldEmitRelativeMethodLists(args);
   config->icfLevel = getICFLevel(args);
   config->dedupStrings =
       args.hasFlag(OPT_deduplicate_strings, OPT_no_deduplicate_strings, true);
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index 22930d52dd1db2..5c1e07cd21b1fb 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -46,6 +46,14 @@ void lld::macho::addInputSection(InputSection *inputSection) {
   if (auto *isec = dyn_cast<ConcatInputSection>(inputSection)) {
     if (isec->isCoalescedWeak())
       return;
+    if (config->emitRelativeMethodLists &&
+        ObjCMethListSection::isMethodList(isec)) {
+      if (in.objcMethList->inputOrder == UnspecifiedInputOrder)
+        in.objcMethList->inputOrder = inputSectionsOrder++;
+      in.objcMethList->addInput(isec);
+      isec->parent = in.objcMethList;
+      return;
+    }
     if (config->emitInitOffsets &&
         sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS) {
       in.initOffsets->addInput(isec);
@@ -273,6 +281,9 @@ ConcatInputSection *macho::makeSyntheticInputSection(StringRef segName,
   Section &section =
       *make<Section>(/*file=*/nullptr, segName, sectName, flags, /*addr=*/0);
   auto isec = make<ConcatInputSection>(section, data, align);
+  // Since this is an explicitly created 'fake' input section,
+  // it should not be dead stripped.
+  isec->live = true;
   section.subsections.push_back({0, isec});
   return isec;
 }
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index 694bdf734907ba..0f389e50425a32 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -149,6 +149,7 @@ class ConcatInputSection final : public InputSection {
 };
 
 // Initialize a fake InputSection that does not belong to any InputFile.
+// The created ConcatInputSection will always have 'live=true'
 ConcatInputSection *makeSyntheticInputSection(StringRef segName,
                                               StringRef sectName,
                                               uint32_t flags = 0,
@@ -342,6 +343,7 @@ constexpr const char moduleTermFunc[] = "__mod_term_func";
 constexpr const char nonLazySymbolPtr[] = "__nl_symbol_ptr";
 constexpr const char objcCatList[] = "__objc_catlist";
 constexpr const char objcClassList[] = "__objc_classlist";
+constexpr const char objcMethList[] = "__objc_methlist";
 constexpr const char objcClassRefs[] = "__objc_classrefs";
 constexpr const char objcConst[] = "__objc_const";
 constexpr const char objCImageInfo[] = "__objc_imageinfo";
diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp
index f736360624ebd1..2a31a5c09cdd22 100644
--- a/lld/MachO/MapFile.cpp
+++ b/lld/MachO/MapFile.cpp
@@ -197,18 +197,24 @@ void macho::writeMapFile() {
                    seg->name.str().c_str(), osec->name.str().c_str());
     }
 
+  // Shared function to print an array of symbols.
+  auto printIsecArrSyms = [&](const std::vector<ConcatInputSection *> &arr) {
+    for (const ConcatInputSection *isec : arr) {
+      for (Defined *sym : isec->symbols) {
+        if (!(isPrivateLabel(sym->getName()) && sym->size == 0))
+          os << format("0x%08llX\t0x%08llX\t[%3u] %s\n", sym->getVA(),
+                       sym->size, readerToFileOrdinal[sym->getFile()],
+                       sym->getName().str().data());
+      }
+    }
+  };
+
   os << "# Symbols:\n";
   os << "# Address\tSize    \tFile  Name\n";
   for (const OutputSegment *seg : outputSegments) {
     for (const OutputSection *osec : seg->getSections()) {
       if (auto *concatOsec = dyn_cast<ConcatOutputSection>(osec)) {
-        for (const InputSection *isec : concatOsec->inputs) {
-          for (Defined *sym : isec->symbols)
-            if (!(isPrivateLabel(sym->getName()) && sym->size == 0))
-              os << format("0x%08llX\t0x%08llX\t[%3u] %s\n", sym->getVA(),
-                           sym->size, readerToFileOrdinal[sym->getFile()],
-                           sym->getName().str().data());
-        }
+        printIsecArrSyms(concatOsec->inputs);
       } else if (osec == in.cStringSection || osec == in.objcMethnameSection) {
         const auto &liveCStrings = info.liveCStringsForSection.lookup(osec);
         uint64_t lastAddr = 0; // strings will never start at address 0, so this
@@ -237,6 +243,8 @@ void macho::writeMapFile() {
         printNonLazyPointerSection(os, in.got);
       } else if (osec == in.tlvPointers) {
         printNonLazyPointerSection(os, in.tlvPointers);
+      } else if (osec == in.objcMethList) {
+        printIsecArrSyms(in.objcMethList->getInputs());
       }
       // TODO print other synthetic sections
     }
diff --git a/lld/MachO/ObjC.h b/lld/MachO/ObjC.h
index 9fbe984e6223ec..8081605670c519 100644
--- a/lld/MachO/ObjC.h
+++ b/lld/MachO/ObjC.h
@@ -22,6 +22,8 @@ constexpr const char klassPropList[] = "__OBJC_$_CLASS_PROP_LIST_";
 constexpr const char metaclass[] = "_OBJC_METACLASS_$_";
 constexpr const char ehtype[] = "_OBJC_EHTYPE_$_";
 constexpr const char ivar[] = "_OBJC_IVAR_$_";
+constexpr const char instanceMethods[] = "__OBJC_$_INSTANCE_METHODS_";
+constexpr const char classMethods[] = "__OBJC_$_CLASS_METHODS_";
 constexpr const char listProprieties[] = "__OBJC_$_PROP_LIST_";
 
 constexpr const char category[] = "__OBJC_$_CATEGORY_";
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 0d8ee2a0926be2..19f8509ba714bd 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -1284,6 +1284,12 @@ def fixup_chains_section : Flag<["-"], "fixup_chains_section">,
     HelpText<"This option is undocumented in ld64">,
     Flags<[HelpHidden]>,
     Group<grp_undocumented>;
+def objc_relative_method_lists : Flag<["-"], "objc_relative_method_lists">,
+    HelpText<"Emit relative method lists (more compact representation)">,
+    Group<grp_undocumented>;
+def no_objc_relative_method_lists : Flag<["-"], "no_objc_relative_method_lists">,
+    HelpText<"Don't emit relative method lists (use traditional representation)">,
+    Group<grp_undocumented>;
 def flto_codegen_only : Flag<["-"], "flto-codegen-only">,
     HelpText<"This option is undocumented in ld64">,
     Flags<[HelpHidden]>,
diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp
index 825242f2cc72ff..755ff270e2f7a9 100644
--- a/lld/MachO/SymbolTable.cpp
+++ b/lld/MachO/SymbolTable.cpp
@@ -377,7 +377,7 @@ static void handleSectionBoundarySymbol(const Undefined &sym, StringRef segSect,
     // live. Marking the isec live ensures an OutputSection is created that the
     // start/end symbol can refer to.
     assert(sym.isLive());
-    isec->live = true;
+    assert(isec->live);
 
     // This runs after gatherInputSections(), so need to explicitly set parent
     // and add to inputSections.
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 0afbbd478bb9fd..6f6b66118b7a94 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -12,6 +12,7 @@
 #include "ExportTrie.h"
 #include "InputFiles.h"
 #include "MachOStructs.h"
+#include "ObjC.h"
 #include "OutputSegment.h"
 #include "SymbolTable.h"
 #include "Symbols.h"
@@ -849,7 +850,7 @@ ConcatInputSection *ObjCSelRefsHelper::makeSelRef(StringRef methname) {
                                 S_LITERAL_POINTERS | S_ATTR_NO_DEAD_STRIP,
                                 ArrayRef<uint8_t>{selrefData, wordSize},
                                 /*align=*/wordSize);
-  objcSelref->live = true;
+  assert(objcSelref->live);
   objcSelref->relocs.push_back({/*type=*/target->unsignedRelocType,
                                 /*pcrel=*/false, /*length=*/3,
                                 /*offset=*/0,
@@ -1975,6 +1976,241 @@ void InitOffsetsSection::setUp() {
   }
 }
 
+ObjCMethListSection::ObjCMethListSection()
+    : SyntheticSection(segment_names::text, section_names::objcMethList) {
+  flags = S_ATTR_NO_DEAD_STRIP;
+  align = relativeOffsetSize;
+}
+
+// Go through all input method lists and ensure that we have selrefs for all
+// their method names. The selrefs will be needed later by ::writeTo. We need to
+// create them early on here to ensure they are processed correctly by the lld
+// pipeline.
+void ObjCMethListSection::setUp() {
+  for (const ConcatInputSection *isec : inputs) {
+    uint32_t structSizeAndFlags = 0, structCount = 0;
+    readMethodListHeader(isec->data.data(), structSizeAndFlags, structCount);
+    uint32_t originalStructSize = structSizeAndFlags & structSizeMask;
+    // Method name is immediately after header
+    uint32_t methodNameOff = methodListHeaderSize;
+
+    // Loop through all methods, and ensure a selref for each of them exists.
+    while (methodNameOff < isec->data.size()) {
+      const Reloc *reloc = isec->getRelocAt(methodNameOff);
+      assert(reloc && "Relocation expected at method list name slot");
+      auto *def = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+      assert(def && "Expected valid Defined at method list name slot");
+      auto *cisec = cast<CStringInputSection>(def->isec);
+      assert(cisec && "Expected method name to be in a CStringInputSection");
+      auto methname = cisec->getStringRefAtOffset(def->value);
+      if (!ObjCSelRefsHelper::getSelRef(methname))
+        ObjCSelRefsHelper::makeSelRef(methname);
+
+      // Jump to method name offset in next struct
+      methodNameOff += originalStructSize;
+    }
+  }
+}
+
+// Calculate section size and final offsets for where InputSection's need to be
+// written.
+void ObjCMethListSection::finalize() {
+  // sectionSize will be the total size of the __objc_methlist section
+  sectionSize = 0;
+  for (ConcatInputSection *isec : inputs) {
+    // We can also use sectionSize as write offset for isec
+    assert(sectionSize == alignToPowerOf2(sectionSize, relativeOffsetSize) &&
+           "expected __objc_methlist to be aligned by default with the "
+           "required section alignment");
+    isec->outSecOff = sectionSize;
+
+    isec->isFinal = true;
+    uint32_t relativeListSize =
+        computeRelativeMethodListSize(isec->data.size());
+    sectionSize += relativeListSize;
+
+    // If encoding the method list in relative offset format shrinks the size,
+    // then we also need to adjust symbol sizes to match the new size. Note that
+    // on 32bit platforms the size of the method list will remain the same when
+    // encoded in relative offset format.
+    if (relativeListSize != isec->data.size()) {
+      for (Symbol *sym : isec->symbols) {
+        assert(isa<Defined>(sym) &&
+               "Unexpected undefined symbol in ObjC method list");
+        auto *def = cast<Defined>(sym);
+        // There can be 0-size symbols, check if this is the case and ignore
+        // them.
+        if (def->size) {
+          assert(
+              def->size == isec->data.size() &&
+              "Invalid ObjC method list symbol size: expected symbol size to "
+              "match isec size");
+          def->size = relativeListSize;
+        }
+      }
+    }
+  }
+}
+
+void ObjCMethListSection::writeTo(uint8_t *bufStart) const {
+  uint8_t *buf = bufStart;
+  for (const ConcatInputSection *isec : inputs) {
+    assert(buf - bufStart == long(isec->outSecOff) &&
+           "Writing at unexpected offset");
+    uint32_t writtenSize = writeRelativeMethodList(isec, buf);
+    buf += writtenSize;
+  }
+  assert(buf - bufStart == sectionSize &&
+         "Written size does not match expected section size");
+}
+
+// Check if an InputSection is a method list. To do this we scan the
+// InputSection for any symbols who's names match the patterns we expect clang
+// to generate for method lists.
+bool ObjCMethListSection::isMethodList(const ConcatInputSection *isec) {
+  const char *symPrefixes[] = {objc::symbol_names::classMethods,
+                               objc::symbol_names::instanceMethods,
+                               objc::symbol_names::categoryInstanceMethods,
+                               objc::symbol_names::categoryClassMethods};
+  if (!isec)
+    return false;
+  for (const Symbol *sym : isec->symbols) {
+    auto *def = dyn_cast_or_null<Defined>(sym);
+    if (!def)
+      continue;
+    for (const char *prefix : symPrefixes) {
+      if (def->getName().starts_with(prefix)) {
+        assert(def->size == isec->data.size() &&
+               "Invalid ObjC method list symbol size: expected symbol size to "
+               "match isec size");
+        assert(def->value == 0 &&
+               "Offset of ObjC method list symbol must be 0");
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+// Encode a single relative offset value. The input is the data/symbol at
+// (&isec->data[inSecOff]). The output is written to (&buf[outSecOff]).
+// 'createSelRef' indicates that we should not directly use the specified
+// symbol, but instead get the selRef for the symbol and use that instead.
+void ObjCMethListSection::writeRelativeOffsetForIsec(
+    const ConcatInputSection *isec, uint8_t *buf, uint32_t &inSecOff,
+    uint32_t &outSecOff, bool useSelRef) const {
+  const Reloc *reloc = isec->getRelocAt(inSecOff);
+  assert(reloc && "Relocation expected at __objc_methlist Offset");
+  auto *def = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(def && "Expected all syms in __objc_methlist to be defined");
+  uint32_t symVA = def->getVA();
+
+  if (useSelRef) {
+    auto *cisec = cast<CStringInputSection>(def->isec);
+    auto methname = cisec->getStringRefAtOffset(def->value);
+    ConcatInputSection *selRef = ObjCSelRefsHelper::getSelRef(methname);
+    assert(selRef && "Expected all selector names to already be already be "
+                     "present in __objc_selrefs");
+    symVA = selRef->getVA();
+    assert(selRef->data.size() == sizeof(target->wordSize) &&
+           "Expected one selref per ConcatInputSection");
+  }
+
+  uint32_t currentVA = isec->getVA() + outSecOff;
+  uint32_t delta = symVA - currentVA;
+  write32le(buf + outSecOff, delta);
+
+  // Move one pointer forward in the absolute method list
+  inSecOff += target->wordSize;
+  // Move one relative offset forward in the relative method list (32 bits)
+  outSecOff += relativeOffsetSize;
+}
+
+// Write a relative method list to buf, return the size of the written
+// information
+uint32_t
+ObjCMethListSection::writeRelativeMethodList(const ConcatInputSection *isec,
+                                             uint8_t *buf) const {
+  // Copy over the header, and add the "this is a relative method list" magic
+  // value flag
+  uint32_t structSizeAndFlags = 0, structCount = 0;
+  readMethodListHeader(isec->data.data(), structSizeAndFlags, structCount);
+  // Set the struct size for the relative method list
+  uint32_t relativeStructSizeAndFlags =
+      (relativeOffsetSize * pointersPerStruct) & structSizeMask;
+  // Carry over the old flags from the input struct
+  relativeStructSizeAndFlags |= structSizeAndFlags & structFlagsMask;
+  // Set the relative method list flag
+  relativeStructSizeAndFlags |= relMethodHeaderFlag;
+
+  writeMethodListHeader(buf, relativeStructSizeAndFlags, structCount);
+
+  assert(methodListHeaderSize +
+                 (structCount * pointersPerStruct * target->wordSize) ==
+             isec->data.size() &&
+         "Invalid computed ObjC method list size");
+
+  uint32_t inSecOff = methodListHeaderSize;
+  uint32_t outSecOff = methodListHeaderSize;
+
+  // Go through the method list and encode input absolute pointers as relative
+  // offsets. writeRelativeOffsetForIsec will be incrementing inSecOff and
+  // outSecOff
+  for (uint32_t i = 0; i < structCount; i++) {
+    // Write the name of the method
+    writeRelativeOffsetForIsec(isec, buf, inSecOff, outSecOff, true);
+    // Write the type of the method
+    writeRelativeOffsetForIsec(isec, buf, inSecOff, outSecOff, false);
+    // Write reference to the selector of the method
+    writeRelativeOffsetForIsec(isec, buf, inSecOff, outSecOff, false);
+  }
+
+  // Expecting to have read all the data in the isec
+  assert(inSecOff == isec->data.size() &&
+         "Invalid actual ObjC method list size");
+  assert(
+      outSecOff == computeRelativeMethodListSize(inSecOff) &&
+      "Mismatch between input & output size when writing relative method list");
+  return outSecOff;
+}
+
+// Given the size of an ObjC method list InputSection, return the size of the
+// method list when encoded in relative offsets format. We can do this without
+// decoding the actual data, as it can be directly inferred from the size of the
+// isec.
+uint32_t ObjCMethListSection::computeRelativeMethodListSize(
+    uint32_t absoluteMethodListSize) const {
+  uint32_t oldPointersSize = absoluteMethodListSize - methodListHeaderSize;
+  uint32_t pointerCount = oldPointersSize / target->wordSize;
+  assert(((pointerCount % pointersPerStruct) == 0) &&
+         "__objc_methlist expects method lists to have multiple-of-3 pointers");
+
+  uint32_t newPointersSize = pointerCount * relativeOffsetSize;
+  uint32_t newTotalSize = methodListHeaderSize + newPointersSize;
+
+  assert((newTotalSize <= absoluteMethodListSize) &&
+         "Expected relative method list size to be smaller or equal than "
+         "original size");
+  return newTotalSize;
+}
+
+// Read a method list header from buf
+void ObjCMethListSection::readMethodListHeader(const uint8_t *buf,
+                                               uint32_t &structSizeAndFlags,
+                                               uint32_t &structCount) const {
+  structSizeAndFlags = read32le(buf);
+  structCount = read32le(buf + sizeof(uint32_t));
+}
+
+// Write a method list header to buf
+void ObjCMethListSection::writeMethodListHeader(uint8_t *buf,
+                                                uint32_t structSizeAndFlags,
+                                                uint32_t structCount) const {
+  write32le(buf, structSizeAndFlags);
+  write32le(buf + sizeof(structSizeAndFlags), structCount);
+}
+
 void macho::createSyntheticSymbols() {
   auto addHeaderSymbol = [](const char *name) {
     symtab->addSynthetic(name, in.header->isec, /*value=*/0,
diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index 4586a4a0bf4361..e8fadfef56d4b2 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -684,6 +684,54 @@ class InitOffsetsSection final : public SyntheticSection {
   std::vector<ConcatInputSection *> sections;
 };
 
+// This SyntheticSection is for the __objc_methlist section, which contains
+// relative method lists if the -objc_relative_method_lists option is enabled.
+class ObjCMethListSection final : public SyntheticSection {
+public:
+  ObjCMethListSection();
+
+  static bool isMethodList(const ConcatInputSection *isec);
+  void addInput(ConcatInputSection *isec) { inputs.push_back(isec); }
+  std::vector<ConcatInputSection *> getInputs() { return inputs; }
+
+  void setUp();
+  void finalize() override;
+  bool isNeeded() const override { return !inputs.empty(); }
+  uint64_t getSize() const override { return sectionSize; }
+  void writeTo(uint8_t *bufStart) const override;
+
+private:
+  void readMethodListHeader(const uint8_t *buf, uint32_t &structSizeAndFlags,
+                            uint32_t &structCount) const;
+  void writeMethodListHeader(uint8_t *buf, uint32_t structSizeAndFlags,
+                             uint32_t structCount) const;
+  uint32_t computeRelativeMethodListSize(uint32_t absoluteMethodListSize) const;
+  void writeRelativeOffsetForIsec(const ConcatInputSection *isec, uint8_t *buf,
+                                  uint32_t &inSecOff, uint32_t &outSecOff,
+                                  bool useSelRef) const;
+  uint32_t writeRelativeMethodList(const ConcatInputSection *isec,
+                                   uint8_t *buf) const;
+
+  static constexpr uint32_t methodListHeaderSize =
+      /*structSizeAndFlags*/ sizeof(uint32_t) +
+      /*structCount*/ sizeof(uint32_t);
+  // Relative method lists are supported only for 3-pointer method lists
+  static constexpr uint32_t pointersPerStruct = 3;
+  // The runtime identifies relative method lists via this magic value
+  static constexpr uint32_t relMethodHeaderFlag = 0x80000000;
+  // In the method list header, the first 2 bytes are the size of struct
+  static constexpr uint32_t structSizeMask = 0x0000FFFF;
+  // In the method list header, the last 2 bytes are the flags for the struct
+  static constexpr uint32_t structFlagsMask = 0xFFFF0000;
+  // Relative method lists have 4 byte alignment as all data in the InputSection
+  // is 4 byte
+  static constexpr uint32_t relativeOffsetSize = sizeof(uint32_t);
+
+  // The output size of the __objc_methlist section, computed during finalize()
+  uint32_t sectionSize = 0;
+  std::vector<ConcatInputSection *> inputs;
+};
+
 // Chained fixups are a replacement for classic dyld opcodes. In this format,
 // most of the metadata necessary for binding symbols and rebasing addresses is
 // stored directly in the memory location that will have the fixup applied.
@@ -810,6 +858,7 @@ struct InStruct {
   ObjCImageInfoSection *objCImageInfo = nullptr;
   ConcatInputSection *imageLoaderCache = nullptr;
   InitOffsetsSection *initOffsets = nullptr;
+  ObjCMethListSection *objcMethList = nullptr;
   ChainedFixupsSection *chainedFixups = nullptr;
 };
 
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index a18b5268fd42aa..1c054912551e3e 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -1292,6 +1292,8 @@ template <class LP> void Writer::run() {
   scanSymbols();
   if (in.objcStubs->isNeeded())
     in.objcStubs->setUp();
+  if (in.objcMethList->isNeeded())
+    in.objcMethList->setUp();
   scanRelocations();
   if (in.initOffsets->isNeeded())
     in.initOffsets->setUp();
@@ -1363,6 +1365,7 @@ void macho::createSyntheticSections() {
   in.unwindInfo = makeUnwindInfoSection();
   in.objCImageInfo = make<ObjCImageInfoSection>();
   in.initOffsets = make<InitOffsetsSection>();
+  in.objcMethList = make<ObjCMethListSection>();
 
   // This section contains space for just a single word, and will be used by
   // dyld to cache an address to the image loader it uses.
@@ -1372,9 +1375,7 @@ void macho::createSyntheticSections() {
       segment_names::data, section_names::data, S_REGULAR,
       ArrayRef<uint8_t>{arr, target->wordSize},
       /*align=*/target->wordSize);
-  // References from dyld are not visible to us, so ensure this section is
-  // always treated as live.
-  in.imageLoaderCache->live = true;
+  assert(in.imageLoaderCache->live);
 }
 
 OutputSection *macho::firstTLVDataSection = nullptr;
diff --git a/lld/test/ELF/allow-multiple-definition.s b/lld/test/ELF/allow-multiple-definition.s
index 492784a3601df1..96fa2627e1bf88 100644
--- a/lld/test/ELF/allow-multiple-definition.s
+++ b/lld/test/ELF/allow-multiple-definition.s
@@ -9,6 +9,9 @@
 # RUN: llvm-objdump --no-print-imm-hex -d %t3 | FileCheck %s
 # RUN: llvm-objdump --no-print-imm-hex -d %t4 | FileCheck --check-prefix=REVERT %s
 
+# RUN: ld.lld --noinhibit-exec %t2 %t1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN
+# WARN: warning: duplicate symbol: _bar
+
 # RUN: ld.lld -z muldefs --fatal-warnings  %t1 %t2 -o %t3
 # RUN: ld.lld -z muldefs --fatal-warnings  %t2 %t1 -o %t4
 # RUN: llvm-objdump --no-print-imm-hex -d %t3 | FileCheck %s
diff --git a/lld/test/ELF/common-gc2.s b/lld/test/ELF/common-gc2.s
index fec1c4be86b5ee..1ecaef7d9af5aa 100644
--- a/lld/test/ELF/common-gc2.s
+++ b/lld/test/ELF/common-gc2.s
@@ -1,7 +1,9 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: ld.lld -gc-sections -export-dynamic %t -o %t1
-# RUN: llvm-readobj --dyn-symbols %t1 | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o %t2.o
+# RUN: ld.lld -shared -soname=t2 %t2.o -o %t2.so
+# RUN: ld.lld -gc-sections -export-dynamic %t.o %t2.so -o %t
+# RUN: llvm-readobj --dyn-symbols %t | FileCheck %s
 
 # CHECK: Name: bar
 # CHECK: Name: foo
diff --git a/lld/test/ELF/executable-undefined-ignoreall.s b/lld/test/ELF/executable-undefined-ignoreall.s
index cc38e17cdf619b..073b22bd84543a 100644
--- a/lld/test/ELF/executable-undefined-ignoreall.s
+++ b/lld/test/ELF/executable-undefined-ignoreall.s
@@ -7,8 +7,6 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld %t.o -o %t --unresolved-symbols=ignore-all -pie
 # RUN: llvm-readobj -r %t | FileCheck %s
-# RUN: ld.lld %t.o -o %t --unresolved-symbols=ignore-all --export-dynamic
-# RUN: llvm-readobj -r %t | FileCheck %s
 
 # CHECK:      Relocations [
 # CHECK-NEXT:   Section ({{.*}}) .rela.plt {
diff --git a/lld/test/ELF/relro-non-contiguous-script-data.s b/lld/test/ELF/relro-non-contiguous-script-data.s
index fd485e89167fcc..530fc7c84eb91e 100644
--- a/lld/test/ELF/relro-non-contiguous-script-data.s
+++ b/lld/test/ELF/relro-non-contiguous-script-data.s
@@ -1,19 +1,21 @@
 // REQUIRES: x86
 
+// RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o %t2.o
+// RUN: ld.lld -shared -soname=t2 %t2.o -o %t2.so
 // RUN: echo "SECTIONS { \
 // RUN: .dynamic : { *(.dynamic) } \
 // RUN: .non_ro : { . += 1; } \
 // RUN: .jcr : { *(.jcr) } \
 // RUN: } " > %t.script
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-// RUN: not ld.lld --export-dynamic %t.o -o /dev/null --script=%t.script 2>&1 | FileCheck %s
+// RUN: not ld.lld %t.o %t2.so -o /dev/null --script=%t.script 2>&1 | FileCheck %s
 
 // RUN: echo "SECTIONS { \
 // RUN: .dynamic : { *(.dynamic) } \
 // RUN: .non_ro : { BYTE(1); } \
 // RUN: .jcr : { *(.jcr) } \
 // RUN: } " > %t2.script
-// RUN: not ld.lld --export-dynamic %t.o -o /dev/null --script=%t2.script 2>&1 | FileCheck %s
+// RUN: not ld.lld %t.o %t2.so -o /dev/null --script=%t2.script 2>&1 | FileCheck %s
 
 // CHECK: error: section: .jcr is not contiguous with other relro sections
 
diff --git a/lld/test/ELF/riscv-undefined-weak.s b/lld/test/ELF/riscv-undefined-weak.s
index 303a27f920c57c..8a78e1f8383386 100644
--- a/lld/test/ELF/riscv-undefined-weak.s
+++ b/lld/test/ELF/riscv-undefined-weak.s
@@ -1,4 +1,6 @@
 # REQUIRES: riscv
+# RUN: llvm-mc -filetype=obj -triple=riscv64 /dev/null -o %t2.o
+# RUN: ld.lld -shared -soname=t2 %t2.o -o %t2.so
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -riscv-asm-relax-branches=0 %s -o %t.o
 # RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=RELOC %s
 
@@ -6,7 +8,7 @@
 # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefixes=CHECK,PC %s
 # RUN: llvm-readelf -x .data %t | FileCheck --check-prefixes=HEX,HEX-WITHOUT-PLT %s
 
-# RUN: ld.lld -e absolute %t.o -o %t --export-dynamic
+# RUN: ld.lld -e absolute %t.o -o %t %t2.so
 # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefixes=CHECK,PLT %s
 # RUN: llvm-readelf -x .data %t | FileCheck --check-prefixes=HEX,HEX-WITH-PLT %s
 
@@ -34,11 +36,11 @@ absolute:
 # CHECK-LABEL: <relative>:
 # CHECK-NEXT:  11{{...}}: auipc a1, 0xfffef
 # PC-NEXT:     addi a1, a1, -0x160
-# PLT-NEXT:    addi a1, a1, -0x318
+# PLT-NEXT:    addi a1, a1, -0x290
 # CHECK-LABEL: <.Lpcrel_hi1>:
 # CHECK-NEXT:  11{{...}}: auipc t1, 0xfffef
 # PC-NEXT:     sd a2, -0x166(t1)
-# PLT-NEXT:    sd a2, -0x31e(t1)
+# PLT-NEXT:    sd a2, -0x296(t1)
 relative:
   la a1, target
   sd a2, target+2, t1
@@ -62,7 +64,7 @@ relative:
 ## We create a PLT entry and redirect the reference to it.
 # PLT-LABEL:   <branch>:
 # PLT-NEXT:    auipc ra, 0x0
-# PLT-NEXT:    jalr 0x38(ra)
+# PLT-NEXT:    jalr 0x30(ra)
 # PLT-NEXT:    [[#%x,ADDR:]]:
 # PLT-SAME:                   j 0x[[#ADDR]]
 # PLT-NEXT:    [[#%x,ADDR:]]:
@@ -84,12 +86,8 @@ branch:
 ## A plt entry is created for target, so this is the offset between the
 ## plt entry and this address.
 ##
-##   S = 0x11360 (the address of the plt entry for target)
-##   A = 0
-##   P = 0x1343c (the address of `.`)
-##
-##   S - A + P = -0x0x20dc = 0xffffdf24
-# HEX-WITH-PLT-SAME: 24dfffff
+##   S - A + P = -0x0x20ec = 0xffffdf14
+# HEX-WITH-PLT-SAME: 14dfffff
 
 .data
 .p2align 3
diff --git a/lld/test/ELF/static-with-export-dynamic.s b/lld/test/ELF/static-with-export-dynamic.s
deleted file mode 100644
index b0349b85e30343..00000000000000
--- a/lld/test/ELF/static-with-export-dynamic.s
+++ /dev/null
@@ -1,32 +0,0 @@
-// REQUIRES: x86
-// RUN: llvm-mc -filetype=obj -triple=i686-unknown-cloudabi %s -o %t.o
-// RUN: ld.lld --export-dynamic %t.o -o %t
-// RUN: llvm-readobj --dyn-syms %t | FileCheck %s
-
-// Ensure that a dynamic symbol table is present when --export-dynamic
-// is passed in, even when creating statically linked executables.
-//
-// CHECK:      DynamicSymbols [
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name:
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: Undefined
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: _start
-// CHECK-NEXT:     Value:
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Global
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text
-// CHECK-NEXT:   }
-// CHECK-NEXT: ]
-
-.global _start
-_start:
-  ret
diff --git a/lld/test/ELF/weak-undef.s b/lld/test/ELF/weak-undef.s
index 3a9d5f462c21b6..21488023a79e10 100644
--- a/lld/test/ELF/weak-undef.s
+++ b/lld/test/ELF/weak-undef.s
@@ -16,10 +16,11 @@
 # RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend
 # RELOC-NEXT: {{.*}} 0000000100000001 R_X86_64_64 0000000000000000 foo + 0
 
-# COMMON:       Symbol table '.dynsym' contains 2 entries:
-# COMMON-NEXT:  Num: Value Size Type Bind Vis Ndx Name
-# COMMON-NEXT:  0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
-# COMMON-NEXT:  1: 0000000000000000 0 NOTYPE WEAK DEFAULT UND foo
+# NORELOC-NOT: Symbol table '.dynsym'
+# RELOC:       Symbol table '.dynsym' contains 2 entries:
+# RELOC-NEXT:  Num: Value Size Type Bind Vis Ndx Name
+# RELOC-NEXT:  0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
+# RELOC-NEXT:  1: 0000000000000000 0 NOTYPE WEAK DEFAULT UND foo
 # COMMON:      Hex dump of section '.data':
 # COMMON-NEXT: {{.*}} 00000000 00000000 
 # COMMON-EMPTY:
diff --git a/lld/test/ELF/x86-64-dyn-rel-error.s b/lld/test/ELF/x86-64-dyn-rel-error.s
index a03adf89072f31..1590045312d4a3 100644
--- a/lld/test/ELF/x86-64-dyn-rel-error.s
+++ b/lld/test/ELF/x86-64-dyn-rel-error.s
@@ -19,7 +19,7 @@
 # CHECK-NOT:  error:
 
 # RUN: ld.lld --noinhibit-exec %t.o %t2.so -o /dev/null 2>&1 | FileCheck --check-prefix=WARN %s
-# RUN: not ld.lld --export-dynamic --unresolved-symbols=ignore-all %t.o -o /dev/null 2>&1 | FileCheck --check-prefix=WARN %s
+# RUN: not ld.lld --export-dynamic --unresolved-symbols=ignore-all %t.o %t2.so -o /dev/null 2>&1 | FileCheck --check-prefix=WARN %s
 
 # WARN: relocation R_X86_64_32 cannot be used against symbol 'zed'; recompile with -fPIC
 # WARN: relocation R_X86_64_PC32 cannot be used against symbol 'zed'; recompile with -fPIC
diff --git a/lld/test/MachO/objc-relative-method-lists-simple.s b/lld/test/MachO/objc-relative-method-lists-simple.s
new file mode 100644
index 00000000000000..5a77085c7d93d8
--- /dev/null
+++ b/lld/test/MachO/objc-relative-method-lists-simple.s
@@ -0,0 +1,249 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t; split-file %s %t && cd %t
+
+## Compile a64_rel_dylib.o
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_rel_dylib.o a64_simple_class.s
+
+## Test arm64 + relative method lists
+# RUN: %no-lsystem-lld a64_rel_dylib.o -o a64_rel_dylib.dylib -map a64_rel_dylib.map -dylib -arch arm64 -objc_relative_method_lists
+# RUN: llvm-objdump --macho --objc-meta-data a64_rel_dylib.dylib  | FileCheck %s --check-prefix=CHK_REL
+
+## Test arm64 + relative method lists + dead-strip
+# RUN: %no-lsystem-lld a64_rel_dylib.o -o a64_rel_dylib.dylib -map a64_rel_dylib.map -dylib -arch arm64 -objc_relative_method_lists -dead_strip
+# RUN: llvm-objdump --macho --objc-meta-data a64_rel_dylib.dylib  | FileCheck %s --check-prefix=CHK_REL
+
+## Test arm64 + traditional method lists (no relative offsets)
+# RUN: %no-lsystem-lld a64_rel_dylib.o -o a64_rel_dylib.dylib -map a64_rel_dylib.map -dylib -arch arm64 -no_objc_relative_method_lists
+# RUN: llvm-objdump --macho --objc-meta-data a64_rel_dylib.dylib  | FileCheck %s --check-prefix=CHK_NO_REL
+
+
+CHK_REL:       Contents of (__DATA_CONST,__objc_classlist) section
+CHK_REL-NEXT:  _OBJC_CLASS_$_MyClass
+CHK_REL:       baseMethods
+CHK_REL-NEXT:  entsize 12 (relative)
+CHK_REL-NEXT:  count 3
+CHK_REL-NEXT:   name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) instance_method_00
+CHK_REL-NEXT:  types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8
+CHK_REL-NEXT:    imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) -[MyClass instance_method_00]
+CHK_REL-NEXT:   name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) instance_method_01
+CHK_REL-NEXT:  types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8
+CHK_REL-NEXT:    imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) -[MyClass instance_method_01]
+CHK_REL-NEXT:   name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) instance_method_02
+CHK_REL-NEXT:  types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8
+CHK_REL-NEXT:    imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) -[MyClass instance_method_02]
+
+CHK_REL:       Meta Class
+CHK_REL-NEXT:  isa 0x{{[0-9a-f]*}} _OBJC_METACLASS_$_MyClass
+CHK_REL:       baseMethods 0x{{[0-9a-f]*}} (struct method_list_t *)
+CHK_REL-NEXT:  entsize 12 (relative)
+CHK_REL-NEXT:  count 3
+CHK_REL-NEXT:   name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  class_method_00
+CHK_REL-NEXT:  types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  v16@0:8
+CHK_REL-NEXT:    imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  +[MyClass class_method_00]
+CHK_REL-NEXT:   name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  class_method_01
+CHK_REL-NEXT:  types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  v16@0:8
+CHK_REL-NEXT:    imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  +[MyClass class_method_01]
+CHK_REL-NEXT:   name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  class_method_02
+CHK_REL-NEXT:  types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  v16@0:8
+CHK_REL-NEXT:    imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}})  +[MyClass class_method_02]
+
+
+CHK_NO_REL-NOT: (relative)
+
+CHK_NO_REL:           Contents of (__DATA_CONST,__objc_classlist) section
+CHK_NO_REL-NEXT:      _OBJC_CLASS_$_MyClass
+
+CHK_NO_REL:            baseMethods 0x{{[0-9a-f]*}} (struct method_list_t *)
+CHK_NO_REL-NEXT:		   entsize 24
+CHK_NO_REL-NEXT:		     count 3
+CHK_NO_REL-NEXT:		      name 0x{{[0-9a-f]*}} instance_method_00
+CHK_NO_REL-NEXT:		     types 0x{{[0-9a-f]*}} v16@0:8
+CHK_NO_REL-NEXT:		       imp -[MyClass instance_method_00]
+CHK_NO_REL-NEXT:		      name 0x{{[0-9a-f]*}} instance_method_01
+CHK_NO_REL-NEXT:		     types 0x{{[0-9a-f]*}} v16@0:8
+CHK_NO_REL-NEXT:		       imp -[MyClass instance_method_01]
+CHK_NO_REL-NEXT:		      name 0x{{[0-9a-f]*}} instance_method_02
+CHK_NO_REL-NEXT:		     types 0x{{[0-9a-f]*}} v16@0:8
+CHK_NO_REL-NEXT:		       imp -[MyClass instance_method_02]
+
+
+CHK_NO_REL:             Meta Class
+CHK_NO_REL-NEXT:        _OBJC_METACLASS_$_MyClass
+
+CHK_NO_REL:             baseMethods 0x{{[0-9a-f]*}} (struct method_list_t *)
+CHK_NO_REL-NEXT:		   entsize 24
+CHK_NO_REL-NEXT:		     count 3
+CHK_NO_REL-NEXT:		      name 0x{{[0-9a-f]*}} class_method_00
+CHK_NO_REL-NEXT:		     types 0x{{[0-9a-f]*}} v16@0:8
+CHK_NO_REL-NEXT:		       imp +[MyClass class_method_00]
+CHK_NO_REL-NEXT:		      name 0x{{[0-9a-f]*}} class_method_01
+CHK_NO_REL-NEXT:		     types 0x{{[0-9a-f]*}} v16@0:8
+CHK_NO_REL-NEXT:		       imp +[MyClass class_method_01]
+CHK_NO_REL-NEXT:		      name 0x{{[0-9a-f]*}} class_method_02
+CHK_NO_REL-NEXT:		     types 0x{{[0-9a-f]*}} v16@0:8
+CHK_NO_REL-NEXT:		       imp +[MyClass class_method_02]
+
+
+######################## Generate a64_simple_class.s #########################
+# clang -c simple_class.mm -s -o a64_simple_class.s -target arm64-apple-macos -arch arm64 -Oz
+
+########################       simple_class.mm       ########################
+#  __attribute__((objc_root_class))
+#  @interface MyClass
+#  - (void)instance_method_00;
+#  - (void)instance_method_01;
+#  - (void)instance_method_02;
+#  + (void)class_method_00;
+#  + (void)class_method_01;
+#  + (void)class_method_02;
+#  @end
+#
+#  @implementation MyClass
+#  - (void)instance_method_00 {}
+#  - (void)instance_method_01 {}
+#  - (void)instance_method_02 {}
+#  + (void)class_method_00 {}
+#  + (void)class_method_01 {}
+#  + (void)class_method_02 {}
+#  @end
+#
+#  void *_objc_empty_cache;
+#  void *_objc_empty_vtable;
+#
+
+#--- objc-macros.s
+.macro .objc_selector_def name
+	.p2align	2
+"\name":
+	.cfi_startproc
+	ret
+	.cfi_endproc
+.endm
+
+#--- a64_simple_class.s
+.include "objc-macros.s"
+
+.section	__TEXT,__text,regular,pure_instructions
+.build_version macos, 11, 0
+
+.objc_selector_def "-[MyClass instance_method_00]"
+.objc_selector_def "-[MyClass instance_method_01]"
+.objc_selector_def "-[MyClass instance_method_02]"
+
+.objc_selector_def "+[MyClass class_method_00]"
+.objc_selector_def "+[MyClass class_method_01]"
+.objc_selector_def "+[MyClass class_method_02]"
+
+.globl	__objc_empty_vtable
+.zerofill __DATA,__common,__objc_empty_vtable,8,3
+.section	__DATA,__objc_data
+.globl	_OBJC_CLASS_$_MyClass
+.p2align	3, 0x0
+
+_OBJC_CLASS_$_MyClass:
+	.quad	_OBJC_METACLASS_$_MyClass
+	.quad	0
+	.quad	__objc_empty_cache
+	.quad	__objc_empty_vtable
+	.quad	__OBJC_CLASS_RO_$_MyClass
+	.globl	_OBJC_METACLASS_$_MyClass
+	.p2align	3, 0x0
+
+_OBJC_METACLASS_$_MyClass:
+	.quad	_OBJC_METACLASS_$_MyClass
+	.quad	_OBJC_CLASS_$_MyClass
+	.quad	__objc_empty_cache
+	.quad	__objc_empty_vtable
+	.quad	__OBJC_METACLASS_RO_$_MyClass
+
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:
+	.asciz	"MyClass"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:
+	.asciz	"class_method_00"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:
+	.asciz	"v16@0:8"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.1:
+	.asciz	"class_method_01"
+l_OBJC_METH_VAR_NAME_.2:
+	.asciz	"class_method_02"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0
+__OBJC_$_CLASS_METHODS_MyClass:
+	.long	24
+	.long	3
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyClass class_method_00]"
+	.quad	l_OBJC_METH_VAR_NAME_.1
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyClass class_method_01]"
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyClass class_method_02]"
+	.p2align	3, 0x0
+
+__OBJC_METACLASS_RO_$_MyClass:
+	.long	3
+	.long	40
+	.long	40
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_CLASS_METHODS_MyClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.3:
+	.asciz	"instance_method_00"
+l_OBJC_METH_VAR_NAME_.4:
+	.asciz	"instance_method_01"
+l_OBJC_METH_VAR_NAME_.5:
+	.asciz	"instance_method_02"
+
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0
+__OBJC_$_INSTANCE_METHODS_MyClass:
+	.long	24
+	.long	3
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyClass instance_method_00]"
+	.quad	l_OBJC_METH_VAR_NAME_.4
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyClass instance_method_01]"
+	.quad	l_OBJC_METH_VAR_NAME_.5
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyClass instance_method_02]"
+	.p2align	3, 0x0
+
+__OBJC_CLASS_RO_$_MyClass:
+	.long	2
+	.long	0
+	.long	0
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_INSTANCE_METHODS_MyClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.globl	__objc_empty_cache
+
+.zerofill __DATA,__common,__objc_empty_cache,8,3
+	.section	__DATA,__objc_classlist,regular,no_dead_strip
+	.p2align	3, 0x0
+l_OBJC_LABEL_CLASS_$:
+	.quad	_OBJC_CLASS_$_MyClass
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	64
+.subsections_via_symbols
diff --git a/lldb/source/Host/common/Alarm.cpp b/lldb/source/Host/common/Alarm.cpp
index 245cdc7ae5c2da..afc770d20d7b1e 100644
--- a/lldb/source/Host/common/Alarm.cpp
+++ b/lldb/source/Host/common/Alarm.cpp
@@ -154,54 +154,60 @@ lldb::thread_result_t Alarm::AlarmThread() {
     //
     // Below we only deal with the timeout expiring and fall through for dealing
     // with the rest.
-    std::unique_lock<std::mutex> alarm_lock(m_alarm_mutex);
-    if (next_alarm) {
-      if (!m_alarm_cv.wait_until(alarm_lock, *next_alarm, predicate)) {
-        // The timeout for the next alarm expired.
-
-        // Clear the next timeout to signal that we need to recompute the next
-        // timeout.
-        next_alarm.reset();
-
-        // Iterate over all the callbacks. Call the ones that have expired
-        // and remove them from the list.
-        const TimePoint now = std::chrono::system_clock::now();
-        auto it = m_entries.begin();
-        while (it != m_entries.end()) {
-          if (it->expiration <= now) {
-            it->callback();
-            it = m_entries.erase(it);
-          } else {
-            it++;
+    llvm::SmallVector<Callback, 1> callbacks;
+    {
+      std::unique_lock<std::mutex> alarm_lock(m_alarm_mutex);
+      if (next_alarm) {
+        if (!m_alarm_cv.wait_until(alarm_lock, *next_alarm, predicate)) {
+          // The timeout for the next alarm expired.
+
+          // Clear the next timeout to signal that we need to recompute the next
+          // timeout.
+          next_alarm.reset();
+
+          // Iterate over all the callbacks. Call the ones that have expired
+          // and remove them from the list.
+          const TimePoint now = std::chrono::system_clock::now();
+          auto it = m_entries.begin();
+          while (it != m_entries.end()) {
+            if (it->expiration <= now) {
+              callbacks.emplace_back(std::move(it->callback));
+              it = m_entries.erase(it);
+            } else {
+              it++;
+            }
           }
         }
+      } else {
+        m_alarm_cv.wait(alarm_lock, predicate);
       }
-    } else {
-      m_alarm_cv.wait(alarm_lock, predicate);
-    }
 
-    // Fall through after waiting on the condition variable. At this point
-    // either the predicate is true or we woke up because an alarm expired.
+      // Fall through after waiting on the condition variable. At this point
+      // either the predicate is true or we woke up because an alarm expired.
 
-    // The alarm thread is shutting down.
-    if (m_exit) {
-      exit = true;
-      if (m_run_callbacks_on_exit) {
-        for (Entry &entry : m_entries)
-          entry.callback();
+      // The alarm thread is shutting down.
+      if (m_exit) {
+        exit = true;
+        if (m_run_callbacks_on_exit) {
+          for (Entry &entry : m_entries)
+            callbacks.emplace_back(std::move(entry.callback));
+        }
       }
-      continue;
-    }
 
-    // A new alarm was added or an alarm expired. Either way we need to
-    // recompute when this thread should wake up for the next alarm.
-    if (m_recompute_next_alarm || !next_alarm) {
-      for (Entry &entry : m_entries) {
-        if (!next_alarm || entry.expiration < *next_alarm)
-          next_alarm = entry.expiration;
+      // A new alarm was added or an alarm expired. Either way we need to
+      // recompute when this thread should wake up for the next alarm.
+      if (m_recompute_next_alarm || !next_alarm) {
+        for (Entry &entry : m_entries) {
+          if (!next_alarm || entry.expiration < *next_alarm)
+            next_alarm = entry.expiration;
+        }
+        m_recompute_next_alarm = false;
       }
-      m_recompute_next_alarm = false;
     }
+
+    // Outside the lock, call the callbacks.
+    for (Callback &callback : callbacks)
+      callback();
   }
   return {};
 }
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index 3e5ee6f6637303..d3fc487aed4333 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -3154,7 +3154,7 @@ AppleObjCRuntimeV2::TaggedPointerVendorExtended::GetClassDescriptor(
                             << m_objc_debug_taggedpointer_ext_payload_lshift) >>
                            m_objc_debug_taggedpointer_ext_payload_rshift);
   int64_t data_payload_signed =
-      ((int64_t)((int64_t)unobfuscated
+      ((int64_t)((uint64_t)unobfuscated
                  << m_objc_debug_taggedpointer_ext_payload_lshift) >>
        m_objc_debug_taggedpointer_ext_payload_rshift);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 16a7fc446fbe1d..4c9d85fd9f5140 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1333,9 +1333,9 @@ class MachineIRBuilder {
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder
-  buildAtomicCmpXchgWithSuccess(Register OldValRes, Register SuccessRes,
-                                Register Addr, Register CmpVal, Register NewVal,
-                                MachineMemOperand &MMO);
+  buildAtomicCmpXchgWithSuccess(const DstOp &OldValRes, const DstOp &SuccessRes,
+                                const SrcOp &Addr, const SrcOp &CmpVal,
+                                const SrcOp &NewVal, MachineMemOperand &MMO);
 
   /// Build and insert `OldValRes<def> = G_ATOMIC_CMPXCHG Addr, CmpVal, NewVal,
   /// MMO`.
@@ -1351,8 +1351,9 @@ class MachineIRBuilder {
   ///      registers of the same type.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildAtomicCmpXchg(Register OldValRes, Register Addr,
-                                         Register CmpVal, Register NewVal,
+  MachineInstrBuilder buildAtomicCmpXchg(const DstOp &OldValRes,
+                                         const SrcOp &Addr, const SrcOp &CmpVal,
+                                         const SrcOp &NewVal,
                                          MachineMemOperand &MMO);
 
   /// Build and insert `OldValRes<def> = G_ATOMICRMW_<Opcode> Addr, Val, MMO`.
diff --git a/llvm/include/llvm/IR/Verifier.h b/llvm/include/llvm/IR/Verifier.h
index b7db6e0bbfb71c..b25f8eb77ee38b 100644
--- a/llvm/include/llvm/IR/Verifier.h
+++ b/llvm/include/llvm/IR/Verifier.h
@@ -77,7 +77,6 @@ class TBAAVerifier {
   /// Visit an instruction and return true if it is valid, return false if an
   /// invalid TBAA is attached.
   bool visitTBAAMetadata(Instruction &I, const MDNode *MD);
-  bool visitTBAAStructMetadata(Instruction &I, const MDNode *MD);
 };
 
 /// Check a function for errors, useful for use when debugging a
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
index 346ca4ad2eb314..f05b90114d75a6 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
@@ -184,7 +184,7 @@ class BinaryCoverageReader : public CoverageMappingReader {
 private:
   std::vector<std::string> Filenames;
   std::vector<ProfileMappingRecord> MappingRecords;
-  InstrProfSymtab ProfileNames;
+  std::unique_ptr<InstrProfSymtab> ProfileNames;
   size_t CurrentRecord = 0;
   std::vector<StringRef> FunctionsFilenames;
   std::vector<CounterExpression> Expressions;
@@ -195,8 +195,9 @@ class BinaryCoverageReader : public CoverageMappingReader {
   // D69471, which can split up function records into multiple sections on ELF.
   FuncRecordsStorage FuncRecords;
 
-  BinaryCoverageReader(FuncRecordsStorage &&FuncRecords)
-      : FuncRecords(std::move(FuncRecords)) {}
+  BinaryCoverageReader(std::unique_ptr<InstrProfSymtab> Symtab,
+                       FuncRecordsStorage &&FuncRecords)
+      : ProfileNames(std::move(Symtab)), FuncRecords(std::move(FuncRecords)) {}
 
 public:
   BinaryCoverageReader(const BinaryCoverageReader &) = delete;
@@ -209,12 +210,10 @@ class BinaryCoverageReader : public CoverageMappingReader {
          SmallVectorImpl<object::BuildIDRef> *BinaryIDs = nullptr);
 
   static Expected<std::unique_ptr<BinaryCoverageReader>>
-  createCoverageReaderFromBuffer(StringRef Coverage,
-                                 FuncRecordsStorage &&FuncRecords,
-                                 InstrProfSymtab &&ProfileNames,
-                                 uint8_t BytesInAddress,
-                                 llvm::endianness Endian,
-                                 StringRef CompilationDir = "");
+  createCoverageReaderFromBuffer(
+      StringRef Coverage, FuncRecordsStorage &&FuncRecords,
+      std::unique_ptr<InstrProfSymtab> ProfileNamesPtr, uint8_t BytesInAddress,
+      llvm::endianness Endian, StringRef CompilationDir = "");
 
   Error readNextRecord(CoverageMappingRecord &Record) override;
 };
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 25ec06a7392027..612c444faec648 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -471,6 +471,13 @@ class InstrProfSymtab {
 public:
   InstrProfSymtab() = default;
 
+  // Not copyable or movable.
+  // Consider std::unique_ptr for move.
+  InstrProfSymtab(const InstrProfSymtab &) = delete;
+  InstrProfSymtab &operator=(const InstrProfSymtab &) = delete;
+  InstrProfSymtab(InstrProfSymtab &&) = delete;
+  InstrProfSymtab &operator=(InstrProfSymtab &&) = delete;
+
   /// Create InstrProfSymtab from an object file section which
   /// contains function PGO names. When section may contain raw
   /// string data or string data in compressed form. This method
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index 66814d39527301..bd7496a799c579 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -86,9 +86,12 @@ template <> struct IRTraits<BasicBlock> {
 // SampleProfileProber.
 class PseudoProbeManager {
   DenseMap<uint64_t, PseudoProbeDescriptor> GUIDToProbeDescMap;
+  const ThinOrFullLTOPhase LTOPhase;
 
 public:
-  PseudoProbeManager(const Module &M) {
+  PseudoProbeManager(const Module &M,
+                     ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
+      : LTOPhase(LTOPhase) {
     if (NamedMDNode *FuncInfo =
             M.getNamedMetadata(PseudoProbeDescMetadataName)) {
       for (const auto *Operand : FuncInfo->operands()) {
@@ -126,17 +129,15 @@ class PseudoProbeManager {
 
   bool profileIsValid(const Function &F, const FunctionSamples &Samples) const {
     const auto *Desc = getDesc(F);
-    if (!Desc) {
-      LLVM_DEBUG(dbgs() << "Probe descriptor missing for Function "
-                        << F.getName() << "\n");
-      return false;
-    }
-    if (Desc->getFunctionHash() != Samples.getFunctionHash()) {
-      LLVM_DEBUG(dbgs() << "Hash mismatch for Function " << F.getName()
-                        << "\n");
-      return false;
-    }
-    return true;
+    assert((LTOPhase != ThinOrFullLTOPhase::ThinLTOPostLink || !Desc ||
+            profileIsHashMismatched(*Desc, Samples) ==
+                F.hasFnAttribute("profile-checksum-mismatch")) &&
+           "In post-link, profile checksum matching state doesn't match "
+           "function 'profile-checksum-mismatch' attribute.");
+    // The desc for import function is unavailable. Check the function attribute
+    // for mismatch.
+    return (!Desc && !F.hasFnAttribute("profile-checksum-mismatch")) ||
+           (Desc && !profileIsHashMismatched(*Desc, Samples));
   }
 };
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index c8195584ade378..9e17dcaa55925d 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1190,107 +1190,113 @@ void TargetLibraryInfoImpl::addVectorizableFunctions(ArrayRef<VecDesc> Fns) {
   llvm::sort(ScalarDescs, compareByVectorFnName);
 }
 
+static const VecDesc VecFuncs_Accelerate[] = {
+#define TLI_DEFINE_ACCELERATE_VECFUNCS
+#include "llvm/Analysis/VecFuncs.def"
+};
+
+static const VecDesc VecFuncs_DarwinLibSystemM[] = {
+#define TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
+#include "llvm/Analysis/VecFuncs.def"
+};
+
+static const VecDesc VecFuncs_LIBMVEC_X86[] = {
+#define TLI_DEFINE_LIBMVEC_X86_VECFUNCS
+#include "llvm/Analysis/VecFuncs.def"
+};
+
+static const VecDesc VecFuncs_MASSV[] = {
+#define TLI_DEFINE_MASSV_VECFUNCS
+#include "llvm/Analysis/VecFuncs.def"
+};
+
+static const VecDesc VecFuncs_SVML[] = {
+#define TLI_DEFINE_SVML_VECFUNCS
+#include "llvm/Analysis/VecFuncs.def"
+};
+
+static const VecDesc VecFuncs_SLEEFGNUABI_VF2[] = {
+#define TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
+  {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
+#include "llvm/Analysis/VecFuncs.def"
+};
+static const VecDesc VecFuncs_SLEEFGNUABI_VF4[] = {
+#define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
+  {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
+#include "llvm/Analysis/VecFuncs.def"
+};
+static const VecDesc VecFuncs_SLEEFGNUABI_VFScalable[] = {
+#define TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX},
+#include "llvm/Analysis/VecFuncs.def"
+};
+
+static const VecDesc VecFuncs_ArmPL[] = {
+#define TLI_DEFINE_ARMPL_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX},
+#include "llvm/Analysis/VecFuncs.def"
+};
+
+const VecDesc VecFuncs_AMDLIBM[] = {
+#define TLI_DEFINE_AMDLIBM_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX},
+#include "llvm/Analysis/VecFuncs.def"
+};
+
 void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     enum VectorLibrary VecLib, const llvm::Triple &TargetTriple) {
   switch (VecLib) {
   case Accelerate: {
-    const VecDesc VecFuncs[] = {
-    #define TLI_DEFINE_ACCELERATE_VECFUNCS
-    #include "llvm/Analysis/VecFuncs.def"
-    };
-    addVectorizableFunctions(VecFuncs);
+    addVectorizableFunctions(VecFuncs_Accelerate);
     break;
   }
   case DarwinLibSystemM: {
-    const VecDesc VecFuncs[] = {
-    #define TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
-    #include "llvm/Analysis/VecFuncs.def"
-    };
-    addVectorizableFunctions(VecFuncs);
+    addVectorizableFunctions(VecFuncs_DarwinLibSystemM);
     break;
   }
   case LIBMVEC_X86: {
-    const VecDesc VecFuncs[] = {
-    #define TLI_DEFINE_LIBMVEC_X86_VECFUNCS
-    #include "llvm/Analysis/VecFuncs.def"
-    };
-    addVectorizableFunctions(VecFuncs);
+    addVectorizableFunctions(VecFuncs_LIBMVEC_X86);
     break;
   }
   case MASSV: {
-    const VecDesc VecFuncs[] = {
-    #define TLI_DEFINE_MASSV_VECFUNCS
-    #include "llvm/Analysis/VecFuncs.def"
-    };
-    addVectorizableFunctions(VecFuncs);
+    addVectorizableFunctions(VecFuncs_MASSV);
     break;
   }
   case SVML: {
-    const VecDesc VecFuncs[] = {
-    #define TLI_DEFINE_SVML_VECFUNCS
-    #include "llvm/Analysis/VecFuncs.def"
-    };
-    addVectorizableFunctions(VecFuncs);
+    addVectorizableFunctions(VecFuncs_SVML);
     break;
   }
   case SLEEFGNUABI: {
-    const VecDesc VecFuncs_VF2[] = {
-#define TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
-  {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
-#include "llvm/Analysis/VecFuncs.def"
-    };
-    const VecDesc VecFuncs_VF4[] = {
-#define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
-  {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
-#include "llvm/Analysis/VecFuncs.def"
-    };
-    const VecDesc VecFuncs_VFScalable[] = {
-#define TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
-  {SCAL, VEC, VF, MASK, VABI_PREFIX},
-#include "llvm/Analysis/VecFuncs.def"
-    };
-
     switch (TargetTriple.getArch()) {
     default:
       break;
     case llvm::Triple::aarch64:
     case llvm::Triple::aarch64_be:
-      addVectorizableFunctions(VecFuncs_VF2);
-      addVectorizableFunctions(VecFuncs_VF4);
-      addVectorizableFunctions(VecFuncs_VFScalable);
+      addVectorizableFunctions(VecFuncs_SLEEFGNUABI_VF2);
+      addVectorizableFunctions(VecFuncs_SLEEFGNUABI_VF4);
+      addVectorizableFunctions(VecFuncs_SLEEFGNUABI_VFScalable);
       break;
     }
     break;
   }
   case ArmPL: {
-    const VecDesc VecFuncs[] = {
-#define TLI_DEFINE_ARMPL_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
-  {SCAL, VEC, VF, MASK, VABI_PREFIX},
-#include "llvm/Analysis/VecFuncs.def"
-    };
-
     switch (TargetTriple.getArch()) {
     default:
       break;
     case llvm::Triple::aarch64:
     case llvm::Triple::aarch64_be:
-      addVectorizableFunctions(VecFuncs);
+      addVectorizableFunctions(VecFuncs_ArmPL);
       break;
     }
     break;
   }
   case AMDLIBM: {
-    const VecDesc VecFuncs[] = {
-#define TLI_DEFINE_AMDLIBM_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
-  {SCAL, VEC, VF, MASK, VABI_PREFIX},
-#include "llvm/Analysis/VecFuncs.def"
-    };
-    addVectorizableFunctions(VecFuncs);
+    addVectorizableFunctions(VecFuncs_AMDLIBM);
     break;
   }
   case NoLibrary:
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 07d4cb5eaa23c8..b8ba782254c370 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -930,14 +930,14 @@ MachineIRBuilder::buildExtractVectorElement(const DstOp &Res, const SrcOp &Val,
 }
 
 MachineInstrBuilder MachineIRBuilder::buildAtomicCmpXchgWithSuccess(
-    Register OldValRes, Register SuccessRes, Register Addr, Register CmpVal,
-    Register NewVal, MachineMemOperand &MMO) {
+    const DstOp &OldValRes, const DstOp &SuccessRes, const SrcOp &Addr,
+    const SrcOp &CmpVal, const SrcOp &NewVal, MachineMemOperand &MMO) {
 #ifndef NDEBUG
-  LLT OldValResTy = getMRI()->getType(OldValRes);
-  LLT SuccessResTy = getMRI()->getType(SuccessRes);
-  LLT AddrTy = getMRI()->getType(Addr);
-  LLT CmpValTy = getMRI()->getType(CmpVal);
-  LLT NewValTy = getMRI()->getType(NewVal);
+  LLT OldValResTy = OldValRes.getLLTTy(*getMRI());
+  LLT SuccessResTy = SuccessRes.getLLTTy(*getMRI());
+  LLT AddrTy = Addr.getLLTTy(*getMRI());
+  LLT CmpValTy = CmpVal.getLLTTy(*getMRI());
+  LLT NewValTy = NewVal.getLLTTy(*getMRI());
   assert(OldValResTy.isScalar() && "invalid operand type");
   assert(SuccessResTy.isScalar() && "invalid operand type");
   assert(AddrTy.isPointer() && "invalid operand type");
@@ -947,24 +947,25 @@ MachineInstrBuilder MachineIRBuilder::buildAtomicCmpXchgWithSuccess(
   assert(OldValResTy == NewValTy && "type mismatch");
 #endif
 
-  return buildInstr(TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS)
-      .addDef(OldValRes)
-      .addDef(SuccessRes)
-      .addUse(Addr)
-      .addUse(CmpVal)
-      .addUse(NewVal)
-      .addMemOperand(&MMO);
+  auto MIB = buildInstr(TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS);
+  OldValRes.addDefToMIB(*getMRI(), MIB);
+  SuccessRes.addDefToMIB(*getMRI(), MIB);
+  Addr.addSrcToMIB(MIB);
+  CmpVal.addSrcToMIB(MIB);
+  NewVal.addSrcToMIB(MIB);
+  MIB.addMemOperand(&MMO);
+  return MIB;
 }
 
 MachineInstrBuilder
-MachineIRBuilder::buildAtomicCmpXchg(Register OldValRes, Register Addr,
-                                     Register CmpVal, Register NewVal,
+MachineIRBuilder::buildAtomicCmpXchg(const DstOp &OldValRes, const SrcOp &Addr,
+                                     const SrcOp &CmpVal, const SrcOp &NewVal,
                                      MachineMemOperand &MMO) {
 #ifndef NDEBUG
-  LLT OldValResTy = getMRI()->getType(OldValRes);
-  LLT AddrTy = getMRI()->getType(Addr);
-  LLT CmpValTy = getMRI()->getType(CmpVal);
-  LLT NewValTy = getMRI()->getType(NewVal);
+  LLT OldValResTy = OldValRes.getLLTTy(*getMRI());
+  LLT AddrTy = Addr.getLLTTy(*getMRI());
+  LLT CmpValTy = CmpVal.getLLTTy(*getMRI());
+  LLT NewValTy = NewVal.getLLTTy(*getMRI());
   assert(OldValResTy.isScalar() && "invalid operand type");
   assert(AddrTy.isPointer() && "invalid operand type");
   assert(CmpValTy.isValid() && "invalid operand type");
@@ -973,12 +974,13 @@ MachineIRBuilder::buildAtomicCmpXchg(Register OldValRes, Register Addr,
   assert(OldValResTy == NewValTy && "type mismatch");
 #endif
 
-  return buildInstr(TargetOpcode::G_ATOMIC_CMPXCHG)
-      .addDef(OldValRes)
-      .addUse(Addr)
-      .addUse(CmpVal)
-      .addUse(NewVal)
-      .addMemOperand(&MMO);
+  auto MIB = buildInstr(TargetOpcode::G_ATOMIC_CMPXCHG);
+  OldValRes.addDefToMIB(*getMRI(), MIB);
+  Addr.addSrcToMIB(MIB);
+  CmpVal.addSrcToMIB(MIB);
+  NewVal.addSrcToMIB(MIB);
+  MIB.addMemOperand(&MMO);
+  return MIB;
 }
 
 MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index e10b8bc8c5e2eb..24f69ea1b742a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1455,6 +1455,9 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   // First store the whole vector.
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
+  // Freeze the index so we don't poison the clamping code we're about to emit.
+  Idx = DAG.getFreeze(Idx);
+
   // Then store the inserted part.
   if (PartVT.isVector()) {
     SDValue SubStackPtr =
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index d95e34b7be1ca1..82de5b6cc6041c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5172,9 +5172,6 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa))
     TBAAVerifyHelper.visitTBAAMetadata(I, TBAA);
 
-  if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa_struct))
-    TBAAVerifyHelper.visitTBAAStructMetadata(I, TBAA);
-
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_noalias))
     visitAliasScopeListMetadata(MD);
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_alias_scope))
@@ -7529,35 +7526,6 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
   return true;
 }
 
-bool TBAAVerifier::visitTBAAStructMetadata(Instruction &I, const MDNode *MD) {
-  CheckTBAA(MD->getNumOperands() % 3 == 0,
-            "tbaa.struct operands must occur in groups of three", &I, MD);
-
-  // Each group of three operands must consist of two integers and a
-  // tbaa node. Moreover, the regions described by the offset and size
-  // operands must be non-overlapping.
-  std::optional<APInt> NextFree;
-  for (unsigned int Idx = 0; Idx < MD->getNumOperands(); Idx += 3) {
-    auto *OffsetCI =
-        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx));
-    CheckTBAA(OffsetCI, "Offset must be a constant integer", &I, MD);
-
-    auto *SizeCI =
-        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx + 1));
-    CheckTBAA(SizeCI, "Size must be a constant integer", &I, MD);
-
-    MDNode *TBAA = dyn_cast_or_null<MDNode>(MD->getOperand(Idx + 2));
-    CheckTBAA(TBAA, "TBAA tag missing", &I, MD);
-    visitTBAAMetadata(I, TBAA);
-
-    bool NonOverlapping = !NextFree || NextFree->ule(OffsetCI->getValue());
-    CheckTBAA(NonOverlapping, "Overlapping tbaa.struct regions", &I, MD);
-
-    NextFree = OffsetCI->getValue() + SizeCI->getValue();
-  }
-  return true;
-}
-
 char VerifierLegacyPass::ID = 0;
 INITIALIZE_PASS(VerifierLegacyPass, "verify", "Module Verifier", false, false)
 
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index f4c6cbc8dd4442..005521bad6e014 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -141,7 +141,6 @@ struct ELFWriter {
 
   // TargetObjectWriter wrappers.
   bool is64Bit() const;
-  bool usesRela(const MCSectionELF &Sec) const;
 
   uint64_t align(Align Alignment);
 
@@ -260,6 +259,7 @@ class ELFObjectWriter : public MCObjectWriter {
   void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, uint64_t &FixedValue) override;
+  bool usesRela(const MCSectionELF &Sec) const;
 
   void executePostLayoutBinding(MCAssembler &Asm,
                                 const MCAsmLayout &Layout) override;
@@ -394,11 +394,6 @@ bool ELFWriter::is64Bit() const {
   return OWriter.TargetObjectWriter->is64Bit();
 }
 
-bool ELFWriter::usesRela(const MCSectionELF &Sec) const {
-  return OWriter.hasRelocationAddend() &&
-         Sec.getType() != ELF::SHT_LLVM_CALL_GRAPH_PROFILE;
-}
-
 // Emit the ELF header.
 void ELFWriter::writeHeader(const MCAssembler &Asm) {
   // ELF Header
@@ -825,24 +820,22 @@ MCSectionELF *ELFWriter::createRelocationSection(MCContext &Ctx,
   if (OWriter.Relocations[&Sec].empty())
     return nullptr;
 
-  const StringRef SectionName = Sec.getName();
-  bool Rela = usesRela(Sec);
-  std::string RelaSectionName = Rela ? ".rela" : ".rel";
-  RelaSectionName += SectionName;
+  unsigned Flags = ELF::SHF_INFO_LINK;
+  if (Sec.getFlags() & ELF::SHF_GROUP)
+    Flags = ELF::SHF_GROUP;
 
+  const StringRef SectionName = Sec.getName();
+  const bool Rela = OWriter.usesRela(Sec);
   unsigned EntrySize;
   if (Rela)
     EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rela) : sizeof(ELF::Elf32_Rela);
   else
     EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rel) : sizeof(ELF::Elf32_Rel);
 
-  unsigned Flags = ELF::SHF_INFO_LINK;
-  if (Sec.getFlags() & ELF::SHF_GROUP)
-    Flags = ELF::SHF_GROUP;
-
-  MCSectionELF *RelaSection = Ctx.createELFRelSection(
-      RelaSectionName, Rela ? ELF::SHT_RELA : ELF::SHT_REL, Flags, EntrySize,
-      Sec.getGroup(), &Sec);
+  MCSectionELF *RelaSection =
+      Ctx.createELFRelSection(((Rela ? ".rela" : ".rel") + SectionName),
+                              Rela ? ELF::SHT_RELA : ELF::SHT_REL, Flags,
+                              EntrySize, Sec.getGroup(), &Sec);
   RelaSection->setAlignment(is64Bit() ? Align(8) : Align(4));
   return RelaSection;
 }
@@ -938,11 +931,11 @@ void ELFWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags,
 void ELFWriter::writeRelocations(const MCAssembler &Asm,
                                        const MCSectionELF &Sec) {
   std::vector<ELFRelocationEntry> &Relocs = OWriter.Relocations[&Sec];
+  const bool Rela = OWriter.usesRela(Sec);
 
   // Sort the relocation entries. MIPS needs this.
   OWriter.TargetObjectWriter->sortRelocs(Asm, Relocs);
 
-  const bool Rela = usesRela(Sec);
   if (OWriter.TargetObjectWriter->getEMachine() == ELF::EM_MIPS) {
     for (const ELFRelocationEntry &Entry : Relocs) {
       uint32_t Symidx = Entry.Symbol ? Entry.Symbol->getIndex() : 0;
@@ -1499,7 +1492,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
   FixedValue = !RelocateWithSymbol && SymA && !SymA->isUndefined()
                    ? C + Layout.getSymbolOffset(*SymA)
                    : C;
-  if (hasRelocationAddend()) {
+  if (usesRela(FixupSection)) {
     Addend = FixedValue;
     FixedValue = 0;
   }
@@ -1528,6 +1521,11 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
   Relocations[&FixupSection].push_back(Rec);
 }
 
+bool ELFObjectWriter::usesRela(const MCSectionELF &Sec) const {
+  return hasRelocationAddend() &&
+         Sec.getType() != ELF::SHT_LLVM_CALL_GRAPH_PROFILE;
+}
+
 bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
     const MCAssembler &Asm, const MCSymbol &SA, const MCFragment &FB,
     bool InSet, bool IsPCRel) const {
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index d328460510830a..445b48067a9755 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -894,31 +894,34 @@ static Error readCoverageMappingData(
 Expected<std::unique_ptr<BinaryCoverageReader>>
 BinaryCoverageReader::createCoverageReaderFromBuffer(
     StringRef Coverage, FuncRecordsStorage &&FuncRecords,
-    InstrProfSymtab &&ProfileNames, uint8_t BytesInAddress,
+    std::unique_ptr<InstrProfSymtab> ProfileNamesPtr, uint8_t BytesInAddress,
     llvm::endianness Endian, StringRef CompilationDir) {
-  std::unique_ptr<BinaryCoverageReader> Reader(
-      new BinaryCoverageReader(std::move(FuncRecords)));
-  Reader->ProfileNames = std::move(ProfileNames);
+  if (ProfileNamesPtr == nullptr)
+    return make_error<CoverageMapError>(coveragemap_error::malformed,
+                                        "Caller must provide ProfileNames");
+  std::unique_ptr<BinaryCoverageReader> Reader(new BinaryCoverageReader(
+      std::move(ProfileNamesPtr), std::move(FuncRecords)));
+  InstrProfSymtab &ProfileNames = *Reader->ProfileNames;
   StringRef FuncRecordsRef = Reader->FuncRecords->getBuffer();
   if (BytesInAddress == 4 && Endian == llvm::endianness::little) {
     if (Error E = readCoverageMappingData<uint32_t, llvm::endianness::little>(
-            Reader->ProfileNames, Coverage, FuncRecordsRef,
-            Reader->MappingRecords, CompilationDir, Reader->Filenames))
+            ProfileNames, Coverage, FuncRecordsRef, Reader->MappingRecords,
+            CompilationDir, Reader->Filenames))
       return std::move(E);
   } else if (BytesInAddress == 4 && Endian == llvm::endianness::big) {
     if (Error E = readCoverageMappingData<uint32_t, llvm::endianness::big>(
-            Reader->ProfileNames, Coverage, FuncRecordsRef,
-            Reader->MappingRecords, CompilationDir, Reader->Filenames))
+            ProfileNames, Coverage, FuncRecordsRef, Reader->MappingRecords,
+            CompilationDir, Reader->Filenames))
       return std::move(E);
   } else if (BytesInAddress == 8 && Endian == llvm::endianness::little) {
     if (Error E = readCoverageMappingData<uint64_t, llvm::endianness::little>(
-            Reader->ProfileNames, Coverage, FuncRecordsRef,
-            Reader->MappingRecords, CompilationDir, Reader->Filenames))
+            ProfileNames, Coverage, FuncRecordsRef, Reader->MappingRecords,
+            CompilationDir, Reader->Filenames))
       return std::move(E);
   } else if (BytesInAddress == 8 && Endian == llvm::endianness::big) {
     if (Error E = readCoverageMappingData<uint64_t, llvm::endianness::big>(
-            Reader->ProfileNames, Coverage, FuncRecordsRef,
-            Reader->MappingRecords, CompilationDir, Reader->Filenames))
+            ProfileNames, Coverage, FuncRecordsRef, Reader->MappingRecords,
+            CompilationDir, Reader->Filenames))
       return std::move(E);
   } else
     return make_error<CoverageMapError>(
@@ -963,8 +966,8 @@ loadTestingFormat(StringRef Data, StringRef CompilationDir) {
   if (Data.size() < ProfileNamesSize)
     return make_error<CoverageMapError>(coveragemap_error::malformed,
                                         "the size of ProfileNames is too big");
-  InstrProfSymtab ProfileNames;
-  if (Error E = ProfileNames.create(Data.substr(0, ProfileNamesSize), Address))
+  auto ProfileNames = std::make_unique<InstrProfSymtab>();
+  if (Error E = ProfileNames->create(Data.substr(0, ProfileNamesSize), Address))
     return std::move(E);
   Data = Data.substr(ProfileNamesSize);
 
@@ -1099,7 +1102,7 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
       OF->isLittleEndian() ? llvm::endianness::little : llvm::endianness::big;
 
   // Look for the sections that we are interested in.
-  InstrProfSymtab ProfileNames;
+  auto ProfileNames = std::make_unique<InstrProfSymtab>();
   std::vector<SectionRef> NamesSectionRefs;
   // If IPSK_name is not found, fallback to search for IPK_covname, which is
   // used when binary correlation is enabled.
@@ -1116,7 +1119,7 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
     return make_error<CoverageMapError>(
         coveragemap_error::malformed,
         "the size of coverage mapping section is not one");
-  if (Error E = ProfileNames.create(NamesSectionRefs.back()))
+  if (Error E = ProfileNames->create(NamesSectionRefs.back()))
     return std::move(E);
 
   auto CoverageSection = lookupSections(*OF, IPSK_covmap);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index bddf3d958a1ae6..6e7d34f5adaa3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1594,6 +1594,9 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
           }
         }
 
+        if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
+          NewEltI->copyIRFlags(&I);
+
         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
       }
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 0788d0c3a72136..027ee1086bf4e0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -153,7 +153,7 @@
 ///      %__THREW__.val = __THREW__;
 ///      __THREW__ = 0;
 ///      %__threwValue.val = __threwValue;
-///      if (%__THREW__.val != 0) {
+///      if (%__THREW__.val != 0 & %__threwValue.val != 0) {
 ///        %label = __wasm_setjmp_test(%__THREW__.val, functionInvocationId);
 ///        if (%label == 0)
 ///          emscripten_longjmp(%__THREW__.val, %__threwValue.val);
@@ -712,10 +712,12 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
   BasicBlock *ThenBB1 = BasicBlock::Create(C, "if.then1", F);
   BasicBlock *ElseBB1 = BasicBlock::Create(C, "if.else1", F);
   BasicBlock *EndBB1 = BasicBlock::Create(C, "if.end", F);
+  Value *ThrewCmp = IRB.CreateICmpNE(Threw, getAddrSizeInt(M, 0));
   Value *ThrewValue = IRB.CreateLoad(IRB.getInt32Ty(), ThrewValueGV,
                                      ThrewValueGV->getName() + ".val");
-  Value *ThrewCmp = IRB.CreateICmpNE(Threw, getAddrSizeInt(M, 0));
-  IRB.CreateCondBr(ThrewCmp, ThenBB1, ElseBB1);
+  Value *ThrewValueCmp = IRB.CreateICmpNE(ThrewValue, IRB.getInt32(0));
+  Value *Cmp1 = IRB.CreateAnd(ThrewCmp, ThrewValueCmp, "cmp1");
+  IRB.CreateCondBr(Cmp1, ThenBB1, ElseBB1);
 
   // Generate call.em.longjmp BB once and share it within the function
   if (!CallEmLongjmpBB) {
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 2cbef8a7ae611d..7545a92c114ef2 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -453,6 +453,7 @@ class SampleProfileMatcher {
   Module &M;
   SampleProfileReader &Reader;
   const PseudoProbeManager *ProbeManager;
+  const ThinOrFullLTOPhase LTOPhase;
   SampleProfileMap FlattenedProfiles;
   // For each function, the matcher generates a map, of which each entry is a
   // mapping from the source location of current build to the source location in
@@ -504,8 +505,9 @@ class SampleProfileMatcher {
 
 public:
   SampleProfileMatcher(Module &M, SampleProfileReader &Reader,
-                       const PseudoProbeManager *ProbeManager)
-      : M(M), Reader(Reader), ProbeManager(ProbeManager){};
+                       const PseudoProbeManager *ProbeManager,
+                       ThinOrFullLTOPhase LTOPhase)
+      : M(M), Reader(Reader), ProbeManager(ProbeManager), LTOPhase(LTOPhase){};
   void runOnModule();
   void clearMatchingData() {
     // Do not clear FuncMappings, it stores IRLoc to ProfLoc remappings which
@@ -521,7 +523,7 @@ class SampleProfileMatcher {
       return &It->second;
     return nullptr;
   }
-  void runOnFunction(const Function &F);
+  void runOnFunction(Function &F);
   void findIRAnchors(const Function &F,
                      std::map<LineLocation, StringRef> &IRAnchors);
   void findProfileAnchors(
@@ -1911,15 +1913,22 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   bool Changed = false;
 
   if (FunctionSamples::ProfileIsProbeBased) {
-    if (!ProbeManager->profileIsValid(F, *Samples)) {
+    LLVM_DEBUG({
+      if (!ProbeManager->getDesc(F))
+        dbgs() << "Probe descriptor missing for Function " << F.getName()
+               << "\n";
+    });
+
+    if (ProbeManager->profileIsValid(F, *Samples)) {
+      ++NumMatchedProfile;
+    } else {
+      ++NumMismatchedProfile;
       LLVM_DEBUG(
           dbgs() << "Profile is invalid due to CFG mismatch for Function "
                  << F.getName() << "\n");
-      ++NumMismatchedProfile;
       if (!SalvageStaleProfile)
         return false;
     }
-    ++NumMatchedProfile;
   } else {
     if (getFunctionLoc(F) == 0)
       return false;
@@ -2185,7 +2194,7 @@ bool SampleProfileLoader::doInitialization(Module &M,
 
   // Load pseudo probe descriptors for probe-based function samples.
   if (Reader->profileIsProbeBased()) {
-    ProbeManager = std::make_unique<PseudoProbeManager>(M);
+    ProbeManager = std::make_unique<PseudoProbeManager>(M, LTOPhase);
     if (!ProbeManager->moduleIsProbed(M)) {
       const char *Msg =
           "Pseudo-probe-based profile requires SampleProfileProbePass";
@@ -2197,8 +2206,8 @@ bool SampleProfileLoader::doInitialization(Module &M,
 
   if (ReportProfileStaleness || PersistProfileStaleness ||
       SalvageStaleProfile) {
-    MatchingManager =
-        std::make_unique<SampleProfileMatcher>(M, *Reader, ProbeManager.get());
+    MatchingManager = std::make_unique<SampleProfileMatcher>(
+        M, *Reader, ProbeManager.get(), LTOPhase);
   }
 
   return true;
@@ -2452,7 +2461,7 @@ void SampleProfileMatcher::runStaleProfileMatching(
   }
 }
 
-void SampleProfileMatcher::runOnFunction(const Function &F) {
+void SampleProfileMatcher::runOnFunction(Function &F) {
   // We need to use flattened function samples for matching.
   // Unlike IR, which includes all callsites from the source code, the callsites
   // in profile only show up when they are hit by samples, i,e. the profile
@@ -2481,8 +2490,16 @@ void SampleProfileMatcher::runOnFunction(const Function &F) {
   // support for pseudo-probe.
   if (SalvageStaleProfile && FunctionSamples::ProfileIsProbeBased &&
       !ProbeManager->profileIsValid(F, *FSFlattened)) {
-    // The matching result will be saved to IRToProfileLocationMap, create a new
-    // map for each function.
+    // For imported functions, the checksum metadata(pseudo_probe_desc) are
+    // dropped, so we leverage function attribute(profile-checksum-mismatch) to
+    // transfer the info: add the attribute during pre-link phase and check it
+    // during post-link phase(see "profileIsValid").
+    if (FunctionSamples::ProfileIsProbeBased &&
+        LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink)
+      F.addFnAttr("profile-checksum-mismatch");
+
+    // The matching result will be saved to IRToProfileLocationMap, create a
+    // new map for each function.
     auto &IRToProfileLocationMap = getIRToProfileLocationMap(F);
     runStaleProfileMatching(F, IRAnchors, ProfileAnchors,
                             IRToProfileLocationMap);
@@ -2758,8 +2775,9 @@ void SampleProfileMatcher::distributeIRToProfileLocationMap(
     FS.setIRToProfileLocationMap(&(ProfileMappings->second));
   }
 
-  for (auto &Inlinees : FS.getCallsiteSamples()) {
-    for (auto FS : Inlinees.second) {
+  for (auto &Callees :
+       const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
+    for (auto &FS : Callees.second) {
       distributeIRToProfileLocationMap(FS.second);
     }
   }
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 5d366e3d6dee0a..f7d4803ded155a 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -422,6 +422,7 @@ class HWAddressSanitizer {
   bool InstrumentLandingPads;
   bool InstrumentWithCalls;
   bool InstrumentStack;
+  bool InstrumentGlobals;
   bool DetectUseAfterScope;
   bool UsePageAliases;
   bool UseMatchAllCallback;
@@ -639,11 +640,13 @@ void HWAddressSanitizer::initializeModule() {
   // If we don't have personality function support, fall back to landing pads.
   InstrumentLandingPads = optOr(ClInstrumentLandingPads, !NewRuntime);
 
+  InstrumentGlobals =
+      !CompileKernel && !UsePageAliases && optOr(ClGlobals, NewRuntime);
+
   if (!CompileKernel) {
     createHwasanCtorComdat();
-    bool InstrumentGlobals = optOr(ClGlobals, NewRuntime);
 
-    if (InstrumentGlobals && !UsePageAliases)
+    if (InstrumentGlobals)
       instrumentGlobals();
 
     bool InstrumentPersonalityFunctions =
@@ -787,6 +790,13 @@ bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
     if (SSI && SSI->stackAccessIsSafe(*Inst))
       return true;
   }
+
+  if (isa<GlobalVariable>(getUnderlyingObject(Ptr))) {
+    if (!InstrumentGlobals)
+      return true;
+    // TODO: Optimize inbound global accesses, like Asan `instrumentMop`.
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1f26b922dbe4d..961380ce4ad9f2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13928,26 +13928,29 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // that feed it. The type of the loaded value may indicate a more suitable
   // width than V's type. We want to base the vector element size on the width
   // of memory operations where possible.
-  SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist;
+  SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
   SmallPtrSet<Instruction *, 16> Visited;
   if (auto *I = dyn_cast<Instruction>(V)) {
-    Worklist.emplace_back(I, I->getParent());
+    Worklist.emplace_back(I, I->getParent(), 0);
     Visited.insert(I);
   }
 
   // Traverse the expression tree in bottom-up order looking for loads. If we
   // encounter an instruction we don't yet handle, we give up.
   auto Width = 0u;
+  Value *FirstNonBool = nullptr;
   while (!Worklist.empty()) {
-    Instruction *I;
-    BasicBlock *Parent;
-    std::tie(I, Parent) = Worklist.pop_back_val();
+    auto [I, Parent, Level] = Worklist.pop_back_val();
 
     // We should only be looking at scalar instructions here. If the current
     // instruction has a vector type, skip.
     auto *Ty = I->getType();
     if (isa<VectorType>(Ty))
       continue;
+    if (Ty != Builder.getInt1Ty() && !FirstNonBool)
+      FirstNonBool = I;
+    if (Level > RecursionMaxDepth)
+      continue;
 
     // If the current instruction is a load, update MaxWidth to reflect the
     // width of the loaded value.
@@ -13960,11 +13963,16 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
     // user or the use is a PHI node, we add it to the worklist.
     else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
                  BinaryOperator, UnaryOperator>(I)) {
-      for (Use &U : I->operands())
+      for (Use &U : I->operands()) {
         if (auto *J = dyn_cast<Instruction>(U.get()))
           if (Visited.insert(J).second &&
-              (isa<PHINode>(I) || J->getParent() == Parent))
-            Worklist.emplace_back(J, J->getParent());
+              (isa<PHINode>(I) || J->getParent() == Parent)) {
+            Worklist.emplace_back(J, J->getParent(), Level + 1);
+            continue;
+          }
+        if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
+          FirstNonBool = U.get();
+      }
     } else {
       break;
     }
@@ -13974,8 +13982,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // gave up for some reason, just return the width of V. Otherwise, return the
   // maximum width we found.
   if (!Width) {
-    if (auto *CI = dyn_cast<CmpInst>(V))
-      V = CI->getOperand(0);
+    if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
+      V = FirstNonBool;
     Width = DL->getTypeSizeInBits(V->getType());
   }
 
@@ -14415,6 +14423,13 @@ void BoUpSLP::computeMinimumValueSizes() {
     unsigned MaxBitWidth = ComputeMaxBitWidth(
         TreeRoot, VectorizableTree[NodeIdx]->getVectorFactor(), IsTopRoot,
         IsProfitableToDemoteRoot, Opcode, Limit, IsTruncRoot);
+    if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
+      if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
+        ReductionBitWidth = bit_ceil(MaxBitWidth);
+      else if (MaxBitWidth == 0)
+        ReductionBitWidth = 0;
+    }
+
     for (unsigned Idx : RootDemotes)
       ToDemote.append(VectorizableTree[Idx]->Scalars.begin(),
                       VectorizableTree[Idx]->Scalars.end());
@@ -15831,7 +15846,9 @@ class HorizontalReduction {
           RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
 
       unsigned ReduxWidth = std::min<unsigned>(
-          llvm::bit_floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));
+          llvm::bit_floor(NumReducedVals),
+          std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
+                               RegMaxNumber * RedValsMaxNumber));
       unsigned Start = 0;
       unsigned Pos = Start;
       // Restarts vectorization attempt with lower vector factor.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 0cf9602adbb094..499c08fa4966f9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -40,11 +40,12 @@ body:             |
 
     ; CHECK-LABEL: name: ldrxrox_breg_oreg
     ; CHECK: liveins: $x0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY]], [[COPY1]], 0, 0 :: (load (s64) from %ir.addr)
-    ; CHECK: $x0 = COPY [[LDRXroX]]
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY]], [[COPY1]], 0, 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $x0 = COPY [[LDRXroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:gpr(p0) = COPY $x0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
@@ -65,11 +66,12 @@ body:             |
     liveins: $d0, $x1
     ; CHECK-LABEL: name: ldrdrox_breg_oreg
     ; CHECK: liveins: $d0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY]], [[COPY1]], 0, 0 :: (load (s64) from %ir.addr)
-    ; CHECK: $d0 = COPY [[LDRDroX]]
-    ; CHECK: RET_ReallyLR implicit $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY]], [[COPY1]], 0, 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $d0 = COPY [[LDRDroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:gpr(p0) = COPY $d0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
@@ -78,6 +80,9 @@ body:             |
     RET_ReallyLR implicit $d0
 ...
 ---
+# This shouldn't be folded, since we reuse the result of the G_PTR_ADD outside
+# the G_LOAD
+
 name:            more_than_one_use
 alignment:       4
 legalized:       true
@@ -87,18 +92,17 @@ machineFunctionInfo: {}
 body:             |
   bb.0:
     liveins: $x0, $x1
-    ; This shouldn't be folded, since we reuse the result of the G_PTR_ADD outside
-    ; the G_LOAD
     ; CHECK-LABEL: name: more_than_one_use
     ; CHECK: liveins: $x0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY]], [[COPY1]]
-    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
-    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[LDRXui]]
-    ; CHECK: $x0 = COPY [[ADDXrr1]]
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
+    ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[LDRXui]]
+    ; CHECK-NEXT: $x0 = COPY [[ADDXrr1]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:gpr(p0) = COPY $x0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
@@ -121,11 +125,12 @@ body:             |
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: ldrxrox_shl
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: $x2 = COPY [[LDRXroX]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $x2 = COPY [[LDRXroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 3
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -148,11 +153,12 @@ body:             |
     liveins: $x0, $x1, $d2
     ; CHECK-LABEL: name: ldrdrox_shl
     ; CHECK: liveins: $x0, $x1, $d2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: $d2 = COPY [[LDRDroX]]
-    ; CHECK: RET_ReallyLR implicit $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $d2 = COPY [[LDRDroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 3
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -175,11 +181,12 @@ body:             |
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: ldrxrox_mul_rhs
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: $x2 = COPY [[LDRXroX]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $x2 = COPY [[LDRXroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 8
     %2:gpr(s64) = G_MUL %0, %1(s64)
@@ -202,11 +209,12 @@ body:             |
     liveins: $x0, $x1, $d2
     ; CHECK-LABEL: name: ldrdrox_mul_rhs
     ; CHECK: liveins: $x0, $x1, $d2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: $d2 = COPY [[LDRDroX]]
-    ; CHECK: RET_ReallyLR implicit $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $d2 = COPY [[LDRDroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 8
     %2:gpr(s64) = G_MUL %0, %1(s64)
@@ -229,11 +237,12 @@ body:             |
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: ldrxrox_mul_lhs
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: $x2 = COPY [[LDRXroX]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $x2 = COPY [[LDRXroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 8
     %2:gpr(s64) = G_MUL %1, %0(s64)
@@ -256,11 +265,12 @@ body:             |
     liveins: $x0, $x1, $d2
     ; CHECK-LABEL: name: ldrdrox_mul_lhs
     ; CHECK: liveins: $x0, $x1, $d2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: $d2 = COPY [[LDRDroX]]
-    ; CHECK: RET_ReallyLR implicit $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $d2 = COPY [[LDRDroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 8
     %2:gpr(s64) = G_MUL %1, %0(s64)
@@ -272,6 +282,9 @@ body:             |
 
 ...
 ---
+# Show that we don't get a shifted load from a mul when we don't have a
+# power of 2. (The bit isn't set on the load.)
+
 name:            mul_not_pow_2
 alignment:       4
 legalized:       true
@@ -280,19 +293,18 @@ tracksRegLiveness: true
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    ; Show that we don't get a shifted load from a mul when we don't have a
-    ; power of 2. (The bit isn't set on the load.)
     liveins: $x0, $x1, $d2
     ; CHECK-LABEL: name: mul_not_pow_2
     ; CHECK: liveins: $x0, $x1, $d2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 7
-    ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32
-    ; CHECK: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[SUBREG_TO_REG]], [[COPY]], $xzr
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[MADDXrrr]], 0, 0 :: (load (s64) from %ir.addr)
-    ; CHECK: $d2 = COPY [[LDRDroX]]
-    ; CHECK: RET_ReallyLR implicit $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 7
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32
+    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[SUBREG_TO_REG]], [[COPY]], $xzr
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[MADDXrrr]], 0, 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $d2 = COPY [[LDRDroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 7
     %2:gpr(s64) = G_MUL %1, %0(s64)
@@ -304,6 +316,9 @@ body:             |
 
 ...
 ---
+# Show that we don't get a shifted load from a mul when we don't have
+# the right power of 2. (The bit isn't set on the load.)
+
 name:            mul_wrong_pow_2
 alignment:       4
 legalized:       true
@@ -312,19 +327,18 @@ tracksRegLiveness: true
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    ; Show that we don't get a shifted load from a mul when we don't have
-    ; the right power of 2. (The bit isn't set on the load.)
     liveins: $x0, $x1, $d2
     ; CHECK-LABEL: name: mul_wrong_pow_2
     ; CHECK: liveins: $x0, $x1, $d2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 16
-    ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32
-    ; CHECK: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[SUBREG_TO_REG]], [[COPY]], $xzr
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[MADDXrrr]], 0, 0 :: (load (s64) from %ir.addr)
-    ; CHECK: $d2 = COPY [[LDRDroX]]
-    ; CHECK: RET_ReallyLR implicit $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 16
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32
+    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = MADDXrrr [[SUBREG_TO_REG]], [[COPY]], $xzr
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRDroX:%[0-9]+]]:fpr64 = LDRDroX [[COPY1]], [[MADDXrrr]], 0, 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: $d2 = COPY [[LDRDroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 16
     %2:gpr(s64) = G_MUL %1, %0(s64)
@@ -336,6 +350,9 @@ body:             |
 
 ...
 ---
+# Show that we can still fall back to the register-register addressing
+# mode when we fail to pull in the shift.
+
 name:            more_than_one_use_shl_1
 alignment:       4
 legalized:       true
@@ -344,19 +361,18 @@ tracksRegLiveness: true
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    ; Show that we can still fall back to the register-register addressing
-    ; mode when we fail to pull in the shift.
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: more_than_one_use_shl_1
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[UBFMXri]], 0, 0 :: (load (s64) from %ir.addr)
-    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
-    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
-    ; CHECK: $x2 = COPY [[ADDXrr]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[UBFMXri]], 0, 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+    ; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 3
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -370,6 +386,9 @@ body:             |
 
 ...
 ---
+# Show that when the GEP is used outside a memory op, we don't do any
+# folding at all.
+
 name:            more_than_one_use_shl_2
 alignment:       4
 legalized:       true
@@ -378,22 +397,21 @@ tracksRegLiveness: true
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    ; Show that when the GEP is used outside a memory op, we don't do any
-    ; folding at all.
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: more_than_one_use_shl_2
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
-    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
-    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]]
-    ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
-    ; CHECK: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]]
-    ; CHECK: $x2 = COPY [[ADDXrr2]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64common = ADDXrr [[COPY1]], [[UBFMXri]]
+    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrr]], 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[ADDXri]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDXrr]]
+    ; CHECK-NEXT: [[ADDXrr2:%[0-9]+]]:gpr64 = ADDXrr [[COPY2]], [[ADDXrr1]]
+    ; CHECK-NEXT: $x2 = COPY [[ADDXrr2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 3
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -409,6 +427,9 @@ body:             |
 
 ...
 ---
+# Show that when we have a fastpath for shift-left, we perform the folding
+# if it has more than one use.
+
 name:            more_than_one_use_shl_lsl_fast
 alignment:       4
 legalized:       true
@@ -417,18 +438,17 @@ tracksRegLiveness: true
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    ; Show that when we have a fastpath for shift-left, we perform the folding
-    ; if it has more than one use.
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: more_than_one_use_shl_lsl_fast
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
-    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
-    ; CHECK: $x2 = COPY [[ADDXrr]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
+    ; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 3
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -442,6 +462,9 @@ body:             |
 
 ...
 ---
+# Show that we don't fold into multiple memory ops when we don't have a
+# fastpath for shift-left.
+
 name:            more_than_one_use_shl_lsl_slow
 alignment:       4
 legalized:       true
@@ -450,19 +473,18 @@ tracksRegLiveness: true
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    ; Show that we don't fold into multiple memory ops when we don't have a
-    ; fastpath for shift-left.
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: more_than_one_use_shl_lsl_slow
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[ADDXrs:%[0-9]+]]:gpr64common = ADDXrs [[COPY1]], [[COPY]], 3
-    ; CHECK: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
-    ; CHECK: $x2 = COPY [[ADDXrr]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[ADDXrs:%[0-9]+]]:gpr64common = ADDXrs [[COPY1]], [[COPY]], 3
+    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+    ; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 3
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -476,6 +498,9 @@ body:             |
 
 ...
 ---
+# Show that when we're optimizing for size, we'll do the folding no matter
+# what.
+
 name:            more_than_one_use_shl_minsize
 alignment:       4
 legalized:       true
@@ -484,22 +509,21 @@ tracksRegLiveness: true
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    ; Show that when we're optimizing for size, we'll do the folding no matter
-    ; what.
     liveins: $x0, $x1, $x2
     ; CHECK-LABEL: name: more_than_one_use_shl_minsize
     ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]]
-    ; CHECK: [[ADDXrs:%[0-9]+]]:gpr64 = ADDXrs [[COPY2]], [[COPY]], 3
-    ; CHECK: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
-    ; CHECK: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
-    ; CHECK: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
-    ; CHECK: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrs]], [[ADDXrr]]
-    ; CHECK: $x2 = COPY [[ADDXrr1]]
-    ; CHECK: RET_ReallyLR implicit $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64common = UBFMXri [[COPY]], 61, 60
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[ADDXrs:%[0-9]+]]:gpr64 = ADDXrs [[COPY2]], [[COPY]], 3
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[UBFMXri]], 3, 0
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[ADDXri]]
+    ; CHECK-NEXT: [[ADDXrr1:%[0-9]+]]:gpr64 = ADDXrr [[ADDXrs]], [[ADDXrr]]
+    ; CHECK-NEXT: $x2 = COPY [[ADDXrr1]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
     %1:gpr(s64) = G_CONSTANT i64 3
     %2:gpr(s64) = G_SHL %0, %1(s64)
@@ -525,11 +549,12 @@ body:             |
     liveins: $x0, $x1
     ; CHECK-LABEL: name: ldrwrox
     ; CHECK: liveins: $x0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], [[COPY1]], 0, 0 :: (load (s32) from %ir.addr)
-    ; CHECK: $w2 = COPY [[LDRWroX]]
-    ; CHECK: RET_ReallyLR implicit $w2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[LDRWroX:%[0-9]+]]:gpr32 = LDRWroX [[COPY]], [[COPY1]], 0, 0 :: (load (s32) from %ir.addr)
+    ; CHECK-NEXT: $w2 = COPY [[LDRWroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $w2
     %0:gpr(p0) = COPY $x0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
@@ -549,11 +574,12 @@ body:             |
     liveins: $d0, $x1
     ; CHECK-LABEL: name: ldrsrox
     ; CHECK: liveins: $d0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], [[COPY1]], 0, 0 :: (load (s32) from %ir.addr)
-    ; CHECK: $s2 = COPY [[LDRSroX]]
-    ; CHECK: RET_ReallyLR implicit $h2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], [[COPY1]], 0, 0 :: (load (s32) from %ir.addr)
+    ; CHECK-NEXT: $s2 = COPY [[LDRSroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $h2
     %0:gpr(p0) = COPY $d0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
@@ -573,11 +599,12 @@ body:             |
     liveins: $x0, $x1
     ; CHECK-LABEL: name: ldrhrox
     ; CHECK: liveins: $x0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[LDRHroX:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], [[COPY1]], 0, 0 :: (load (s16) from %ir.addr)
-    ; CHECK: $h2 = COPY [[LDRHroX]]
-    ; CHECK: RET_ReallyLR implicit $h2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[LDRHroX:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], [[COPY1]], 0, 0 :: (load (s16) from %ir.addr)
+    ; CHECK-NEXT: $h2 = COPY [[LDRHroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $h2
     %0:gpr(p0) = COPY $x0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
@@ -597,11 +624,12 @@ body:             |
     liveins: $x0, $x1
     ; CHECK-LABEL: name: ldbbrox
     ; CHECK: liveins: $x0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[LDRBBroX:%[0-9]+]]:gpr32 = LDRBBroX [[COPY]], [[COPY1]], 0, 0 :: (load (s8) from %ir.addr)
-    ; CHECK: $w2 = COPY [[LDRBBroX]]
-    ; CHECK: RET_ReallyLR implicit $w2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[LDRBBroX:%[0-9]+]]:gpr32 = LDRBBroX [[COPY]], [[COPY1]], 0, 0 :: (load (s8) from %ir.addr)
+    ; CHECK-NEXT: $w2 = COPY [[LDRBBroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $w2
     %0:gpr(p0) = COPY $x0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
@@ -621,11 +649,12 @@ body:             |
     liveins: $d0, $x1
     ; CHECK-LABEL: name: ldrqrox
     ; CHECK: liveins: $d0, $x1
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: [[LDRQroX:%[0-9]+]]:fpr128 = LDRQroX [[COPY]], [[COPY1]], 0, 0 :: (load (<2 x s64>) from %ir.addr)
-    ; CHECK: $q0 = COPY [[LDRQroX]]
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[LDRQroX:%[0-9]+]]:fpr128 = LDRQroX [[COPY]], [[COPY1]], 0, 0 :: (load (<2 x s64>) from %ir.addr)
+    ; CHECK-NEXT: $q0 = COPY [[LDRQroX]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:gpr(p0) = COPY $d0
     %1:gpr(s64) = COPY $x1
     %2:gpr(p0) = G_PTR_ADD %0, %1
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index cf9ed4d5f0e16a..573f921e638cf8 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -20,7 +20,7 @@ entry:
 define i8 @test2(i32 %a) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #135
+; CHECK-NEXT:    mov w8, #135 // =0x87
 ; CHECK-NEXT:    and w8, w0, w8
 ; CHECK-NEXT:    cmp w8, #1024
 ; CHECK-NEXT:    cset w0, eq
@@ -37,7 +37,7 @@ entry:
 define i8 @test3(i32 %a) {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024
+; CHECK-NEXT:    mov w8, #1024 // =0x400
 ; CHECK-NEXT:    movk w8, #33, lsl #16
 ; CHECK-NEXT:    and w8, w0, w8
 ; CHECK-NEXT:    cmp w8, #1024
@@ -84,7 +84,7 @@ entry:
 define i8 @test6(i64 %a) {
 ; CHECK-LABEL: test6:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #135
+; CHECK-NEXT:    mov w8, #135 // =0x87
 ; CHECK-NEXT:    and x8, x0, x8
 ; CHECK-NEXT:    cmp x8, #1024
 ; CHECK-NEXT:    cset w0, eq
@@ -101,7 +101,7 @@ entry:
 define i8 @test7(i64 %a) {
 ; CHECK-LABEL: test7:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1024
+; CHECK-NEXT:    mov w8, #1024 // =0x400
 ; CHECK-NEXT:    movk w8, #33, lsl #16
 ; CHECK-NEXT:    and x8, x0, x8
 ; CHECK-NEXT:    cmp x8, #1024
@@ -175,7 +175,7 @@ define i32 @test9(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-NEXT:    cmp w2, #1
 ; CHECK-NEXT:    b.lt .LBB8_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w9, #1024
+; CHECK-NEXT:    mov w9, #1024 // =0x400
 ; CHECK-NEXT:    mov w8, w2
 ; CHECK-NEXT:    movk w9, #32, lsl #16
 ; CHECK-NEXT:  .LBB8_2: // %for.body
@@ -226,7 +226,7 @@ define void @test10(ptr nocapture %x, ptr nocapture readonly %y, ptr nocapture %
 ; CHECK-LABEL: test10:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr w8, [x1]
-; CHECK-NEXT:    mov w9, #1024
+; CHECK-NEXT:    mov w9, #1024 // =0x400
 ; CHECK-NEXT:    movk w9, #32, lsl #16
 ; CHECK-NEXT:    and w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
@@ -253,7 +253,7 @@ entry:
 define i8 @test11(i64 %a) {
 ; CHECK-LABEL: test11:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #-1610612736
+; CHECK-NEXT:    mov w8, #-1610612736 // =0xa0000000
 ; CHECK-NEXT:    and x8, x0, x8
 ; CHECK-NEXT:    cmp x8, #1024
 ; CHECK-NEXT:    cset w0, eq
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
index c9fd2d38e27acd..089e171e5a4a79 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -518,6 +518,4 @@ attributes #5 = { nobuiltin }
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"short", !1}
-!4 = !{i64 0, i64 4, !5, i64 4, i64 2, !6, i64 8, i64 4, !5, i64 12, i64 2, !6, i64 16, i64 4, !5, i64 20, i64 2, !6}
-!5 = !{!0, !0, i64 0}
-!6 = !{!3, !3, i64 0}
+!4 = !{i64 0, i64 4, !0, i64 4, i64 2, !3, i64 8, i64 4, !0, i64 12, i64 2, !3, i64 16, i64 4, !0, i64 20, i64 2, !3}
diff --git a/llvm/test/CodeGen/AArch64/pr86717.ll b/llvm/test/CodeGen/AArch64/pr86717.ll
new file mode 100644
index 00000000000000..aa8be954be72d0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr86717.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define <16 x i8> @f(i32 %0) {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    bfxil x9, x8, #0, #4
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    strb w8, [x9]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %2 = sub nuw i32 1, %0
+  %3 = insertelement <16 x i8> zeroinitializer, i8 3, i32 %2
+  ret <16 x i8> %3
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index d9001656f308e1..2ad28b8dd6ecf5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -10668,3 +10668,111 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
   store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
+
+define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) {
+; CHECK-LABEL:  @v_sdiv_i32_exact(
+; CHECK:        %1 = extractelement <2 x i32> %num, i64 0
+; CHECK-NEXT:   %2 = sdiv exact i32 %1, 4096
+; CHECK-NEXT:   %3 = insertelement <2 x i32> poison, i32 %2, i64 0
+; CHECK-NEXT:   %4 = extractelement <2 x i32> %num, i64 1
+; CHECK-NEXT:   %5 = sdiv exact i32 %4, 1024
+; CHECK-NEXT:   %6 = insertelement <2 x i32> %3, i32 %5, i64 1
+; CHECK-NEXT:   ret <2 x i32> %6
+;
+; GFX6-LABEL: v_sdiv_i32_exact:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 10, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sdiv_i32_exact:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 10, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+   %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024>
+   ret <2 x i32> %result
+}
+
+define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) {
+; CHECK-LABEL:  @v_sdiv_i64_exact(
+; CHECK:        %1 = extractelement <2 x i64> %num, i64 0
+; CHECK-NEXT:   %2 = sdiv exact i64 %1, 4096
+; CHECK-NEXT:   %3 = insertelement <2 x i64> poison, i64 %2, i64 0
+; CHECK-NEXT:   %4 = extractelement <2 x i64> %num, i64 1
+; CHECK-NEXT:   %5 = sdiv exact i64 %4, 1024
+; CHECK-NEXT:   %6 = insertelement <2 x i64> %3, i64 %5, i64 1
+; CHECK-NEXT:   ret <2 x i64> %6
+;
+; GFX6-LABEL: v_sdiv_i64_exact:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_ashr_i64 v[0:1], v[0:1], 12
+; GFX6-NEXT:    v_ashr_i64 v[2:3], v[2:3], 10
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sdiv_i64_exact:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 12, v[0:1]
+; GFX9-NEXT:    v_ashrrev_i64 v[2:3], 10, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+   %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024>
+   ret <2 x i64> %result
+}
+
+define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) {
+; CHECK-LABEL:  @v_udiv_i32_exact(
+; CHECK:        %1 = extractelement <2 x i32> %num, i64 0
+; CHECK-NEXT:   %2 = udiv exact i32 %1, 4096
+; CHECK-NEXT:   %3 = insertelement <2 x i32> poison, i32 %2, i64 0
+; CHECK-NEXT:   %4 = extractelement <2 x i32> %num, i64 1
+; CHECK-NEXT:   %5 = udiv exact i32 %4, 1024
+; CHECK-NEXT:   %6 = insertelement <2 x i32> %3, i32 %5, i64 1
+; CHECK-NEXT:   ret <2 x i32> %6
+;
+; GFX6-LABEL: v_udiv_i32_exact:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_udiv_i32_exact:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+   %result = udiv exact <2 x i32> %num, <i32 4096, i32 1024>
+   ret <2 x i32> %result
+}
+
+define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
+; CHECK-LABEL:  @v_udiv_i64_exact(
+; CHECK:        %1 = extractelement <2 x i64> %num, i64 0
+; CHECK-NEXT:   %2 = udiv exact i64 %1, 4096
+; CHECK-NEXT:   %3 = insertelement <2 x i64> poison, i64 %2, i64 0
+; CHECK-NEXT:   %4 = extractelement <2 x i64> %num, i64 1
+; CHECK-NEXT:   %5 = udiv exact i64 %4, 1024
+; CHECK-NEXT:   %6 = insertelement <2 x i64> %3, i64 %5, i64 1
+; CHECK-NEXT:   ret <2 x i64> %6
+;
+; GFX6-LABEL: v_udiv_i64_exact:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 12
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 10
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_udiv_i64_exact:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 12, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 10, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+   %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024>
+   ret <2 x i64> %result
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 25106b456d2f7a..6629d34405492c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -123,9 +123,10 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
-; CHECK-NEXT:    st.b $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 0
+; CHECK-NEXT:    st.b $a2, $a3, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -149,9 +150,10 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
-; CHECK-NEXT:    st.h $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 1
+; CHECK-NEXT:    st.h $a2, $a3, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -175,9 +177,10 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
-; CHECK-NEXT:    st.w $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 2
+; CHECK-NEXT:    st.w $a2, $a3, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -201,9 +204,10 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
-; CHECK-NEXT:    st.d $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 3
+; CHECK-NEXT:    st.d $a2, $a3, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -227,9 +231,10 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
 ; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT:    fst.s $fa0, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    bstrins.d $a2, $a0, 4, 2
+; CHECK-NEXT:    fst.s $fa0, $a2, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -253,9 +258,10 @@ define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounw
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
 ; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
-; CHECK-NEXT:    fst.d $fa0, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    bstrins.d $a2, $a0, 4, 3
+; CHECK-NEXT:    fst.d $fa0, $a2, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
index 7f232073ae129c..19171b7d8ed784 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
@@ -87,9 +87,10 @@ define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 0
-; CHECK-NEXT:    st.b $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 0
+; CHECK-NEXT:    st.b $a2, $a3, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -106,9 +107,10 @@ define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 1
-; CHECK-NEXT:    st.h $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 1
+; CHECK-NEXT:    st.h $a2, $a3, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -125,9 +127,10 @@ define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 2
-; CHECK-NEXT:    st.w $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 2
+; CHECK-NEXT:    st.w $a2, $a3, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -144,9 +147,10 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 3
-; CHECK-NEXT:    st.d $a2, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT:    addi.d $a3, $sp, 0
+; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 3
+; CHECK-NEXT:    st.d $a2, $a3, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -163,9 +167,10 @@ define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwi
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr1, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 2
-; CHECK-NEXT:    fst.s $fa0, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    bstrins.d $a2, $a0, 3, 2
+; CHECK-NEXT:    fst.s $fa0, $a2, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -182,9 +187,10 @@ define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) noun
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr1, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
-; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 3
-; CHECK-NEXT:    fst.d $fa0, $a0, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    bstrins.d $a2, $a0, 3, 3
+; CHECK-NEXT:    fst.d $fa0, $a2, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index f810f51f6bc07a..d9d83633a8537f 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -1282,6 +1282,96 @@ define zeroext i32 @sext_ashr_zext_i8(i8 %a) nounwind {
   ret i32 %1
 }
 
+define i64 @sh6_sh3_add1(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh6_sh3_add1:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sh3add a1, a1, a2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %shl
+  %add2 = add nsw i64 %add, %x
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add2(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh6_sh3_add2:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    slli a1, a1, 6
+; RV64ZBA-NEXT:    add a0, a1, a0
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %x
+  %add2 = add nsw i64 %add, %shl
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add3(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh6_sh3_add3:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    sh3add a1, a1, a2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %shl
+  %add2 = add nsw i64 %x, %add
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add4(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh6_sh3_add4:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    slli a1, a1, 6
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %x, %shl
+  %add2 = add nsw i64 %add, %shl1
+  ret i64 %add2
+}
+
 ; Make sure we use sext.h+slli+srli for Zba+Zbb.
 ; FIXME: The RV64I and Zba only cases can be done with only 3 shifts.
 define zeroext i32 @sext_ashr_zext_i16(i16 %a) nounwind {
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj.ll
index d88f42a4dc5847..32942cd92e684f 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj.ll
@@ -22,8 +22,10 @@ entry:
           to label %try.cont unwind label %lpad
 
 ; CHECK:    entry.split.split:
-; CHECK:      %__threwValue.val = load i32, ptr @__threwValue
-; CHECK-NEXT: %[[CMP:.*]] = icmp ne i32 %__THREW__.val, 0
+; CHECK:      %[[CMP0:.*]] = icmp ne i32 %__THREW__.val, 0
+; CHECK-NEXT: %__threwValue.val = load i32, ptr @__threwValue
+; CHECK-NEXT: %[[CMP1:.*]] = icmp ne i32 %__threwValue.val, 0
+; CHECK-NEXT: %[[CMP:.*]] = and i1 %[[CMP0]], %[[CMP1]]
 ; CHECK-NEXT: br i1 %[[CMP]], label %if.then1, label %if.else1
 
 ; This is exception checking part. %if.else1 leads here
@@ -119,7 +121,6 @@ if.end:                                           ; preds = %entry
 ; CHECK-NEXT: unreachable
 
 ; CHECK:    normal:
-; CHECK-NEXT: %__threwValue.val = load i32, ptr @__threwValue, align 4
 ; CHECK-NEXT: icmp ne i32 %__THREW__.val, 0
 
 return:                                           ; preds = %if.end, %entry
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
index dca4c59d7c8740..27ec95a2c462ab 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -37,8 +37,10 @@ entry:
 ; CHECK-NEXT: call cc{{.*}} void @__invoke_void_[[PTR]]_i32(ptr @emscripten_longjmp, [[PTR]] %[[JMPBUF]], i32 1)
 ; CHECK-NEXT: %[[__THREW__VAL:.*]] = load [[PTR]], ptr @__THREW__
 ; CHECK-NEXT: store [[PTR]] 0, ptr @__THREW__
+; CHECK-NEXT: %[[CMP0:.*]] = icmp ne [[PTR]] %__THREW__.val, 0
 ; CHECK-NEXT: %[[THREWVALUE_VAL:.*]] = load i32, ptr @__threwValue
-; CHECK-NEXT: %[[CMP:.*]] = icmp ne [[PTR]] %__THREW__.val, 0
+; CHECK-NEXT: %[[CMP1:.*]] = icmp ne i32 %[[THREWVALUE_VAL]], 0
+; CHECK-NEXT: %[[CMP:.*]] = and i1 %[[CMP0]], %[[CMP1]]
 ; CHECK-NEXT: br i1 %[[CMP]], label %if.then1, label %if.else1
 
 ; CHECK: entry.split.split.split:
diff --git a/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll b/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
index 535450a52ff60e..695a2d0cd806e0 100644
--- a/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
+++ b/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
@@ -9,11 +9,11 @@ define <2 x i64> @_mm_insert_epi16(<2 x i64> %a, i32 %b, i32 %imm) nounwind read
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzwl 8(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    andl $7, %ecx
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movzwl 8(%ebp), %ecx
+; X86-NEXT:    andl $7, %eax
 ; X86-NEXT:    movaps %xmm0, (%esp)
-; X86-NEXT:    movw %ax, (%esp,%ecx,2)
+; X86-NEXT:    movw %cx, (%esp,%eax,2)
 ; X86-NEXT:    movaps (%esp), %xmm0
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index f73288dc58de31..37583f2ba07b4f 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -1009,18 +1009,19 @@ define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    pushl %esi
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $48, %esp
-; X86AVX2-NEXT:    movl 8(%ebp), %eax
-; X86AVX2-NEXT:    movl 12(%ebp), %ecx
-; X86AVX2-NEXT:    movl 16(%ebp), %edx
+; X86AVX2-NEXT:    movl 8(%ebp), %edx
+; X86AVX2-NEXT:    movl 12(%ebp), %eax
+; X86AVX2-NEXT:    movl 16(%ebp), %ecx
 ; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    leal (%edx,%edx), %esi
+; X86AVX2-NEXT:    addl %ecx, %ecx
+; X86AVX2-NEXT:    movl %ecx, %esi
 ; X86AVX2-NEXT:    andl $3, %esi
-; X86AVX2-NEXT:    movl %eax, (%esp,%esi,4)
+; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    leal 1(%edx,%edx), %eax
-; X86AVX2-NEXT:    andl $3, %eax
-; X86AVX2-NEXT:    movl %ecx, 16(%esp,%eax,4)
+; X86AVX2-NEXT:    incl %ecx
+; X86AVX2-NEXT:    andl $3, %ecx
+; X86AVX2-NEXT:    movl %eax, 16(%esp,%ecx,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
 ; X86AVX2-NEXT:    leal -4(%ebp), %esp
 ; X86AVX2-NEXT:    popl %esi
@@ -1362,12 +1363,13 @@ define <2 x i64> @load_i64_v2i64(<2 x i64> %v, i64* %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl (%ecx), %edx
 ; X86AVX2-NEXT:    movl 4(%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    leal (%eax,%eax), %esi
+; X86AVX2-NEXT:    addl %eax, %eax
+; X86AVX2-NEXT:    movl %eax, %esi
 ; X86AVX2-NEXT:    andl $3, %esi
 ; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    leal 1(%eax,%eax), %eax
+; X86AVX2-NEXT:    incl %eax
 ; X86AVX2-NEXT:    andl $3, %eax
 ; X86AVX2-NEXT:    movl %ecx, 16(%esp,%eax,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
@@ -1742,18 +1744,19 @@ define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    pushl %esi
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $96, %esp
-; X86AVX2-NEXT:    movl 8(%ebp), %eax
-; X86AVX2-NEXT:    movl 12(%ebp), %ecx
-; X86AVX2-NEXT:    movl 16(%ebp), %edx
+; X86AVX2-NEXT:    movl 8(%ebp), %edx
+; X86AVX2-NEXT:    movl 12(%ebp), %eax
+; X86AVX2-NEXT:    movl 16(%ebp), %ecx
 ; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    leal (%edx,%edx), %esi
+; X86AVX2-NEXT:    addl %ecx, %ecx
+; X86AVX2-NEXT:    movl %ecx, %esi
 ; X86AVX2-NEXT:    andl $7, %esi
-; X86AVX2-NEXT:    movl %eax, (%esp,%esi,4)
+; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    leal 1(%edx,%edx), %eax
-; X86AVX2-NEXT:    andl $7, %eax
-; X86AVX2-NEXT:    movl %ecx, 32(%esp,%eax,4)
+; X86AVX2-NEXT:    incl %ecx
+; X86AVX2-NEXT:    andl $7, %ecx
+; X86AVX2-NEXT:    movl %eax, 32(%esp,%ecx,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %ymm0
 ; X86AVX2-NEXT:    leal -4(%ebp), %esp
 ; X86AVX2-NEXT:    popl %esi
@@ -2128,12 +2131,13 @@ define <4 x i64> @load_i64_v4i64(<4 x i64> %v, i64* %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl (%ecx), %edx
 ; X86AVX2-NEXT:    movl 4(%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    leal (%eax,%eax), %esi
+; X86AVX2-NEXT:    addl %eax, %eax
+; X86AVX2-NEXT:    movl %eax, %esi
 ; X86AVX2-NEXT:    andl $7, %esi
 ; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    leal 1(%eax,%eax), %eax
+; X86AVX2-NEXT:    incl %eax
 ; X86AVX2-NEXT:    andl $7, %eax
 ; X86AVX2-NEXT:    movl %ecx, 32(%esp,%eax,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %ymm0
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll b/llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll
index 84f57f80ab339e..f9040afd1c0166 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --global-value-regex "x" --version 4
-; RUN: opt < %s -S -passes=hwasan -mtriple=aarch64 -hwasan-globals=0 | FileCheck %s --check-prefixes=NOGLOB
-; RUN: opt < %s -S -passes=hwasan -mtriple=aarch64 -hwasan-globals=1 | FileCheck %s
+; RUN: opt < %s -S -passes=hwasan -mtriple=aarch64-linux-gnu -hwasan-globals=0 | FileCheck %s --check-prefixes=NOGLOB
+; RUN: opt < %s -S -passes=hwasan -mtriple=aarch64-linux-gnu -hwasan-globals=1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 @x = dso_local global i32 0, align 4
 
@@ -13,29 +15,13 @@ define dso_local noundef i32 @_Z3tmpv() sanitize_hwaddress {
 ; NOGLOB-LABEL: define dso_local noundef i32 @_Z3tmpv(
 ; NOGLOB-SAME: ) #[[ATTR0:[0-9]+]] {
 ; NOGLOB-NEXT:  entry:
-; NOGLOB-NEXT:    [[TMP12:%.*]] = load i64, ptr @__hwasan_tls, align 4
-; NOGLOB-NEXT:    [[TMP1:%.*]] = or i64 [[TMP12]], 4294967295
-; NOGLOB-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP1]], 1
-; NOGLOB-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
-; NOGLOB-NEXT:    [[TMP3:%.*]] = lshr i64 ptrtoint (ptr @x to i64), 56
-; NOGLOB-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
-; NOGLOB-NEXT:    [[TMP5:%.*]] = and i64 ptrtoint (ptr @x to i64), 72057594037927935
-; NOGLOB-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
-; NOGLOB-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP2]], i64 [[TMP6]]
-; NOGLOB-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1
-; NOGLOB-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP4]], [[TMP8]]
-; NOGLOB-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1:![0-9]+]]
-; NOGLOB:       10:
-; NOGLOB-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP2]], ptr @x, i32 2)
-; NOGLOB-NEXT:    br label [[TMP11]]
-; NOGLOB:       11:
 ; NOGLOB-NEXT:    [[TMP0:%.*]] = load i32, ptr @x, align 4
 ; NOGLOB-NEXT:    ret i32 [[TMP0]]
 ;
 ; CHECK-LABEL: define dso_local noundef i32 @_Z3tmpv(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__hwasan_tls, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__hwasan_tls, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[TMP12]], 4294967295
 ; CHECK-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll
index 079d7224128301..62fd7a16715693 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll
@@ -54,7 +54,6 @@ define dso_local noundef i1 @_Z6targetv() sanitize_hwaddress {
 ; CHECK:       sw.bb1:
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess(ptr [[TMP16]], ptr @stackbuf, i32 19)
 ; CHECK-NEXT:    store ptr [[BUF_HWASAN]], ptr @stackbuf, align 8
 ; CHECK-NEXT:    call void @may_jump()
 ; CHECK-NEXT:    br label [[RETURN]]
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
index 2f264a2432fc3d..50b0e7a0f5471b 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -141,4 +141,4 @@ attributes #1 = { argmemonly nounwind }
 !5 = distinct !{!5, !"some domain"}
 !6 = !{!7}
 !7 = distinct !{!7, !5, !"some scope 2"}
-!8 = !{i64 0, i64 8, !0}
+!8 = !{i64 0, i64 8, null}
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index d079c03f1dcb93..996d2c0e67e165 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -75,7 +75,7 @@ entry:
 !1 = !{!"omnipotent char", !0}
 !2 = !{!5, !5, i64 0}
 !3 = !{i64 0, i64 4, !2}
-!4 = !{i64 0, i64 8, !2}
+!4 = !{i64 0, i64 8, null}
 !5 = !{!"float", !0}
 !6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2}
 !7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-extension-after-bitwidth.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-extension-after-bitwidth.ll
new file mode 100644
index 00000000000000..7771e8369b6198
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-extension-after-bitwidth.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=riscv64-unknown-linux-gnu -mattr="+v" --passes=slp-vectorizer < %s | FileCheck %s
+
+define i32 @test(ptr %0, ptr %1) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> <i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP3]], [[LOAD_5]]
+; CHECK-NEXT:    ret i32 [[OP_RDX]]
+;
+entry:
+  %zext.0 = zext i8 1 to i32
+  %zext.1 = zext i8 1 to i32
+  %zext.2 = zext i8 1 to i32
+  %zext.3 = zext i8 1 to i32
+  %select.zext.0 = select i1 false, i32 -1, i32 %zext.0
+  %select.zext.1 = select i1 false, i32 0, i32 %zext.1
+  %select.zext.2 = select i1 false, i32 0, i32 %zext.2
+  %select.zext.3 = select i1 false, i32 0, i32 %zext.3
+
+  %load.5 = load i32, ptr %1, align 4
+
+  %and.0 = and i32 %load.5, %select.zext.0
+  %and.1 = and i32 %and.0, %select.zext.1
+  %and.2 = and i32 %and.1, %select.zext.2
+  %and.3 = and i32 %and.2, %select.zext.3
+
+  ret i32 %and.3
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
index cfe3ca9f8f9e5f..7b4e2b0ce9112e 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
@@ -11,9 +11,8 @@ define void @test(ptr %a, i8 %0, i16 %b.promoted.i) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i128> [[TMP5]], <4 x i128> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i128> [[TMP6]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i16> [[TMP8]] to <4 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP9]] to i64
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = and i64 [[TMP11]], 1
 ; CHECK-NEXT:    store i64 [[OP_RDX]], ptr [[A]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
index 61034de81e4b27..0fcd787fef9769 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -539,7 +539,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias
 !6 = !{!5, !5, i64 0}
 !7 = !{i64 0, i64 8, !6, i64 8, i64 4, !1}
 !8 = !{i64 0, i64 4, !1, i64 4, i64 8, !6}
-!9 = !{i64 0, i64 8, !6, i64 8, i64 8, !1}
+!9 = !{i64 0, i64 8, !6, i64 4, i64 8, !1}
 !10 = !{i64 0, i64 2, !1, i64 2, i64 2, !1}
 !11 = !{i64 0, i64 1, !1, i64 1, i64 3, !1}
 !12 = !{i64 0, i64 2, !1, i64 2, i64 6, !1}
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-callee-profile-mismatch.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-callee-profile-mismatch.prof
new file mode 100644
index 00000000000000..76a8fc9d19a85d
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-callee-profile-mismatch.prof
@@ -0,0 +1,16 @@
+main:252:0
+ 1: 0
+ 2: 50
+ 5: 50
+ 7: bar:102
+  1: 51
+  2: baz:51
+   1: 51
+   !CFGChecksum: 4294967295
+   !Attributes: 3
+  !CFGChecksum: 281479271677951
+  !Attributes: 2
+ !CFGChecksum: 281582081721716
+bar:1:1
+ 1: 1
+ !CFGChecksum: 281479271677951
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-profile-checksum-mismatch-attr.ll b/llvm/test/Transforms/SampleProfile/csspgo-profile-checksum-mismatch-attr.ll
new file mode 100644
index 00000000000000..df56b55dcdf3c0
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-profile-checksum-mismatch-attr.ll
@@ -0,0 +1,67 @@
+; REQUIRES: x86_64-linux
+; REQUIRES: asserts
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof -pass-remarks=inline  -S -o %t 2>&1 | FileCheck %s --check-prefix=INLINE
+; RUN: FileCheck %s < %t
+; RUN: FileCheck %s < %t --check-prefix=MERGE
+
+
+; Make sure bar is inlined into main for attr merging verification.
+; INLINE: 'bar' inlined into 'main'
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @baz() #0 {
+entry:
+  ret i32 0
+}
+
+define i32 @bar() #0 !dbg !11 {
+; CHECK: define {{.*}} @bar() {{.*}} #[[#BAR_ATTR:]] !
+entry:
+  %call = call i32 @baz()
+  ret i32 0
+}
+
+define i32 @main() #0 {
+; MERGE: define {{.*}} @main() {{.*}} #[[#MAIN_ATTR:]] !
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %call = call i32 @bar(), !dbg !14
+  br label %for.cond
+}
+
+; CHECK: attributes #[[#BAR_ATTR]] = {{{.*}} "profile-checksum-mismatch" {{.*}}}
+
+; Verify the attribute is not merged into the caller.
+; MERGE-NOT: attributes #[[#MAIN_ATTR]] = {{{.*}} "profile-checksum-mismatch" {{.*}}}
+
+attributes #0 = { "use-sample-profile" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7}
+!llvm.pseudo_probe_desc = !{!8, !9, !10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home", checksumkind: CSK_MD5, checksum: "0df0c950a93a603a7d13f0a9d4623642")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "x", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6)
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i64 7546896869197086323, i64 4294967295, !"baz"}
+!9 = !{i64 -2012135647395072713, i64 281530612780802, !"bar"}
+!10 = !{i64 -2624081020897602054, i64 281582081721716, !"main"}
+!11 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !12, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
+!12 = distinct !DISubroutineType(types: !13)
+!13 = !{}
+!14 = !DILocation(line: 15, column: 10, scope: !15)
+!15 = !DILexicalBlockFile(scope: !16, file: !1, discriminator: 186646591)
+!16 = distinct !DILexicalBlock(scope: !17, file: !1, line: 14, column: 40)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 14, column: 3)
+!18 = distinct !DILexicalBlock(scope: !19, file: !1, line: 14, column: 3)
+!19 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 12, type: !20, scopeLine: 13, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
+!20 = !DISubroutineType(types: !13)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
new file mode 100644
index 00000000000000..e00b737cae4e85
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
@@ -0,0 +1,63 @@
+; REQUIRES: x86_64-linux
+; REQUIRES: asserts
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-impl  -pass-remarks=inline 2>&1 | FileCheck %s
+
+
+; CHECK: Run stale profile matching for bar
+; CHECK: Callsite with callee:baz is matched from 4 to 2
+; CHECK: 'baz' inlined into 'main' to match profiling context with (cost=always): preinliner at callsite bar:3:8.4 @ main:3:10.7
+
+; CHECK: Probe descriptor missing for Function bar
+; CHECK: Profile is invalid due to CFG mismatch for Function bar
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+  %1 = call i32 @bar(), !dbg !13
+  ret i32 0
+}
+
+define available_externally i32 @bar() #1 !dbg !21 {
+  %1 = call i32 @baz(), !dbg !23
+  ret i32 0
+}
+
+define available_externally i32 @baz() #0 !dbg !25 {
+  ret i32 0
+}
+
+attributes #0 = { "use-sample-profile" }
+attributes #1 = { "profile-checksum-mismatch" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!0, !7, !9}
+!llvm.module.flags = !{!11}
+!llvm.pseudo_probe_desc = !{!12}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/test", checksumkind: CSK_MD5, checksum: "7220f1a2d70ff869f1a6ab7958e3c393")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "x", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6)
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = distinct !DICompileUnit(language: DW_LANG_C11, file: !8, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!8 = !DIFile(filename: "test1.v1.c", directory: "/home/test", checksumkind: CSK_MD5, checksum: "76696bd6bfe16a9f227fe03cfdb6a82c")
+!9 = distinct !DICompileUnit(language: DW_LANG_C11, file: !10, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!10 = !DIFile(filename: "test2.c", directory: "/home/test", checksumkind: CSK_MD5, checksum: "553093afc026f9c73562eb3b0c5b7532")
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i64 -2624081020897602054, i64 281582081721716, !"main"}
+!13 = !DILocation(line: 8, column: 10, scope: !14)
+!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 186646591)
+!15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 7, column: 40)
+!16 = distinct !DILexicalBlock(scope: !17, file: !1, line: 7, column: 3)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 7, column: 3)
+!18 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !19, scopeLine: 6, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !20)
+!19 = distinct !DISubroutineType(types: !20)
+!20 = !{}
+!21 = distinct !DISubprogram(name: "bar", scope: !8, file: !8, line: 3, type: !22, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !7, retainedNodes: !20)
+!22 = !DISubroutineType(types: !20)
+!23 = !DILocation(line: 6, column: 8, scope: !24)
+!24 = !DILexicalBlockFile(scope: !21, file: !8, discriminator: 186646567)
+!25 = distinct !DISubprogram(name: "baz", scope: !10, file: !10, line: 1, type: !22, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !9, retainedNodes: !20)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll
index 55225b415d4abc..270beee4ebc2bd 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll
@@ -106,7 +106,7 @@ define available_externally dso_local i32 @bar(i32 noundef %0) local_unnamed_add
   ret i32 %2, !dbg !132
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" "profile-checksum-mismatch"}
 attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #3 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll
index 89477ea5fecf1e..29877fb22a2c2e 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll
@@ -48,6 +48,8 @@
 ;    }
 ;  }
 
+; Verify not running profile matching for checksum matched function.
+; CHECK-NOT: Run stale profile matching for bar
 
 ; CHECK: Run stale profile matching for main
 
diff --git a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
index 73ae66dd76c66e..bbcdcb6f586742 100644
--- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
@@ -836,6 +836,5 @@ define <2 x i32> @f23_crash(<2 x i32> %srcvec, i32 %v1) {
 !2 = !{ !"set2", !0 }
 !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
 !4 = !{ float 4.0 }
-!5 = !{ i64 0, i64 8, !6 }
-!6 = !{ !1, !1, i64 0 }
+!5 = !{ i64 0, i64 8, null }
 !13 = distinct !{}
diff --git a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll
index 87a70ccd3fc7c5..db7c5f535f7e9d 100644
--- a/llvm/test/Transforms/Scalarizer/basic.ll
+++ b/llvm/test/Transforms/Scalarizer/basic.ll
@@ -870,6 +870,5 @@ define <2 x float> @f25(<2 x float> %src) {
 !2 = !{ !"set2", !0 }
 !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
 !4 = !{ float 4.0 }
-!5 = !{ i64 0, i64 8, !6 }
-!6 = !{ !1, !1, i64 0 }
+!5 = !{ i64 0, i64 8, null }
 !13 = distinct !{}
diff --git a/llvm/test/Verifier/tbaa-struct.ll b/llvm/test/Verifier/tbaa-struct.ll
index 14c19a19d5ae89..b8ddc7cee496a9 100644
--- a/llvm/test/Verifier/tbaa-struct.ll
+++ b/llvm/test/Verifier/tbaa-struct.ll
@@ -1,36 +1,28 @@
-; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+; RUN: llvm-as < %s 2>&1
+
+; FIXME: The verifer should reject the invalid !tbaa.struct nodes below.
 
 define void @test_overlapping_regions(ptr %a1) {
-; CHECK: Overlapping tbaa.struct regions
-; CHECK-NEXT:  %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
   %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
   ret void
 }
 
 define void @test_size_not_integer(ptr %a1) {
-; CHECK: Size must be a constant integer
-; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !5
   store i8 1, ptr %a1, align 1, !tbaa.struct !5
   ret void
 }
 
 define void @test_offset_not_integer(ptr %a1, ptr %a2) {
-; CHECK: Offset must be a constant integer
-; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
   ret void
 }
 
 define void @test_tbaa_missing(ptr %a1, ptr %a2) {
-; CHECK: TBAA tag missing
-; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
   ret void
 }
 
 define void @test_tbaa_invalid(ptr %a1) {
-; CHECK: Old-style TBAA is no longer allowed, use struct-path TBAA instead
-; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !8
   store i8 1, ptr %a1, align 1, !tbaa.struct !8
   ret void
 }
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 1fd81bd407becb..0a947f6e206fef 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -63,6 +63,10 @@ Error SubprocessMemory::addMemoryDefinition(
     SharedMemoryNames.push_back(SharedMemoryName);
     int SharedMemoryFD =
         shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+    if (SharedMemoryFD == -1)
+      return make_error<Failure>(
+          "Failed to create shared memory object for memory definition: " +
+          Twine(strerror(errno)));
     if (ftruncate(SharedMemoryFD, MemVal.SizeBytes) != 0) {
       return make_error<Failure>("Truncating a memory definiton failed: " +
                                  Twine(strerror(errno)));
@@ -100,7 +104,8 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
       shm_open(AuxiliaryMemoryName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFileDescriptor == -1)
     return make_error<Failure>(
-        "Getting file descriptor for auxiliary memory failed");
+        "Getting file descriptor for auxiliary memory failed: " +
+        Twine(strerror(errno)));
   // set up memory value file descriptors
   int *AuxiliaryMemoryMapping =
       (int *)mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED,
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 193f95443b16ef..19d42b7688dac8 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -1077,30 +1077,25 @@ OperandPredicateMatcher::~OperandPredicateMatcher() {}
 bool OperandPredicateMatcher::isHigherPriorityThan(
     const OperandPredicateMatcher &B) const {
   // Generally speaking, an instruction is more important than an Int or a
-  // LiteralInt because it can cover more nodes but theres an exception to
+  // LiteralInt because it can cover more nodes but there's an exception to
   // this. G_CONSTANT's are less important than either of those two because they
   // are more permissive.
 
-  const InstructionOperandMatcher *AOM =
-      dyn_cast<InstructionOperandMatcher>(this);
-  const InstructionOperandMatcher *BOM =
-      dyn_cast<InstructionOperandMatcher>(&B);
+  const auto *AOM = dyn_cast<InstructionOperandMatcher>(this);
+  const auto *BOM = dyn_cast<InstructionOperandMatcher>(&B);
   bool AIsConstantInsn = AOM && AOM->getInsnMatcher().isConstantInstruction();
   bool BIsConstantInsn = BOM && BOM->getInsnMatcher().isConstantInstruction();
 
-  if (AOM && BOM) {
-    // The relative priorities between a G_CONSTANT and any other instruction
-    // don't actually matter but this code is needed to ensure a strict weak
-    // ordering. This is particularly important on Windows where the rules will
-    // be incorrectly sorted without it.
-    if (AIsConstantInsn != BIsConstantInsn)
-      return AIsConstantInsn < BIsConstantInsn;
-    return false;
-  }
+  // The relative priorities between a G_CONSTANT and any other instruction
+  // don't actually matter but this code is needed to ensure a strict weak
+  // ordering. This is particularly important on Windows where the rules will
+  // be incorrectly sorted without it.
+  if (AOM && BOM)
+    return !AIsConstantInsn && BIsConstantInsn;
 
-  if (AOM && AIsConstantInsn && (B.Kind == OPM_Int || B.Kind == OPM_LiteralInt))
+  if (AIsConstantInsn && (B.Kind == OPM_Int || B.Kind == OPM_LiteralInt))
     return false;
-  if (BOM && BIsConstantInsn && (Kind == OPM_Int || Kind == OPM_LiteralInt))
+  if (BIsConstantInsn && (Kind == OPM_Int || Kind == OPM_LiteralInt))
     return true;
 
   return Kind < B.Kind;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
index 5ceb85e7d9903b..7fd88dec71d491 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
@@ -552,6 +553,305 @@ class BubbleUpPackThroughPadOp final : public OpRewritePattern<tensor::PackOp> {
   ControlPropagationFn controlFn;
 };
 
+/// Project dimsPos to the inner-most non-unit dim pos with reassocIndices.
+///
+/// For example, given dimsPos [0, 2], reassocIndices [[0, 1], [2, 3]], and
+/// targetShape [16, 16, 32, 1], it returns [1, 2]. Because for pos 0, the
+/// inner-most projected dim in pos [0, 1] is 1. And for pos 2, the inner-most
+/// non-unit projected dims in pos [2, 3] is 2.
+///
+/// If all candidates in a reassociation are unit dims, it chooses the
+/// inner-most dim pos.
+static SmallVector<int64_t>
+projectToInnerMostNonUnitDimsPos(ArrayRef<int64_t> dimsPos,
+                                 ArrayRef<ReassociationIndices> reassocIndices,
+                                 ArrayRef<int64_t> targetShape) {
+  SmallVector<int64_t> projectedDimsPos;
+  for (auto pos : dimsPos) {
+    // In the case all dims are unit, this will return the inner-most one.
+    int64_t projectedPos = reassocIndices[pos].back();
+    for (auto i : llvm::reverse(reassocIndices[pos])) {
+      int64_t dim = targetShape[i];
+      if (dim > 1 || ShapedType::isDynamic(dim)) {
+        projectedPos = i;
+        break;
+      }
+    }
+    projectedDimsPos.push_back(projectedPos);
+  }
+  return projectedDimsPos;
+}
+
+/// Check if all dims in dimsPos are divisible by the corresponding tile sizes.
+static bool isDimsDivisibleByTileSizes(ArrayRef<int64_t> dimsPos,
+                                       ArrayRef<int64_t> shape,
+                                       ArrayRef<int64_t> tileSizes) {
+  for (auto [pos, tileSize] : llvm::zip_equal(dimsPos, tileSizes)) {
+    int64_t dim = shape[pos];
+    if (ShapedType::isDynamic(dim) || (dim % tileSize) != 0)
+      return false;
+  }
+  return true;
+}
+
+/// Permutate the reassociation indices and reindex them in the sequence order.
+/// Returns the next dim pos in the sequence.
+///
+/// For example, given reassocIndices [[0, 1], [2]] and permutation [1, 0], it
+/// applies the permutation to get [[2], [0, 1]] and reindexes the indices into
+/// [[0], [1, 2]].
+static int64_t applyPermutationAndReindexReassoc(
+    SmallVector<ReassociationIndices> &reassocIndices,
+    ArrayRef<int64_t> permutation) {
+  applyPermutationToVector<ReassociationIndices>(reassocIndices, permutation);
+  int64_t nextPos = 0;
+  for (ReassociationIndices &indices : reassocIndices) {
+    for (auto &index : indices) {
+      index = nextPos;
+      nextPos += 1;
+    }
+  }
+  return nextPos;
+}
+
+/// Bubble up pack op through collapse shape op when the packed dims can be
+/// projected to the dims before collapsing. This is possible when the inner
+/// tile sizes can divide the projected dims.
+///
+/// For example:
+///
+/// %collapsed = tensor.collapse_shape %in [[0, 1], 2]
+///     : tensor<?x16x4xf32> into tensor<?x4xf32>
+/// %pack = tensor.pack %collapsed outer_dims_perm = [0, 1]
+///     inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %empty
+///     : tensor<?x4xf32> -> tensor<?x4x8x1xf32>
+///
+/// can be transformed into:
+///
+/// %pack = tensor.pack %in outer_dims_perm = [1, 2]
+///     inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %empty
+///     : tensor<?x16x4xf32> -> tensor<?x2x4x8x1xf32>
+/// %collapsed = tensor.collapse_shape %pack [[0, 1], 2, 3, 4]
+///     : tensor<?x2x4x8x1xf32> into tensor<?x4x8x1>
+static LogicalResult
+bubbleUpPackOpThroughCollapseShape(tensor::CollapseShapeOp collapseOp,
+                                   tensor::PackOp packOp,
+                                   PatternRewriter &rewriter) {
+  SmallVector<int64_t> innerTileSizes = packOp.getStaticTiles();
+  ArrayRef<int64_t> innerDimsPos = packOp.getInnerDimsPos();
+  ArrayRef<int64_t> outerDimsPerm = packOp.getOuterDimsPerm();
+
+  ArrayRef<int64_t> srcShape = collapseOp.getSrcType().getShape();
+  SmallVector<ReassociationIndices> reassocIndices =
+      collapseOp.getReassociationIndices();
+  // Project inner tile pos to the dim pos before collapsing. For example, if
+  // dims [x, y] is collapsed into [z], packing on dim z can be projected back
+  // to pack on dim y.
+  //
+  // Project to inner-most non-unit dims to increase the chance that they can be
+  // divided by the inner tile sizes. This is correct because for [..., x, 1],
+  // packing on dim 1 is equivalent to packing on dim x.
+  SmallVector<int64_t> projectedInnerDimsPos =
+      projectToInnerMostNonUnitDimsPos(innerDimsPos, reassocIndices, srcShape);
+
+  if (!isDimsDivisibleByTileSizes(projectedInnerDimsPos, srcShape,
+                                  innerTileSizes)) {
+    return failure();
+  }
+  // Expand the outer dims permutation with the associated source dims for the
+  // new permutation after bubbling. This is because moving a collapsed dim is
+  // equivalent to moving the associated source dims together.
+  SmallVector<int64_t> newOuterDimsPerm;
+  for (auto outerPos : outerDimsPerm) {
+    newOuterDimsPerm.insert(newOuterDimsPerm.end(),
+                            reassocIndices[outerPos].begin(),
+                            reassocIndices[outerPos].end());
+  }
+
+  auto emptyOp = tensor::PackOp::createDestinationTensor(
+      rewriter, packOp.getLoc(), collapseOp.getSrc(), packOp.getMixedTiles(),
+      projectedInnerDimsPos, newOuterDimsPerm);
+  auto newPackOp = rewriter.create<tensor::PackOp>(
+      packOp.getLoc(), collapseOp.getSrc(), emptyOp, projectedInnerDimsPos,
+      packOp.getMixedTiles(), packOp.getPaddingValue(), newOuterDimsPerm);
+
+  SmallVector<ReassociationIndices> newReassocIndices = reassocIndices;
+  // First apply the permutation on the reassociations of the outer dims.
+  // For example given the permutation [1, 0], the reassociations [[0, 1], [2]]
+  // -> [[0], [1, 2]]
+  int64_t nextPos =
+      applyPermutationAndReindexReassoc(newReassocIndices, outerDimsPerm);
+  // Then add direct mapping for the inner tile dims.
+  for (size_t i = 0; i < innerDimsPos.size(); ++i) {
+    newReassocIndices.push_back({nextPos});
+    nextPos += 1;
+  }
+
+  auto newCollapseOp = rewriter.create<tensor::CollapseShapeOp>(
+      collapseOp.getLoc(), packOp.getType(), newPackOp, newReassocIndices);
+  rewriter.replaceOp(packOp, newCollapseOp);
+
+  return success();
+}
+
+class BubbleUpPackOpThroughReshapeOp final
+    : public OpRewritePattern<tensor::PackOp> {
+public:
+  BubbleUpPackOpThroughReshapeOp(MLIRContext *context, ControlPropagationFn fun)
+      : OpRewritePattern<tensor::PackOp>(context), controlFn(std::move(fun)) {}
+
+  LogicalResult matchAndRewrite(tensor::PackOp packOp,
+                                PatternRewriter &rewriter) const override {
+    Operation *srcOp = packOp.getSource().getDefiningOp();
+    // Currently only support when the pack op is the only user.
+    if (!srcOp || !(srcOp->getNumResults() == 1) ||
+        !srcOp->getResult(0).hasOneUse()) {
+      return failure();
+    }
+    // Currently only support static inner tile sizes.
+    if (llvm::any_of(packOp.getStaticTiles(), [](int64_t size) {
+          return ShapedType::isDynamic(size);
+        })) {
+      return failure();
+    }
+
+    // User controlled propagation function.
+    if (!controlFn(srcOp))
+      return failure();
+
+    return TypeSwitch<Operation *, LogicalResult>(srcOp)
+        .Case([&](tensor::CollapseShapeOp op) {
+          return bubbleUpPackOpThroughCollapseShape(op, packOp, rewriter);
+        })
+        .Default([](Operation *) { return failure(); });
+  }
+
+private:
+  ControlPropagationFn controlFn;
+};
+
+/// Push down unpack op through expand shape op when the packed dims can be
+/// projected to the dims after expanding. This is possible when the inner tile
+/// sizes can divide the projected dims.
+///
+/// For example:
+///
+/// %unpack = tensor.unpack %in outer_dims_perm = [0, 1]
+///     inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %empty
+///     : tensor<?x32x8x8xf32> -> tensor<?x256xf32>
+/// %expanded = tensor.expand_shape %unpack [[0, 1], [2]]
+///     : tensor<?x256xf32> into tensor<?x256x256xf32>
+///
+/// can be transformed into:
+///
+/// %expanded = tensor.expand_shape %ain [[0, 1], [2], [3], [4]]
+///     : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
+/// %unpack = tensor.unpack %expanded outer_dims_perm = [0, 1, 2]
+///     inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %empty
+///     : tensor<?x32x32x8x8xf32> -> tensor<?x256x256xf32>
+static LogicalResult
+pushDownUnPackOpThroughExpandShape(tensor::UnPackOp unPackOp,
+                                   tensor::ExpandShapeOp expandOp,
+                                   PatternRewriter &rewriter) {
+  SmallVector<int64_t> innerTileSizes = unPackOp.getStaticTiles();
+  ArrayRef<int64_t> innerDimsPos = unPackOp.getInnerDimsPos();
+  ArrayRef<int64_t> outerDimsPerm = unPackOp.getOuterDimsPerm();
+
+  ArrayRef<int64_t> dstShape = expandOp.getType().getShape();
+  SmallVector<ReassociationIndices> reassocIndices =
+      expandOp.getReassociationIndices();
+  // Project inner tile pos to the dim pos after expanding. For example, if dims
+  // [z] is expanded into [x, y], unpacking on dim z can be projected to unpack
+  // on dim y.
+  //
+  // Project to inner-most non-unit dims to increase the chance that they can be
+  // divided by the inner tile sizes. This is correct because for [..., x, 1],
+  // unpacking on dim 1 is equivalent to unpacking on dim x.
+  SmallVector<int64_t> projectedInnerDimsPos =
+      projectToInnerMostNonUnitDimsPos(innerDimsPos, reassocIndices, dstShape);
+
+  if (!isDimsDivisibleByTileSizes(projectedInnerDimsPos, dstShape,
+                                  innerTileSizes)) {
+    return failure();
+  }
+  // Expand the outer dims permutation with the associated expanded dims for the
+  // new permutation after pushing. This is because moving a source dim is
+  // equivalent to moving the associated expanded dims together.
+  SmallVector<int64_t> newOuterDimsPerm;
+  for (auto outerPos : outerDimsPerm) {
+    newOuterDimsPerm.insert(newOuterDimsPerm.end(),
+                            reassocIndices[outerPos].begin(),
+                            reassocIndices[outerPos].end());
+  }
+
+  SmallVector<ReassociationIndices> newReassocIndices = reassocIndices;
+  // First apply the permutation on the reassociations of the outer dims.
+  // For example given the permutation [1, 0], the reassociations [[0, 1], [2]]
+  // -> [[0], [1, 2]]
+  int64_t nextPos =
+      applyPermutationAndReindexReassoc(newReassocIndices, outerDimsPerm);
+  // Then add direct mapping for the inner tile dims.
+  for (size_t i = 0; i < innerDimsPos.size(); ++i) {
+    newReassocIndices.push_back({nextPos});
+    nextPos += 1;
+  }
+
+  RankedTensorType newExpandType =
+      tensor::PackOp::inferPackedType(expandOp.getType(), innerTileSizes,
+                                      projectedInnerDimsPos, newOuterDimsPerm);
+  auto newExpandOp = rewriter.create<tensor::ExpandShapeOp>(
+      expandOp.getLoc(), newExpandType, unPackOp.getSource(),
+      newReassocIndices);
+
+  auto emptyOp = tensor::UnPackOp::createDestinationTensor(
+      rewriter, unPackOp.getLoc(), newExpandOp, unPackOp.getMixedTiles(),
+      projectedInnerDimsPos, newOuterDimsPerm);
+  auto newUnPackOp = rewriter.create<tensor::UnPackOp>(
+      unPackOp.getLoc(), newExpandOp.getResult(), emptyOp,
+      projectedInnerDimsPos, unPackOp.getMixedTiles(), newOuterDimsPerm);
+  rewriter.replaceOp(expandOp, newUnPackOp);
+
+  return success();
+}
+
+class PushDownUnPackOpThroughReshapeOp final
+    : public OpRewritePattern<tensor::UnPackOp> {
+public:
+  PushDownUnPackOpThroughReshapeOp(MLIRContext *context,
+                                   ControlPropagationFn fun)
+      : OpRewritePattern<tensor::UnPackOp>(context), controlFn(std::move(fun)) {
+  }
+
+  LogicalResult matchAndRewrite(tensor::UnPackOp unPackOp,
+                                PatternRewriter &rewriter) const override {
+    Value result = unPackOp.getResult();
+    // Currently only support unpack op with the single user.
+    if (!result.hasOneUse()) {
+      return failure();
+    }
+    // Currently only support static inner tile sizes.
+    if (llvm::any_of(unPackOp.getStaticTiles(), [](int64_t size) {
+          return ShapedType::isDynamic(size);
+        })) {
+      return failure();
+    }
+
+    Operation *consumerOp = *result.user_begin();
+    // User controlled propagation function.
+    if (!controlFn(consumerOp))
+      return failure();
+
+    return TypeSwitch<Operation *, LogicalResult>(consumerOp)
+        .Case([&](tensor::ExpandShapeOp op) {
+          return pushDownUnPackOpThroughExpandShape(unPackOp, op, rewriter);
+        })
+        .Default([](Operation *) { return failure(); });
+  }
+
+private:
+  ControlPropagationFn controlFn;
+};
+
 // TODO: Relax this restriction. We should unpack a generic op also
 // in the presence of multiple unpack ops as producers.
 /// Return the unpacked operand, if present, for the current generic op.
@@ -774,6 +1074,7 @@ void mlir::linalg::populateDataLayoutPropagationPatterns(
     const ControlPropagationFn &controlPackUnPackPropagation) {
   patterns
       .insert<BubbleUpPackOpThroughGenericOpPattern, BubbleUpPackThroughPadOp,
-              PushDownUnPackOpThroughGenericOp, PushDownUnPackThroughPadOp>(
+              BubbleUpPackOpThroughReshapeOp, PushDownUnPackOpThroughGenericOp,
+              PushDownUnPackThroughPadOp, PushDownUnPackOpThroughReshapeOp>(
           patterns.getContext(), controlPackUnPackPropagation);
 }
diff --git a/mlir/lib/ExecutionEngine/CRunnerUtils.cpp b/mlir/lib/ExecutionEngine/CRunnerUtils.cpp
index 48e4b8cd88b58e..41c619566b55df 100644
--- a/mlir/lib/ExecutionEngine/CRunnerUtils.cpp
+++ b/mlir/lib/ExecutionEngine/CRunnerUtils.cpp
@@ -51,8 +51,20 @@ void stdSort(uint64_t n, V *p) {
 // details of our vectors. Also useful for direct LLVM IR output.
 extern "C" void printI64(int64_t i) { fprintf(stdout, "%" PRId64, i); }
 extern "C" void printU64(uint64_t u) { fprintf(stdout, "%" PRIu64, u); }
-extern "C" void printF32(float f) { fprintf(stdout, "%g", f); }
-extern "C" void printF64(double d) { fprintf(stdout, "%lg", d); }
+extern "C" void printF32(float f) {
+  if (std::isnan(f) && std::signbit(f)) {
+    fprintf(stdout, "-nan");
+  } else {
+    fprintf(stdout, "%g", f);
+  }
+}
+extern "C" void printF64(double d) {
+  if (std::isnan(d) && std::signbit(d)) {
+    fprintf(stdout, "-nan");
+  } else {
+    fprintf(stdout, "%lg", d);
+  }
+}
 extern "C" void printString(char const *s) { fputs(s, stdout); }
 extern "C" void printOpen() { fputs("( ", stdout); }
 extern "C" void printClose() { fputs(" )", stdout); }
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index e036695a2ac9fd..79d61ab757e327 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -905,3 +905,163 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32
 // CHECK-SAME:      inner_dims_pos = [0] inner_tiles = [16]
 // CHECK-SAME:      into %[[UNPACK_NEW_DEST]]
 // CHECK:         return %[[UNPACK]] : tensor<16x540x960xi32>
+
+// -----
+
+func.func @bubble_up_pack_through_collapse(%1: tensor<?x16x4xf32>, %dim : index) -> tensor<?x4x8x1xf32> {
+  %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor<?x16x4xf32> into tensor<?x4xf32>
+  %2 = tensor.empty(%dim) : tensor<?x4x8x1xf32>
+  %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor<?x4xf32> -> tensor<?x4x8x1xf32>
+  func.return %pack : tensor<?x4x8x1xf32>
+}
+// CHECK-LABEL: func.func @bubble_up_pack_through_collapse
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x16x4xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x2x4x8x1xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<?x16x4xf32> -> tensor<?x2x4x8x1xf32>
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] : tensor<?x2x4x8x1xf32> into tensor<?x4x8x1xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<?x4x8x1xf32>
+
+// -----
+
+func.func @bubble_up_permuted_pack_through_collapse(%1: tensor<4x192x16x256xf32>) -> tensor<4x32x3072x8x1xf32> {
+  %collapsed = tensor.collapse_shape %1 [[0], [1, 2], [3]] : tensor<4x192x16x256xf32> into tensor<4x3072x256xf32>
+  %2 = tensor.empty() : tensor<4x32x3072x8x1xf32>
+  %pack = tensor.pack %collapsed outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 1] into %2 : tensor<4x3072x256xf32> -> tensor<4x32x3072x8x1xf32>
+  func.return %pack : tensor<4x32x3072x8x1xf32>
+}
+// CHECK-LABEL: func.func @bubble_up_permuted_pack_through_collapse
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<4x32x192x16x8x1xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<4x192x16x256xf32> -> tensor<4x32x192x16x8x1xf32>
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %pack {{\[}}[0], [1], [2, 3], [4], [5]] : tensor<4x32x192x16x8x1xf32> into tensor<4x32x3072x8x1xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<4x32x3072x8x1xf32>
+
+// -----
+
+func.func @bubble_up_pack_through_unit_collapse(%1: tensor<1x64x1x4xf32>) -> tensor<8x4x8x1xf32> {
+  %collapsed = tensor.collapse_shape %1 [[0, 1, 2], [3]] : tensor<1x64x1x4xf32> into tensor<64x4xf32>
+  %2 = tensor.empty() : tensor<8x4x8x1xf32>
+  %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor<64x4xf32> -> tensor<8x4x8x1xf32>
+  func.return %pack : tensor<8x4x8x1xf32>
+}
+// CHECK-LABEL: func.func @bubble_up_pack_through_unit_collapse
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<1x8x1x4x8x1xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<1x64x1x4xf32> -> tensor<1x8x1x4x8x1xf32>
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1, 2], [3], [4], [5]] : tensor<1x8x1x4x8x1xf32> into tensor<8x4x8x1xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<8x4x8x1xf32>
+
+// -----
+
+func.func @bubble_up_pack_through_collapse_on_outer_dims(%1: tensor<?x16x4xf32>, %dim : index) -> tensor<?x1x4xf32> {
+  %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor<?x16x4xf32> into tensor<?x4xf32>
+  %2 = tensor.empty(%dim) : tensor<?x1x4xf32>
+  %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [4] into %2 : tensor<?x4xf32> -> tensor<?x1x4xf32>
+  func.return %pack : tensor<?x1x4xf32>
+}
+// CHECK-LABEL: func.func @bubble_up_pack_through_collapse_on_outer_dims
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x16x4xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x16x1x4xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [4] into %[[EMPTY]] : tensor<?x16x4xf32> -> tensor<?x16x1x4xf32>
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3]] : tensor<?x16x1x4xf32> into tensor<?x1x4xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<?x1x4xf32>
+
+// -----
+
+func.func @no_bubble_up_pack_through_non_divisible_collapse(%1: tensor<3072x64x4xf32>) -> tensor<384x32x8x8xf32> {
+  %collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<3072x64x4xf32> into tensor<3072x256xf32>
+  %2 = tensor.empty() : tensor<384x32x8x8xf32>
+  %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %2 : tensor<3072x256xf32> -> tensor<384x32x8x8xf32>
+  func.return %pack : tensor<384x32x8x8xf32>
+}
+// CHECK-LABEL: func.func @no_bubble_up_pack_through_non_divisible_collapse
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<3072x64x4xf32> into tensor<3072x256xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[COLLAPSED]]
+// CHECK:         return %[[PACK]] : tensor<384x32x8x8xf32>
+
+// -----
+
+func.func @push_down_unpack_through_expand(%5: tensor<?x32x8x8xf32>, %dim: index) -> tensor<?x256x256xf32> {
+  %6 = tensor.empty(%dim) : tensor<?x256xf32>
+  %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<?x32x8x8xf32> -> tensor<?x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] : tensor<?x256xf32> into tensor<?x256x256xf32>
+  func.return %expanded : tensor<?x256x256xf32>
+}
+// CHECK-LABEL: func.func @push_down_unpack_through_expand
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
+// CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x32x32x8x8xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<?x32x32x8x8xf32> -> tensor<?x256x256xf32>
+// CHECK:         return %[[UNPACK]] : tensor<?x256x256xf32>
+
+// -----
+
+func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32>) -> tensor<4x12x256x256xf32> {
+  %6 = tensor.empty() : tensor<4x3072x256xf32>
+  %unpack = tensor.unpack %5 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 8] into %6 : tensor<4x32x384x8x8xf32> -> tensor<4x3072x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0], [1, 2], [3]] : tensor<4x3072x256xf32> into tensor<4x12x256x256xf32>
+  func.return %expanded : tensor<4x12x256x256xf32>
+}
+// CHECK-LABEL: @push_down_permuted_unpack_through_expand
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1], [2, 3], [4], [5]] : tensor<4x32x384x8x8xf32> into tensor<4x32x12x32x8x8xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<4x12x256x256xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<4x32x12x32x8x8xf32> -> tensor<4x12x256x256xf32>
+// CHECK:         return %[[UNPACK]] : tensor<4x12x256x256xf32>
+
+// -----
+
+func.func @push_down_unpack_through_unit_expand(%5: tensor<6x32x8x8xf32>) -> tensor<3x16x1x256xf32> {
+  %6 = tensor.empty() : tensor<48x256xf32>
+  %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<6x32x8x8xf32> -> tensor<48x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1, 2], [3]] : tensor<48x256xf32> into tensor<3x16x1x256xf32>
+  func.return %expanded : tensor<3x16x1x256xf32>
+}
+// CHECK-LABEL: func.func @push_down_unpack_through_unit_expand
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3], [4], [5]] : tensor<6x32x8x8xf32> into tensor<3x2x1x32x8x8xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<3x16x1x256xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<3x2x1x32x8x8xf32> -> tensor<3x16x1x256xf32>
+// CHECK:         return %[[UNPACK]] : tensor<3x16x1x256xf32>
+
+// -----
+
+func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor<?x32x8xf32>, %dim: index) -> tensor<?x256x256xf32> {
+  %6 = tensor.empty(%dim) : tensor<?x256xf32>
+  %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [8] into %6 : tensor<?x32x8xf32> -> tensor<?x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] : tensor<?x256xf32> into tensor<?x256x256xf32>
+  func.return %expanded : tensor<?x256x256xf32>
+}
+// CHECK-LABEL: func.func @push_down_unpack_through_expand_on_outer_dims
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3]] : tensor<?x32x8xf32> into tensor<?x256x32x8xf32>
+// CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x256x32x8xf32>
+// CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [8] into %[[EMPTY]] : tensor<?x256x32x8xf32> -> tensor<?x256x256xf32>
+// CHECK:         return %[[UNPACK]] : tensor<?x256x256xf32>
+
+// -----
+
+func.func @no_push_down_unpack_through_non_divisible_expand(%5: tensor<384x32x8x8xf32>) -> tensor<256x12x256xf32> {
+  %6 = tensor.empty() : tensor<3072x256xf32>
+  %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<384x32x8x8xf32> -> tensor<3072x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] : tensor<3072x256xf32> into tensor<256x12x256xf32>
+  func.return %expanded : tensor<256x12x256xf32>
+}
+// CHECK-LABEL: func.func @no_push_down_unpack_through_non_divisible_expand
+// CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[UNPACK]] {{\[}}[0, 1], [2]] : tensor<3072x256xf32> into tensor<256x12x256xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<256x12x256xf32>
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
index e2229a392bbf76..340ef30bf59c29 100644
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -190,6 +190,12 @@ func.func @func_powff64(%a : f64, %b : f64) {
   return
 }
 
+func.func @func_powff32(%a : f32, %b : f32) {
+  %r = math.powf %a, %b : f32
+  vector.print %r : f32
+  return
+}
+
 func.func @powf() {
   // CHECK-NEXT: 16
   %a   = arith.constant 4.0 : f64
@@ -230,7 +236,17 @@ func.func @powf() {
   %j   = arith.constant 29385.0 : f64
   %j_p = arith.constant 23598.0 : f64
   call @func_powff64(%j, %j_p) : (f64, f64) -> ()
-  return
+
+  // CHECK-NEXT: -nan
+  %k = arith.constant 1.0 : f64
+  %k_p = arith.constant 0xfff0000001000000 : f64
+  call @func_powff64(%k, %k_p) : (f64, f64) -> ()  
+
+  // CHECK-NEXT: -nan
+  %l = arith.constant 1.0 : f32
+  %l_p = arith.constant 0xffffffff : f32
+  call @func_powff32(%l, %l_p) : (f32, f32) -> ()  
+  return  
 }
 
 // -------------------------------------------------------------------------- //
diff --git a/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp b/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp
index c79daa79858171..c586ad1c1969b3 100644
--- a/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp
+++ b/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp
@@ -11,6 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DynamicLibrary.h"
+
+#include "Shared/Debug.h"
 #include <memory>
 
 #include "DLWrap.h"
@@ -37,15 +39,21 @@ uint32_t ffi_init() {
   std::string ErrMsg;
   auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
       llvm::sys::DynamicLibrary::getPermanentLibrary(FFI_PATH, &ErrMsg));
-  if (!DynlibHandle->isValid())
+
+  if (!DynlibHandle->isValid()) {
+    DP("Unable to load library '%s': %s!\n", FFI_PATH, ErrMsg.c_str());
     return DYNAMIC_FFI_FAIL;
+  }
 
   for (size_t I = 0; I < dlwrap::size(); I++) {
     const char *Sym = dlwrap::symbol(I);
 
     void *P = DynlibHandle->getAddressOfSymbol(Sym);
-    if (P == nullptr)
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, FFI_PATH);
       return DYNAMIC_FFI_FAIL;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
 
     *dlwrap::pointer(I) = P;
   }
@@ -53,8 +61,10 @@ uint32_t ffi_init() {
 #define DYNAMIC_INIT(SYMBOL)                                                   \
   {                                                                            \
     void *SymbolPtr = DynlibHandle->getAddressOfSymbol(#SYMBOL);               \
-    if (!SymbolPtr)                                                            \
+    if (!SymbolPtr) {                                                          \
+      DP("Unable to find '%s' in '%s'!\n", #SYMBOL, FFI_PATH);                 \
       return DYNAMIC_FFI_FAIL;                                                 \
+    }                                                                          \
     SYMBOL = *reinterpret_cast<decltype(SYMBOL) *>(SymbolPtr);                 \
   }
   DYNAMIC_INIT(ffi_type_void);
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 7c0b981a35c396..8ba54d44a7d4f8 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -80,9 +80,7 @@ DeviceTy::~DeviceTy() {
 llvm::Error DeviceTy::init() {
   // Make call to init_requires if it exists for this plugin.
   int32_t Ret = 0;
-  if (RTL->init_requires)
-    Ret = RTL->init_requires(PM->getRequirements());
-
+  Ret = RTL->init_requires(PM->getRequirements());
   if (Ret != OFFLOAD_SUCCESS)
     return llvm::createStringError(
         llvm::inconvertibleErrorCode(),
@@ -175,11 +173,11 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
           omp_get_initial_device(), HstPtrBegin, DeviceID, TgtPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
-  if (ForceSynchronousTargetRegions || !AsyncInfo ||
 #ifdef OMPT_SUPPORT
-      ompt::CallbacksInitialized ||
+  if (ForceSynchronousTargetRegions || !AsyncInfo || ompt::CallbacksInitialized)
+#else
+  if (ForceSynchronousTargetRegions || !AsyncInfo)
 #endif
-      !RTL->data_submit_async || !RTL->synchronize)
     return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
   return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
                                 AsyncInfo);
@@ -207,11 +205,11 @@ int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
           DeviceID, TgtPtrBegin, omp_get_initial_device(), HstPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
-  if (ForceSynchronousTargetRegions || !RTL->data_retrieve_async ||
 #ifdef OMPT_SUPPORT
-      ompt::CallbacksInitialized ||
+  if (ForceSynchronousTargetRegions || ompt::CallbacksInitialized)
+#else
+  if (ForceSynchronousTargetRegions)
 #endif
-      !RTL->synchronize)
     return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
   return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
                                   AsyncInfo);
@@ -237,11 +235,13 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
               .getTraceGenerators<ompt_target_data_transfer_from_device>(),
           RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-  if (ForceSynchronousTargetRegions || !AsyncInfo ||
+
 #ifdef OMPT_SUPPORT
-      ompt::CallbacksInitialized ||
+  if (ForceSynchronousTargetRegions || !AsyncInfo ||
+      ompt::CallbacksInitialized) {
+#else
+  if (ForceSynchronousTargetRegions || !AsyncInfo) {
 #endif
-      !RTL->data_exchange_async || !RTL->synchronize) {
     assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
     return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
                               Size);
@@ -251,9 +251,6 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
 }
 
 int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
-  if (!RTL->data_notify_mapped)
-    return OFFLOAD_SUCCESS;
-
   DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
      DPxPTR(HstPtr), Size);
 
@@ -265,9 +262,6 @@ int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
 }
 
 int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
-  if (!RTL->data_notify_unmapped)
-    return OFFLOAD_SUCCESS;
-
   DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
 
   if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
@@ -294,70 +288,46 @@ int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
 
 // Run region on device
 bool DeviceTy::printDeviceInfo() {
-  if (!RTL->print_device_info)
-    return false;
   RTL->print_device_info(RTLDeviceID);
   return true;
 }
 
 // Whether data can be copied to DstDevice directly
 bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
-  if (RTL != DstDevice.RTL || !RTL->is_data_exchangable)
+  if (RTL != DstDevice.RTL)
     return false;
 
   if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
-    return (RTL->data_exchange != nullptr) ||
-           (RTL->data_exchange_async != nullptr);
-
+    return true;
   return false;
 }
 
 int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
-  if (RTL->synchronize)
-    return RTL->synchronize(RTLDeviceID, AsyncInfo);
-  return OFFLOAD_SUCCESS;
+  return RTL->synchronize(RTLDeviceID, AsyncInfo);
 }
 
 int32_t DeviceTy::queryAsync(AsyncInfoTy &AsyncInfo) {
-  if (RTL->query_async)
-    return RTL->query_async(RTLDeviceID, AsyncInfo);
-
-  return synchronize(AsyncInfo);
+  return RTL->query_async(RTLDeviceID, AsyncInfo);
 }
 
 int32_t DeviceTy::createEvent(void **Event) {
-  if (RTL->create_event)
-    return RTL->create_event(RTLDeviceID, Event);
-
-  return OFFLOAD_SUCCESS;
+  return RTL->create_event(RTLDeviceID, Event);
 }
 
 int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) {
-  if (RTL->record_event)
-    return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
-
-  return OFFLOAD_SUCCESS;
+  return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
 }
 
 int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) {
-  if (RTL->wait_event)
-    return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
-
-  return OFFLOAD_SUCCESS;
+  return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
 }
 
 int32_t DeviceTy::syncEvent(void *Event) {
-  if (RTL->sync_event)
-    return RTL->sync_event(RTLDeviceID, Event);
-
-  return OFFLOAD_SUCCESS;
+  return RTL->sync_event(RTLDeviceID, Event);
 }
 
 int32_t DeviceTy::destroyEvent(void *Event) {
-  if (RTL->create_event)
-    return RTL->destroy_event(RTLDeviceID, Event);
-
-  return OFFLOAD_SUCCESS;
+  return RTL->destroy_event(RTLDeviceID, Event);
 }
 
 void DeviceTy::dumpOffloadEntries() {
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 0d67661e6485ac..7fcbe7dcff2375 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -482,10 +482,8 @@ EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
   assert(PM && "Runtime not initialized");
   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
   InfoLevel.store(NewInfoLevel);
-  for (auto &R : PM->pluginAdaptors()) {
-    if (R.set_info_flag)
-      R.set_info_flag(NewInfoLevel);
-  }
+  for (auto &R : PM->pluginAdaptors())
+    R.set_info_flag(NewInfoLevel);
 }
 
 EXTERN int __tgt_print_device_info(int64_t DeviceId) {
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 514b1d61334481..0c85bf7897b2b6 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -463,12 +463,10 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
 
   int32_t Err = 0;
-  if (!DeviceOrErr->RTL->data_lock) {
-    Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
-    if (Err) {
-      DP("Could not lock ptr %p\n", HostPtr);
-      return nullptr;
-    }
+  Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
+  if (Err) {
+    DP("Could not lock ptr %p\n", HostPtr);
+    return nullptr;
   }
   DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC));
   return RC;
@@ -481,9 +479,7 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
 
-  if (!DeviceOrErr->RTL->data_unlock)
-    DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
-
+  DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
   DP("%s returns\n", Name);
 }
 
diff --git a/revert_patches.txt b/revert_patches.txt
index 70553bd34c974f..983d470c44336a 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -12,8 +12,3 @@ Revert :breaks hip catch tests.
 Revert: breaks sles build 
 36146d2b6be [ELF] Make LinkerDrive::link a template. NFC  
 
-Revert: breaks openmp aomp build
-dcbddc252501 [Libomptarget] Unify and simplify plugin CMake (#86191) 
-85af772f3b40 [Libomptarget][FIX] Fix unintentinally used PUBLIC interface  
-3f5e649ff64a [Libomptarget] Fix linking to LLVM dylib (#86397
-9f0321ccf118 [Libomptarget] Make plugins depend explicitly 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index eb0afbb6dd6ffe..acdf9349fd5868 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -68,6 +68,7 @@ libc_support_library(
     name = "llvm_libc_macros_math_macros",
     hdrs = ["include/llvm-libc-macros/math-macros.h"],
     deps = [":llvm_libc_macros_limits_macros"],
+    defines = ["__FP_LOGBNAN_MIN"],
 )
 
 libc_support_library(