From 897b00f3c563dd3f7b8f7263c41eaebb3520ec86 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 30 Aug 2024 11:55:04 -0700
Subject: [PATCH 01/31] Reuse getBinOpIdentity in createAnyOfTargetReduction
 [nfc]

Consolidating code so that we have one copy instead of multiple reasoning
about identity element.  Note that we're (deliberately) not passing
the FMF flags to common utility to preserve behavior in this change.
---
 llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index a49d3b0b990bc7..8a8d8afece6cb4 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1210,6 +1210,11 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src,
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
                                          RecurKind RdxKind) {
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
+  auto getIdentity = [&]() {
+    Intrinsic::ID ID = getReductionIntrinsicID(RdxKind);
+    unsigned Opc = getArithmeticReductionInstruction(ID);
+    return ConstantExpr::getBinOpIdentity(Opc, SrcVecEltTy);
+  };
   switch (RdxKind) {
   case RecurKind::Add:
   case RecurKind::Mul:
@@ -1227,10 +1232,9 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
     return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src);
   case RecurKind::FMulAdd:
   case RecurKind::FAdd:
-    return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
-                                    Src);
+    return Builder.CreateFAddReduce(getIdentity(), Src);
   case RecurKind::FMul:
-    return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src);
+    return Builder.CreateFMulReduce(getIdentity(), Src);
   default:
     llvm_unreachable("Unhandled opcode");
   }

From 5eda4988117021b36ebe01b49082f63365846507 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 30 Aug 2024 12:34:41 -0700
Subject: [PATCH 02/31] Revert "[mlir][Transforms] Dialect conversion: Make
 materializations optional" (#106778)

Reverts llvm/llvm-project#104668

This commit triggers an edge case that can cause circular
`unrealized_conversion_cast` ops.
https://github.com/llvm/llvm-project/pull/106760 may fix it, but it is
has other issues. Reverting this PR for now, until I find a solution for
that problem.
---
 .../mlir/Transforms/DialectConversion.h       |  11 -
 .../Transforms/Utils/DialectConversion.cpp    | 393 +++++++++++++-----
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir |   5 +-
 .../Transforms/finalizing-bufferize.mlir      |   1 -
 .../test-legalize-type-conversion.mlir        |   6 +-
 5 files changed, 298 insertions(+), 118 deletions(-)

diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 5f680e8eca7559..60113bdef16a23 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -1124,17 +1124,6 @@ struct ConversionConfig {
   // already been modified) and iterators into past IR state cannot be
   // represented at the moment.
   RewriterBase::Listener *listener = nullptr;
-
-  /// If set to "true", the dialect conversion attempts to build source/target/
-  /// argument materializations through the type converter API in lieu of
-  /// builtin.unrealized_conversion_cast ops. The conversion process fails if
-  /// at least one materialization could not be built.
-  ///
-  /// If set to "false", the dialect conversion does not does not build any
-  /// custom materializations and instead inserts
-  /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR
-  /// is valid.
-  bool buildMaterializations = true;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index cc9c9495e5155c..b23fb97959ed67 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -702,12 +702,14 @@ class UnresolvedMaterializationRewrite : public OperationRewrite {
     return rewrite->getKind() == Kind::UnresolvedMaterialization;
   }
 
-  void rollback() override;
-
   UnrealizedConversionCastOp getOperation() const {
     return cast<UnrealizedConversionCastOp>(op);
   }
 
+  void rollback() override;
+
+  void cleanup(RewriterBase &rewriter) override;
+
   /// Return the type converter of this materialization (which may be null).
   const TypeConverter *getConverter() const {
     return converterAndKind.getPointer();
@@ -764,7 +766,7 @@ namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(MLIRContext *ctx,
                                          const ConversionConfig &config)
-      : context(ctx), eraseRewriter(ctx), config(config) {}
+      : context(ctx), config(config) {}
 
   //===--------------------------------------------------------------------===//
   // State Management
@@ -832,7 +834,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   //===--------------------------------------------------------------------===//
   // Materializations
   //===--------------------------------------------------------------------===//
-
   /// Build an unresolved materialization operation given an output type and set
   /// of input operands.
   Value buildUnresolvedMaterialization(MaterializationKind kind,
@@ -881,7 +882,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 
     /// Erase the given op (unless it was already erased).
     void eraseOp(Operation *op) override {
-      if (wasErased(op))
+      if (erased.contains(op))
         return;
       op->dropAllUses();
       RewriterBase::eraseOp(op);
@@ -889,24 +890,17 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 
     /// Erase the given block (unless it was already erased).
     void eraseBlock(Block *block) override {
-      if (wasErased(block))
+      if (erased.contains(block))
         return;
       assert(block->empty() && "expected empty block");
       block->dropAllDefinedValueUses();
       RewriterBase::eraseBlock(block);
     }
 
-    bool wasErased(void *ptr) const { return erased.contains(ptr); }
-
-    bool wasErased(OperationRewrite *rewrite) const {
-      return wasErased(rewrite->getOperation());
-    }
-
     void notifyOperationErased(Operation *op) override { erased.insert(op); }
 
     void notifyBlockErased(Block *block) override { erased.insert(block); }
 
-  private:
     /// Pointers to all erased operations and blocks.
     DenseSet<void *> erased;
   };
@@ -918,11 +912,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// MLIR context.
   MLIRContext *context;
 
-  /// A rewriter that keeps track of ops/block that were already erased and
-  /// skips duplicate op/block erasures. This rewriter is used during the
-  /// "cleanup" phase.
-  SingleEraseRewriter eraseRewriter;
-
   // Mapping between replaced values that differ in type. This happens when
   // replacing a value with one of a different type.
   ConversionValueMapping mapping;
@@ -1069,6 +1058,10 @@ void UnresolvedMaterializationRewrite::rollback() {
   op->erase();
 }
 
+void UnresolvedMaterializationRewrite::cleanup(RewriterBase &rewriter) {
+  rewriter.eraseOp(op);
+}
+
 void ConversionPatternRewriterImpl::applyRewrites() {
   // Commit all rewrites.
   IRRewriter rewriter(context, config.listener);
@@ -1076,6 +1069,7 @@ void ConversionPatternRewriterImpl::applyRewrites() {
     rewrite->commit(rewriter);
 
   // Clean up all rewrites.
+  SingleEraseRewriter eraseRewriter(context);
   for (auto &rewrite : rewrites)
     rewrite->cleanup(eraseRewriter);
 }
@@ -2359,6 +2353,12 @@ struct OperationConverter {
       ConversionPatternRewriterImpl &rewriterImpl,
       DenseMap<Value, SmallVector<Value>> &inverseMapping);
 
+  /// Legalize any unresolved type materializations.
+  LogicalResult legalizeUnresolvedMaterializations(
+      ConversionPatternRewriter &rewriter,
+      ConversionPatternRewriterImpl &rewriterImpl,
+      DenseMap<Value, SmallVector<Value>> &inverseMapping);
+
   /// Legalize an operation result that was marked as "erased".
   LogicalResult
   legalizeErasedResult(Operation *op, OpResult result,
@@ -2405,56 +2405,6 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
   return success();
 }
 
-static LogicalResult
-legalizeUnresolvedMaterialization(RewriterBase &rewriter,
-                                  UnresolvedMaterializationRewrite *rewrite) {
-  UnrealizedConversionCastOp op = rewrite->getOperation();
-  assert(!op.use_empty() &&
-         "expected that dead materializations have already been DCE'd");
-  Operation::operand_range inputOperands = op.getOperands();
-  Type outputType = op.getResultTypes()[0];
-
-  // Try to materialize the conversion.
-  if (const TypeConverter *converter = rewrite->getConverter()) {
-    rewriter.setInsertionPoint(op);
-    Value newMaterialization;
-    switch (rewrite->getMaterializationKind()) {
-    case MaterializationKind::Argument:
-      // Try to materialize an argument conversion.
-      newMaterialization = converter->materializeArgumentConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      if (newMaterialization)
-        break;
-      // If an argument materialization failed, fallback to trying a target
-      // materialization.
-      [[fallthrough]];
-    case MaterializationKind::Target:
-      newMaterialization = converter->materializeTargetConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      break;
-    case MaterializationKind::Source:
-      newMaterialization = converter->materializeSourceConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      break;
-    }
-    if (newMaterialization) {
-      assert(newMaterialization.getType() == outputType &&
-             "materialization callback produced value of incorrect type");
-      rewriter.replaceOp(op, newMaterialization);
-      return success();
-    }
-  }
-
-  InFlightDiagnostic diag = op->emitError()
-                            << "failed to legalize unresolved materialization "
-                               "from ("
-                            << inputOperands.getTypes() << ") to " << outputType
-                            << " that remained live after conversion";
-  diag.attachNote(op->getUsers().begin()->getLoc())
-      << "see existing live user here: " << *op->getUsers().begin();
-  return failure();
-}
-
 LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   if (ops.empty())
     return success();
@@ -2496,37 +2446,6 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   } else {
     rewriterImpl.applyRewrites();
   }
-
-  // Gather all unresolved materializations.
-  SmallVector<UnrealizedConversionCastOp> allCastOps;
-  DenseMap<Operation *, UnresolvedMaterializationRewrite *> rewriteMap;
-  for (std::unique_ptr<IRRewrite> &rewrite : rewriterImpl.rewrites) {
-    auto *mat = dyn_cast<UnresolvedMaterializationRewrite>(rewrite.get());
-    if (!mat)
-      continue;
-    if (rewriterImpl.eraseRewriter.wasErased(mat))
-      continue;
-    allCastOps.push_back(mat->getOperation());
-    rewriteMap[mat->getOperation()] = mat;
-  }
-
-  // Reconcile all UnrealizedConversionCastOps that were inserted by the
-  // dialect conversion frameworks. (Not the one that were inserted by
-  // patterns.)
-  SmallVector<UnrealizedConversionCastOp> remainingCastOps;
-  reconcileUnrealizedCasts(allCastOps, &remainingCastOps);
-
-  // Try to legalize all unresolved materializations.
-  if (config.buildMaterializations) {
-    IRRewriter rewriter(rewriterImpl.context, config.listener);
-    for (UnrealizedConversionCastOp castOp : remainingCastOps) {
-      auto it = rewriteMap.find(castOp.getOperation());
-      assert(it != rewriteMap.end() && "inconsistent state");
-      if (failed(legalizeUnresolvedMaterialization(rewriter, it->second)))
-        return failure();
-    }
-  }
-
   return success();
 }
 
@@ -2540,6 +2459,9 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
   if (failed(legalizeConvertedOpResultTypes(rewriter, rewriterImpl,
                                             inverseMapping)))
     return failure();
+  if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl,
+                                                inverseMapping)))
+    return failure();
   return success();
 }
 
@@ -2655,6 +2577,279 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes(
   return success();
 }
 
+/// Replace the results of a materialization operation with the given values.
+static void
+replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl,
+                       ResultRange matResults, ValueRange values,
+                       DenseMap<Value, SmallVector<Value>> &inverseMapping) {
+  matResults.replaceAllUsesWith(values);
+
+  // For each of the materialization results, update the inverse mappings to
+  // point to the replacement values.
+  for (auto [matResult, newValue] : llvm::zip(matResults, values)) {
+    auto inverseMapIt = inverseMapping.find(matResult);
+    if (inverseMapIt == inverseMapping.end())
+      continue;
+
+    // Update the reverse mapping, or remove the mapping if we couldn't update
+    // it. Not being able to update signals that the mapping would have become
+    // circular (i.e. %foo -> newValue -> %foo), which may occur as values are
+    // propagated through temporary materializations. We simply drop the
+    // mapping, and let the post-conversion replacement logic handle updating
+    // uses.
+    for (Value inverseMapVal : inverseMapIt->second)
+      if (!rewriterImpl.mapping.tryMap(inverseMapVal, newValue))
+        rewriterImpl.mapping.erase(inverseMapVal);
+  }
+}
+
+/// Compute all of the unresolved materializations that will persist beyond the
+/// conversion process, and require inserting a proper user materialization for.
+static void computeNecessaryMaterializations(
+    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
+        &materializationOps,
+    ConversionPatternRewriter &rewriter,
+    ConversionPatternRewriterImpl &rewriterImpl,
+    DenseMap<Value, SmallVector<Value>> &inverseMapping,
+    SetVector<UnresolvedMaterializationRewrite *> &necessaryMaterializations) {
+  // Helper function to check if the given value or a not yet materialized
+  // replacement of the given value is live.
+  // Note: `inverseMapping` maps from replaced values to original values.
+  auto isLive = [&](Value value) {
+    auto findFn = [&](Operation *user) {
+      auto matIt = materializationOps.find(user);
+      if (matIt != materializationOps.end())
+        return !necessaryMaterializations.count(matIt->second);
+      return rewriterImpl.isOpIgnored(user);
+    };
+    // A worklist is needed because a value may have gone through a chain of
+    // replacements and each of the replaced values may have live users.
+    SmallVector<Value> worklist;
+    worklist.push_back(value);
+    while (!worklist.empty()) {
+      Value next = worklist.pop_back_val();
+      if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end())
+        return true;
+      // This value may be replacing another value that has a live user.
+      llvm::append_range(worklist, inverseMapping.lookup(next));
+    }
+    return false;
+  };
+
+  llvm::unique_function<Value(Value, Value, Type)> lookupRemappedValue =
+      [&](Value invalidRoot, Value value, Type type) {
+        // Check to see if the input operation was remapped to a variant of the
+        // output.
+        Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type);
+        if (remappedValue.getType() == type && remappedValue != invalidRoot)
+          return remappedValue;
+
+        // Check to see if the input is a materialization operation that
+        // provides an inverse conversion. We just check blindly for
+        // UnrealizedConversionCastOp here, but it has no effect on correctness.
+        auto inputCastOp = value.getDefiningOp<UnrealizedConversionCastOp>();
+        if (inputCastOp && inputCastOp->getNumOperands() == 1)
+          return lookupRemappedValue(invalidRoot, inputCastOp->getOperand(0),
+                                     type);
+
+        return Value();
+      };
+
+  SetVector<UnresolvedMaterializationRewrite *> worklist;
+  for (auto &rewrite : rewriterImpl.rewrites) {
+    auto *mat = dyn_cast<UnresolvedMaterializationRewrite>(rewrite.get());
+    if (!mat)
+      continue;
+    materializationOps.try_emplace(mat->getOperation(), mat);
+    worklist.insert(mat);
+  }
+  while (!worklist.empty()) {
+    UnresolvedMaterializationRewrite *mat = worklist.pop_back_val();
+    UnrealizedConversionCastOp op = mat->getOperation();
+
+    // We currently only handle target materializations here.
+    assert(op->getNumResults() == 1 && "unexpected materialization type");
+    OpResult opResult = op->getOpResult(0);
+    Type outputType = opResult.getType();
+    Operation::operand_range inputOperands = op.getOperands();
+
+    // Try to forward propagate operands for user conversion casts that result
+    // in the input types of the current cast.
+    for (Operation *user : llvm::make_early_inc_range(opResult.getUsers())) {
+      auto castOp = dyn_cast<UnrealizedConversionCastOp>(user);
+      if (!castOp)
+        continue;
+      if (castOp->getResultTypes() == inputOperands.getTypes()) {
+        replaceMaterialization(rewriterImpl, user->getResults(), inputOperands,
+                               inverseMapping);
+        necessaryMaterializations.remove(materializationOps.lookup(user));
+      }
+    }
+
+    // Try to avoid materializing a resolved materialization if possible.
+    // Handle the case of a 1-1 materialization.
+    if (inputOperands.size() == 1) {
+      // Check to see if the input operation was remapped to a variant of the
+      // output.
+      Value remappedValue =
+          lookupRemappedValue(opResult, inputOperands[0], outputType);
+      if (remappedValue && remappedValue != opResult) {
+        replaceMaterialization(rewriterImpl, opResult, remappedValue,
+                               inverseMapping);
+        necessaryMaterializations.remove(mat);
+        continue;
+      }
+    } else {
+      // TODO: Avoid materializing other types of conversions here.
+    }
+
+    // If the materialization does not have any live users, we don't need to
+    // generate a user materialization for it.
+    bool isMaterializationLive = isLive(opResult);
+    if (!isMaterializationLive)
+      continue;
+    if (!necessaryMaterializations.insert(mat))
+      continue;
+
+    // Reprocess input materializations to see if they have an updated status.
+    for (Value input : inputOperands) {
+      if (auto parentOp = input.getDefiningOp<UnrealizedConversionCastOp>()) {
+        if (auto *mat = materializationOps.lookup(parentOp))
+          worklist.insert(mat);
+      }
+    }
+  }
+}
+
+/// Legalize the given unresolved materialization. Returns success if the
+/// materialization was legalized, failure otherise.
+static LogicalResult legalizeUnresolvedMaterialization(
+    UnresolvedMaterializationRewrite &mat,
+    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
+        &materializationOps,
+    ConversionPatternRewriter &rewriter,
+    ConversionPatternRewriterImpl &rewriterImpl,
+    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
+  auto findLiveUser = [&](auto &&users) {
+    auto liveUserIt = llvm::find_if_not(
+        users, [&](Operation *user) { return rewriterImpl.isOpIgnored(user); });
+    return liveUserIt == users.end() ? nullptr : *liveUserIt;
+  };
+
+  llvm::unique_function<Value(Value, Type)> lookupRemappedValue =
+      [&](Value value, Type type) {
+        // Check to see if the input operation was remapped to a variant of the
+        // output.
+        Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type);
+        if (remappedValue.getType() == type)
+          return remappedValue;
+        return Value();
+      };
+
+  UnrealizedConversionCastOp op = mat.getOperation();
+  if (!rewriterImpl.ignoredOps.insert(op))
+    return success();
+
+  // We currently only handle target materializations here.
+  OpResult opResult = op->getOpResult(0);
+  Operation::operand_range inputOperands = op.getOperands();
+  Type outputType = opResult.getType();
+
+  // If any input to this materialization is another materialization, resolve
+  // the input first.
+  for (Value value : op->getOperands()) {
+    auto valueCast = value.getDefiningOp<UnrealizedConversionCastOp>();
+    if (!valueCast)
+      continue;
+
+    auto matIt = materializationOps.find(valueCast);
+    if (matIt != materializationOps.end())
+      if (failed(legalizeUnresolvedMaterialization(
+              *matIt->second, materializationOps, rewriter, rewriterImpl,
+              inverseMapping)))
+        return failure();
+  }
+
+  // Perform a last ditch attempt to avoid materializing a resolved
+  // materialization if possible.
+  // Handle the case of a 1-1 materialization.
+  if (inputOperands.size() == 1) {
+    // Check to see if the input operation was remapped to a variant of the
+    // output.
+    Value remappedValue = lookupRemappedValue(inputOperands[0], outputType);
+    if (remappedValue && remappedValue != opResult) {
+      replaceMaterialization(rewriterImpl, opResult, remappedValue,
+                             inverseMapping);
+      return success();
+    }
+  } else {
+    // TODO: Avoid materializing other types of conversions here.
+  }
+
+  // Try to materialize the conversion.
+  if (const TypeConverter *converter = mat.getConverter()) {
+    rewriter.setInsertionPoint(op);
+    Value newMaterialization;
+    switch (mat.getMaterializationKind()) {
+    case MaterializationKind::Argument:
+      // Try to materialize an argument conversion.
+      newMaterialization = converter->materializeArgumentConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      if (newMaterialization)
+        break;
+      // If an argument materialization failed, fallback to trying a target
+      // materialization.
+      [[fallthrough]];
+    case MaterializationKind::Target:
+      newMaterialization = converter->materializeTargetConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      break;
+    case MaterializationKind::Source:
+      newMaterialization = converter->materializeSourceConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      break;
+    }
+    if (newMaterialization) {
+      assert(newMaterialization.getType() == outputType &&
+             "materialization callback produced value of incorrect type");
+      replaceMaterialization(rewriterImpl, opResult, newMaterialization,
+                             inverseMapping);
+      return success();
+    }
+  }
+
+  InFlightDiagnostic diag = op->emitError()
+                            << "failed to legalize unresolved materialization "
+                               "from ("
+                            << inputOperands.getTypes() << ") to " << outputType
+                            << " that remained live after conversion";
+  if (Operation *liveUser = findLiveUser(op->getUsers())) {
+    diag.attachNote(liveUser->getLoc())
+        << "see existing live user here: " << *liveUser;
+  }
+  return failure();
+}
+
+LogicalResult OperationConverter::legalizeUnresolvedMaterializations(
+    ConversionPatternRewriter &rewriter,
+    ConversionPatternRewriterImpl &rewriterImpl,
+    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
+  // As an initial step, compute all of the inserted materializations that we
+  // expect to persist beyond the conversion process.
+  DenseMap<Operation *, UnresolvedMaterializationRewrite *> materializationOps;
+  SetVector<UnresolvedMaterializationRewrite *> necessaryMaterializations;
+  computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl,
+                                   inverseMapping, necessaryMaterializations);
+
+  // Once computed, legalize any necessary materializations.
+  for (auto *mat : necessaryMaterializations) {
+    if (failed(legalizeUnresolvedMaterialization(
+            *mat, materializationOps, rewriter, rewriterImpl, inverseMapping)))
+      return failure();
+  }
+  return success();
+}
+
 LogicalResult OperationConverter::legalizeErasedResult(
     Operation *op, OpResult result,
     ConversionPatternRewriterImpl &rewriterImpl) {
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 75362378daaaaa..156a8a468d5b42 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1286,6 +1286,7 @@ func.func @warpgroup_matrix_multiply_m128n128k64(
 
 // CHECK-DAG: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> to i64
 // CHECK-DAG: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>> to i64
+// CHECK-DAG: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : memref<128x128xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK: %[[S3:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
 // CHECK: %[[S4:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
 // CHECK: %[[S5:.+]] = llvm.extractvalue %[[S4]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
@@ -1298,8 +1299,8 @@ func.func @warpgroup_matrix_multiply_m128n128k64(
 // CHECK: %[[S136:.+]] = llvm.insertvalue %[[S134]], %[[S135]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
 // CHECK: nvvm.wgmma.fence.aligned
 // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
-// CHECK: %[[S138:.+]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
-// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S138]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+// CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
+// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %1, %[[S138]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async
diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
index ab18ce05e355d3..a192434c5accf8 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
@@ -80,7 +80,6 @@ func.func @no_layout_to_dyn_layout_cast(%m: memref<?xf32>) -> memref<?xf32, stri
   %0 = bufferization.to_tensor %m : memref<?xf32>
   // expected-error @+1 {{failed to legalize unresolved materialization from ('memref<?xf32>') to 'memref<?xf32, strided<[1], offset: ?>>' that remained live after conversion}}
   %1 = bufferization.to_memref %0 : memref<?xf32, strided<[1], offset: ?>>
-  // expected-note @below{{see existing live user here}}
   return %1 : memref<?xf32, strided<[1], offset: ?>>
 }
 
diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir
index f130adff42f8cd..cf2c9f6a8ec441 100644
--- a/mlir/test/Transforms/test-legalize-type-conversion.mlir
+++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir
@@ -4,7 +4,6 @@
 func.func @test_invalid_arg_materialization(
   // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}}
   %arg0: i16) {
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%arg0) : (i16) -> ()
 }
 
@@ -23,7 +22,6 @@ func.func @test_valid_arg_materialization(%arg0: i64) {
 func.func @test_invalid_result_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.type_producer"() : () -> f16
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -32,7 +30,6 @@ func.func @test_invalid_result_materialization() {
 func.func @test_invalid_result_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.type_producer"() : () -> f16
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -52,7 +49,6 @@ func.func @test_transitive_use_materialization() {
 func.func @test_transitive_use_invalid_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.another_type_producer"() : () -> f16
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -103,9 +99,9 @@ func.func @test_block_argument_not_converted() {
 func.func @test_signature_conversion_no_converter() {
   "test.signature_conversion_no_converter"() ({
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}}
+  // expected-note@below {{see existing live user here}}
   ^bb0(%arg0: f32):
     "test.type_consumer"(%arg0) : (f32) -> ()
-    // expected-note@below{{see existing live user here}}
     "test.return"(%arg0) : (f32) -> ()
   }) : () -> ()
   return

From c315d787e3680e7f48d9de0502bb83300b190f84 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 30 Aug 2024 12:25:50 -0700
Subject: [PATCH 03/31] [VP] Reduce duplicate code in vp.reduce expansions

Primary goal is having one way of doing this, to ensure that we don't
end up with accidental divergence.
---
 llvm/lib/CodeGen/ExpandVectorPredication.cpp | 65 +++++---------------
 1 file changed, 15 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 675d88d6d38cd9..5140f5951d6d3f 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include <optional>
 
 using namespace llvm;
@@ -437,69 +438,33 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
   default:
     llvm_unreachable("Impossible reduction kind");
   case Intrinsic::vp_reduce_add:
-    Reduction = Builder.CreateAddReduce(RedOp);
-    Reduction = Builder.CreateAdd(Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_mul:
-    Reduction = Builder.CreateMulReduce(RedOp);
-    Reduction = Builder.CreateMul(Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_and:
-    Reduction = Builder.CreateAndReduce(RedOp);
-    Reduction = Builder.CreateAnd(Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_or:
-    Reduction = Builder.CreateOrReduce(RedOp);
-    Reduction = Builder.CreateOr(Reduction, Start);
-    break;
-  case Intrinsic::vp_reduce_xor:
-    Reduction = Builder.CreateXorReduce(RedOp);
-    Reduction = Builder.CreateXor(Reduction, Start);
-    break;
-  case Intrinsic::vp_reduce_smax:
-    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true);
+  case Intrinsic::vp_reduce_xor: {
+    Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID();
+    unsigned Opc = getArithmeticReductionInstruction(RedID);
+    assert(Instruction::isBinaryOp(Opc));
+    Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp);
     Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start);
+        Builder.CreateBinOp((Instruction::BinaryOps)Opc, Reduction, Start);
     break;
+  }
+  case Intrinsic::vp_reduce_smax:
   case Intrinsic::vp_reduce_smin:
-    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_umax:
-    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_umin:
-    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_fmax:
-    Reduction = Builder.CreateFPMaxReduce(RedOp);
-    transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_fmin:
-    Reduction = Builder.CreateFPMinReduce(RedOp);
-    transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_fmaximum:
-    Reduction = Builder.CreateFPMaximumReduce(RedOp);
-    transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::maximum, Reduction, Start);
-    break;
-  case Intrinsic::vp_reduce_fminimum:
-    Reduction = Builder.CreateFPMinimumReduce(RedOp);
+  case Intrinsic::vp_reduce_fminimum: {
+    Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID();
+    Intrinsic::ID ScalarID = getMinMaxReductionIntrinsicOp(RedID);
+    Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp);
     transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::minimum, Reduction, Start);
+    Reduction = Builder.CreateBinaryIntrinsic(ScalarID, Reduction, Start);
     break;
+  }
   case Intrinsic::vp_reduce_fadd:
     Reduction = Builder.CreateFAddReduce(Start, RedOp);
     break;

From a3f8790901cafaec8bcd863bd30b4f9ab7917bd8 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 30 Aug 2024 15:38:02 -0400
Subject: [PATCH 04/31] [libc++][NFC] Minor reformatting in <cstddef>

---
 libcxx/include/cstddef | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef
index 1a4049e4d34f2d..592f6261a6de3f 100644
--- a/libcxx/include/cstddef
+++ b/libcxx/include/cstddef
@@ -66,8 +66,8 @@ using ::max_align_t _LIBCPP_USING_IF_EXISTS;
 _LIBCPP_END_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 17
-namespace std // purposefully not versioned
-{
+namespace std { // purposefully not versioned
+
 enum class byte : unsigned char {};
 
 _LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept {
@@ -127,7 +127,6 @@ template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
 }
 
 } // namespace std
-
-#endif
+#endif // _LIBCPP_STD_VER >= 17
 
 #endif // _LIBCPP_CSTDDEF

From c53008de899653818b22c44eafd7e5eaab524e2b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 30 Aug 2024 12:44:02 -0700
Subject: [PATCH 05/31] [VPlan] Manually jumpthread a bit of reduction code for
 readability [nfc]

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f84317ba51257a..c9cee652d2d326 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1802,18 +1802,18 @@ void VPReductionRecipe::execute(VPTransformState &State) {
             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
             NewVecOp);
       PrevInChain = NewRed;
+      NextInChain = NewRed;
     } else {
       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
+        NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
+                                     NewRed, PrevInChain);
+      else
+        NextInChain = State.Builder.CreateBinOp(
+            (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed,
+            PrevInChain);
     }
-    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
-      NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
-                                   NewRed, PrevInChain);
-    } else if (IsOrdered)
-      NextInChain = NewRed;
-    else
-      NextInChain = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
     State.set(this, NextInChain, Part, /*IsScalar*/ true);
   }
 }

From 923a1c1fc348f7c30ff4726b54ed63ce403dc3ce Mon Sep 17 00:00:00 2001
From: Brendan Dahl <brendan.dahl@gmail.com>
Date: Fri, 30 Aug 2024 13:01:16 -0700
Subject: [PATCH 06/31] [WebAssembly] Update FP16 opcodes to match current
 spec. (#106759)

https://github.com/WebAssembly/half-precision/blob/f267a3d54432e5723dcc13ad4530c3581a0cc4b3/proposals/half-precision/Overview.md#binary-format
---
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 24 ++++-----
 llvm/test/MC/WebAssembly/simd-encodings.s     | 54 +++++++++----------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index da4b8d228f627d..9d17d90f530541 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -763,7 +763,7 @@ multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst> {
 multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
   defm "" : SIMDCondition<F32x4, name, cond, baseInst>;
   defm "" : SIMDCondition<F64x2, name, cond, !add(baseInst, 6)>;
-  defm "" : HalfPrecisionCondition<F16x8, name, cond, !add(baseInst, 255)>;
+  defm "" : HalfPrecisionCondition<F16x8, name, cond, !add(baseInst, 246)>;
 }
 
 // Equality: eq
@@ -1218,7 +1218,7 @@ multiclass SIMDUnaryFP<SDNode node, string name, bits<32> baseInst> {
   // Unlike F32x4 and F64x2 there's not a gap in the opcodes between "neg" and
   // "sqrt" so subtract one from the offset.
   defm "" : HalfPrecisionUnary<F16x8, node, name,
-                               !add(baseInst,!if(!eq(name, "sqrt"), 80, 81))>;
+                               !add(baseInst,!if(!eq(name, "sqrt"), 79, 80))>;
 }
 
 // Absolute value: abs
@@ -1239,10 +1239,10 @@ defm CEIL : SIMDUnary<F64x2, fceil, "ceil", 0x74>;
 defm FLOOR : SIMDUnary<F64x2, ffloor, "floor", 0x75>;
 defm TRUNC: SIMDUnary<F64x2, ftrunc, "trunc", 0x7a>;
 defm NEAREST: SIMDUnary<F64x2, fnearbyint, "nearest", 0x94>;
-defm CEIL : HalfPrecisionUnary<F16x8, fceil, "ceil", 0x13c>;
-defm FLOOR : HalfPrecisionUnary<F16x8, ffloor, "floor", 0x13d>;
-defm TRUNC : HalfPrecisionUnary<F16x8, ftrunc, "trunc", 0x13e>;
-defm NEAREST : HalfPrecisionUnary<F16x8, fnearbyint, "nearest", 0x13f>;
+defm CEIL : HalfPrecisionUnary<F16x8, fceil, "ceil", 0x133>;
+defm FLOOR : HalfPrecisionUnary<F16x8, ffloor, "floor", 0x134>;
+defm TRUNC : HalfPrecisionUnary<F16x8, ftrunc, "trunc", 0x135>;
+defm NEAREST : HalfPrecisionUnary<F16x8, fnearbyint, "nearest", 0x136>;
 
 // WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
 def : Pat<(v4f32 (frint (v4f32 V128:$src))), (NEAREST_F32x4 V128:$src)>;
@@ -1261,7 +1261,7 @@ def : Pat<(v8f16 (froundeven (v8f16 V128:$src))), (NEAREST_F16x8 V128:$src)>;
 multiclass SIMDBinaryFP<SDPatternOperator node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<F32x4, node, name, baseInst>;
   defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
-  defm "" : HalfPrecisionBinary<F16x8, node, name, !add(baseInst, 80)>;
+  defm "" : HalfPrecisionBinary<F16x8, node, name, !add(baseInst, 89)>;
 }
 
 // Addition: add
@@ -1362,8 +1362,8 @@ multiclass HalfPrecisionConvert<Vec vec, Vec arg, SDPatternOperator op,
 // Floating point to integer with saturation: trunc_sat
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
-defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_sint, "trunc_sat_f16x8_s", 0x148>;
-defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_uint, "trunc_sat_f16x8_u", 0x149>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_sint, "trunc_sat_f16x8_s", 0x145>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_uint, "trunc_sat_f16x8_u", 0x146>;
 
 // Support the saturating variety as well.
 def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>;
@@ -1394,8 +1394,8 @@ defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
 defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
 defm "" : SIMDConvert<F64x2, I32x4, convert_low_s, "convert_low_i32x4_s", 0xfe>;
 defm "" : SIMDConvert<F64x2, I32x4, convert_low_u, "convert_low_i32x4_u", 0xff>;
-defm "" : HalfPrecisionConvert<F16x8, I16x8, sint_to_fp, "convert_i16x8_s", 0x14a>;
-defm "" : HalfPrecisionConvert<F16x8, I16x8, uint_to_fp, "convert_i16x8_u", 0x14b>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, sint_to_fp, "convert_i16x8_s", 0x147>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, uint_to_fp, "convert_i16x8_u", 0x148>;
 
 // Extending operations
 // TODO: refactor this to be uniform for i64x2 if the numbering is not changed.
@@ -1538,7 +1538,7 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
 
 defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
 defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x146, 0x147, [HasFP16]>;
+defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
 
 //===----------------------------------------------------------------------===//
 // Laneselect
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 45335b348b7e8f..48aec4bc52a0c5 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -854,85 +854,85 @@ main:
     # CHECK: f16x8.replace_lane 1 # encoding: [0xfd,0xa2,0x02,0x01]
     f16x8.replace_lane 1
 
-    # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02]
+    # CHECK: f16x8.add # encoding: [0xfd,0xbd,0x02]
     f16x8.add
 
-    # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02]
+    # CHECK: f16x8.sub # encoding: [0xfd,0xbe,0x02]
     f16x8.sub
 
-    # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02]
+    # CHECK: f16x8.mul # encoding: [0xfd,0xbf,0x02]
     f16x8.mul
 
-    # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02]
+    # CHECK: f16x8.div # encoding: [0xfd,0xc0,0x02]
     f16x8.div
 
-    # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02]
+    # CHECK: f16x8.min # encoding: [0xfd,0xc1,0x02]
     f16x8.min
 
-    # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02]
+    # CHECK: f16x8.max # encoding: [0xfd,0xc2,0x02]
     f16x8.max
 
-    # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02]
+    # CHECK: f16x8.pmin # encoding: [0xfd,0xc3,0x02]
     f16x8.pmin
 
-    # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02]
+    # CHECK: f16x8.pmax # encoding: [0xfd,0xc4,0x02]
     f16x8.pmax
 
-    # CHECK: f16x8.eq # encoding: [0xfd,0xc0,0x02]
+    # CHECK: f16x8.eq # encoding: [0xfd,0xb7,0x02]
     f16x8.eq
 
-    # CHECK: f16x8.ne # encoding: [0xfd,0xc1,0x02]
+    # CHECK: f16x8.ne # encoding: [0xfd,0xb8,0x02]
     f16x8.ne
 
-    # CHECK: f16x8.lt # encoding: [0xfd,0xc2,0x02]
+    # CHECK: f16x8.lt # encoding: [0xfd,0xb9,0x02]
     f16x8.lt
 
-    # CHECK: f16x8.gt # encoding: [0xfd,0xc3,0x02]
+    # CHECK: f16x8.gt # encoding: [0xfd,0xba,0x02]
     f16x8.gt
 
-    # CHECK: f16x8.le # encoding: [0xfd,0xc4,0x02]
+    # CHECK: f16x8.le # encoding: [0xfd,0xbb,0x02]
     f16x8.le
 
-    # CHECK: f16x8.ge # encoding: [0xfd,0xc5,0x02]
+    # CHECK: f16x8.ge # encoding: [0xfd,0xbc,0x02]
     f16x8.ge
 
-    # CHECK: f16x8.abs # encoding: [0xfd,0xb1,0x02]
+    # CHECK: f16x8.abs # encoding: [0xfd,0xb0,0x02]
     f16x8.abs
 
-    # CHECK: f16x8.neg # encoding: [0xfd,0xb2,0x02]
+    # CHECK: f16x8.neg # encoding: [0xfd,0xb1,0x02]
     f16x8.neg
 
-    # CHECK: f16x8.sqrt # encoding: [0xfd,0xb3,0x02]
+    # CHECK: f16x8.sqrt # encoding: [0xfd,0xb2,0x02]
     f16x8.sqrt
 
-    # CHECK: f16x8.ceil # encoding: [0xfd,0xbc,0x02]
+    # CHECK: f16x8.ceil # encoding: [0xfd,0xb3,0x02]
     f16x8.ceil
 
-    # CHECK: f16x8.floor # encoding: [0xfd,0xbd,0x02]
+    # CHECK: f16x8.floor # encoding: [0xfd,0xb4,0x02]
     f16x8.floor
 
-    # CHECK: f16x8.trunc # encoding: [0xfd,0xbe,0x02]
+    # CHECK: f16x8.trunc # encoding: [0xfd,0xb5,0x02]
     f16x8.trunc
 
-    # CHECK: f16x8.nearest # encoding: [0xfd,0xbf,0x02]
+    # CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02]
     f16x8.nearest
 
-    # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xc6,0x02]
+    # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02]
     f16x8.relaxed_madd
 
-    # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xc7,0x02]
+    # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02]
     f16x8.relaxed_nmadd
 
-    # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc8,0x02]
+    # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02]
     i16x8.trunc_sat_f16x8_s
 
-    # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc9,0x02]
+    # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc6,0x02]
     i16x8.trunc_sat_f16x8_u
 
-    # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xca,0x02]
+    # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xc7,0x02]
     f16x8.convert_i16x8_s
 
-    # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xcb,0x02]
+    # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02]
     f16x8.convert_i16x8_u
 
     end_function

From 5e7f0dcd69fd666bbb2a93d20e6a56a11261b519 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 30 Aug 2024 13:16:26 -0700
Subject: [PATCH 07/31] [lldb] Include checksum in source cache dump (#106773)

This patch updates the source cache dump command to print both the
actual (on-disk) checksum and the expected (line table) checksum. To
achieve that we now read and store the on-disk checksum in the cached
object. The same information will be used in a future path to print a
warning when the checksums differ.
---
 lldb/include/lldb/Core/SourceManager.h |  6 ++++++
 lldb/source/Core/SourceManager.cpp     | 27 +++++++++++++++++++-------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h
index ae7bd3d2311f96..172824dc78a6bc 100644
--- a/lldb/include/lldb/Core/SourceManager.h
+++ b/lldb/include/lldb/Core/SourceManager.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_CORE_SOURCEMANAGER_H
 #define LLDB_CORE_SOURCEMANAGER_H
 
+#include "lldb/Utility/Checksum.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-forward.h"
@@ -71,6 +72,8 @@ class SourceManager {
 
     llvm::sys::TimePoint<> GetTimestamp() const { return m_mod_time; }
 
+    const Checksum &GetChecksum() const { return m_checksum; }
+
   protected:
     /// Set file and update modification time.
     void SetSupportFile(lldb::SupportFileSP support_file_sp);
@@ -81,6 +84,9 @@ class SourceManager {
     /// different from the original support file passed to the constructor.
     lldb::SupportFileSP m_support_file_sp;
 
+    /// Keep track of the on-disk checksum.
+    Checksum m_checksum;
+
     // Keep the modification time that this file data is valid for
     llvm::sys::TimePoint<> m_mod_time;
 
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index c427bb91f4643a..f6e59ce731a573 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -447,13 +447,14 @@ void SourceManager::FindLinesMatchingRegex(SupportFileSP support_file_sp,
 
 SourceManager::File::File(SupportFileSP support_file_sp,
                           lldb::DebuggerSP debugger_sp)
-    : m_support_file_sp(std::make_shared<SupportFile>()), m_mod_time(),
-      m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) {
+    : m_support_file_sp(std::make_shared<SupportFile>()), m_checksum(),
+      m_mod_time(), m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) {
   CommonInitializer(support_file_sp, {});
 }
 
 SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp)
-    : m_support_file_sp(std::make_shared<SupportFile>()), m_mod_time(),
+    : m_support_file_sp(std::make_shared<SupportFile>()), m_checksum(),
+      m_mod_time(),
       m_debugger_wp(target_sp ? target_sp->GetDebugger().shared_from_this()
                               : DebuggerSP()),
       m_target_wp(target_sp) {
@@ -532,9 +533,11 @@ void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp,
   }
 
   // If the file exists, read in the data.
-  if (m_mod_time != llvm::sys::TimePoint<>())
+  if (m_mod_time != llvm::sys::TimePoint<>()) {
     m_data_sp = FileSystem::Instance().CreateDataBuffer(
         m_support_file_sp->GetSpecOnly());
+    m_checksum = llvm::MD5::hash(m_data_sp->GetData());
+  }
 }
 
 void SourceManager::File::SetSupportFile(lldb::SupportFileSP support_file_sp) {
@@ -835,14 +838,24 @@ SourceManager::FileSP SourceManager::SourceFileCache::FindSourceFile(
   return {};
 }
 
+static std::string toString(const Checksum &checksum) {
+  if (!checksum)
+    return "";
+  return std::string(llvm::formatv("{0}", checksum.digest()));
+}
+
 void SourceManager::SourceFileCache::Dump(Stream &stream) const {
-  stream << "Modification time   Lines    Path\n";
-  stream << "------------------- -------- --------------------------------\n";
+  // clang-format off
+  stream << "Modification time   MD5 Checksum (on-disk)           MD5 Checksum (line table)        Lines    Path\n";
+  stream << "------------------- -------------------------------- -------------------------------- -------- --------------------------------\n";
+  // clang-format on
   for (auto &entry : m_file_cache) {
     if (!entry.second)
       continue;
     FileSP file = entry.second;
-    stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,8:d} {2}\n", file->GetTimestamp(),
+    stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,32} {2,32} {3,8:d} {4}\n",
+                  file->GetTimestamp(), toString(file->GetChecksum()),
+                  toString(file->GetSupportFile()->GetChecksum()),
                   file->GetNumLines(), entry.first.GetPath());
   }
 }

From 432e9f44101e44bb996c350cf5693038916953f3 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 30 Aug 2024 13:19:31 -0700
Subject: [PATCH 08/31] [llvm][LoongArch] Avoid shift overflow (#106785)

Follow up fix to #106332

`LoongArchMatInt.cpp:96:33: runtime error: shift exponent 64 is too
large for 64-bit type`
https://lab.llvm.org/buildbot/#/builders/169/builds/2681
---
 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
index 6ad2c003558a51..a7823470382756 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
@@ -92,8 +92,9 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) {
     break;
   }
 
-  for (uint64_t Msb = 32; Msb < 64; ++Msb) {
-    uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1);
+  uint64_t Msb = 32;
+  uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1);
+  for (; Msb < 64; ++Msb, HighMask = (HighMask << 1) + 1) {
     for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) {
       uint64_t LowMask = (1ULL << Lsb) - 1;
       uint64_t Mask = HighMask | LowMask;

From 982d2445f2a5bad96c501ff23923648ffa094ef2 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 30 Aug 2024 13:51:53 -0700
Subject: [PATCH 09/31] Revert "AtomicExpand: Allow incrementally legalizing
 atomicrmw" (#106792)

Reverts llvm/llvm-project#103371

There is `heap-use-after-free`, commented on
206b5aff44a95754f6dd7a5696efa024e983ac59

Maybe `if (Next == E || BB != Next->getParent()) {` is enough,
but not sure, what was the intent there,
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp       |  35 +-
 llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 373 +++++++++-----------
 llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 373 +++++++++-----------
 llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 373 +++++++++-----------
 llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 373 +++++++++-----------
 5 files changed, 691 insertions(+), 836 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index b9732e816ea7e6..39a705599f90cc 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -351,30 +351,17 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
 
   bool MadeChange = false;
 
-  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) {
-    BasicBlock *BB = &*BBI;
-    ++BBI;
-
-    BasicBlock::iterator Next;
-
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-         I = Next) {
-      Instruction &Inst = *I;
-      Next = std::next(I);
-
-      if (processAtomicInstr(&Inst)) {
-        MadeChange = true;
-
-        // Detect control flow change and resume iteration from the original
-        // block to inspect any newly inserted blocks. This allows incremental
-        // legalizaton of atomicrmw and cmpxchg.
-        if (BB != Next->getParent()) {
-          BBI = BB->getIterator();
-          BBE = F.end();
-          break;
-        }
-      }
-    }
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (Instruction &I : instructions(F))
+    if (I.isAtomic() && !isa<FenceInst>(&I))
+      AtomicInsts.push_back(&I);
+
+  for (auto *I : AtomicInsts) {
+    if (processAtomicInstr(I))
+      MadeChange = true;
   }
 
   return MadeChange;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index ed9c1b037d0cc7..0d230bb9dcc6e9 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -43,49 +43,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -131,49 +128,46 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2
@@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 4
@@ -413,38 +399,35 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4
@@ -486,40 +469,36 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl __adddf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8
@@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __addsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 888b795876f7df..bfe0d20ca814bc 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -45,49 +45,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -133,49 +130,46 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2
@@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 4
@@ -415,38 +401,35 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4
@@ -488,40 +471,36 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl fmax
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8
@@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fmaxf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index a3665c6e428608..6b7d2df044460a 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -45,49 +45,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -133,49 +130,46 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -240,40 +234,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2
@@ -339,40 +329,36 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 4
@@ -415,38 +401,35 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4
@@ -488,40 +471,36 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl fmin
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8
@@ -588,18 +567,18 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB6_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB6_5
 ; SOFTFP-NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
@@ -612,33 +591,29 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB6_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB6_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB6_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB6_2
-; SOFTFP-NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB6_3
+; SOFTFP-NOLSE-NEXT:    b .LBB6_1
+; SOFTFP-NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -748,18 +723,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -769,28 +743,25 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -838,49 +809,45 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl fminf
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl fminf
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index 7725ce0e731859..67e164037d5ce7 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -43,49 +43,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB0_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB0_5
 ; SOFTFP-NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB0_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB0_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB0_2
-; SOFTFP-NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; SOFTFP-NOLSE-NEXT:    b .LBB0_1
+; SOFTFP-NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2
   ret half %res
@@ -131,49 +128,46 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4:
 ; SOFTFP-NOLSE:       // %bb.0:
-; SOFTFP-NOLSE-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
 ; SOFTFP-NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w22, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w23, w20, #0xffff
+; SOFTFP-NOLSE-NEXT:    mov w22, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w22, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB1_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB1_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB1_2
-; SOFTFP-NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; SOFTFP-NOLSE-NEXT:    b .LBB1_1
+; SOFTFP-NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; SOFTFP-NOLSE-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, half %value seq_cst, align 4
   ret half %res
@@ -238,40 +232,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB2_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB2_5
 ; SOFTFP-NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB2_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB2_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB2_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB2_2
-; SOFTFP-NOLSE-NEXT:  .LBB2_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB2_3
+; SOFTFP-NOLSE-NEXT:    b .LBB2_1
+; SOFTFP-NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2
@@ -337,40 +327,36 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldrh w20, [x0]
+; SOFTFP-NOLSE-NEXT:    lsl w21, w1, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldrh w0, [x0]
-; SOFTFP-NOLSE-NEXT:    lsl w20, w1, #16
 ; SOFTFP-NOLSE-NEXT:    b .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB3_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB3_5
 ; SOFTFP-NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w0, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
+; SOFTFP-NOLSE-NEXT:    lsl w0, w20, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxrh w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21, uxth
+; SOFTFP-NOLSE-NEXT:    ldaxrh w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20, uxth
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB3_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxrh w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB3_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB3_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB3_2
-; SOFTFP-NOLSE-NEXT:  .LBB3_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxrh wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB3_3
+; SOFTFP-NOLSE-NEXT:    b .LBB3_1
+; SOFTFP-NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 4
@@ -413,38 +399,35 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr w20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    ldr w0, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w20, w1
+; SOFTFP-NOLSE-NEXT:    mov w21, w1
 ; SOFTFP-NOLSE-NEXT:    b .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB4_6
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
+; SOFTFP-NOLSE-NEXT:    mov w20, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB4_5
 ; SOFTFP-NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w1, w20
-; SOFTFP-NOLSE-NEXT:    mov w21, w0
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
+; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w0, w21
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB4_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w8, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB4_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB4_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB4_2
-; SOFTFP-NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB4_3
+; SOFTFP-NOLSE-NEXT:    b .LBB4_1
+; SOFTFP-NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4
@@ -486,40 +469,36 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8:
 ; SOFTFP-NOLSE:       // %bb.0:
 ; SOFTFP-NOLSE-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldr x21, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; SOFTFP-NOLSE-NEXT:    ldr x20, [x0]
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
-; SOFTFP-NOLSE-NEXT:    mov x20, x1
+; SOFTFP-NOLSE-NEXT:    mov x21, x1
 ; SOFTFP-NOLSE-NEXT:    b .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_6
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
+; SOFTFP-NOLSE-NEXT:    mov x20, x8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB5_5
 ; SOFTFP-NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
-; SOFTFP-NOLSE-NEXT:    mov x1, x20
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
+; SOFTFP-NOLSE-NEXT:    mov x1, x21
 ; SOFTFP-NOLSE-NEXT:    bl __subdf3
-; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
 ; SOFTFP-NOLSE-NEXT:    ldaxr x8, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp x8, x21
+; SOFTFP-NOLSE-NEXT:    cmp x8, x20
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB5_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, x0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB5_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB5_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w9, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    mov x21, x8
-; SOFTFP-NOLSE-NEXT:    cbz w9, .LBB5_2
-; SOFTFP-NOLSE-NEXT:  .LBB5_6: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB5_3
+; SOFTFP-NOLSE-NEXT:    b .LBB5_1
+; SOFTFP-NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov x0, x20
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; SOFTFP-NOLSE-NEXT:    mov x0, x21
 ; SOFTFP-NOLSE-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ret
   %res = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8
@@ -708,18 +687,18 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w23, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
-; SOFTFP-NOLSE-NEXT:    mov w21, w1
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
+; SOFTFP-NOLSE-NEXT:    mov w22, w1
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB7_6
+; SOFTFP-NOLSE-NEXT:    lsr w23, w8, #16
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
+; SOFTFP-NOLSE-NEXT:    mov w21, w8
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB7_5
 ; SOFTFP-NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
@@ -732,33 +711,29 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w25, w0
-; SOFTFP-NOLSE-NEXT:    and w0, w22, #0xffff
+; SOFTFP-NOLSE-NEXT:    and w0, w21, #0xffff
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_h2f_ieee
 ; SOFTFP-NOLSE-NEXT:    mov w1, w25
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __gnu_f2h_ieee
-; SOFTFP-NOLSE-NEXT:    mov w8, w22
+; SOFTFP-NOLSE-NEXT:    bfi w21, w23, #16, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:    bfi w8, w23, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w8
+; SOFTFP-NOLSE-NEXT:    ldaxr w8, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp w8, w21
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB7_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w9, w0, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w9, .LBB7_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB7_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w23, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB7_2
-; SOFTFP-NOLSE-NEXT:  .LBB7_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB7_3
+; SOFTFP-NOLSE-NEXT:    b .LBB7_1
+; SOFTFP-NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    mov w1, w23
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
@@ -824,18 +799,17 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    ldrh w1, [x0, #2]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; SOFTFP-NOLSE-NEXT:    ldrh w22, [x0]
+; SOFTFP-NOLSE-NEXT:    ldrh w21, [x0]
 ; SOFTFP-NOLSE-NEXT:    lsl w20, w2, #16
-; SOFTFP-NOLSE-NEXT:    lsl w21, w8, #16
+; SOFTFP-NOLSE-NEXT:    lsl w22, w8, #16
 ; SOFTFP-NOLSE-NEXT:    mov x19, x0
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    b .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_6
+; SOFTFP-NOLSE-NEXT:    lsr w1, w21, #16
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB8_5
 ; SOFTFP-NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
@@ -845,28 +819,25 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    lsl w0, w22, #16
-; SOFTFP-NOLSE-NEXT:    mov w1, w21
+; SOFTFP-NOLSE-NEXT:    lsl w0, w21, #16
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    bl __truncsfbf2
-; SOFTFP-NOLSE-NEXT:    bfxil w23, w22, #0, #16
+; SOFTFP-NOLSE-NEXT:    bfxil w23, w21, #0, #16
 ; SOFTFP-NOLSE-NEXT:    bfi w0, w24, #16, #16
-; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr w22, [x19]
-; SOFTFP-NOLSE-NEXT:    cmp w22, w23
+; SOFTFP-NOLSE-NEXT:    ldaxr w21, [x19]
+; SOFTFP-NOLSE-NEXT:    cmp w21, w23
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB8_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w8, w0, [x19]
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB8_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB8_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr w1, w22, #16
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB8_2
-; SOFTFP-NOLSE-NEXT:  .LBB8_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, w0, [x19]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB8_3
+; SOFTFP-NOLSE-NEXT:    b .LBB8_1
+; SOFTFP-NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w21
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
@@ -914,49 +885,45 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa
 ; SOFTFP-NOLSE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w21, w1
-; SOFTFP-NOLSE-NEXT:    ldp w22, w23, [x0]
+; SOFTFP-NOLSE-NEXT:    ldp w23, w22, [x0]
 ; SOFTFP-NOLSE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
 ; SOFTFP-NOLSE-NEXT:    mov w19, w2
 ; SOFTFP-NOLSE-NEXT:    mov x20, x0
 ; SOFTFP-NOLSE-NEXT:    b .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %cmpxchg.nostore
+; SOFTFP-NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, wzr
-; SOFTFP-NOLSE-NEXT:    clrex
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB9_6
+; SOFTFP-NOLSE-NEXT:    lsr x22, x23, #32
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
+; SOFTFP-NOLSE-NEXT:    // kill: def $w22 killed $w22 killed $x22 def $x22
+; SOFTFP-NOLSE-NEXT:    b.eq .LBB9_5
 ; SOFTFP-NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; SOFTFP-NOLSE-NEXT:    // Child Loop BB9_3 Depth 2
-; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w0, w22
 ; SOFTFP-NOLSE-NEXT:    mov w1, w19
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
 ; SOFTFP-NOLSE-NEXT:    mov w24, w0
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
 ; SOFTFP-NOLSE-NEXT:    mov w1, w21
 ; SOFTFP-NOLSE-NEXT:    bl __subsf3
-; SOFTFP-NOLSE-NEXT:    mov w8, w0
-; SOFTFP-NOLSE-NEXT:    mov w9, w22
-; SOFTFP-NOLSE-NEXT:    // kill: def $w23 killed $w23 killed $x23 def $x23
-; SOFTFP-NOLSE-NEXT:    orr x8, x8, x24, lsl #32
-; SOFTFP-NOLSE-NEXT:    orr x9, x9, x23, lsl #32
-; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %cmpxchg.start
+; SOFTFP-NOLSE-NEXT:    mov w8, w23
+; SOFTFP-NOLSE-NEXT:    mov w9, w0
+; SOFTFP-NOLSE-NEXT:    orr x9, x9, x24, lsl #32
+; SOFTFP-NOLSE-NEXT:    orr x8, x8, x22, lsl #32
+; SOFTFP-NOLSE-NEXT:  .LBB9_3: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // Parent Loop BB9_2 Depth=1
 ; SOFTFP-NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; SOFTFP-NOLSE-NEXT:    ldaxr x22, [x20]
-; SOFTFP-NOLSE-NEXT:    cmp x22, x9
+; SOFTFP-NOLSE-NEXT:    ldaxr x23, [x20]
+; SOFTFP-NOLSE-NEXT:    cmp x23, x8
 ; SOFTFP-NOLSE-NEXT:    b.ne .LBB9_1
-; SOFTFP-NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
+; SOFTFP-NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
 ; SOFTFP-NOLSE-NEXT:    // in Loop: Header=BB9_3 Depth=2
-; SOFTFP-NOLSE-NEXT:    stlxr w10, x8, [x20]
-; SOFTFP-NOLSE-NEXT:    cbnz w10, .LBB9_3
-; SOFTFP-NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB9_2 Depth=1
-; SOFTFP-NOLSE-NEXT:    mov w8, #1 // =0x1
-; SOFTFP-NOLSE-NEXT:    lsr x23, x22, #32
-; SOFTFP-NOLSE-NEXT:    cbz w8, .LBB9_2
-; SOFTFP-NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
-; SOFTFP-NOLSE-NEXT:    mov w0, w22
-; SOFTFP-NOLSE-NEXT:    mov w1, w23
+; SOFTFP-NOLSE-NEXT:    stlxr wzr, x9, [x20]
+; SOFTFP-NOLSE-NEXT:    cbnz wzr, .LBB9_3
+; SOFTFP-NOLSE-NEXT:    b .LBB9_1
+; SOFTFP-NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; SOFTFP-NOLSE-NEXT:    mov w0, w23
+; SOFTFP-NOLSE-NEXT:    mov w1, w22
 ; SOFTFP-NOLSE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
 ; SOFTFP-NOLSE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload

From 57fe53cae40351ebd079a9a0105addf4ad2e97dd Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 30 Aug 2024 16:54:09 -0400
Subject: [PATCH 10/31] [libc++] First attempt to regroup a few modules in the
 modulemap (#98214)

We split up all the headers into top-level modules when we broke up
cycles with the C compatibility headers. However, this resulted in a
large number of small modules, which is awkward and clearly against the
philosophy of Clang modules. This was necessary to make things work.

This patch regroups a few headers from two leaf modules: stop_token and
pstl. It should be pretty uncontroversial that grouping these headers
into a single module doesn't introduce any cyclic dependency, yet it's a
first step towards reducing the number of top-level modules we have in
our modulemap.
---
 libcxx/include/module.modulemap               | 66 ++++++-------------
 .../atomic_unique_lock.pass.cpp               |  7 +-
 .../intrusive_list_view.pass.cpp              |  1 +
 .../intrusive_shared_ptr.pass.cpp             |  1 +
 4 files changed, 25 insertions(+), 50 deletions(-)

diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 13d0dce34d97e3..f193b5d95f49f5 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -245,8 +245,15 @@ module std_stdexcept [system] {
   header "stdexcept"
   export *
 }
-module std_stop_token {
+module std_stop_token [system] {
   header "stop_token"
+  private header "__stop_token/atomic_unique_lock.h"
+  private header "__stop_token/intrusive_list_view.h"
+  private header "__stop_token/intrusive_shared_ptr.h"
+  private header "__stop_token/stop_callback.h"
+  private header "__stop_token/stop_source.h"
+  private header "__stop_token/stop_state.h"
+  private header "__stop_token/stop_token.h"
   export *
 }
 module std_streambuf [system] {
@@ -1592,41 +1599,25 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric
 module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" }
 module std_private_numeric_transform_reduce         [system] { header "__numeric/transform_reduce.h" }
 
-module std_private_pstl_backend                    [system] {
+module std_private_pstl [system] {
   header "__pstl/backend.h"
-  export *
-}
-module std_private_pstl_backend_fwd                [system] {
   header "__pstl/backend_fwd.h"
-  export *
-}
-module std_private_pstl_backends_default           [system] {
   header "__pstl/backends/default.h"
-  export *
-}
-module std_private_pstl_backends_libdispatch       [system] {
   header "__pstl/backends/libdispatch.h"
-  export *
-}
-module std_private_pstl_backends_serial            [system] {
   header "__pstl/backends/serial.h"
-  export *
-}
-module std_private_pstl_backends_std_thread        [system] {
   header "__pstl/backends/std_thread.h"
-  export *
+  header "__pstl/cpu_algos/any_of.h"
+  header "__pstl/cpu_algos/cpu_traits.h"
+  header "__pstl/cpu_algos/fill.h"
+  header "__pstl/cpu_algos/find_if.h"
+  header "__pstl/cpu_algos/for_each.h"
+  header "__pstl/cpu_algos/merge.h"
+  header "__pstl/cpu_algos/stable_sort.h"
+  header "__pstl/cpu_algos/transform.h"
+  header "__pstl/cpu_algos/transform_reduce.h"
+  header "__pstl/dispatch.h"
+  header "__pstl/handle_exception.h"
 }
-module std_private_pstl_cpu_algos_any_of           [system] { header "__pstl/cpu_algos/any_of.h" }
-module std_private_pstl_cpu_algos_cpu_traits       [system] { header "__pstl/cpu_algos/cpu_traits.h" }
-module std_private_pstl_cpu_algos_fill             [system] { header "__pstl/cpu_algos/fill.h" }
-module std_private_pstl_cpu_algos_find_if          [system] { header "__pstl/cpu_algos/find_if.h" }
-module std_private_pstl_cpu_algos_for_each         [system] { header "__pstl/cpu_algos/for_each.h" }
-module std_private_pstl_cpu_algos_merge            [system] { header "__pstl/cpu_algos/merge.h" }
-module std_private_pstl_cpu_algos_stable_sort      [system] { header "__pstl/cpu_algos/stable_sort.h" }
-module std_private_pstl_cpu_algos_transform        [system] { header "__pstl/cpu_algos/transform.h" }
-module std_private_pstl_cpu_algos_transform_reduce [system] { header "__pstl/cpu_algos/transform_reduce.h" }
-module std_private_pstl_dispatch                   [system] { header "__pstl/dispatch.h" }
-module std_private_pstl_handle_exception           [system] { header "__pstl/handle_exception.h" }
 
 module std_private_queue_fwd [system] { header "__fwd/queue.h" }
 
@@ -1781,23 +1772,6 @@ module std_private_span_span_fwd [system] { header "__fwd/span.h" }
 
 module std_private_stack_fwd [system] { header "__fwd/stack.h" }
 
-module std_private_stop_token_atomic_unique_lock   [system] { header "__stop_token/atomic_unique_lock.h" }
-module std_private_stop_token_intrusive_list_view  [system] { header "__stop_token/intrusive_list_view.h" }
-module std_private_stop_token_intrusive_shared_ptr [system] { header "__stop_token/intrusive_shared_ptr.h" }
-module std_private_stop_token_stop_callback        [system] { header "__stop_token/stop_callback.h" }
-module std_private_stop_token_stop_source          [system] {
-  header "__stop_token/stop_source.h"
-  export *
-}
-module std_private_stop_token_stop_state           [system] {
-  header "__stop_token/stop_state.h"
-  export *
-}
-module std_private_stop_token_stop_token           [system] {
-  header "__stop_token/stop_token.h"
-  export *
-}
-
 module std_private_string_char_traits           [system] {
   header "__string/char_traits.h"
   export *
diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
index 2a9b828f4389ce..44d51921ac74ad 100644
--- a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
@@ -5,12 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-
-// XFAIL: availability-synchronization_library-missing
 
+// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header
 
 #include <__stop_token/atomic_unique_lock.h>
 #include <atomic>
diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp
index 85cd9786258955..d8cd2fb68e132e 100644
--- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp
@@ -8,6 +8,7 @@
 //
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header
 
 #include <__stop_token/intrusive_list_view.h>
 #include <cassert>
diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp
index 47440015f2c50c..99d4226662a0b7 100644
--- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp
@@ -8,6 +8,7 @@
 //
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header
 
 #include <__stop_token/intrusive_shared_ptr.h>
 #include <atomic>

From 06c531e808ceeafdf996867a2e8e66960ae4774e Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Fri, 30 Aug 2024 14:00:33 -0700
Subject: [PATCH 11/31] BPF: Generate locked insn for __sync_fetch_and_add()
 with cpu v1/v2 (#106494)

This patch contains two pars:
- first to revert the patch https://github.com/llvm/llvm-project/pull/101428.
- second to remove `atomic_fetch_and_*()` to `atomic_<op>()`
  conversion (when return value is not used), but preserve
  `__sync_fetch_and_add()` to locked insn with cpu v1/v2.
---
 llvm/lib/Target/BPF/BPF.h                |   2 +
 llvm/lib/Target/BPF/BPFInstrInfo.td      |  76 +++-------
 llvm/lib/Target/BPF/BPFMIChecking.cpp    | 181 +++++++++++++++++++++++
 llvm/lib/Target/BPF/BPFTargetMachine.cpp |   1 +
 llvm/lib/Target/BPF/CMakeLists.txt       |   1 +
 llvm/test/CodeGen/BPF/atomics.ll         |  15 +-
 llvm/test/CodeGen/BPF/atomics_2.ll       |   2 +-
 llvm/test/CodeGen/BPF/objdump_atomics.ll |   4 +-
 llvm/test/CodeGen/BPF/xadd.ll            |  59 ++++++++
 llvm/test/CodeGen/BPF/xadd_legal.ll      |   2 +-
 10 files changed, 280 insertions(+), 63 deletions(-)
 create mode 100644 llvm/lib/Target/BPF/BPFMIChecking.cpp
 create mode 100644 llvm/test/CodeGen/BPF/xadd.ll

diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index f7bc6f958470b9..f07ae4c9baf1c6 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -28,6 +28,7 @@ FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
 FunctionPass *createBPFMIPeepholePass();
 FunctionPass *createBPFMIPreEmitPeepholePass();
+FunctionPass *createBPFMIPreEmitCheckingPass();
 
 InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &,
                                                   const BPFSubtarget &,
@@ -36,6 +37,7 @@ InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &,
 void initializeBPFCheckAndAdjustIRPass(PassRegistry&);
 void initializeBPFDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeBPFMIPeepholePass(PassRegistry &);
+void initializeBPFMIPreEmitCheckingPass(PassRegistry &);
 void initializeBPFMIPreEmitPeepholePass(PassRegistry &);
 void initializeBPFMISimplifyPatchablePass(PassRegistry &);
 
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 4baeeb017699d6..6c750af5c2fd92 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -786,45 +786,13 @@ let Predicates = [BPFNoALU32] in {
   def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>;
 }
 
-// Atomic XADD for BPFNoALU32
-class XADD<BPFWidthModifer SizeOp, string OpcodeStr>
-    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
-                 (outs GPR:$dst),
-                 (ins MEMri:$addr, GPR:$val),
-                 "lock *("#OpcodeStr#" *)($addr) += $val",
-                 []> {
-  bits<4> dst;
-  bits<20> addr;
-
-  let Inst{51-48} = addr{19-16}; // base reg
-  let Inst{55-52} = dst;
-  let Inst{47-32} = addr{15-0}; // offset
-  let Inst{7-4} = BPF_ADD.Value;
-  let BPFClass = BPF_STX;
-}
-
 // Atomic add, and, or, xor
-class ATOMIC_NOFETCH<BPFArithOp Opc, string Opstr>
-    : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_DW.Value,
+class ATOMIC_NOFETCH<BPFWidthModifer SizeOp, string OpType, RegisterClass RegTp,
+                     BPFArithOp Opc, string Opstr>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
                  (outs GPR:$dst),
-                 (ins MEMri:$addr, GPR:$val),
-                 "lock *(u64 *)($addr) " #Opstr# "= $val",
-                 []> {
-  bits<4> dst;
-  bits<20> addr;
-
-  let Inst{51-48} = addr{19-16}; // base reg
-  let Inst{55-52} = dst;
-  let Inst{47-32} = addr{15-0}; // offset
-  let Inst{7-4} = Opc.Value;
-  let BPFClass = BPF_STX;
-}
-
-class ATOMIC32_NOFETCH<BPFArithOp Opc, string Opstr>
-    : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_W.Value,
-                 (outs GPR32:$dst),
-                 (ins MEMri:$addr, GPR32:$val),
-                 "lock *(u32 *)($addr) " #Opstr# "= $val",
+                 (ins MEMri:$addr, RegTp:$val),
+                 "lock *(" #OpType# " *)($addr) " #Opstr# "= $val",
                  []> {
   bits<4> dst;
   bits<20> addr;
@@ -838,16 +806,23 @@ class ATOMIC32_NOFETCH<BPFArithOp Opc, string Opstr>
 
 let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
-    def XADDW32 : ATOMIC32_NOFETCH<BPF_ADD, "+">;
-    def XANDW32 : ATOMIC32_NOFETCH<BPF_AND, "&">;
-    def XORW32  : ATOMIC32_NOFETCH<BPF_OR, "|">;
-    def XXORW32 : ATOMIC32_NOFETCH<BPF_XOR, "^">;
+    def XADDW32 : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_ADD, "+">;
+    def XANDW32 : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_AND, "&">;
+    def XORW32  : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_OR, "|">;
+    def XXORW32 : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_XOR, "^">;
   }
+  def XADDW  : ATOMIC_NOFETCH<BPF_W,  "u32", GPR, BPF_ADD, "+">;
+  def XADDD  : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_ADD, "+">;
+  def XANDD  : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_AND, "&">;
+  def XORD   : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_OR, "|">;
+  def XXORD  : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_XOR, "^">;
+}
 
-  def XADDD  : ATOMIC_NOFETCH<BPF_ADD, "+">;
-  def XANDD  : ATOMIC_NOFETCH<BPF_AND, "&">;
-  def XORD   : ATOMIC_NOFETCH<BPF_OR, "|">;
-  def XXORD  : ATOMIC_NOFETCH<BPF_XOR, "^">;
+let Predicates = [BPFNoALU32] in {
+  def : Pat<(atomic_load_add_i32 ADDRri:$addr, GPR:$val),
+            (XADDW ADDRri:$addr, GPR:$val)>;
+  def : Pat<(atomic_load_add_i64 ADDRri:$addr, GPR:$val),
+            (XADDD ADDRri:$addr, GPR:$val)>;
 }
 
 // Atomic Fetch-and-<add, and, or, xor> operations
@@ -887,13 +862,6 @@ class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
   let BPFClass = BPF_STX;
 }
 
-let Constraints = "$dst = $val" in {
-  let Predicates = [BPFNoALU32] in {
-    def XADDW : XADD<BPF_W, "u32">;
-    def XFADDW : XFALU64<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
-  }
-}
-
 let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
     def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
@@ -902,7 +870,9 @@ let Constraints = "$dst = $val" in {
     def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32>;
   }
 
-  def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+  let Predicates = [BPFHasALU32] in {
+    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+  }
   def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64>;
   def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_i64>;
   def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64>;
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
new file mode 100644
index 00000000000000..24224f6c1e9e66
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -0,0 +1,181 @@
+//===-------------- BPFMIChecking.cpp - MI Checking Legality -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs checking to signal errors for certain illegal usages at
+// MachineInstruction layer. Specially, the result of XADD{32,64} insn should
+// not be used. The pass is done at the PreEmit pass right before the
+// machine code is emitted at which point the register liveness information
+// is still available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-mi-checking"
+
+namespace {
+
+struct BPFMIPreEmitChecking : public MachineFunctionPass {
+
+  static char ID;
+  MachineFunction *MF;
+  const TargetRegisterInfo *TRI;
+
+  BPFMIPreEmitChecking() : MachineFunctionPass(ID) {
+    initializeBPFMIPreEmitCheckingPass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  void processAtomicInsts();
+
+public:
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!skipFunction(MF.getFunction())) {
+      initialize(MF);
+      processAtomicInsts();
+    }
+    return false;
+  }
+};
+
+// Initialize class variables.
+void BPFMIPreEmitChecking::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF PreEmit checking pass ***\n\n");
+}
+
+// Make sure all Defs of XADD are dead, meaning any result of XADD insn is not
+// used.
+//
+// NOTE: BPF backend hasn't enabled sub-register liveness track, so when the
+// source and destination operands of XADD are GPR32, there is no sub-register
+// dead info. If we rely on the generic MachineInstr::allDefsAreDead, then we
+// will raise false alarm on GPR32 Def.
+//
+// To support GPR32 Def, ideally we could just enable sub-registr liveness track
+// on BPF backend, then allDefsAreDead could work on GPR32 Def. This requires
+// implementing TargetSubtargetInfo::enableSubRegLiveness on BPF.
+//
+// However, sub-register liveness tracking module inside LLVM is actually
+// designed for the situation where one register could be split into more than
+// one sub-registers for which case each sub-register could have their own
+// liveness and kill one of them doesn't kill others. So, tracking liveness for
+// each make sense.
+//
+// For BPF, each 64-bit register could only have one 32-bit sub-register. This
+// is exactly the case which LLVM think brings no benefits for doing
+// sub-register tracking, because the live range of sub-register must always
+// equal to its parent register, therefore liveness tracking is disabled even
+// the back-end has implemented enableSubRegLiveness. The detailed information
+// is at r232695:
+//
+//   Author: Matthias Braun <matze@braunis.de>
+//   Date:   Thu Mar 19 00:21:58 2015 +0000
+//   Do not track subregister liveness when it brings no benefits
+//
+// Hence, for BPF, we enhance MachineInstr::allDefsAreDead. Given the solo
+// sub-register always has the same liveness as its parent register, LLVM is
+// already attaching a implicit 64-bit register Def whenever the there is
+// a sub-register Def. The liveness of the implicit 64-bit Def is available.
+// For example, for "lock *(u32 *)(r0 + 4) += w9", the MachineOperand info could
+// be:
+//
+//   $w9 = XADDW32 killed $r0, 4, $w9(tied-def 0),
+//                        implicit killed $r9, implicit-def dead $r9
+//
+// Even though w9 is not marked as Dead, the parent register r9 is marked as
+// Dead correctly, and it is safe to use such information or our purpose.
+static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
+  const MCRegisterClass *GPR64RegClass =
+      &BPFMCRegisterClasses[BPF::GPRRegClassID];
+  std::vector<unsigned> GPR32LiveDefs;
+  std::vector<unsigned> GPR64DeadDefs;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    bool RegIsGPR64;
+
+    if (!MO.isReg() || MO.isUse())
+      continue;
+
+    RegIsGPR64 = GPR64RegClass->contains(MO.getReg());
+    if (!MO.isDead()) {
+      // It is a GPR64 live Def, we are sure it is live. */
+      if (RegIsGPR64)
+        return true;
+      // It is a GPR32 live Def, we are unsure whether it is really dead due to
+      // no sub-register liveness tracking. Push it to vector for deferred
+      // check.
+      GPR32LiveDefs.push_back(MO.getReg());
+      continue;
+    }
+
+    // Record any GPR64 dead Def as some unmarked GPR32 could be alias of its
+    // low 32-bit.
+    if (RegIsGPR64)
+      GPR64DeadDefs.push_back(MO.getReg());
+  }
+
+  // No GPR32 live Def, safe to return false.
+  if (GPR32LiveDefs.empty())
+    return false;
+
+  // No GPR64 dead Def, so all those GPR32 live Def can't have alias, therefore
+  // must be truely live, safe to return true.
+  if (GPR64DeadDefs.empty())
+    return true;
+
+  // Otherwise, return true if any aliased SuperReg of GPR32 is not dead.
+  for (auto I : GPR32LiveDefs)
+    for (MCPhysReg SR : TRI->superregs(I))
+      if (!llvm::is_contained(GPR64DeadDefs, SR))
+        return true;
+
+  return false;
+}
+
+void BPFMIPreEmitChecking::processAtomicInsts() {
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD)
+        continue;
+
+      LLVM_DEBUG(MI.dump());
+      if (hasLiveDefs(MI, TRI)) {
+        DebugLoc Empty;
+        const DebugLoc &DL = MI.getDebugLoc();
+        const Function &F = MF->getFunction();
+        F.getContext().diagnose(DiagnosticInfoUnsupported{
+            F, "Invalid usage of the XADD return value", DL});
+      }
+    }
+  }
+}
+
+} // namespace
+
+INITIALIZE_PASS(BPFMIPreEmitChecking, "bpf-mi-pemit-checking",
+                "BPF PreEmit Checking", false, false)
+
+char BPFMIPreEmitChecking::ID = 0;
+FunctionPass *llvm::createBPFMIPreEmitCheckingPass() {
+  return new BPFMIPreEmitChecking();
+}
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 64b115b8fc8afa..7d91fa8bb824cf 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -178,6 +178,7 @@ void BPFPassConfig::addMachineSSAOptimization() {
 }
 
 void BPFPassConfig::addPreEmitPass() {
+  addPass(createBPFMIPreEmitCheckingPass());
   if (getOptLevel() != CodeGenOptLevel::None)
     if (!DisableMIPeephole)
       addPass(createBPFMIPreEmitPeepholePass());
diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index 253660d4d62e37..eade4cacb7100e 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_target(BPFCodeGen
   BPFSubtarget.cpp
   BPFTargetMachine.cpp
   BPFMIPeephole.cpp
+  BPFMIChecking.cpp
   BPFMISimplifyPatchable.cpp
   BTFDebug.cpp
 
diff --git a/llvm/test/CodeGen/BPF/atomics.ll b/llvm/test/CodeGen/BPF/atomics.ll
index 0c16c49f2a873b..c17b94af5f7bd9 100644
--- a/llvm/test/CodeGen/BPF/atomics.ll
+++ b/llvm/test/CodeGen/BPF/atomics.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck --check-prefixes=CHECK,CHECK-V2 %s
-; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefixes=CHECK,CHECK-V3 %s
+; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefix=CHECK-V3 %s
 
 ; CHECK-LABEL: test_load_add_32
-; CHECK-V2: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
+; CHECK: lock *(u32 *)(r1 + 0) += r2
+; CHECK: encoding: [0xc3,0x21
 ; CHECK-V3: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
-; CHECK: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
+; CHECK-V3: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
 define void @test_load_add_32(ptr %p, i32 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i32 %v seq_cst
@@ -12,8 +13,10 @@ entry:
 }
 
 ; CHECK-LABEL: test_load_add_64
-; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
-; CHECK: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
+; CHECK: lock *(u64 *)(r1 + 0) += r2
+; CHECK: encoding: [0xdb,0x21
+; CHECK-V3: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-V3: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
 define void @test_load_add_64(ptr %p, i64 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i64 %v seq_cst
diff --git a/llvm/test/CodeGen/BPF/atomics_2.ll b/llvm/test/CodeGen/BPF/atomics_2.ll
index c670ddb05b6a77..6371e3b875638e 100644
--- a/llvm/test/CodeGen/BPF/atomics_2.ll
+++ b/llvm/test/CodeGen/BPF/atomics_2.ll
@@ -224,7 +224,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_atomic_xor_64
-; CHECK: r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2)
+; CHECK: atomic_fetch_xor((u64 *)(r1 + 0), r2)
 ; CHECK: encoding: [0xdb,0x21,0x00,0x00,0xa1,0x00,0x00,0x00]
 ; CHECK: w0 = 0
 define dso_local i32 @test_atomic_xor_64(ptr nocapture %p, i64 %v) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/BPF/objdump_atomics.ll b/llvm/test/CodeGen/BPF/objdump_atomics.ll
index c4cb16b2c36418..fcc889ba300e39 100644
--- a/llvm/test/CodeGen/BPF/objdump_atomics.ll
+++ b/llvm/test/CodeGen/BPF/objdump_atomics.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: test_load_add_32
 ; CHECK: c3 21
-; CHECK: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
+; CHECK: lock *(u32 *)(r1 + 0) += w2
 define void @test_load_add_32(ptr %p, i32 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i32 %v seq_cst
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK-LABEL: test_load_add_64
 ; CHECK: db 21
-; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK: lock *(u64 *)(r1 + 0) += r2
 define void @test_load_add_64(ptr %p, i64 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i64 %v seq_cst
diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll
new file mode 100644
index 00000000000000..5aeeb9baf7b892
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/xadd.ll
@@ -0,0 +1,59 @@
+; RUN: not llc -march=bpfel < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=bpfeb < %s 2>&1 | FileCheck %s
+
+; This file is generated with the source command and source
+; $ clang -target bpf -O2 -g -S -emit-llvm t.c
+; $ cat t.c
+; int test(int *ptr) {
+;    int r;
+;    __sync_fetch_and_add(ptr, 4);
+;    r = __sync_fetch_and_add(ptr, 6);
+;    return r;
+; }
+
+; ModuleID = 't.c'
+source_filename = "t.c"
+target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
+target triple = "bpf"
+
+; Function Attrs: nounwind
+define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15
+  %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16
+  %1 = atomicrmw add ptr %ptr, i32 6 seq_cst, !dbg !17
+; CHECK: in function test i32 (ptr): Invalid usage of the XADD return value
+  call void @llvm.dbg.value(metadata i32 %1, metadata !14, metadata !DIExpression()), !dbg !18
+  ret i32 %1, !dbg !19
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "t.c", directory: "/home/yhs/work/tests/llvm/sync/test1")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "ptr", arg: 1, scope: !7, file: !1, line: 1, type: !11)
+!14 = !DILocalVariable(name: "r", scope: !7, file: !1, line: 2, type: !10)
+!15 = !DILocation(line: 1, column: 15, scope: !7)
+!16 = !DILocation(line: 3, column: 4, scope: !7)
+!17 = !DILocation(line: 4, column: 8, scope: !7)
+!18 = !DILocation(line: 2, column: 8, scope: !7)
+!19 = !DILocation(line: 5, column: 4, scope: !7)
diff --git a/llvm/test/CodeGen/BPF/xadd_legal.ll b/llvm/test/CodeGen/BPF/xadd_legal.ll
index 88f04d85a779f8..9b07afade3fee9 100644
--- a/llvm/test/CodeGen/BPF/xadd_legal.ll
+++ b/llvm/test/CodeGen/BPF/xadd_legal.ll
@@ -19,7 +19,7 @@ define dso_local i32 @test(ptr nocapture %ptr, i64 %a) {
 entry:
   %conv = trunc i64 %a to i32
   %0 = atomicrmw add ptr %ptr, i32 %conv seq_cst
-; CHECK-64: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
+; CHECK-64: lock *(u32 *)(r1 + 0) += r2
 ; CHECK-32: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
   %1 = load i32, ptr %ptr, align 4
   ret i32 %1

From d66765ddf1ae9e16676a49cebd966258f8b5c6e0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 30 Aug 2024 21:01:09 +0000
Subject: [PATCH 12/31] [gn build] Port 06c531e808ce

---
 llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
index 243a92f2e62587..aa594df8c164a1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
@@ -71,6 +71,7 @@ static_library("LLVMBPFCodeGen") {
     "BPFISelLowering.cpp",
     "BPFInstrInfo.cpp",
     "BPFMCInstLower.cpp",
+    "BPFMIChecking.cpp",
     "BPFMIPeephole.cpp",
     "BPFMISimplifyPatchable.cpp",
     "BPFPreserveDIType.cpp",

From 02654f7370638889b989b4d776d35c3d47c87cdd Mon Sep 17 00:00:00 2001
From: Chris B <chris.bieneman@me.com>
Date: Fri, 30 Aug 2024 16:18:46 -0500
Subject: [PATCH 13/31] [HLSL][Doc] Document multi-argument resolution
 (#104474)

This updates the expected diffferences document to capture the
difference in multi-argument overload resolution between Clang and DXC.

Fixes #99530
---
 clang/docs/HLSL/ExpectedDifferences.rst | 121 +++++++++++++++++++++---
 1 file changed, 109 insertions(+), 12 deletions(-)

diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst
index 4782eb3cda754a..e143c5b71575aa 100644
--- a/clang/docs/HLSL/ExpectedDifferences.rst
+++ b/clang/docs/HLSL/ExpectedDifferences.rst
@@ -54,6 +54,19 @@ HLSL 202x based on proposal
 and
 `0008 <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.
 
+The largest difference between Clang and DXC's overload resolution is the
+algorithm used for identifying best-match overloads. There are more details
+about the algorithmic differences in the :ref:`multi_argument_overloads` section
+below. There are three high level differences that should be highlighted:
+
+* **There should be no cases** where DXC and Clang both successfully
+  resolve an overload where the resolved overload is different between the two.
+* There are cases where Clang will successfully resolve an overload that DXC
+  wouldn't because we've trimmed the overload set in Clang to remove ambiguity.
+* There are cases where DXC will successfully resolve an overload that Clang
+  will not for two reasons: (1) DXC only generates partial overload sets for
+  builtin functions and (2) DXC resolves cases that probably should be ambiguous.
+
 Clang's implementation extends standard overload resolution rules to HLSL
 library functionality. This causes subtle changes in overload resolution
 behavior between Clang and DXC. Some examples include:
@@ -71,18 +84,23 @@ behavior between Clang and DXC. Some examples include:
     uint U;
     int I;
     float X, Y, Z;
-    double3 A, B;
+    double3 R, G;
   }
 
-  void twoParams(int, int);
-  void twoParams(float, float);
+  void takesSingleDouble(double);
+  void takesSingleDouble(vector<double, 1>);
+
+  void scalarOrVector(double);
+  void scalarOrVector(vector<double, 2>);
 
   export void call() {
-    halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads
-                    // Clang: Resolves to halfOrInt16(uint16_t).
-    halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
     half H;
+    halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
+
   #ifndef IGNORE_ERRORS
+    halfOrInt16(U); // All: Fails with call ambiguous between int16_t and uint16_t
+                    // overloads
+
     // asfloat16 is a builtin with overloads for half, int16_t, and uint16_t.
     H = asfloat16(I); // DXC: Fails to resolve overload for int.
                       // Clang: Resolves to asfloat16(int16_t).
@@ -94,21 +112,28 @@ behavior between Clang and DXC. Some examples include:
 
     takesDoubles(X, Y, Z); // Works on all compilers
   #ifndef IGNORE_ERRORS
-    fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double.
+    fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to
+                  //   double.
                   // Clang: Resolves to fma(double,double,double).
-  #endif
 
-    double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
+    double D = dot(R, G); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
                           // FXC: Expands to compute double dot product with fmul/fadd
-                          // Clang: Resolves to dot(float3, float3), emits conversion warnings.
+                          // Clang: Fails to resolve as ambiguous against
+                          //   dot(half, half) or dot(float, float)
+  #endif
 
   #ifndef IGNORE_ERRORS
     tan(B); // DXC: resolves to tan(float).
             // Clang: Fails to resolve, ambiguous between integer types.
 
-    twoParams(I, X); // DXC: resolves twoParams(int, int).
-                     // Clang: Fails to resolve ambiguous conversions.
   #endif
+
+    double D;
+    takesSingleDouble(D); // All: Fails to resolve ambiguous conversions.
+    takesSingleDouble(R); // All: Fails to resolve ambiguous conversions.
+
+    scalarOrVector(D); // All: Resolves to scalarOrVector(double).
+    scalarOrVector(R); // All: Fails to resolve ambiguous conversions.
   }
 
 .. note::
@@ -119,3 +144,75 @@ behavior between Clang and DXC. Some examples include:
   diagnostic notifying the user of the conversion rather than silently altering
   precision relative to the other overloads (as FXC does) or generating code
   that will fail validation (as DXC does).
+
+.. _multi_argument_overloads:
+
+Multi-Argument Overloads
+------------------------
+
+In addition to the differences in single-element conversions, Clang and DXC
+differ dramatically in multi-argument overload resolution. C++ multi-argument
+overload resolution behavior (or something very similar) is required to
+implement
+`non-member operator overloading <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.
+
+Clang adopts the C++ inspired language from the
+`draft HLSL specification <https://microsoft.github.io/hlsl-specs/specs/hlsl.pdf>`_,
+where an overload ``f1`` is a better candidate than ``f2`` if for all arguments the
+conversion sequences is not worse than the corresponding conversion sequence and
+for at least one argument it is better.
+
+.. code-block:: c++
+
+  cbuffer CB {
+    int I;
+    float X;
+    float4 V;
+  }
+
+  void twoParams(int, int);
+  void twoParams(float, float);
+  void threeParams(float, float, float);
+  void threeParams(float4, float4, float4);
+
+  export void call() {
+    twoParams(I, X); // DXC: resolves twoParams(int, int).
+                     // Clang: Fails to resolve ambiguous conversions.
+
+    threeParams(X, V, V); // DXC: resolves threeParams(float4, float4, float4).
+                          // Clang: Fails to resolve ambiguous conversions.
+  }
+
+For the examples above since ``twoParams`` called with mixed parameters produces
+implicit conversion sequences that are { ExactMatch, FloatingIntegral }  and {
+FloatingIntegral, ExactMatch }. In both cases an argument has a worse conversion
+in the other sequence, so the overload is ambiguous.
+
+In the ``threeParams`` example the sequences are { ExactMatch, VectorTruncation,
+VectorTruncation } or { VectorSplat, ExactMatch, ExactMatch }, again in both
+cases at least one parameter has a worse conversion in the other sequence, so
+the overload is ambiguous.
+
+.. note::
+
+  The behavior of DXC documented below is undocumented so this is gleaned from
+  observation and a bit of reading the source.
+
+DXC's approach for determining the best overload produces an integer score value
+for each implicit conversion sequence for each argument expression. Scores for
+casts are based on a bitmask construction that is complicated to reverse
+engineer. It seems that:
+
+* Exact match is 0
+* Dimension increase is 1
+* Promotion is 2
+* Integral -> Float conversion is 4
+* Float -> Integral conversion is 8
+* Cast is 16
+
+The masks are or'd against each other to produce a score for the cast.
+
+The scores of each conversion sequence are then summed to generate a score for
+the overload candidate. The overload candidate with the lowest score is the best
+candidate. If more than one overload are matched for the lowest score the call
+is ambiguous.

From 1293ab35e406e8b50030335ccf98580a7b719ff5 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 30 Aug 2024 14:43:15 -0700
Subject: [PATCH 14/31] [Github] Cancel previous in-progress code formatting
 jobs (#106701)

---
 .github/workflows/pr-code-format.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 1a1700b75cfdb7..f2bb37316d3a8b 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -13,6 +13,9 @@ jobs:
   code_formatter:
     runs-on: ubuntu-latest
     timeout-minutes: 30
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources

From 8f0c014b12663129d8bfe0cc89f06e7a1d8b48c2 Mon Sep 17 00:00:00 2001
From: Yinying Li <yinyingli@google.com>
Date: Fri, 30 Aug 2024 17:47:37 -0400
Subject: [PATCH 15/31] [mlir][sparse] add parallelization options to mini
 pipeline (#104233)

---
 .../Dialect/SparseTensor/Transforms/Passes.h  |  3 +-
 .../Dialect/SparseTensor/Transforms/Passes.td | 17 +++++++++
 .../Pipelines/SparseTensorPipelines.cpp       |  3 +-
 .../SparsificationAndBufferizationPass.cpp    | 13 +++++--
 .../SparseTensor/minipipeline_parallel.mlir   | 38 +++++++++++++++++++
 5 files changed, 69 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Dialect/SparseTensor/minipipeline_parallel.mlir

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index 8413691910189a..d22df6a7857c1d 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -263,7 +263,8 @@ std::unique_ptr<Pass> createSparsificationAndBufferizationPass(
     bool createSparseDeallocs, bool enableRuntimeLibrary,
     bool enableBufferInitialization, unsigned vectorLength,
     bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen,
-    SparseEmitStrategy emitStrategy);
+    SparseEmitStrategy emitStrategy,
+    SparseParallelizationStrategy parallelizationStrategy);
 
 //===----------------------------------------------------------------------===//
 // Sparse Iteration Transform Passes
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 8ec18a1e186481..a534381bd5c2f3 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -496,6 +496,23 @@ def SparsificationAndBufferization : Pass<"sparsification-and-bufferization", "M
                         "Emit (experimental) loops (with sparse.iterate)."),
              clEnumValN(mlir::SparseEmitStrategy::kDebugInterface, "debug-interface",
                         "Emit non-functional but easy-to-read interfaces to debug."))}]>,
+    Option<"parallelization", "parallelization-strategy", "mlir::SparseParallelizationStrategy",
+           "mlir::SparseParallelizationStrategy::kNone",
+           "Set the parallelization strategy", [{llvm::cl::values(
+             clEnumValN(mlir::SparseParallelizationStrategy::kNone, "none",
+                        "Turn off sparse parallelization."),
+             clEnumValN(mlir::SparseParallelizationStrategy::kDenseOuterLoop,
+                        "dense-outer-loop",
+                        "Enable dense outer loop sparse parallelization."),
+             clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageOuterLoop,
+                        "any-storage-outer-loop",
+                        "Enable sparse parallelization regardless of storage for the outer loop."),
+             clEnumValN(mlir::SparseParallelizationStrategy::kDenseAnyLoop,
+                        "dense-any-loop",
+                        "Enable dense parallelization for any loop."),
+             clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
+                        "any-storage-any-loop",
+                        "Enable sparse parallelization for any storage and loop."))}]>,
   ];
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 12e330ac7efbdf..abc4a4c252841b 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -45,7 +45,8 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
       /*enableVLAVectorization=*/options.armSVE,
       /*enableSIMDIndex32=*/options.force32BitVectorIndices,
       options.enableGPULibgen,
-      options.sparsificationOptions().sparseEmitStrategy));
+      options.sparsificationOptions().sparseEmitStrategy,
+      options.sparsificationOptions().parallelizationStrategy));
 
   // Bail-early for test setup.
   if (options.testBufferizationAnalysisOnly)
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index e088328848c9c8..6e882a8d0ff30a 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -78,7 +78,8 @@ class SparsificationAndBufferizationPass
       const SparsificationOptions &sparsificationOptions,
       bool createSparseDeallocs, bool enableRuntimeLibrary,
       bool enableBufferInitialization, unsigned vl, bool vla, bool index32,
-      bool gpu, SparseEmitStrategy emitStrategy)
+      bool gpu, SparseEmitStrategy emitStrategy,
+      SparseParallelizationStrategy parallelizationStrategy)
       : bufferizationOptions(bufferizationOptions),
         sparsificationOptions(sparsificationOptions),
         createSparseDeallocs(createSparseDeallocs),
@@ -90,6 +91,7 @@ class SparsificationAndBufferizationPass
     enableSIMDIndex32 = index32;
     enableGPULibgen = gpu;
     sparseEmitStrategy = emitStrategy;
+    parallelization = parallelizationStrategy;
   }
 
   /// Bufferize all dense ops. This assumes that no further analysis is needed
@@ -124,6 +126,9 @@ class SparsificationAndBufferizationPass
     // Overrides the default emit strategy using user-provided value.
     this->sparsificationOptions.sparseEmitStrategy = sparseEmitStrategy;
 
+    // Overrides the default parallelization strategy using user-provided value.
+    this->sparsificationOptions.parallelizationStrategy = parallelization;
+
     // Run enabling transformations.
     {
       OpPassManager pm("builtin.module");
@@ -248,10 +253,12 @@ std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
     bool createSparseDeallocs, bool enableRuntimeLibrary,
     bool enableBufferInitialization, unsigned vectorLength,
     bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen,
-    SparseEmitStrategy emitStrategy) {
+    SparseEmitStrategy emitStrategy,
+    SparseParallelizationStrategy parallelizationStrategy) {
   return std::make_unique<
       mlir::sparse_tensor::SparsificationAndBufferizationPass>(
       bufferizationOptions, sparsificationOptions, createSparseDeallocs,
       enableRuntimeLibrary, enableBufferInitialization, vectorLength,
-      enableVLAVectorization, enableSIMDIndex32, enableGPULibgen, emitStrategy);
+      enableVLAVectorization, enableSIMDIndex32, enableGPULibgen, emitStrategy,
+      parallelizationStrategy);
 }
diff --git a/mlir/test/Dialect/SparseTensor/minipipeline_parallel.mlir b/mlir/test/Dialect/SparseTensor/minipipeline_parallel.mlir
new file mode 100644
index 00000000000000..d97d6e58a3df2d
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/minipipeline_parallel.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-opt %s --sparsification-and-bufferization        | FileCheck %s --check-prefix=CHECK-NOPARA
+// RUN: mlir-opt %s --sparsification-and-bufferization="parallelization-strategy=any-storage-any-loop" | FileCheck %s --check-prefix=CHECK-PARA
+
+// Test to ensure we can pass parallelization flags into
+// the mini sparsification and bufferization pipeline.
+
+#SparseMatrix = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : compressed, d1 : compressed)
+}>
+
+#trait_ss = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>,  // A
+    affine_map<(i,j) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel"],
+  doc = "X(i,j) = A(i,j) * SCALE"
+}
+
+//
+// CHECK-NOPARA-LABEL: func.func @scale_ss
+// CHECK-NOPARA:       scf.for
+//
+// CHECK-PARA-LABEL: func.func @scale_ss
+// CHECK-PARA:       scf.parallel
+//
+func.func @scale_ss(%scale: f32,
+               %arga: tensor<?x?xf32, #SparseMatrix>,
+	       %argx: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.generic #trait_ss
+     ins(%arga: tensor<?x?xf32, #SparseMatrix>)
+    outs(%argx: tensor<?x?xf32>) {
+      ^bb(%a: f32, %x: f32):
+        %0 = arith.mulf %a, %scale : f32
+        linalg.yield %0 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}

From 10affaf894a72bee9b84ada77dc943b1bb03d02e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 30 Aug 2024 14:50:51 -0700
Subject: [PATCH 16/31] [AArch64][AsmParser] Stop parsing on error (#106804)

Fixes buffer overflow after #106625
---
 llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 34c0fad45fc499..373f844b239081 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6995,7 +6995,7 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
     });
 
     if (It == std::end(ExtensionMap))
-      Error(CurLoc, "unsupported architectural extension: " + Name);
+      return Error(CurLoc, "unsupported architectural extension: " + Name);
 
     if (EnableFeature)
       STI.SetFeatureBitsTransitively(It->Features);

From 0dcd68c28a6170391b4643b737950689723d35fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 30 Aug 2024 14:55:09 -0700
Subject: [PATCH 17/31] [flang][cuda] Set allocator index for module
 allocatable variable (#106777)

Descriptor for module variable with cuda attribute must be set with the
correct allocator index. This patch updates the embox operation used in
the global to carry the allocator index.
---
 flang/lib/Lower/ConvertVariable.cpp        | 20 ++++++++++++++++++--
 flang/test/Lower/CUDA/cuda-allocatable.cuf | 15 +++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 70fa32d621e2f1..f76d44f5479d32 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -478,6 +478,20 @@ void Fortran::lower::createGlobalInitialization(
   builder.restoreInsertionPoint(insertPt);
 }
 
+static unsigned getAllocatorIdx(cuf::DataAttributeAttr dataAttr) {
+  if (dataAttr) {
+    if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
+      return kPinnedAllocatorPos;
+    if (dataAttr.getValue() == cuf::DataAttribute::Device)
+      return kDeviceAllocatorPos;
+    if (dataAttr.getValue() == cuf::DataAttribute::Managed)
+      return kManagedAllocatorPos;
+    if (dataAttr.getValue() == cuf::DataAttribute::Unified)
+      return kUnifiedAllocatorPos;
+  }
+  return kDefaultAllocator;
+}
+
 /// Create the global op and its init if it has one
 static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
@@ -540,8 +554,10 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
       // Create unallocated/disassociated descriptor if no explicit init
       Fortran::lower::createGlobalInitialization(
           builder, global, [&](fir::FirOpBuilder &b) {
-            mlir::Value box =
-                fir::factory::createUnallocatedBox(b, loc, symTy, std::nullopt);
+            mlir::Value box = fir::factory::createUnallocatedBox(
+                b, loc, symTy,
+                /*nonDeferredParams=*/std::nullopt,
+                /*typeSourceBox=*/{}, getAllocatorIdx(dataAttr));
             b.create<fir::HasValueOp>(loc, box);
           });
     }
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index fb72f88fe415ca..6479425c58d8be 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -2,6 +2,21 @@
 
 ! Test lowering of CUDA allocatable allocate/deallocate statements.
 
+module globals
+  real, device, allocatable :: a_device(:)
+  real, managed, allocatable :: a_managed(:)
+  real, pinned, allocatable :: a_pinned(:)
+end module
+
+! CHECK-LABEL: fir.global @_QMglobalsEa_device {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+
+! CHECK-LABEL: fir.global @_QMglobalsEa_managed {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 3 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+
+! CHECK-LABEL: fir.global @_QMglobalsEa_pinned {data_attr = #cuf.cuda<pinned>} : !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK: %{{.*}} = fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 1 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+
 subroutine sub1()
   real, allocatable, device :: a(:)
   allocate(a(10))

From d004ebc14a7ad9c0fa0509497ce75eaf9d073faa Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 30 Aug 2024 15:59:44 -0600
Subject: [PATCH 18/31] Fix tsan race in PerfJITEventListener.cpp (#106800)

Static destructor can race with calls to notify and trigger tsan
warning.

```
WARNING: ThreadSanitizer: data race (pid=5787)
  Write of size 1 at 0x55bec9df8de8 by thread T23:
    #0 pthread_mutex_destroy [third_party/llvm/llvm-project/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp:1344](third_party/llvm/llvm-project/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp?l=1344&cl=669089572):3 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x1b12affb) (BuildId: ff25ace8b17d9863348bb1759c47246c)
    #1 __libcpp_recursive_mutex_destroy [third_party/crosstool/v18/stable/src/libcxx/include/__thread/support/pthread.h:91](third_party/crosstool/v18/stable/src/libcxx/include/__thread/support/pthread.h?l=91&cl=669089572):10 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x4523d4e9) (BuildId: ff25ace8b17d9863348bb1759c47246c)
    #2 std::__tsan::recursive_mutex::~recursive_mutex() [third_party/crosstool/v18/stable/src/libcxx/src/mutex.cpp:52](third_party/crosstool/v18/stable/src/libcxx/src/mutex.cpp?l=52&cl=669089572):11 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x4523d4e9)
    #3 ~SmartMutex [third_party/llvm/llvm-project/llvm/include/llvm/Support/Mutex.h:28](third_party/llvm/llvm-project/llvm/include/llvm/Support/Mutex.h?l=28&cl=669089572):11 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x2bcaedfe) (BuildId: ff25ace8b17d9863348bb1759c47246c)
    #4 (anonymous namespace)::PerfJITEventListener::~PerfJITEventListener() [third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp:65](third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp?l=65&cl=669089572):3 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x2bcaedfe)
    #5 cxa_at_exit_callback_installed_at(void*) [third_party/llvm/llvm-project/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp:437](third_party/llvm/llvm-project/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp?l=437&cl=669089572):3 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x1b172cb9) (BuildId: ff25ace8b17d9863348bb1759c47246c)
    #6 llvm::JITEventListener::createPerfJITEventListener() [third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp:496](third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp?l=496&cl=669089572):3 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x2bcad8f5) (BuildId: ff25ace8b17d9863348bb1759c47246c)
```
```
Previous atomic read of size 1 at 0x55bec9df8de8 by thread T192 (mutexes: write M0, write M1):
    #0 pthread_mutex_unlock [third_party/llvm/llvm-project/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp:1387](third_party/llvm/llvm-project/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp?l=1387&cl=669089572):3 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x1b12b6bb) (BuildId: ff25ace8b17d9863348bb1759c47246c)
    #1 __libcpp_recursive_mutex_unlock [third_party/crosstool/v18/stable/src/libcxx/include/__thread/support/pthread.h:87](third_party/crosstool/v18/stable/src/libcxx/include/__thread/support/pthread.h?l=87&cl=669089572):10 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x4523d589) (BuildId: ff25ace8b17d9863348bb1759c47246c)
    #2 std::__tsan::recursive_mutex::unlock() [third_party/crosstool/v18/stable/src/libcxx/src/mutex.cpp:64](third_party/crosstool/v18/stable/src/libcxx/src/mutex.cpp?l=64&cl=669089572):11 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x4523d589)
    #3 unlock [third_party/llvm/llvm-project/llvm/include/llvm/Support/Mutex.h:47](third_party/llvm/llvm-project/llvm/include/llvm/Support/Mutex.h?l=47&cl=669089572):16 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x2bcaf968) (BuildId: ff25ace8b17d9863348bb1759c47246c)
    #4 ~lock_guard [third_party/crosstool/v18/stable/src/libcxx/include/__mutex/lock_guard.h:39](third_party/crosstool/v18/stable/src/libcxx/include/__mutex/lock_guard.h?l=39&cl=669089572):101 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x2bcaf968)
    #5 (anonymous namespace)::PerfJITEventListener::notifyObjectLoaded(unsigned long, llvm::object::ObjectFile const&, llvm::RuntimeDyld::LoadedObjectInfo const&) [third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp:290](https://cs.corp.google.com/piper///depot/google3/third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp?l=290&cl=669089572):1 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x2bcaf968)
    #6 llvm::orc::RTDyldObjectLinkingLayer::onObjEmit(llvm::orc::MaterializationResponsibility&, llvm::object::OwningBinary<llvm::object::ObjectFile>, std::__tsan::unique_ptr<llvm::RuntimeDyld::MemoryManager, std::__tsan::default_delete<llvm::RuntimeDyld::MemoryManager>>, std::__tsan::unique_ptr<llvm::RuntimeDyld::LoadedObjectInfo, std::__tsan::default_delete<llvm::RuntimeDyld::LoadedObjectInfo>>, std::__tsan::unique_ptr<llvm::DenseMap<llvm::orc::JITDylib*, llvm::DenseSet<llvm::orc::SymbolStringPtr, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>>, llvm::DenseMapInfo<llvm::orc::JITDylib*, void>, llvm::detail::DenseMapPair<llvm::orc::JITDylib*, llvm::DenseSet<llvm::orc::SymbolStringPtr, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>>>>, std::__tsan::default_delete<llvm::DenseMap<llvm::orc::JITDylib*, llvm::DenseSet<llvm::orc::SymbolStringPtr, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>>, llvm::DenseMapInfo<llvm::orc::JITDylib*, void>, llvm::detail::DenseMapPair<llvm::orc::JITDylib*, llvm::DenseSet<llvm::orc::SymbolStringPtr, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>>>>>>, llvm::Error) [third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp:386](https://cs.corp.google.com/piper///depot/google3/third_party/llvm/llvm-project/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp?l=386&cl=669089572):10 (be1eb158bb70fc9cf7be2db70407e512890e5c6e20720cd88c69d7d9c26ea531_0200d5f71908+0x2bc404a8) (BuildId: ff25ace8b17d9863348bb1759c47246c)
```
---
 .../ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
index e2b5ce49ba2ec1..cf9ed7dbff1536 100644
--- a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
+++ b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
@@ -60,6 +60,10 @@ class PerfJITEventListener : public JITEventListener {
 public:
   PerfJITEventListener();
   ~PerfJITEventListener() {
+    // Lock a mutex to correctly synchronize with prior calls to
+    // `notifyObjectLoaded` and `notifyFreeingObject` that happened on other
+    // threads to prevent tsan from complaining.
+    std::lock_guard<sys::Mutex> Guard(Mutex);
     if (MarkerAddr)
       CloseMarker();
   }

From 8b77aa990b5f2f75ea7128a87bdbb8b905162e90 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Fri, 30 Aug 2024 15:13:23 -0700
Subject: [PATCH 19/31] [libc] Use correct names for locale variants in spec.td
 (#106806)

This addresses issue introduced in #105718.
---
 clang/cmake/caches/Fuchsia-stage2.cmake |  4 ++--
 libc/spec/stdc.td                       | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 4ef37a5fad67f5..bf74f69296142d 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -366,7 +366,7 @@ foreach(target riscv32-unknown-elf)
   set(BUILTINS_${target}_CMAKE_SYSROOT "" CACHE STRING "")
   set(BUILTINS_${target}_CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "")
   foreach(lang C;CXX;ASM)
-    set(BUILTINS_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f" CACHE STRING "")
+    set(BUILTINS_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imc_zicsr_zifencei -mabi=ilp32" CACHE STRING "")
   endforeach()
   foreach(type SHARED;MODULE;EXE)
     set(BUILTINS_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")
@@ -382,7 +382,7 @@ foreach(target riscv32-unknown-elf)
   foreach(lang C;CXX;ASM)
     # TODO: The preprocessor defines workaround various issues in libc and libc++ integration.
     # These should be addressed and removed over time.
-    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imc_zicsr_zifencei -mabi=ilp32 -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
   endforeach()
   foreach(type SHARED;MODULE;EXE)
     set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 1742e1f7b0ef33..a4ae3e1ff7d9c6 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -1321,13 +1321,13 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"strtoul", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
           FunctionSpec<"strtoull", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
 
-          FunctionSpec<"strtof", RetValSpec<FloatType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtod", RetValSpec<DoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtold", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtol", RetValSpec<LongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtoll", RetValSpec<LongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtoul", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtoull", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtof_l", RetValSpec<FloatType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtod_l", RetValSpec<DoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtold_l", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtol_l", RetValSpec<LongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtoll_l", RetValSpec<LongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtoul_l", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
+          FunctionSpec<"strtoull_l", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
 
           FunctionSpec<"malloc", RetValSpec<VoidPtr>, [ArgSpec<SizeTType>]>,
           FunctionSpec<"calloc", RetValSpec<VoidPtr>, [ArgSpec<SizeTType>, ArgSpec<SizeTType>]>,

From 332e6f86c50218ce60cafc9bf6d38d907da535ea Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Fri, 30 Aug 2024 15:23:32 -0700
Subject: [PATCH 20/31] [Fuchsia] Support F extension for riscv32-unknown-elf
 (#106808)

This is used by some targets we support.
---
 clang/cmake/caches/Fuchsia-stage2.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index bf74f69296142d..a4d1ceed8c1c4b 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -366,7 +366,7 @@ foreach(target riscv32-unknown-elf)
   set(BUILTINS_${target}_CMAKE_SYSROOT "" CACHE STRING "")
   set(BUILTINS_${target}_CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "")
   foreach(lang C;CXX;ASM)
-    set(BUILTINS_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imc_zicsr_zifencei -mabi=ilp32" CACHE STRING "")
+    set(BUILTINS_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f" CACHE STRING "")
   endforeach()
   foreach(type SHARED;MODULE;EXE)
     set(BUILTINS_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")
@@ -382,7 +382,7 @@ foreach(target riscv32-unknown-elf)
   foreach(lang C;CXX;ASM)
     # TODO: The preprocessor defines workaround various issues in libc and libc++ integration.
     # These should be addressed and removed over time.
-    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imc_zicsr_zifencei -mabi=ilp32 -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -march=rv32imafc -mabi=ilp32f -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
   endforeach()
   foreach(type SHARED;MODULE;EXE)
     set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")

From 5013cf682cf010c299e64acf68d35248b7c3e883 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Sat, 31 Aug 2024 08:10:22 +0900
Subject: [PATCH 21/31] [cmake] Add symbolic links for MSVC libraries (#106710)

When cross-compiling a Windows clang with `-DLLVM_BUILD_INSTRUMENTED`,
the profiling compiler-rt is linked to binaries, as one would expect,
but the profiling compiler-rt contains objects with `/DEFAULTLIB:LIBCMT`
and `/DEFAULTLIB:OLDNAMES` directives, which makes the build expect
`LIBCMT.lib` and `OLDNAMES.lib`, but they are nowhere to be found
because they are in lowercase. While the WinMsvc.cmake helper recreates
symbolic links to work around such case sensitivity issues for the
Windows SDK libs, it doesn't do so for the MSVC libs, which we add here.
---
 llvm/cmake/platforms/WinMsvc.cmake | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/cmake/platforms/WinMsvc.cmake b/llvm/cmake/platforms/WinMsvc.cmake
index e5d1ba8ec4a7c2..40d47f12c53ab7 100644
--- a/llvm/cmake/platforms/WinMsvc.cmake
+++ b/llvm/cmake/platforms/WinMsvc.cmake
@@ -95,6 +95,7 @@ list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
   LLVM_WINSYSROOT
   MSVC_VER
   WINSDK_VER
+  msvc_lib_symlinks_dir
   winsdk_lib_symlinks_dir
   winsdk_vfs_overlay_path
   )
@@ -156,6 +157,24 @@ function(generate_winsdk_lib_symlinks winsdk_um_lib_dir output_dir)
   endforeach()
 endfunction()
 
+function(generate_msvc_lib_symlinks msvc_lib_dir output_dir)
+  execute_process(COMMAND "${CMAKE_COMMAND}" -E make_directory "${output_dir}")
+  file(GLOB libraries RELATIVE "${msvc_lib_dir}" "${msvc_lib_dir}/*.lib")
+  foreach(library ${libraries})
+    get_filename_component(name_wle "${library}" NAME_WLE)
+    get_filename_component(ext "${library}" LAST_EXT)
+    string(TOLOWER "${ext}" lowercase_ext)
+    string(TOUPPER "${name_wle}" all_uppercase_symlink_name_wle)
+    set(uppercase_symlink_name "${all_uppercase_symlink_name_wle}${lowercase_ext}")
+    if(NOT library STREQUAL uppercase_symlink_name)
+      execute_process(COMMAND "${CMAKE_COMMAND}"
+                              -E create_symlink
+                              "${msvc_lib_dir}/${library}"
+                              "${output_dir}/${uppercase_symlink_name}")
+    endif()
+  endforeach()
+endfunction()
+
 function(get_highest_version the_dir the_ver)
   file(GLOB entries LIST_DIRECTORIES true RELATIVE "${the_dir}" "${the_dir}/[0-9.]*")
   foreach(entry ${entries})
@@ -297,6 +316,12 @@ if(case_sensitive_filesystem)
   endif()
   list(APPEND LINK_FLAGS
        -libpath:"${winsdk_lib_symlinks_dir}")
+  if(NOT msvc_lib_symlinks_dir)
+    set(msvc_lib_symlinks_dir "${CMAKE_BINARY_DIR}/msvc_lib_symlinks")
+    generate_msvc_lib_symlinks("${MSVC_LIB}/${WINSDK_ARCH}" "${msvc_lib_symlinks_dir}")
+  endif()
+  list(APPEND LINK_FLAGS
+       -libpath:"${msvc_lib_symlinks_dir}")
 endif()
 
 string(REPLACE ";" " " LINK_FLAGS "${LINK_FLAGS}")

From 02eb03d5e0fef68a37751bd4865eff98c0e20a8c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 30 Aug 2024 16:12:37 -0700
Subject: [PATCH 22/31] [RISCV] Use DwarfRegAlias instead of DwarfRegNum for
 32-bit and 64-bit FP registers.

There should only be one register that specifies a particular
DwarfRegNum.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index efdf6bebfce301..73649129e4f93f 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -234,12 +234,12 @@ let RegAltNameIndices = [ABIRegAltName] in {
 
   foreach Index = 0-31 in {
     def F#Index#_F : RISCVReg32<!cast<RISCVReg16>("F"#Index#"_H")>,
-      DwarfRegNum<[!add(Index, 32)]>;
+      DwarfRegAlias<!cast<Register>("F"#Index#"_H")>;
   }
 
   foreach Index = 0-31 in {
     def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
-      DwarfRegNum<[!add(Index, 32)]>;
+      DwarfRegAlias<!cast<Register>("F"#Index#"_H")>;
   }
 }
 

From 3745a2e8ab10029f8f401f5ff3c3c76c12e94822 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Sat, 31 Aug 2024 01:50:59 +0200
Subject: [PATCH 23/31] [clang][bytecode][NFC] Cache the BuiltinID in Function
 (#106745)

FunctionDecl::getBuiltinID() is surprisingly slow and we tend to call it
quite a bit, especially when interpreting builtin functions. Caching the
BuiltinID here reduces the time I need to compile the
floating_comparison namespace from builtin-functions.cpp from 7.2s to
6.3s locally.
---
 clang/lib/AST/ByteCode/ByteCodeEmitter.cpp | 18 ++----------------
 clang/lib/AST/ByteCode/Function.cpp        | 20 +++++++++++++++++---
 clang/lib/AST/ByteCode/Function.h          | 14 +++++---------
 3 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
index 35ae1547939fdd..b8778f6027894c 100644
--- a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -21,17 +21,6 @@
 using namespace clang;
 using namespace clang::interp;
 
-/// Unevaluated builtins don't get their arguments put on the stack
-/// automatically. They instead operate on the AST of their Call
-/// Expression.
-/// Similar information is available via ASTContext::BuiltinInfo,
-/// but that is not correct for our use cases.
-static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
-  return BuiltinID == Builtin::BI__builtin_classify_type ||
-         BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
-         BuiltinID == Builtin::BI__builtin_constant_p;
-}
-
 Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
 
   // Manually created functions that haven't been assigned proper
@@ -147,14 +136,11 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
   // Create a handle over the emitted code.
   Function *Func = P.getFunction(FuncDecl);
   if (!Func) {
-    bool IsUnevaluatedBuiltin = false;
-    if (unsigned BI = FuncDecl->getBuiltinID())
-      IsUnevaluatedBuiltin = isUnevaluatedBuiltin(BI);
-
+    unsigned BuiltinID = FuncDecl->getBuiltinID();
     Func =
         P.createFunction(FuncDecl, ParamOffset, std::move(ParamTypes),
                          std::move(ParamDescriptors), std::move(ParamOffsets),
-                         HasThisPointer, HasRVO, IsUnevaluatedBuiltin);
+                         HasThisPointer, HasRVO, BuiltinID);
   }
 
   assert(Func);
diff --git a/clang/lib/AST/ByteCode/Function.cpp b/clang/lib/AST/ByteCode/Function.cpp
index e3fab3f6720b41..25da6ae1bc7b61 100644
--- a/clang/lib/AST/ByteCode/Function.cpp
+++ b/clang/lib/AST/ByteCode/Function.cpp
@@ -20,11 +20,10 @@ Function::Function(Program &P, FunctionDeclTy Source, unsigned ArgSize,
                    llvm::SmallVectorImpl<PrimType> &&ParamTypes,
                    llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
                    llvm::SmallVectorImpl<unsigned> &&ParamOffsets,
-                   bool HasThisPointer, bool HasRVO, bool UnevaluatedBuiltin)
+                   bool HasThisPointer, bool HasRVO, unsigned BuiltinID)
     : P(P), Source(Source), ArgSize(ArgSize), ParamTypes(std::move(ParamTypes)),
       Params(std::move(Params)), ParamOffsets(std::move(ParamOffsets)),
-      HasThisPointer(HasThisPointer), HasRVO(HasRVO),
-      IsUnevaluatedBuiltin(UnevaluatedBuiltin) {
+      HasThisPointer(HasThisPointer), HasRVO(HasRVO), BuiltinID(BuiltinID) {
   if (const auto *F = Source.dyn_cast<const FunctionDecl *>())
     Variadic = F->isVariadic();
 }
@@ -53,3 +52,18 @@ bool Function::isVirtual() const {
     return M->isVirtual();
   return false;
 }
+
+/// Unevaluated builtins don't get their arguments put on the stack
+/// automatically. They instead operate on the AST of their Call
+/// Expression.
+/// Similar information is available via ASTContext::BuiltinInfo,
+/// but that is not correct for our use cases.
+static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
+  return BuiltinID == Builtin::BI__builtin_classify_type ||
+         BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
+         BuiltinID == Builtin::BI__builtin_constant_p;
+}
+
+bool Function::isUnevaluatedBuiltin() const {
+  return ::isUnevaluatedBuiltin(BuiltinID);
+}
diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h
index f254db20d4f594..b21fa8497130ea 100644
--- a/clang/lib/AST/ByteCode/Function.h
+++ b/clang/lib/AST/ByteCode/Function.h
@@ -193,15 +193,11 @@ class Function final {
 
   bool isVariadic() const { return Variadic; }
 
-  unsigned getBuiltinID() const {
-    return Source.get<const FunctionDecl *>()->getBuiltinID();
-  }
+  unsigned getBuiltinID() const { return BuiltinID; }
 
-  bool isBuiltin() const {
-    return Source.get<const FunctionDecl *>()->getBuiltinID() != 0;
-  }
+  bool isBuiltin() const { return getBuiltinID() != 0; }
 
-  bool isUnevaluatedBuiltin() const { return IsUnevaluatedBuiltin; }
+  bool isUnevaluatedBuiltin() const;
 
   unsigned getNumParams() const { return ParamTypes.size(); }
 
@@ -232,7 +228,7 @@ class Function final {
            llvm::SmallVectorImpl<PrimType> &&ParamTypes,
            llvm::DenseMap<unsigned, ParamDescriptor> &&Params,
            llvm::SmallVectorImpl<unsigned> &&ParamOffsets, bool HasThisPointer,
-           bool HasRVO, bool UnevaluatedBuiltin);
+           bool HasRVO, unsigned BuiltinID);
 
   /// Sets the code of a function.
   void setCode(unsigned NewFrameSize, std::vector<std::byte> &&NewCode,
@@ -289,7 +285,7 @@ class Function final {
   bool HasBody = false;
   bool Defined = false;
   bool Variadic = false;
-  bool IsUnevaluatedBuiltin = false;
+  unsigned BuiltinID = 0;
 
 public:
   /// Dumps the disassembled bytecode to \c llvm::errs().

From 18e35d8f665177a971d0f4ea93b2008dac5e7f33 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 30 Aug 2024 17:17:50 -0700
Subject: [PATCH 24/31] [LoongArch] Don't left shift negative value (#106812)

Fixed another UB from #106332.

Detected here https://lab.llvm.org/buildbot/#/builders/169/builds/2662
---
 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
index a7823470382756..08e5ccc7bc0be5 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
@@ -82,9 +82,9 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) {
       TmpVal1 = Insts[1].Imm;
       if (N == 3)
         break;
-      TmpVal2 = Insts[3].Imm << 52 | TmpVal1;
+      TmpVal2 = static_cast<uint64_t>(Insts[3].Imm) << 52 | TmpVal1;
     }
-    TmpVal1 |= Insts[0].Imm << 12;
+    TmpVal1 |= static_cast<uint64_t>(Insts[0].Imm) << 12;
     break;
   case LoongArch::ORI:
   case LoongArch::ADDI_W:

From 0ab3d6e14305ce8a97bfe3af7ddc52c416e698a6 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 30 Aug 2024 17:24:40 -0700
Subject: [PATCH 25/31] Reapply "[MemProf] Reduce cloning overhead by sharing
 nodes when possible" (#102932) with fixes (#106623)

This reverts commit 11aa31f595325d6b2dede3364e4b86d78fffe635, restoring
commit 055e4319112282354327af9908091fdb25149e9b, with added fixes for
linker unsats.

In some cases multiple calls to different targets may end up with the
same debug information, and therefore callsite id. We will end up
sharing the node between these calls. We don't know which one matches
the callees until all nodes are matched with calls, at which point any
non-matching calls should be removed from the node. The fix extends the
handling in handleCallsitesWithMultipleTargets to do this, and adds
tests for various permutations of this situation.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 236 +++++++++++++--
 .../ThinLTO/X86/memprof-aliased-location1.ll  | 116 ++++++++
 .../ThinLTO/X86/memprof-aliased-location2.ll  | 116 ++++++++
 .../X86/memprof-tailcall-aliased-location1.ll |  99 +++++++
 .../X86/memprof-tailcall-aliased-location2.ll |  99 +++++++
 .../aliased-location1.ll                      | 274 ++++++++++++++++++
 .../aliased-location2.ll                      | 274 ++++++++++++++++++
 .../tailcall-aliased-location1.ll             | 100 +++++++
 .../tailcall-aliased-location2.ll             | 100 +++++++
 9 files changed, 1386 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location1.ll
 create mode 100644 llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location2.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location1.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location2.ll

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 66b68d5cd457fb..52def8f21312de 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -242,9 +242,16 @@ class CallsiteContextGraph {
     // recursion.
     bool Recursive = false;
 
-    // The corresponding allocation or interior call.
+    // The corresponding allocation or interior call. This is the primary call
+    // for which we have created this node.
     CallInfo Call;
 
+    // List of other calls that can be treated the same as the primary call
+    // through cloning. I.e. located in the same function and have the same
+    // (possibly pruned) stack ids. They will be updated the same way as the
+    // primary call when assigning to function clones.
+    std::vector<CallInfo> MatchingCalls;
+
     // For alloc nodes this is a unique id assigned when constructed, and for
     // callsite stack nodes it is the original stack id when the node is
     // constructed from the memprof MIB metadata on the alloc nodes. Note that
@@ -457,6 +464,9 @@ class CallsiteContextGraph {
   /// iteration.
   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
 
+  /// Records the function each call is located in.
+  DenseMap<CallInfo, const FuncTy *> CallToFunc;
+
   /// Map from callsite node to the enclosing caller function.
   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
 
@@ -474,7 +484,8 @@ class CallsiteContextGraph {
   /// StackIdToMatchingCalls map.
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
+      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -521,6 +532,11 @@ class CallsiteContextGraph {
         Call, Func, CallerFunc, FoundCalleeChain);
   }
 
+  /// Returns true if both call instructions have the same callee.
+  bool sameCallee(CallTy Call1, CallTy Call2) {
+    return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
+  }
+
   /// Get a list of nodes corresponding to the stack ids in the given
   /// callsite's context.
   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
@@ -667,6 +683,7 @@ class ModuleCallsiteContextGraph
   bool calleeMatchesFunc(
       Instruction *Call, const Function *Func, const Function *CallerFunc,
       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
+  bool sameCallee(Instruction *Call1, Instruction *Call2);
   bool findProfiledCalleeThroughTailCalls(
       const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
@@ -744,6 +761,7 @@ class IndexCallsiteContextGraph
       IndexCall &Call, const FunctionSummary *Func,
       const FunctionSummary *CallerFunc,
       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
+  bool sameCallee(IndexCall &Call1, IndexCall &Call2);
   bool findProfiledCalleeThroughTailCalls(
       ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
@@ -1230,10 +1248,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(ContextNode *Node,
-                              DenseSet<const ContextNode *> &Visited,
-                              DenseMap<uint64_t, std::vector<CallContextInfo>>
-                                  &StackIdToMatchingCalls) {
+    assignStackNodesPostOrder(
+        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
+        DenseMap<uint64_t, std::vector<CallContextInfo>>
+            &StackIdToMatchingCalls,
+        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1246,7 +1265,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     // Skip any that have been removed during the recursion.
     if (!Edge)
       continue;
-    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
+    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
+                              CallToMatchingCall);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1289,8 +1309,19 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
     // Skip any for which we didn't assign any ids, these don't get a node in
     // the graph.
-    if (SavedContextIds.empty())
+    if (SavedContextIds.empty()) {
+      // If this call has a matching call (located in the same function and
+      // having the same stack ids), simply add it to the context node created
+      // for its matching call earlier. These can be treated the same through
+      // cloning and get updated at the same time.
+      if (!CallToMatchingCall.contains(Call))
+        continue;
+      auto MatchingCall = CallToMatchingCall[Call];
+      assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
+      NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
+          Call);
       continue;
+    }
 
     assert(LastId == Ids.back());
 
@@ -1422,6 +1453,10 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // there is more than one call with the same stack ids. Their (possibly newly
   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
+  // Save a map from each call to any that are found to match it. I.e. located
+  // in the same function and have the same (possibly pruned) stack ids. We use
+  // this to avoid creating extra graph nodes as they can be treated the same.
+  DenseMap<CallInfo, CallInfo> CallToMatchingCall;
   for (auto &It : StackIdToMatchingCalls) {
     auto &Calls = It.getSecond();
     // Skip single calls with a single stack id. These don't need a new node.
@@ -1460,6 +1495,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
     assert(!LastNodeContextIds.empty());
 
+    // Map from function to the first call from the below list (with matching
+    // stack ids) found in that function. Note that calls from different
+    // functions can have the same stack ids because this is the list of stack
+    // ids that had (possibly pruned) nodes after building the graph from the
+    // allocation MIBs.
+    DenseMap<const FuncTy *, CallInfo> FuncToCallMap;
+
     for (unsigned I = 0; I < Calls.size(); I++) {
       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
       assert(SavedContextIds.empty());
@@ -1533,6 +1575,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
           continue;
       }
 
+      const FuncTy *CallFunc = CallToFunc[Call];
+
+      // If the prior call had the same stack ids this map would not be empty.
+      // Check if we already have a call that "matches" because it is located
+      // in the same function.
+      if (FuncToCallMap.contains(CallFunc)) {
+        // Record the matching call found for this call, and skip it. We
+        // will subsequently combine it into the same node.
+        CallToMatchingCall[Call] = FuncToCallMap[CallFunc];
+        continue;
+      }
+
       // Check if the next set of stack ids is the same (since the Calls vector
       // of tuples is sorted by the stack ids we can just look at the next one).
       bool DuplicateContextIds = false;
@@ -1562,7 +1616,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
         set_subtract(LastNodeContextIds, StackSequenceContextIds);
         if (LastNodeContextIds.empty())
           break;
-      }
+        // No longer possibly in a sequence of calls with duplicate stack ids,
+        // clear the map.
+        FuncToCallMap.clear();
+      } else
+        // Record the call with its function, so we can locate it the next time
+        // we find a call from this function when processing the calls with the
+        // same stack ids.
+        FuncToCallMap[CallFunc] = Call;
     }
   }
 
@@ -1579,7 +1640,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
   for (auto &Entry : AllocationCallToContextNodeMap)
-    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
+    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
+                              CallToMatchingCall);
   if (VerifyCCG)
     check();
 }
@@ -1679,6 +1741,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           continue;
         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
           CallsWithMetadata.push_back(&I);
+          CallToFunc[&I] = &F;
           auto *AllocNode = addAllocNode(&I, &F);
           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
           assert(CallsiteMD);
@@ -1700,8 +1763,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           I.setMetadata(LLVMContext::MD_callsite, nullptr);
         }
         // For callsite metadata, add to list for this function for later use.
-        else if (I.getMetadata(LLVMContext::MD_callsite))
+        else if (I.getMetadata(LLVMContext::MD_callsite)) {
           CallsWithMetadata.push_back(&I);
+          CallToFunc[&I] = &F;
+        }
       }
     }
     if (!CallsWithMetadata.empty())
@@ -1756,8 +1821,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
           // correlate properly in applyImport in the backends.
           if (AN.MIBs.empty())
             continue;
-          CallsWithMetadata.push_back({&AN});
-          auto *AllocNode = addAllocNode({&AN}, FS);
+          IndexCall AllocCall(&AN);
+          CallsWithMetadata.push_back(AllocCall);
+          CallToFunc[AllocCall] = FS;
+          auto *AllocNode = addAllocNode(AllocCall, FS);
           // Pass an empty CallStack to the CallsiteContext (second)
           // parameter, since for ThinLTO we already collapsed out the inlined
           // stack ids on the allocation call during ModuleSummaryAnalysis.
@@ -1788,8 +1855,11 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
       }
       // For callsite metadata, add to list for this function for later use.
       if (!FS->callsites().empty())
-        for (auto &SN : FS->mutableCallsites())
-          CallsWithMetadata.push_back({&SN});
+        for (auto &SN : FS->mutableCallsites()) {
+          IndexCall StackNodeCall(&SN);
+          CallsWithMetadata.push_back(StackNodeCall);
+          CallToFunc[StackNodeCall] = FS;
+        }
 
       if (!CallsWithMetadata.empty())
         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
@@ -1829,26 +1899,76 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
   // from the profiled contexts.
   MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
 
+  std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
   for (auto &Entry : NonAllocationCallToContextNodeMap) {
     auto *Node = Entry.second;
     assert(Node->Clones.empty());
     // Check all node callees and see if in the same function.
-    auto Call = Node->Call.call();
-    for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
-         ++EI) {
-      auto Edge = *EI;
-      if (!Edge->Callee->hasCall())
-        continue;
-      assert(NodeToCallingFunc.count(Edge->Callee));
-      // Check if the called function matches that of the callee node.
-      if (calleesMatch(Call, EI, TailCallToContextNodeMap))
-        continue;
+    // We need to check all of the calls recorded in this Node, because in some
+    // cases we may have had multiple calls with the same debug info calling
+    // different callees. This can happen, for example, when an object is
+    // constructed in the paramter list - the destructor call of the object has
+    // the same debug info (line/col) as the call the object was passed to.
+    // Here we will prune any that don't match all callee nodes.
+    std::vector<CallInfo> AllCalls;
+    AllCalls.reserve(Node->MatchingCalls.size() + 1);
+    AllCalls.push_back(Node->Call);
+    AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
+                    Node->MatchingCalls.end());
+    auto It = AllCalls.begin();
+    // Iterate through the calls until we find the first that matches.
+    for (; It != AllCalls.end(); ++It) {
+      auto ThisCall = *It;
+      bool Match = true;
+      for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
+           ++EI) {
+        auto Edge = *EI;
+        if (!Edge->Callee->hasCall())
+          continue;
+        assert(NodeToCallingFunc.count(Edge->Callee));
+        // Check if the called function matches that of the callee node.
+        if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
+          Match = false;
+          break;
+        }
+      }
+      // Found a call that matches the callee nodes, we can quit now.
+      if (Match) {
+        // If the first match is not the primary call on the Node, update it
+        // now. We will update the list of matching calls further below.
+        if (Node->Call != ThisCall) {
+          Node->setCall(ThisCall);
+          // We need to update the NonAllocationCallToContextNodeMap, but don't
+          // want to do this during iteration over that map, so save the calls
+          // that need updated entries.
+          NewCallToNode.push_back({ThisCall, Node});
+          // We should only have shared this node between calls from the same
+          // function.
+          assert(NodeToCallingFunc[Node] == CallToFunc[Node->Call]);
+        }
+        break;
+      }
+    }
+    // We will update this list below (or leave it cleared if there was no
+    // match found above).
+    Node->MatchingCalls.clear();
+    // If we hit the end of the AllCalls vector, no call matching the callee
+    // nodes was found, clear the call information in the node.
+    if (It == AllCalls.end()) {
       RemovedEdgesWithMismatchedCallees++;
       // Work around by setting Node to have a null call, so it gets
       // skipped during cloning. Otherwise assignFunctions will assert
       // because its data structures are not designed to handle this case.
       Node->setCall(CallInfo());
-      break;
+      continue;
+    }
+    // Now add back any matching calls that call the same function as the
+    // matching primary call on Node.
+    for (++It; It != AllCalls.end(); ++It) {
+      auto ThisCall = *It;
+      if (!sameCallee(Node->Call.call(), ThisCall.call()))
+        continue;
+      Node->MatchingCalls.push_back(ThisCall);
     }
   }
 
@@ -1856,8 +1976,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
   // (checking whether they have a null call which is set above). For a
   // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
   // to do the removal via remove_if than by individually erasing entries above.
-  NonAllocationCallToContextNodeMap.remove_if(
-      [](const auto &it) { return !it.second->hasCall(); });
+  // Also remove any entries if we updated the node's primary call above.
+  NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
+    return !it.second->hasCall() || it.second->Call != it.first;
+  });
+
+  // Add entries for any new primary calls recorded above.
+  for (auto &[Call, Node] : NewCallToNode)
+    NonAllocationCallToContextNodeMap[Call] = Node;
 
   // Add the new nodes after the above loop so that the iteration is not
   // invalidated.
@@ -2083,6 +2209,21 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc(
   return true;
 }
 
+bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
+                                            Instruction *Call2) {
+  auto *CB1 = cast<CallBase>(Call1);
+  if (!CB1->getCalledOperand() || CB1->isIndirectCall())
+    return false;
+  auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
+  auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
+  auto *CB2 = cast<CallBase>(Call2);
+  if (!CB2->getCalledOperand() || CB2->isIndirectCall())
+    return false;
+  auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
+  auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
+  return CalleeFunc1 == CalleeFunc2;
+}
+
 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
     ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
@@ -2209,6 +2350,14 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc(
   return true;
 }
 
+bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
+  ValueInfo Callee1 =
+      dyn_cast_if_present<CallsiteInfo *>(Call1.getBase())->Callee;
+  ValueInfo Callee2 =
+      dyn_cast_if_present<CallsiteInfo *>(Call2.getBase())->Callee;
+  return Callee1 == Callee2;
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
     const {
@@ -2225,6 +2374,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
   if (Recursive)
     OS << " (recursive)";
   OS << "\n";
+  if (!MatchingCalls.empty()) {
+    OS << "\tMatchingCalls:\n";
+    for (auto &MatchingCall : MatchingCalls) {
+      OS << "\t";
+      MatchingCall.print(OS);
+      OS << "\n";
+    }
+  }
   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
   OS << "\tContextIds:";
   // Make a copy of the computed context ids that we can sort for stability.
@@ -2478,6 +2635,7 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
   ContextNode *Clone = NodeOwner.back().get();
   Node->addClone(Clone);
+  Clone->MatchingCalls = Node->MatchingCalls;
   assert(NodeToCallingFunc.count(Node));
   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
@@ -3021,6 +3179,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
         if (CallMap.count(Call))
           CallClone = CallMap[Call];
         CallsiteClone->setCall(CallClone);
+        // Need to do the same for all matching calls.
+        for (auto &MatchingCall : Node->MatchingCalls) {
+          CallInfo CallClone(MatchingCall);
+          if (CallMap.count(MatchingCall))
+            CallClone = CallMap[MatchingCall];
+          // Updates the call in the list.
+          MatchingCall = CallClone;
+        }
       };
 
       // Keep track of the clones of callsite Node that need to be assigned to
@@ -3187,6 +3353,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
               CallInfo NewCall(CallMap[OrigCall]);
               assert(NewCall);
               NewClone->setCall(NewCall);
+              // Need to do the same for all matching calls.
+              for (auto &MatchingCall : NewClone->MatchingCalls) {
+                CallInfo OrigMatchingCall(MatchingCall);
+                OrigMatchingCall.setCloneNo(0);
+                assert(CallMap.count(OrigMatchingCall));
+                CallInfo NewCall(CallMap[OrigMatchingCall]);
+                assert(NewCall);
+                // Updates the call in the list.
+                MatchingCall = NewCall;
+              }
             }
           }
           // Fall through to handling below to perform the recording of the
@@ -3373,6 +3549,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
 
     if (Node->IsAllocation) {
       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+      assert(Node->MatchingCalls.empty());
       return;
     }
 
@@ -3381,6 +3558,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
 
     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
     updateCall(Node->Call, CalleeFunc);
+    // Update all the matching calls as well.
+    for (auto &Call : Node->MatchingCalls)
+      updateCall(Call, CalleeFunc);
   };
 
   // Performs DFS traversal starting from allocation nodes to update calls to
diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
new file mode 100644
index 00000000000000..42819d5421ca0f
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
@@ -0,0 +1,116 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a preceding call in the alloc context
+;; does not cause missing or incorrect cloning. This test is otherwise the same
+;; as memprof-basic.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:	-supports-hot-cold-new \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,blah, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:	-stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:	-o %t.out 2>&1 | FileCheck %s \
+; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+source_filename = "memprof-aliased-location1.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !0
+  %call1 = call ptr @_Z3foov(), !callsite !1
+  ret i32 0
+}
+
+declare void @blah()
+
+define internal ptr @_Z3barv() #0 {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() #0 {
+entry:
+  ;; Preceding call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !8
+  %call = call ptr @_Z3barv(), !callsite !8
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() #0 {
+entry:
+  %call = call ptr @_Z3bazv(), !callsite !9
+  ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold", i64 100}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold", i64 400}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _Z3bazv.memprof.1
+; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
+
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3bazv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
new file mode 100644
index 00000000000000..663f8525043c2f
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
@@ -0,0 +1,116 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a subsequent call in the alloc context
+;; does not cause missing or incorrect cloning. This test is otherwise the same
+;; as memprof-basic.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:	-supports-hot-cold-new \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,blah, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:	-stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:	-o %t.out 2>&1 | FileCheck %s \
+; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+source_filename = "memprof-aliased-location2.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !0
+  %call1 = call ptr @_Z3foov(), !callsite !1
+  ret i32 0
+}
+
+declare void @blah()
+
+define internal ptr @_Z3barv() #0 {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() #0 {
+entry:
+  %call = call ptr @_Z3barv(), !callsite !8
+  ;; Subsequent call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !8
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() #0 {
+entry:
+  %call = call ptr @_Z3bazv(), !callsite !9
+  ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold", i64 100}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold", i64 400}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _Z3bazv.memprof.1
+; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
+
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3bazv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
diff --git a/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location1.ll
new file mode 100644
index 00000000000000..3f5dc7732dc5c3
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location1.ll
@@ -0,0 +1,99 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a preceding tail call in the alloc
+;; context does not cause missing or incorrect cloning. This test is otherwise
+;; the same as memprof-tailcall.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z3barv,plx \
+; RUN:  -r=%t.o,_Z3bazv,plx \
+; RUN:  -r=%t.o,_Z3foov,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -r=%t.o,blah, \
+; RUN:  -stats -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls
+; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls
+; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls
+
+source_filename = "memprof-tailcall-aliased-location1.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3barv()
+define ptr @_Z3barv() local_unnamed_addr #0 {
+entry:
+  ; IR: call {{.*}} @_Znam(i64 10) #[[NOTCOLD:[0-9]+]]
+  %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5
+  ret ptr %call
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare ptr @_Znam(i64) #1
+declare void @blah()
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3bazv()
+define ptr @_Z3bazv() #0 {
+entry:
+  ; IR: call ptr @_Z3barv()
+  %call = tail call ptr @_Z3barv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3foov()
+define ptr @_Z3foov() #0 {
+entry:
+  ; IR: call ptr @_Z3bazv()
+  %call = tail call ptr @_Z3bazv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @main()
+define i32 @main() #0 {
+  ;; Preceding call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !6
+  ;; The first call to foo is part of a cold context, and should use the
+  ;; original functions.
+  ; IR: call ptr @_Z3foov()
+  %call = tail call ptr @_Z3foov(), !callsite !6
+  ;; The second call to foo is part of a cold context, and should call the
+  ;; cloned functions.
+  ; IR: call ptr @_Z3foov.memprof.1()
+  %call1 = tail call ptr @_Z3foov(), !callsite !7
+  ret i32 0
+}
+
+; IR-LABEL: @_Z3barv.memprof.1()
+; IR: call {{.*}} @_Znam(i64 10) #[[COLD:[0-9]+]]
+; IR-LABEL: @_Z3bazv.memprof.1()
+; IR: call ptr @_Z3barv.memprof.1()
+; IR-LABEL: @_Z3foov.memprof.1()
+; IR: call ptr @_Z3bazv.memprof.1()
+
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+attributes #0 = { noinline }
+attributes #1 = { nobuiltin allocsize(0) }
+attributes #2 = { builtin allocsize(0) }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 3186456655321080972, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 3186456655321080972, i64 -3421689549917153178}
+!5 = !{i64 3186456655321080972}
+!6 = !{i64 8632435727821051414}
+!7 = !{i64 -3421689549917153178}
diff --git a/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location2.ll
new file mode 100644
index 00000000000000..3085b4e41938b2
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-tailcall-aliased-location2.ll
@@ -0,0 +1,99 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a subsequent tail call in the alloc
+;; context does not cause missing or incorrect cloning. This test is otherwise
+;; the same as memprof-tailcall.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z3barv,plx \
+; RUN:  -r=%t.o,_Z3bazv,plx \
+; RUN:  -r=%t.o,_Z3foov,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -r=%t.o,blah, \
+; RUN:  -stats -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls
+; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls
+; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls
+
+source_filename = "memprof-tailcall-aliased-location2.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3barv()
+define ptr @_Z3barv() local_unnamed_addr #0 {
+entry:
+  ; IR: call {{.*}} @_Znam(i64 10) #[[NOTCOLD:[0-9]+]]
+  %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5
+  ret ptr %call
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare ptr @_Znam(i64) #1
+declare void @blah()
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3bazv()
+define ptr @_Z3bazv() #0 {
+entry:
+  ; IR: call ptr @_Z3barv()
+  %call = tail call ptr @_Z3barv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3foov()
+define ptr @_Z3foov() #0 {
+entry:
+  ; IR: call ptr @_Z3bazv()
+  %call = tail call ptr @_Z3bazv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @main()
+define i32 @main() #0 {
+  ;; The first call to foo is part of a cold context, and should use the
+  ;; original functions.
+  ; IR: call ptr @_Z3foov()
+  %call = tail call ptr @_Z3foov(), !callsite !6
+  ;; Subsequent call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !6
+  ;; The second call to foo is part of a cold context, and should call the
+  ;; cloned functions.
+  ; IR: call ptr @_Z3foov.memprof.1()
+  %call1 = tail call ptr @_Z3foov(), !callsite !7
+  ret i32 0
+}
+
+; IR-LABEL: @_Z3barv.memprof.1()
+; IR: call {{.*}} @_Znam(i64 10) #[[COLD:[0-9]+]]
+; IR-LABEL: @_Z3bazv.memprof.1()
+; IR: call ptr @_Z3barv.memprof.1()
+; IR-LABEL: @_Z3foov.memprof.1()
+; IR: call ptr @_Z3bazv.memprof.1()
+
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+attributes #0 = { noinline }
+attributes #1 = { nobuiltin allocsize(0) }
+attributes #2 = { builtin allocsize(0) }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 3186456655321080972, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 3186456655321080972, i64 -3421689549917153178}
+!5 = !{i64 3186456655321080972}
+!6 = !{i64 8632435727821051414}
+!7 = !{i64 -3421689549917153178}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
new file mode 100644
index 00000000000000..8f9df20471e41c
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
@@ -0,0 +1,274 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a preceding call in the alloc context
+;; does not cause missing or incorrect cloning. This test is otherwise the same
+;; as basic.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:	-stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:	--check-prefix=STATS --check-prefix=REMARKS
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !callsite !0
+  %call1 = call noundef ptr @_Z3foov(), !callsite !1
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
+
+; Function Attrs: nobuiltin
+declare void @_ZdaPv() #2
+
+define internal ptr @_Z3barv() #3 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !2, !callsite !7
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+declare void @blah()
+
+define internal ptr @_Z3bazv() #4 {
+entry:
+  ;; Preceding call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !8
+  %call = call noundef ptr @_Z3barv(), !callsite !8
+  ret ptr null
+}
+
+; Function Attrs: noinline
+define internal ptr @_Z3foov() #5 {
+entry:
+  %call = call noundef ptr @_Z3bazv(), !callsite !9
+  ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+attributes #0 = { "tune-cpu"="generic" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #2 = { nobuiltin }
+attributes #3 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #4 = { "stack-protector-buffer-size"="8" }
+attributes #5 = { noinline }
+attributes #6 = { builtin }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold", i64 100}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold", i64 400}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[FOO]]
+; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: CCG after cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP:		Clones: [[BAR2:0x[a-z0-9]+]]
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP:		Clones: [[BAZ2:0x[a-z0-9]+]]
+
+; DUMP: Node [[FOO]]
+; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP:		Clones: [[FOO2:0x[a-z0-9]+]]
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[FOO2]]
+; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+; DUMP:		Clone of [[FOO]]
+
+; DUMP: Node [[BAZ2]]
+; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2
+; DUMP:		Clone of [[BAZ]]
+
+; DUMP: Node [[BAR2]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
+; DUMP:		Clone of [[BAR]]
+
+
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3bazv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
+; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv
+; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+
+; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
+; SIZES: Cold context 2 with total size 400 is Cold after cloning
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3bazv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: }
+
+
+; DOTCLONED: digraph "cloned" {
+; DOTCLONED: 	label="cloned";
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOTCLONED: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO2]] -> Node[[BAZ2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOTCLONED: 	Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOTCLONED: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
new file mode 100644
index 00000000000000..c3c164d4928632
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
@@ -0,0 +1,274 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a subsequent call in the alloc context
+;; does not cause missing or incorrect cloning. This test is otherwise the same
+;; as basic.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:	-stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:	--check-prefix=STATS --check-prefix=REMARKS
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !callsite !0
+  %call1 = call noundef ptr @_Z3foov(), !callsite !1
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
+
+; Function Attrs: nobuiltin
+declare void @_ZdaPv() #2
+
+define internal ptr @_Z3barv() #3 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !2, !callsite !7
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+declare void @blah()
+
+define internal ptr @_Z3bazv() #4 {
+entry:
+  %call = call noundef ptr @_Z3barv(), !callsite !8
+  ;; Subsequent call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !8
+  ret ptr null
+}
+
+; Function Attrs: noinline
+define internal ptr @_Z3foov() #5 {
+entry:
+  %call = call noundef ptr @_Z3bazv(), !callsite !9
+  ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+attributes #0 = { "tune-cpu"="generic" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #2 = { nobuiltin }
+attributes #3 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #4 = { "stack-protector-buffer-size"="8" }
+attributes #5 = { noinline }
+attributes #6 = { builtin }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold", i64 100}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold", i64 400}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[FOO]]
+; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: CCG after cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP:		Clones: [[BAR2:0x[a-z0-9]+]]
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP:		Clones: [[BAZ2:0x[a-z0-9]+]]
+
+; DUMP: Node [[FOO]]
+; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP:		Clones: [[FOO2:0x[a-z0-9]+]]
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	  %call = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	  %call1 = call noundef ptr @_Z3foov()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[FOO2]]
+; DUMP: 	  %call = call noundef ptr @_Z3bazv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+; DUMP:		Clone of [[FOO]]
+
+; DUMP: Node [[BAZ2]]
+; DUMP: 	  %call = call noundef ptr @_Z3barv()	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ2]] to Caller: [[FOO2]] AllocTypes: Cold ContextIds: 2
+; DUMP:		Clone of [[BAZ]]
+
+; DUMP: Node [[BAR2]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
+; DUMP:		Clone of [[BAR]]
+
+
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3bazv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
+; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv
+; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+
+; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
+; SIZES: Cold context 2 with total size 400 is Cold after cloning
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3bazv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: }
+
+
+; DOTCLONED: digraph "cloned" {
+; DOTCLONED: 	label="cloned";
+; DOTCLONED: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOTCLONED: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOTCLONED: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOTCLONED: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOTCLONED: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOTCLONED: 	Node[[MAIN2]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOTCLONED: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3foov -\> _Z3bazv}"];
+; DOTCLONED: 	Node[[FOO2]] -> Node[[BAZ2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOTCLONED: 	Node[[BAZ2]] [shape=record,tooltip="N[[BAZ2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: 0\n_Z3bazv -\> _Z3barv}"];
+; DOTCLONED: 	Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
+; DOTCLONED: }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location1.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location1.ll
new file mode 100644
index 00000000000000..e0bcd284c097c2
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location1.ll
@@ -0,0 +1,100 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a preceding tail call in the alloc
+;; context does not cause missing or incorrect cloning. This test is otherwise
+;; the same as tailcall.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -stats %s -S 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=IR
+
+source_filename = "tailcall-aliased-location1.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = dso_local global [2 x ptr] [ptr @_Z2a1v, ptr @_Z2a2v], align 16
+
+declare void @_Z2a1v() #0
+
+declare void @_Z2a2v() #0
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3barv()
+define ptr @_Z3barv() local_unnamed_addr #0 {
+entry:
+  ; IR: call ptr @_Znam(i64 10) #[[NOTCOLD:[0-9]+]]
+  %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5
+  ret ptr %call
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare ptr @_Znam(i64) #1
+declare void @blah()
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3bazv()
+define ptr @_Z3bazv() #0 {
+entry:
+  ; IR: call ptr @_Z3barv()
+  %call = tail call ptr @_Z3barv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3foov()
+define ptr @_Z3foov() #0 {
+entry:
+  ; IR: call ptr @_Z3bazv()
+  %call = tail call ptr @_Z3bazv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @main()
+define i32 @main() #0 {
+  ;; Preceding call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !6
+  ;; The first call to foo is part of a cold context, and should use the
+  ;; original functions.
+  ;; allocation. The latter should call the cloned functions.
+  ; IR: call ptr @_Z3foov()
+  %call = tail call ptr @_Z3foov(), !callsite !6
+  ;; The second call to foo is part of a cold context, and should call the
+  ;; cloned functions.
+  ; IR: call ptr @_Z3foov.memprof.1()
+  %call1 = tail call ptr @_Z3foov(), !callsite !7
+  %2 = load ptr, ptr @a, align 16
+  call void %2(), !callsite !10
+  ret i32 0
+}
+
+; IR-LABEL: @_Z3barv.memprof.1()
+; IR: call ptr @_Znam(i64 10) #[[COLD:[0-9]+]]
+; IR-LABEL: @_Z3bazv.memprof.1()
+; IR: call ptr @_Z3barv.memprof.1()
+; IR-LABEL: @_Z3foov.memprof.1()
+; IR: call ptr @_Z3bazv.memprof.1()
+
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls
+; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls
+; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls
+
+attributes #0 = { noinline }
+attributes #1 = { nobuiltin allocsize(0) }
+attributes #2 = { builtin allocsize(0) }
+
+!0 = !{!1, !3, !8}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 3186456655321080972, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 3186456655321080972, i64 -3421689549917153178}
+!5 = !{i64 3186456655321080972}
+!6 = !{i64 8632435727821051414}
+!7 = !{i64 -3421689549917153178}
+!8 = !{!9, !"notcold"}
+!9 = !{i64 3186456655321080972, i64 1}
+!10 = !{i64 1}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location2.ll
new file mode 100644
index 00000000000000..1e76243fe0f48b
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-aliased-location2.ll
@@ -0,0 +1,100 @@
+;; Test to ensure a call to a different callee but with the same debug info
+;; (and therefore callsite metadata) as a subsequent tail call in the alloc
+;; context does not cause missing or incorrect cloning. This test is otherwise
+;; the same as tailcall.ll.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -stats %s -S 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=IR
+
+source_filename = "tailcall-aliased-location2.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = dso_local global [2 x ptr] [ptr @_Z2a1v, ptr @_Z2a2v], align 16
+
+declare void @_Z2a1v() #0
+
+declare void @_Z2a2v() #0
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3barv()
+define ptr @_Z3barv() local_unnamed_addr #0 {
+entry:
+  ; IR: call ptr @_Znam(i64 10) #[[NOTCOLD:[0-9]+]]
+  %call = tail call ptr @_Znam(i64 10) #2, !memprof !0, !callsite !5
+  ret ptr %call
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare ptr @_Znam(i64) #1
+declare void @blah()
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3bazv()
+define ptr @_Z3bazv() #0 {
+entry:
+  ; IR: call ptr @_Z3barv()
+  %call = tail call ptr @_Z3barv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @_Z3foov()
+define ptr @_Z3foov() #0 {
+entry:
+  ; IR: call ptr @_Z3bazv()
+  %call = tail call ptr @_Z3bazv()
+  ret ptr %call
+}
+
+; Function Attrs: noinline
+; IR-LABEL: @main()
+define i32 @main() #0 {
+  ;; The first call to foo is part of a cold context, and should use the
+  ;; original functions.
+  ;; allocation. The latter should call the cloned functions.
+  ; IR: call ptr @_Z3foov()
+  %call = tail call ptr @_Z3foov(), !callsite !6
+  ;; Subsequent call to another callee but with the same debug location / callsite id
+  call void @blah(), !callsite !6
+  ;; The second call to foo is part of a cold context, and should call the
+  ;; cloned functions.
+  ; IR: call ptr @_Z3foov.memprof.1()
+  %call1 = tail call ptr @_Z3foov(), !callsite !7
+  %2 = load ptr, ptr @a, align 16
+  call void %2(), !callsite !10
+  ret i32 0
+}
+
+; IR-LABEL: @_Z3barv.memprof.1()
+; IR: call ptr @_Znam(i64 10) #[[COLD:[0-9]+]]
+; IR-LABEL: @_Z3bazv.memprof.1()
+; IR: call ptr @_Z3barv.memprof.1()
+; IR-LABEL: @_Z3foov.memprof.1()
+; IR: call ptr @_Z3bazv.memprof.1()
+
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+; STATS: 2 memprof-context-disambiguation - Number of profiled callees found via tail calls
+; STATS: 4 memprof-context-disambiguation - Aggregate depth of profiled callees found via tail calls
+; STATS: 2 memprof-context-disambiguation - Maximum depth of profiled callees found via tail calls
+
+attributes #0 = { noinline }
+attributes #1 = { nobuiltin allocsize(0) }
+attributes #2 = { builtin allocsize(0) }
+
+!0 = !{!1, !3, !8}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 3186456655321080972, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 3186456655321080972, i64 -3421689549917153178}
+!5 = !{i64 3186456655321080972}
+!6 = !{i64 8632435727821051414}
+!7 = !{i64 -3421689549917153178}
+!8 = !{!9, !"notcold"}
+!9 = !{i64 3186456655321080972, i64 1}
+!10 = !{i64 1}

From d8bffa9018c88ef6ce441bb44d7b7d7a9091e583 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 30 Aug 2024 17:37:48 -0700
Subject: [PATCH 26/31] Revert "[AArch64][AsmParser] Directives should clear
 transitively implied features (#106625)" (#106813)

Revert #106625 and fix attempt #106804"

There is another issue in
https://lab.llvm.org/buildbot/#/builders/169/builds/2690
directive-cpu-err.s
and the fix like #106804 fixed the overflow but fails CHECKs.

This reverts commit 10affaf894a72bee9b84ada77dc943b1bb03d02e.
This reverts commit 24977395592fb3a47d0356b6e9e6d25358a521c5.
---
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 102 ++++++++++--------
 .../MC/AArch64/SVE/directive-arch-negative.s  |   8 --
 .../SVE/directive-arch_extension-negative.s   |   7 +-
 .../MC/AArch64/SVE/directive-cpu-negative.s   |   7 +-
 .../test/MC/AArch64/directive-arch-negative.s |   5 +-
 .../directive-arch_extension-negative.s       |  14 +--
 6 files changed, 59 insertions(+), 84 deletions(-)
 delete mode 100644 llvm/test/MC/AArch64/SVE/directive-arch-negative.s

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 373f844b239081..37add682b150e7 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6947,14 +6947,10 @@ static void ExpandCryptoAEK(const AArch64::ArchInfo &ArchInfo,
   }
 }
 
-static SMLoc incrementLoc(SMLoc L, int Offset) {
-  return SMLoc::getFromPointer(L.getPointer() + Offset);
-}
-
 /// parseDirectiveArch
 ///   ::= .arch token
 bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
-  SMLoc CurLoc = getLoc();
+  SMLoc ArchLoc = getLoc();
 
   StringRef Arch, ExtensionString;
   std::tie(Arch, ExtensionString) =
@@ -6962,7 +6958,7 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
 
   const AArch64::ArchInfo *ArchInfo = AArch64::parseArch(Arch);
   if (!ArchInfo)
-    return Error(CurLoc, "unknown arch name");
+    return Error(ArchLoc, "unknown arch name");
 
   if (parseToken(AsmToken::EndOfStatement))
     return true;
@@ -6982,30 +6978,27 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
     ExtensionString.split(RequestedExtensions, '+');
 
   ExpandCryptoAEK(*ArchInfo, RequestedExtensions);
-  CurLoc = incrementLoc(CurLoc, Arch.size());
 
+  FeatureBitset Features = STI.getFeatureBits();
+  setAvailableFeatures(ComputeAvailableFeatures(Features));
   for (auto Name : RequestedExtensions) {
-    // Advance source location past '+'.
-    CurLoc = incrementLoc(CurLoc, 1);
-
     bool EnableFeature = !Name.consume_front_insensitive("no");
 
-    auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) {
-      return Extension.Name == Name;
-    });
-
-    if (It == std::end(ExtensionMap))
-      return Error(CurLoc, "unsupported architectural extension: " + Name);
+    for (const auto &Extension : ExtensionMap) {
+      if (Extension.Name != Name)
+        continue;
 
-    if (EnableFeature)
-      STI.SetFeatureBitsTransitively(It->Features);
-    else
-      STI.ClearFeatureBitsTransitively(It->Features);
+      if (Extension.Features.none())
+        report_fatal_error("unsupported architectural extension: " + Name);
 
-    CurLoc = incrementLoc(CurLoc, Name.size());
+      FeatureBitset ToggleFeatures =
+          EnableFeature
+              ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+              : STI.ToggleFeature(Features & Extension.Features);
+      setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
+      break;
+    }
   }
-  FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
-  setAvailableFeatures(Features);
   return false;
 }
 
@@ -7025,21 +7018,28 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
     Name = Name.substr(2);
   }
 
-  auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) {
-    return Extension.Name == Name;
-  });
+  MCSubtargetInfo &STI = copySTI();
+  FeatureBitset Features = STI.getFeatureBits();
+  for (const auto &Extension : ExtensionMap) {
+    if (Extension.Name != Name)
+      continue;
+
+    if (Extension.Features.none())
+      return Error(ExtLoc, "unsupported architectural extension: " + Name);
+
+    FeatureBitset ToggleFeatures =
+        EnableFeature
+            ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+            : STI.ToggleFeature(Features & Extension.Features);
+    setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
+    return false;
+  }
 
-  if (It == std::end(ExtensionMap))
-    return Error(ExtLoc, "unsupported architectural extension: " + Name);
+  return Error(ExtLoc, "unknown architectural extension: " + Name);
+}
 
-  MCSubtargetInfo &STI = copySTI();
-  if (EnableFeature)
-    STI.SetFeatureBitsTransitively(It->Features);
-  else
-    STI.ClearFeatureBitsTransitively(It->Features);
-  FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
-  setAvailableFeatures(Features);
-  return false;
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+  return SMLoc::getFromPointer(L.getPointer() + Offset);
 }
 
 /// parseDirectiveCPU
@@ -7075,22 +7075,30 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
 
     bool EnableFeature = !Name.consume_front_insensitive("no");
 
-    auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) {
-      return Extension.Name == Name;
-    });
+    bool FoundExtension = false;
+    for (const auto &Extension : ExtensionMap) {
+      if (Extension.Name != Name)
+        continue;
 
-    if (It == std::end(ExtensionMap))
-      Error(CurLoc, "unsupported architectural extension: " + Name);
+      if (Extension.Features.none())
+        report_fatal_error("unsupported architectural extension: " + Name);
 
-    if (EnableFeature)
-      STI.SetFeatureBitsTransitively(It->Features);
-    else
-      STI.ClearFeatureBitsTransitively(It->Features);
+      FeatureBitset Features = STI.getFeatureBits();
+      FeatureBitset ToggleFeatures =
+          EnableFeature
+              ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
+              : STI.ToggleFeature(Features & Extension.Features);
+      setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
+      FoundExtension = true;
+
+      break;
+    }
+
+    if (!FoundExtension)
+      Error(CurLoc, "unsupported architectural extension");
 
     CurLoc = incrementLoc(CurLoc, Name.size());
   }
-  FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
-  setAvailableFeatures(Features);
   return false;
 }
 
diff --git a/llvm/test/MC/AArch64/SVE/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch-negative.s
deleted file mode 100644
index e3029c16ffc8a6..00000000000000
--- a/llvm/test/MC/AArch64/SVE/directive-arch-negative.s
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
-
-// Check that setting +nosve implies +nosve2
-.arch armv9-a+nosve
-
-adclb z0.s, z1.s, z31.s
-// CHECK: error: instruction requires: sve2
-// CHECK-NEXT: adclb z0.s, z1.s, z31.s
diff --git a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
index 31118f7490d00d..661f13974d0bc8 100644
--- a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
@@ -1,12 +1,7 @@
 // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
 
-.arch_extension sve2+nosve
+.arch_extension nosve
 
 ptrue   p0.b, pow2
 // CHECK: error: instruction requires: sve or sme
 // CHECK-NEXT: ptrue   p0.b, pow2
-
-// Check that setting +nosve implies +nosve2
-adclb z0.s, z1.s, z31.s
-// CHECK: error: instruction requires: sve2
-// CHECK-NEXT: adclb z0.s, z1.s, z31.s
diff --git a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s
index 6ba537ca70609e..82acc1b0b0be9b 100644
--- a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s
+++ b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s
@@ -1,11 +1,6 @@
 // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
 
-.cpu generic+sve2+nosve
+.cpu generic+sve+nosve
 ptrue   p0.b, pow2
 // CHECK: error: instruction requires: sve or sme
 // CHECK-NEXT: ptrue   p0.b, pow2
-
-// Check that setting +nosve implies +nosve2
-adclb z0.s, z1.s, z31.s
-// CHECK: error: instruction requires: sve2
-// CHECK-NEXT: adclb z0.s, z1.s, z31.s
diff --git a/llvm/test/MC/AArch64/directive-arch-negative.s b/llvm/test/MC/AArch64/directive-arch-negative.s
index 406507d5fc8f4d..f60759899aa6c9 100644
--- a/llvm/test/MC/AArch64/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/directive-arch-negative.s
@@ -12,13 +12,10 @@
 # CHECK-NEXT: 	aese v0.8h, v1.8h
 # CHECK-NEXT:	^
 
+// We silently ignore invalid features.
 	.arch armv8+foo
 	aese v0.8h, v1.8h
 
-# CHECK: error: unsupported architectural extension: foo
-# CHECK-NEXT:   .arch armv8+foo
-# CHECK-NEXT:               ^
-
 # CHECK: error: invalid operand for instruction
 # CHECK-NEXT:	aese v0.8h, v1.8h
 # CHECK-NEXT:	^
diff --git a/llvm/test/MC/AArch64/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/directive-arch_extension-negative.s
index 1843af56555461..1c1cfc9d33e3ed 100644
--- a/llvm/test/MC/AArch64/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/directive-arch_extension-negative.s
@@ -4,7 +4,7 @@
 // RUN: -filetype asm -o - %s 2>&1 | FileCheck %s
 
 .arch_extension axp64
-// CHECK: error: unsupported architectural extension: axp64
+// CHECK: error: unknown architectural extension: axp64
 // CHECK-NEXT: .arch_extension axp64
 
 crc32cx w0, w1, x3
@@ -49,8 +49,6 @@ fminnm d0, d0, d1
 // CHECK: [[@LINE-1]]:1: error: instruction requires: fp
 // CHECK-NEXT: fminnm d0, d0, d1
 
-// nofp implied nosimd, so reinstate it
-.arch_extension simd
 addp v0.4s, v0.4s, v0.4s
 // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: neon
 .arch_extension nosimd
@@ -72,8 +70,6 @@ casa w5, w7, [x20]
 // CHECK: [[@LINE-1]]:1: error: instruction requires: lse
 // CHECK-NEXT: casa w5, w7, [x20]
 
-// nolse implied nolse128, so reinstate it
-.arch_extension lse128
 swpp x0, x2, [x3]
 // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: lse128
 .arch_extension nolse128
@@ -88,8 +84,6 @@ cfp rctx, x0
 // CHECK: [[@LINE-1]]:5: error: CFPRCTX requires: predres
 // CHECK-NEXT: cfp rctx, x0
 
-// nopredres implied nopredres2, so reinstate it
-.arch_extension predres2
 cosp rctx, x0
 // CHECK-NOT: [[@LINE-1]]:6: error: COSP requires: predres2
 .arch_extension nopredres2
@@ -139,8 +133,6 @@ ldapr x0, [x1]
 // CHECK: [[@LINE-1]]:1: error: instruction requires: rcpc
 // CHECK-NEXT: ldapr x0, [x1]
 
-// norcpc implied norcpc3, so reinstate it
-.arch_extension rcpc3
 stilp w24, w0, [x16, #-8]!
 // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: rcpc3
 .arch_extension norcpc3
@@ -177,8 +169,6 @@ cpyfp [x0]!, [x1]!, x2!
 // CHECK: [[@LINE-1]]:1: error: instruction requires: mops
 // CHECK-NEXT: cpyfp [x0]!, [x1]!, x2!
 
-// nolse128 implied nod128, so reinstate it
-.arch_extension d128
 // This needs to come before `.arch_extension nothe` as it uses an instruction
 // that requires both the and d128
 sysp #0, c2, c0, #0, x0, x1
@@ -214,8 +204,6 @@ umax x0, x1, x2
 // CHECK: [[@LINE-1]]:1: error: instruction requires: cssc
 // CHECK-NEXT: umax x0, x1, x2
 
-// noras implied norasv2, so reinstate it
-.arch_extension rasv2
 mrs x0, ERXGSR_EL1
 // CHECK-NOT: [[@LINE-1]]:9: error: expected readable system register
 .arch_extension norasv2

From e6e429179ecd425040af2bd475f090b503b047c9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 30 Aug 2024 18:57:50 -0700
Subject: [PATCH 27/31] [RISCV] Cleanup CHECK prefixes in half-arith.ll. NFC

Remove prefixes that donn't appear on RUN lines.
Rename prefixes for consistency.
Add RV32/RV64 prefixes where necessary to fix a conflict.
---
 llvm/test/CodeGen/RISCV/half-arith.ll | 2380 +++++++++++++------------
 1 file changed, 1247 insertions(+), 1133 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index 59981a282ab43e..f00829530bb97e 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -4,21 +4,21 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zfh -verify-machineinstrs \
 ; RUN:   -target-abi lp64f < %s | FileCheck -check-prefix=CHECKIZFH %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zhinx -verify-machineinstrs \
-; RUN:   -target-abi ilp32 < %s | FileCheck -check-prefix=CHECK-ZHINX %s
+; RUN:   -target-abi ilp32 < %s | FileCheck -check-prefix=CHECKIZHINX %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zhinx -verify-machineinstrs \
-; RUN:   -target-abi lp64 < %s | FileCheck -check-prefix=CHECK-ZHINX %s
+; RUN:   -target-abi lp64 < %s | FileCheck -check-prefix=CHECKIZHINX %s
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zfhmin -verify-machineinstrs \
-; RUN:   -target-abi ilp32f < %s | FileCheck -check-prefixes=CHECKIZFHMIN,CHECK-RV32-FSGNJ %s
+; RUN:   -target-abi ilp32f < %s | FileCheck -check-prefixes=CHECKIZFHMIN,RV32IZFHMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zfhmin -verify-machineinstrs \
-; RUN:   -target-abi lp64f < %s | FileCheck --check-prefixes=CHECKIZFHMIN,CHECK-RV64-FSGNJ %s
+; RUN:   -target-abi lp64f < %s | FileCheck --check-prefixes=CHECKIZFHMIN,RV64IZFHMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zhinxmin -verify-machineinstrs \
-; RUN:   -target-abi ilp32 < %s | FileCheck --check-prefixes=CHECKZHINXMIN %s
+; RUN:   -target-abi ilp32 < %s | FileCheck --check-prefixes=CHECKIZHINXMIN,RV32IZHINXMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zhinxmin -verify-machineinstrs \
-; RUN:   -target-abi lp64 < %s | FileCheck --check-prefixes=CHECKZHINXMIN %s
+; RUN:   -target-abi lp64 < %s | FileCheck --check-prefixes=CHECKIZHINXMIN,RV64IZHINXMIN %s
 
 ; These tests are each targeted at a particular RISC-V FPU instruction.
 ; Compares and conversions can be found in half-fcmp.ll and half-convert.ll
@@ -31,10 +31,10 @@ define half @fadd_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fadd.h fa0, fa0, fa1
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fadd_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fadd_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fadd_s:
 ; RV32I:       # %bb.0:
@@ -96,20 +96,13 @@ define half @fadd_s(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fadd_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fadd_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fadd_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = fadd half %a, %b
   ret half %1
 }
@@ -120,10 +113,10 @@ define half @fsub_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fsub.h fa0, fa0, fa1
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fsub_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fsub.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fsub_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fsub.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fsub_s:
 ; RV32I:       # %bb.0:
@@ -185,20 +178,13 @@ define half @fsub_s(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fsub_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fsub.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fsub_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fsub.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fsub_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fsub.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = fsub half %a, %b
   ret half %1
 }
@@ -209,10 +195,10 @@ define half @fmul_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fmul.h fa0, fa0, fa1
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fmul_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fmul.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fmul_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fmul.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fmul_s:
 ; RV32I:       # %bb.0:
@@ -274,20 +260,13 @@ define half @fmul_s(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fmul_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fmul_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fmul_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = fmul half %a, %b
   ret half %1
 }
@@ -298,10 +277,10 @@ define half @fdiv_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fdiv.h fa0, fa0, fa1
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fdiv_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fdiv.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fdiv_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fdiv.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fdiv_s:
 ; RV32I:       # %bb.0:
@@ -363,20 +342,13 @@ define half @fdiv_s(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fdiv_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fdiv.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fdiv_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fdiv.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fdiv_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fdiv.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = fdiv half %a, %b
   ret half %1
 }
@@ -389,10 +361,10 @@ define half @fsqrt_s(half %a) nounwind {
 ; CHECKIZFH-NEXT:    fsqrt.h fa0, fa0
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fsqrt_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fsqrt.h a0, a0
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fsqrt_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fsqrt.h a0, a0
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fsqrt_s:
 ; RV32I:       # %bb.0:
@@ -427,18 +399,12 @@ define half @fsqrt_s(half %a) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fsqrt_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fsqrt.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fsqrt_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fsqrt.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fsqrt_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fsqrt.s a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = call half @llvm.sqrt.f16(half %a)
   ret half %1
 }
@@ -451,10 +417,10 @@ define half @fsgnj_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fsgnj.h fa0, fa0, fa1
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fsgnj_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fsgnj.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fsgnj_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fsgnj.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fsgnj_s:
 ; RV32I:       # %bb.0:
@@ -474,79 +440,65 @@ define half @fsgnj_s(half %a, half %b) nounwind {
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fsgnj_s:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa1, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa0, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a1, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    andi a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    andi a1, a1, 127
-; CHECK-RV32-FSGNJ-NEXT:    or a0, a1, a0
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa0, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fsgnj_s:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa1, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa0, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a1, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    andi a1, a1, 127
-; CHECK-RV64-FSGNJ-NEXT:    or a0, a1, a0
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fsgnj_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    addi sp, sp, -16
-; CHECK-ZHINXMIN-NEXT:    addi a2, sp, 12
-; CHECK-ZHINXMIN-NEXT:    sh a1, 0(a2)
-; CHECK-ZHINXMIN-NEXT:    addi a1, sp, 8
-; CHECK-ZHINXMIN-NEXT:    sh a0, 0(a1)
-; CHECK-ZHINXMIN-NEXT:    lbu a0, 13(sp)
-; CHECK-ZHINXMIN-NEXT:    lbu a2, 9(sp)
-; CHECK-ZHINXMIN-NEXT:    andi a0, a0, 128
-; CHECK-ZHINXMIN-NEXT:    andi a2, a2, 127
-; CHECK-ZHINXMIN-NEXT:    or a0, a2, a0
-; CHECK-ZHINXMIN-NEXT:    sb a0, 9(sp)
-; CHECK-ZHINXMIN-NEXT:    lh a0, 0(a1)
-; CHECK-ZHINXMIN-NEXT:    addi sp, sp, 16
-; CHECK-ZHINXMIN-NEXT:    ret
-; CHECKFSGNJ-LABEL: fsgnj_s:
-; CHECKFSGNJ:       # %bb.0:
-; CHECKFSGNJ-NEXT:    addi sp, sp, -16
-; CHECKFSGNJ-NEXT:    fsh fa1, 12(sp)
-; CHECKFSGNJ-NEXT:    fsh fa0, 8(sp)
-; CHECKFSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECKFSGNJ-NEXT:    lbu a1, 9(sp)
-; CHECKFSGNJ-NEXT:    andi a0, a0, 128
-; CHECKFSGNJ-NEXT:    andi a1, a1, 127
-; CHECKFSGNJ-NEXT:    or a0, a1, a0
-; CHECKFSGNJ-NEXT:    sb a0, 9(sp)
-; CHECKFSGNJ-NEXT:    flh fa0, 8(sp)
-; CHECKFSGNJ-NEXT:    addi sp, sp, 16
-; CHECKFSGNJ-NEXT:    ret
-; CHECK64FSGNJ-LABEL: fsgnj_s:
-; CHECK64FSGNJ:       # %bb.0:
-; CHECK64FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK64FSGNJ-NEXT:    fsh fa1, 8(sp)
-; CHECK64FSGNJ-NEXT:    fsh fa0, 0(sp)
-; CHECK64FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK64FSGNJ-NEXT:    lbu a1, 1(sp)
-; CHECK64FSGNJ-NEXT:    andi a0, a0, 128
-; CHECK64FSGNJ-NEXT:    andi a1, a1, 127
-; CHECK64FSGNJ-NEXT:    or a0, a1, a0
-; CHECK64FSGNJ-NEXT:    sb a0, 1(sp)
-; CHECK64FSGNJ-NEXT:    flh fa0, 0(sp)
-; CHECK64FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK64FSGNJ-NEXT:    ret
+; RV32IZFHMIN-LABEL: fsgnj_s:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fsh fa1, 12(sp)
+; RV32IZFHMIN-NEXT:    fsh fa0, 8(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    lbu a1, 9(sp)
+; RV32IZFHMIN-NEXT:    andi a0, a0, 128
+; RV32IZFHMIN-NEXT:    andi a1, a1, 127
+; RV32IZFHMIN-NEXT:    or a0, a1, a0
+; RV32IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV32IZFHMIN-NEXT:    flh fa0, 8(sp)
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fsgnj_s:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fsh fa1, 8(sp)
+; RV64IZFHMIN-NEXT:    fsh fa0, 0(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    lbu a1, 1(sp)
+; RV64IZFHMIN-NEXT:    andi a0, a0, 128
+; RV64IZFHMIN-NEXT:    andi a1, a1, 127
+; RV64IZFHMIN-NEXT:    or a0, a1, a0
+; RV64IZFHMIN-NEXT:    sb a0, 1(sp)
+; RV64IZFHMIN-NEXT:    flh fa0, 0(sp)
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fsgnj_s:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    sh a1, 12(sp)
+; RV32IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV32IZHINXMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 9(sp)
+; RV32IZHINXMIN-NEXT:    andi a0, a0, 128
+; RV32IZHINXMIN-NEXT:    andi a1, a1, 127
+; RV32IZHINXMIN-NEXT:    or a0, a1, a0
+; RV32IZHINXMIN-NEXT:    sb a0, 9(sp)
+; RV32IZHINXMIN-NEXT:    lh a0, 8(sp)
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fsgnj_s:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    sh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    sh a0, 0(sp)
+; RV64IZHINXMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 1(sp)
+; RV64IZHINXMIN-NEXT:    andi a0, a0, 128
+; RV64IZHINXMIN-NEXT:    andi a1, a1, 127
+; RV64IZHINXMIN-NEXT:    or a0, a1, a0
+; RV64IZHINXMIN-NEXT:    sb a0, 1(sp)
+; RV64IZHINXMIN-NEXT:    lh a0, 0(sp)
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %1 = call half @llvm.copysign.f16(half %a, half %b)
   ret half %1
 }
@@ -561,12 +513,12 @@ define i32 @fneg_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    feq.h a0, fa5, fa4
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fneg_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a0, a0, a0
-; CHECK-ZHINX-NEXT:    fneg.h a1, a0
-; CHECK-ZHINX-NEXT:    feq.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fneg_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a0, a0, a0
+; CHECKIZHINX-NEXT:    fneg.h a1, a0
+; CHECKIZHINX-NEXT:    feq.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fneg_s:
 ; RV32I:       # %bb.0:
@@ -630,50 +582,73 @@ define i32 @fneg_s(half %a, half %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fneg_s:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV32-FSGNJ-NEXT:    feq.s a0, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fneg_s:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV64-FSGNJ-NEXT:    feq.s a0, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fneg_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fneg.s a1, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    feq.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fneg_s:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa4, 12(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV32IZFHMIN-NEXT:    feq.s a0, fa5, fa4
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fneg_s:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa4, 8(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV64IZFHMIN-NEXT:    feq.s a0, fa5, fa4
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fneg_s:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fadd.s a0, a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    sh a0, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 13(sp)
+; RV32IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV32IZHINXMIN-NEXT:    sb a1, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a1, 12(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    feq.s a0, a0, a1
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fneg_s:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fadd.s a0, a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV64IZHINXMIN-NEXT:    sb a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    feq.s a0, a0, a1
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %1 = fadd half %a, %a
   %2 = fneg half %1
   %3 = fcmp oeq half %1, %2
@@ -690,11 +665,11 @@ define half @fsgnjn_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fsgnjn.h fa0, fa0, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fsgnjn_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a1, a0, a1
-; CHECK-ZHINX-NEXT:    fsgnjn.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fsgnjn_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a1, a0, a1
+; CHECKIZHINX-NEXT:    fsgnjn.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fsgnjn_s:
 ; RV32I:       # %bb.0:
@@ -774,118 +749,101 @@ define half @fsgnjn_s(half %a, half %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fsgnjn_s:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 4(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 5(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 5(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 4(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa0, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a1, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    andi a0, a0, 127
-; CHECK-RV32-FSGNJ-NEXT:    andi a1, a1, 128
-; CHECK-RV32-FSGNJ-NEXT:    or a0, a0, a1
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa0, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fsgnjn_s:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -32
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa0, 16(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 24(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 17(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a1, 25(sp)
-; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 127
-; CHECK-RV64-FSGNJ-NEXT:    andi a1, a1, 128
-; CHECK-RV64-FSGNJ-NEXT:    or a0, a0, a1
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 17(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 16(sp)
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 32
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fsgnjn_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    addi sp, sp, -16
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a2, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fneg.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    addi a2, sp, 8
-; CHECK-ZHINXMIN-NEXT:    sh a0, 0(a2)
-; CHECK-ZHINXMIN-NEXT:    addi a0, sp, 12
-; CHECK-ZHINXMIN-NEXT:    sh a1, 0(a0)
-; CHECK-ZHINXMIN-NEXT:    lbu a0, 9(sp)
-; CHECK-ZHINXMIN-NEXT:    lbu a1, 13(sp)
-; CHECK-ZHINXMIN-NEXT:    andi a0, a0, 127
-; CHECK-ZHINXMIN-NEXT:    andi a1, a1, 128
-; CHECK-ZHINXMIN-NEXT:    or a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    sb a0, 9(sp)
-; CHECK-ZHINXMIN-NEXT:    lh a0, 0(a2)
-; CHECK-ZHINXMIN-NEXT:    addi sp, sp, 16
-; CHECK-ZHINXMIN-NEXT:    ret
-; CHECKFSGNJ-LABEL: fsgnjn_s:
-; CHECKFSGNJ:       # %bb.0:
-; CHECKFSGNJ-NEXT:    addi sp, sp, -16
-; CHECKFSGNJ-NEXT:    fcvt.s.h ft0, fa1
-; CHECKFSGNJ-NEXT:    fcvt.s.h ft1, fa0
-; CHECKFSGNJ-NEXT:    fadd.s ft0, ft1, ft0
-; CHECKFSGNJ-NEXT:    fcvt.h.s ft0, ft0
-; CHECKFSGNJ-NEXT:    fcvt.s.h ft0, ft0
-; CHECKFSGNJ-NEXT:    fneg.s ft0, ft0
-; CHECKFSGNJ-NEXT:    fcvt.h.s ft0, ft0
-; CHECKFSGNJ-NEXT:    fsh fa0, 8(sp)
-; CHECKFSGNJ-NEXT:    fsh ft0, 12(sp)
-; CHECKFSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECKFSGNJ-NEXT:    lbu a1, 13(sp)
-; CHECKFSGNJ-NEXT:    andi a0, a0, 127
-; CHECKFSGNJ-NEXT:    andi a1, a1, 128
-; CHECKFSGNJ-NEXT:    or a0, a0, a1
-; CHECKFSGNJ-NEXT:    sb a0, 9(sp)
-; CHECKFSGNJ-NEXT:    flh fa0, 8(sp)
-; CHECKFSGNJ-NEXT:    addi sp, sp, 16
-; CHECKFSGNJ-NEXT:    ret
-; CHECK64FSGNJ-LABEL: fsgnjn_s:
-; CHECK64FSGNJ:       # %bb.0:
-; CHECK64FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK64FSGNJ-NEXT:    fcvt.s.h ft0, fa1
-; CHECK64FSGNJ-NEXT:    fcvt.s.h ft1, fa0
-; CHECK64FSGNJ-NEXT:    fadd.s ft0, ft1, ft0
-; CHECK64FSGNJ-NEXT:    fcvt.h.s ft0, ft0
-; CHECK64FSGNJ-NEXT:    fcvt.s.h ft0, ft0
-; CHECK64FSGNJ-NEXT:    fneg.s ft0, ft0
-; CHECK64FSGNJ-NEXT:    fcvt.h.s ft0, ft0
-; CHECK64FSGNJ-NEXT:    fsh fa0, 0(sp)
-; CHECK64FSGNJ-NEXT:    fsh ft0, 8(sp)
-; CHECK64FSGNJ-NEXT:    lbu a0, 1(sp)
-; CHECK64FSGNJ-NEXT:    lbu a1, 9(sp)
-; CHECK64FSGNJ-NEXT:    andi a0, a0, 127
-; CHECK64FSGNJ-NEXT:    andi a1, a1, 128
-; CHECK64FSGNJ-NEXT:    or a0, a0, a1
-; CHECK64FSGNJ-NEXT:    sb a0, 1(sp)
-; CHECK64FSGNJ-NEXT:    flh fa0, 0(sp)
-; CHECK64FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK64FSGNJ-NEXT:    ret
+; RV32IZFHMIN-LABEL: fsgnjn_s:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 4(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 5(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 5(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 4(sp)
+; RV32IZFHMIN-NEXT:    fsh fa0, 8(sp)
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV32IZFHMIN-NEXT:    lbu a1, 13(sp)
+; RV32IZFHMIN-NEXT:    andi a0, a0, 127
+; RV32IZFHMIN-NEXT:    andi a1, a1, 128
+; RV32IZFHMIN-NEXT:    or a0, a0, a1
+; RV32IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV32IZFHMIN-NEXT:    flh fa0, 8(sp)
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fsgnjn_s:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -32
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    fsh fa0, 16(sp)
+; RV64IZFHMIN-NEXT:    fsh fa5, 24(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 17(sp)
+; RV64IZFHMIN-NEXT:    lbu a1, 25(sp)
+; RV64IZFHMIN-NEXT:    andi a0, a0, 127
+; RV64IZFHMIN-NEXT:    andi a1, a1, 128
+; RV64IZFHMIN-NEXT:    or a0, a0, a1
+; RV64IZFHMIN-NEXT:    sb a0, 17(sp)
+; RV64IZFHMIN-NEXT:    flh fa0, 16(sp)
+; RV64IZFHMIN-NEXT:    addi sp, sp, 32
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fsgnjn_s:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a0
+; RV32IZHINXMIN-NEXT:    fadd.s a1, a2, a1
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINXMIN-NEXT:    sh a1, 4(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 5(sp)
+; RV32IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV32IZHINXMIN-NEXT:    sb a1, 5(sp)
+; RV32IZHINXMIN-NEXT:    lh a1, 4(sp)
+; RV32IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV32IZHINXMIN-NEXT:    sh a1, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a0, 9(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 13(sp)
+; RV32IZHINXMIN-NEXT:    andi a0, a0, 127
+; RV32IZHINXMIN-NEXT:    andi a1, a1, 128
+; RV32IZHINXMIN-NEXT:    or a0, a0, a1
+; RV32IZHINXMIN-NEXT:    sb a0, 9(sp)
+; RV32IZHINXMIN-NEXT:    lh a0, 8(sp)
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fsgnjn_s:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -32
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a0
+; RV64IZHINXMIN-NEXT:    fadd.s a1, a2, a1
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV64IZHINXMIN-NEXT:    sh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV64IZHINXMIN-NEXT:    sb a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    sh a0, 16(sp)
+; RV64IZHINXMIN-NEXT:    sh a1, 24(sp)
+; RV64IZHINXMIN-NEXT:    lbu a0, 17(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 25(sp)
+; RV64IZHINXMIN-NEXT:    andi a0, a0, 127
+; RV64IZHINXMIN-NEXT:    andi a1, a1, 128
+; RV64IZHINXMIN-NEXT:    or a0, a0, a1
+; RV64IZHINXMIN-NEXT:    sb a0, 17(sp)
+; RV64IZHINXMIN-NEXT:    lh a0, 16(sp)
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 32
+; RV64IZHINXMIN-NEXT:    ret
   %1 = fadd half %a, %b
   %2 = fneg half %1
   %3 = call half @llvm.copysign.f16(half %a, half %2)
@@ -904,12 +862,12 @@ define half @fabs_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fadd.h fa0, fa4, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fabs_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    fabs.h a1, a0
-; CHECK-ZHINX-NEXT:    fadd.h a0, a1, a0
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fabs_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a0, a0, a1
+; CHECKIZHINX-NEXT:    fabs.h a1, a0
+; CHECKIZHINX-NEXT:    fadd.h a0, a1, a0
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fabs_s:
 ; RV32I:       # %bb.0:
@@ -985,56 +943,81 @@ define half @fabs_s(half %a, half %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fabs_s:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    andi a0, a0, 127
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fabs_s:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 127
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fabs_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fabs.s a1, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a1, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fabs_s:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    andi a0, a0, 127
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa4, 12(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fabs_s:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    andi a0, a0, 127
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa4, 8(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fabs_s:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fadd.s a0, a0, a1
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    sh a0, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 13(sp)
+; RV32IZHINXMIN-NEXT:    andi a1, a1, 127
+; RV32IZHINXMIN-NEXT:    sb a1, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a1, 12(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fadd.s a0, a1, a0
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fabs_s:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fadd.s a0, a0, a1
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    andi a1, a1, 127
+; RV64IZHINXMIN-NEXT:    sb a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fadd.s a0, a1, a0
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %1 = fadd half %a, %b
   %2 = call half @llvm.fabs.f16(half %1)
   %3 = fadd half %2, %1
@@ -1049,10 +1032,10 @@ define half @fmin_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fmin.h fa0, fa0, fa1
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fmin_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fmin.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fmin_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fmin.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fmin_s:
 ; RV32I:       # %bb.0:
@@ -1114,20 +1097,13 @@ define half @fmin_s(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fmin_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmin.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fmin_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmin.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fmin_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmin.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = call half @llvm.minnum.f16(half %a, half %b)
   ret half %1
 }
@@ -1140,10 +1116,10 @@ define half @fmax_s(half %a, half %b) nounwind {
 ; CHECKIZFH-NEXT:    fmax.h fa0, fa0, fa1
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fmax_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fmax.h a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fmax_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fmax.h a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fmax_s:
 ; RV32I:       # %bb.0:
@@ -1205,20 +1181,13 @@ define half @fmax_s(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fmax_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmax.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fmax_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmax.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fmax_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmax.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = call half @llvm.maxnum.f16(half %a, half %b)
   ret half %1
 }
@@ -1231,10 +1200,10 @@ define half @fmadd_s(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fmadd.h fa0, fa0, fa1, fa2
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fmadd_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fmadd.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fmadd_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fmadd.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fmadd_s:
 ; RV32I:       # %bb.0:
@@ -1311,22 +1280,14 @@ define half @fmadd_s(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fmadd_s:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fmadd_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fmadd_s:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
   ret half %1
 }
@@ -1339,11 +1300,11 @@ define half @fmsub_s(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fmsub.h fa0, fa0, fa1, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fmsub_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a2, a2, zero
-; CHECK-ZHINX-NEXT:    fmsub.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fmsub_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a2, a2, zero
+; CHECKIZHINX-NEXT:    fmsub.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fmsub_s:
 ; RV32I:       # %bb.0:
@@ -1433,59 +1394,83 @@ define half @fmsub_s(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fmsub_s:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fmsub_s:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fmsub_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fmsub_s:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV32IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fmsub_s:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV64IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fmsub_s:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; RV32IZHINXMIN-NEXT:    sh a2, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a2, 13(sp)
+; RV32IZHINXMIN-NEXT:    xori a2, a2, 128
+; RV32IZHINXMIN-NEXT:    sb a2, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a2, 12(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fmsub_s:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; RV64IZHINXMIN-NEXT:    sh a2, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a2, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a2, a2, 128
+; RV64IZHINXMIN-NEXT:    sb a2, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a2, 8(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %c_ = fadd half 0.0, %c ; avoid negation using xor
   %negc = fsub half -0.0, %c_
   %1 = call half @llvm.fma.f16(half %a, half %b, half %negc)
@@ -1501,12 +1486,12 @@ define half @fnmadd_s(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fnmadd.h fa0, fa4, fa1, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmadd_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a0, a0, zero
-; CHECK-ZHINX-NEXT:    fadd.h a2, a2, zero
-; CHECK-ZHINX-NEXT:    fnmadd.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmadd_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a0, a0, zero
+; CHECKIZHINX-NEXT:    fadd.h a2, a2, zero
+; CHECKIZHINX-NEXT:    fnmadd.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmadd_s:
 ; RV32I:       # %bb.0:
@@ -1624,81 +1609,115 @@ define half @fnmadd_s(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fnmadd_s:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa4, fa3, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fnmadd_s:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa4, fa3, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmadd_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fnmadd_s:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV32IZFHMIN-NEXT:    flh fa4, 8(sp)
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV32IZFHMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fnmadd_s:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 0(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 1(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 1(sp)
+; RV64IZFHMIN-NEXT:    flh fa4, 0(sp)
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV64IZFHMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fnmadd_s:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fadd.s a0, a0, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV32IZHINXMIN-NEXT:    lbu a0, 9(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; RV32IZHINXMIN-NEXT:    xori a0, a0, 128
+; RV32IZHINXMIN-NEXT:    sb a0, 9(sp)
+; RV32IZHINXMIN-NEXT:    lh a0, 8(sp)
+; RV32IZHINXMIN-NEXT:    sh a2, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a2, 13(sp)
+; RV32IZHINXMIN-NEXT:    xori a2, a2, 128
+; RV32IZHINXMIN-NEXT:    sb a2, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a2, 12(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fnmadd_s:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fadd.s a0, a0, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    sh a0, 0(sp)
+; RV64IZHINXMIN-NEXT:    lbu a0, 1(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; RV64IZHINXMIN-NEXT:    xori a0, a0, 128
+; RV64IZHINXMIN-NEXT:    sb a0, 1(sp)
+; RV64IZHINXMIN-NEXT:    lh a0, 0(sp)
+; RV64IZHINXMIN-NEXT:    sh a2, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a2, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a2, a2, 128
+; RV64IZHINXMIN-NEXT:    sb a2, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a2, 8(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %a_ = fadd half 0.0, %a
   %c_ = fadd half 0.0, %c
   %nega = fsub half -0.0, %a_
@@ -1716,12 +1735,12 @@ define half @fnmadd_s_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fnmadd.h fa0, fa4, fa0, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmadd_s_2:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a1, a1, zero
-; CHECK-ZHINX-NEXT:    fadd.h a2, a2, zero
-; CHECK-ZHINX-NEXT:    fnmadd.h a0, a1, a0, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmadd_s_2:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a1, a1, zero
+; CHECKIZHINX-NEXT:    fadd.h a2, a2, zero
+; CHECKIZHINX-NEXT:    fnmadd.h a0, a1, a0, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmadd_s_2:
 ; RV32I:       # %bb.0:
@@ -1839,81 +1858,115 @@ define half @fnmadd_s_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_2:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa4, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_2:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa4, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmadd_s_2:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fneg.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fneg.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fnmadd_s_2:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV32IZFHMIN-NEXT:    flh fa4, 8(sp)
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV32IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fnmadd_s_2:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 0(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 1(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 1(sp)
+; RV64IZFHMIN-NEXT:    flh fa4, 0(sp)
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa4
+; RV64IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fnmadd_s_2:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINXMIN-NEXT:    sh a1, 8(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 9(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; RV32IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV32IZHINXMIN-NEXT:    sb a1, 9(sp)
+; RV32IZHINXMIN-NEXT:    lh a1, 8(sp)
+; RV32IZHINXMIN-NEXT:    sh a2, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a2, 13(sp)
+; RV32IZHINXMIN-NEXT:    xori a2, a2, 128
+; RV32IZHINXMIN-NEXT:    sb a2, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a2, 12(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fnmadd_s_2:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV64IZHINXMIN-NEXT:    sh a1, 0(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 1(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; RV64IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV64IZHINXMIN-NEXT:    sb a1, 1(sp)
+; RV64IZHINXMIN-NEXT:    lh a1, 0(sp)
+; RV64IZHINXMIN-NEXT:    sh a2, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a2, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a2, a2, 128
+; RV64IZHINXMIN-NEXT:    sb a2, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a2, 8(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %b_ = fadd half 0.0, %b
   %c_ = fadd half 0.0, %c
   %negb = fsub half -0.0, %b_
@@ -1941,12 +1994,12 @@ define half @fnmadd_s_3(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fneg.h fa0, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmadd_s_3:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fmadd.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    lui a1, 1048568
-; CHECK-ZHINX-NEXT:    xor a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmadd_s_3:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fmadd.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    lui a1, 1048568
+; CHECKIZHINX-NEXT:    xor a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmadd_s_3:
 ; RV32I:       # %bb.0:
@@ -2018,58 +2071,48 @@ define half @fnmadd_s_3(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_3:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa0, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_3:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-;
-; CHECKZHINXMIN-LABEL: fnmadd_s_3:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    lui a1, 1048568
-; CHECKZHINXMIN-NEXT:    xor a0, a0, a1
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmadd_s_3:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    lui a1, 1048568
-; CHECK-ZHINXMIN-NEXT:    xor a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fnmadd_s_3:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV32IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa0, 12(sp)
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fnmadd_s_3:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV64IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa0, 8(sp)
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; CHECKIZHINXMIN-LABEL: fnmadd_s_3:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
+; CHECKIZHINXMIN-NEXT:    xor a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
   %neg = fneg half %1
   ret half %neg
@@ -2092,12 +2135,12 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fnmadd.h fa0, fa0, fa1, fa2
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmadd_nsz:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fmadd.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    lui a1, 1048568
-; CHECK-ZHINX-NEXT:    xor a0, a0, a1
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmadd_nsz:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fmadd.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    lui a1, 1048568
+; CHECKIZHINX-NEXT:    xor a0, a0, a1
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmadd_nsz:
 ; RV32I:       # %bb.0:
@@ -2169,58 +2212,48 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fnmadd_nsz:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa0, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fnmadd_nsz:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa0, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-;
-; CHECKZHINXMIN-LABEL: fnmadd_nsz:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    lui a1, 1048568
-; CHECKZHINXMIN-NEXT:    xor a0, a0, a1
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmadd_nsz:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    lui a1, 1048568
-; CHECK-ZHINXMIN-NEXT:    xor a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fnmadd_nsz:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV32IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa0, 12(sp)
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fnmadd_nsz:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV64IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa0, 8(sp)
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; CHECKIZHINXMIN-LABEL: fnmadd_nsz:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
+; CHECKIZHINXMIN-NEXT:    xor a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = call nsz half @llvm.fma.f16(half %a, half %b, half %c)
   %neg = fneg nsz half %1
   ret half %neg
@@ -2234,11 +2267,11 @@ define half @fnmsub_s(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fnmsub.h fa0, fa5, fa1, fa2
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmsub_s:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a0, a0, zero
-; CHECK-ZHINX-NEXT:    fnmsub.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmsub_s:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a0, a0, zero
+; CHECKIZHINX-NEXT:    fnmsub.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmsub_s:
 ; RV32I:       # %bb.0:
@@ -2326,59 +2359,83 @@ define half @fnmsub_s(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fnmsub_s:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fnmsub_s:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa5, fa3, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmsub_s:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fnmsub_s:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa2
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fnmsub_s:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa2
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fnmsub_s:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fadd.s a0, a0, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    sh a0, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZHINXMIN-NEXT:    xori a0, a0, 128
+; RV32IZHINXMIN-NEXT:    sb a0, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a0, 12(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fnmsub_s:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fadd.s a0, a0, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a0, a0, 128
+; RV64IZHINXMIN-NEXT:    sb a0, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a0, 8(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %a_ = fadd half 0.0, %a
   %nega = fsub half -0.0, %a_
   %1 = call half @llvm.fma.f16(half %nega, half %b, half %c)
@@ -2393,11 +2450,11 @@ define half @fnmsub_s_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fnmsub.h fa0, fa5, fa0, fa2
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmsub_s_2:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a1, a1, zero
-; CHECK-ZHINX-NEXT:    fnmsub.h a0, a1, a0, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmsub_s_2:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a1, a1, zero
+; CHECKIZHINX-NEXT:    fnmsub.h a0, a1, a0, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmsub_s_2:
 ; RV32I:       # %bb.0:
@@ -2487,59 +2544,83 @@ define half @fnmsub_s_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fnmsub_s_2:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fnmsub_s_2:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fmadd.s fa5, fa3, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmsub_s_2:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fneg.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fnmsub_s_2:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa2
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fnmsub_s_2:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa2
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa0
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fnmsub_s_2:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINXMIN-NEXT:    sh a1, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a1, 13(sp)
+; RV32IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV32IZHINXMIN-NEXT:    sb a1, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a1, 12(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fnmsub_s_2:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV64IZHINXMIN-NEXT:    sh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    xori a1, a1, 128
+; RV64IZHINXMIN-NEXT:    sb a1, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a1, 8(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %b_ = fadd half 0.0, %b
   %negb = fsub half -0.0, %b_
   %1 = call half @llvm.fma.f16(half %a, half %negb, half %c)
@@ -2552,10 +2633,10 @@ define half @fmadd_s_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fmadd.h fa0, fa0, fa1, fa2
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fmadd_s_contract:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fmadd.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fmadd_s_contract:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fmadd.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fmadd_s_contract:
 ; RV32I:       # %bb.0:
@@ -2645,28 +2726,17 @@ define half @fmadd_s_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fmadd_s_contract:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fmadd_s_contract:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fmadd_s_contract:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a2
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %1 = fmul contract half %a, %b
   %2 = fadd contract half %1, %c
   ret half %2
@@ -2680,11 +2750,11 @@ define half @fmsub_s_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fmsub.h fa0, fa0, fa1, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fmsub_s_contract:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a2, a2, zero
-; CHECK-ZHINX-NEXT:    fmsub.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fmsub_s_contract:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a2, a2, zero
+; CHECKIZHINX-NEXT:    fmsub.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fmsub_s_contract:
 ; RV32I:       # %bb.0:
@@ -2790,34 +2860,20 @@ define half @fmsub_s_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fmsub_s_contract:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fsub.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fmsub_s_contract:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fsub.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fmsub_s_contract:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fsub.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %c_ = fadd half 0.0, %c ; avoid negation using xor
   %1 = fmul contract half %a, %b
   %2 = fsub contract half %1, %c_
@@ -2834,13 +2890,13 @@ define half @fnmadd_s_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fnmadd.h fa0, fa4, fa3, fa5
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmadd_s_contract:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a0, a0, zero
-; CHECK-ZHINX-NEXT:    fadd.h a1, a1, zero
-; CHECK-ZHINX-NEXT:    fadd.h a2, a2, zero
-; CHECK-ZHINX-NEXT:    fnmadd.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmadd_s_contract:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a0, a0, zero
+; CHECKIZHINX-NEXT:    fadd.h a1, a1, zero
+; CHECKIZHINX-NEXT:    fadd.h a2, a2, zero
+; CHECKIZHINX-NEXT:    fnmadd.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmadd_s_contract:
 ; RV32I:       # %bb.0:
@@ -2964,86 +3020,119 @@ define half @fnmadd_s_contract(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fnmadd_s_contract:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV32-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa3, fa3, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa3, fa3
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa3, fa3
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fmul.s fa5, fa5, fa3
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV32-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa3, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa3
-; CHECK-RV32-FSGNJ-NEXT:    fsub.s fa5, fa4, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fnmadd_s_contract:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa0
-; CHECK-RV64-FSGNJ-NEXT:    fmv.w.x fa4, zero
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa3, fa3, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa3, fa3
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa3, fa3
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fmul.s fa5, fa5, fa3
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa2
-; CHECK-RV64-FSGNJ-NEXT:    xori a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa3, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa3
-; CHECK-RV64-FSGNJ-NEXT:    fsub.s fa5, fa4, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmadd_s_contract:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fneg.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECK-ZHINXMIN-NEXT:    fsub.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; RV32IZFHMIN-LABEL: fnmadd_s_contract:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; RV32IZFHMIN-NEXT:    fadd.s fa3, fa3, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa3, fa3
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa3, fa3
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fmul.s fa5, fa5, fa3
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fsh fa5, 12(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV32IZFHMIN-NEXT:    xori a0, a0, 128
+; RV32IZFHMIN-NEXT:    sb a0, 13(sp)
+; RV32IZFHMIN-NEXT:    flh fa3, 12(sp)
+; RV32IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa3
+; RV32IZFHMIN-NEXT:    fsub.s fa5, fa4, fa5
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fnmadd_s_contract:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; RV64IZFHMIN-NEXT:    fadd.s fa3, fa3, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa3, fa3
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa3, fa3
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fmul.s fa5, fa5, fa3
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa2
+; RV64IZFHMIN-NEXT:    xori a0, a0, 128
+; RV64IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV64IZFHMIN-NEXT:    flh fa3, 8(sp)
+; RV64IZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa3
+; RV64IZFHMIN-NEXT:    fsub.s fa5, fa4, fa5
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fnmadd_s_contract:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fadd.s a0, a0, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    sh a0, 12(sp)
+; RV32IZHINXMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a2
+; RV32IZHINXMIN-NEXT:    xori a0, a0, 128
+; RV32IZHINXMIN-NEXT:    sb a0, 13(sp)
+; RV32IZHINXMIN-NEXT:    lh a0, 12(sp)
+; RV32IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fsub.s a0, a0, a1
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fnmadd_s_contract:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fadd.s a0, a0, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV64IZHINXMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a2
+; RV64IZHINXMIN-NEXT:    xori a0, a0, 128
+; RV64IZHINXMIN-NEXT:    sb a0, 9(sp)
+; RV64IZHINXMIN-NEXT:    lh a0, 8(sp)
+; RV64IZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fsub.s a0, a0, a1
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %a_ = fadd half 0.0, %a ; avoid negation using xor
   %b_ = fadd half 0.0, %b ; avoid negation using xor
   %c_ = fadd half 0.0, %c ; avoid negation using xor
@@ -3062,12 +3151,12 @@ define half @fnmsub_s_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFH-NEXT:    fnmsub.h fa0, fa4, fa5, fa2
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fnmsub_s_contract:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fadd.h a0, a0, zero
-; CHECK-ZHINX-NEXT:    fadd.h a1, a1, zero
-; CHECK-ZHINX-NEXT:    fnmsub.h a0, a0, a1, a2
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fnmsub_s_contract:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fadd.h a0, a0, zero
+; CHECKIZHINX-NEXT:    fadd.h a1, a1, zero
+; CHECKIZHINX-NEXT:    fnmsub.h a0, a0, a1, a2
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fnmsub_s_contract:
 ; RV32I:       # %bb.0:
@@ -3190,40 +3279,23 @@ define half @fnmsub_s_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKZHINXMIN-LABEL: fnmsub_s_contract:
-; CHECKZHINXMIN:       # %bb.0:
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECKZHINXMIN-NEXT:    fsub.s a0, a1, a0
-; CHECKZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKZHINXMIN-NEXT:    ret
-; CHECK-ZHINXMIN-LABEL: fnmsub_s_contract:
-; CHECK-ZHINXMIN:       # %bb.0:
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a2
-; CHECK-ZHINXMIN-NEXT:    fsub.s a0, a1, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    ret
+; CHECKIZHINXMIN-LABEL: fnmsub_s_contract:
+; CHECKIZHINXMIN:       # %bb.0:
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a2
+; CHECKIZHINXMIN-NEXT:    fsub.s a0, a1, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; CHECKIZHINXMIN-NEXT:    ret
   %a_ = fadd half 0.0, %a ; avoid negation using xor
   %b_ = fadd half 0.0, %b ; avoid negation using xor
   %1 = fmul contract half %a_, %b_
@@ -3237,10 +3309,10 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ; CHECKIZFH-NEXT:    fsgnjx.h fa0, fa1, fa0
 ; CHECKIZFH-NEXT:    ret
 ;
-; CHECK-ZHINX-LABEL: fsgnjx_f16:
-; CHECK-ZHINX:       # %bb.0:
-; CHECK-ZHINX-NEXT:    fsgnjx.h a0, a1, a0
-; CHECK-ZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fsgnjx_f16:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    fsgnjx.h a0, a1, a0
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fsgnjx_f16:
 ; RV32I:       # %bb.0:
@@ -3294,47 +3366,89 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
-; CHECK-RV32-FSGNJ-LABEL: fsgnjx_f16:
-; CHECK-RV32-FSGNJ:       # %bb.0:
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV32-FSGNJ-NEXT:    lui a0, %hi(.LCPI23_0)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa0, 12(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
-; CHECK-RV32-FSGNJ-NEXT:    lbu a1, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    andi a0, a0, 128
-; CHECK-RV32-FSGNJ-NEXT:    andi a1, a1, 127
-; CHECK-RV32-FSGNJ-NEXT:    or a0, a1, a0
-; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
-; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 8(sp)
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV32-FSGNJ-NEXT:    fmul.s fa5, fa5, fa4
-; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV32-FSGNJ-NEXT:    ret
-;
-; CHECK-RV64-FSGNJ-LABEL: fsgnjx_f16:
-; CHECK-RV64-FSGNJ:       # %bb.0:
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
-; CHECK-RV64-FSGNJ-NEXT:    lui a0, %hi(.LCPI23_0)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa0, 8(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
-; CHECK-RV64-FSGNJ-NEXT:    lbu a1, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 128
-; CHECK-RV64-FSGNJ-NEXT:    andi a1, a1, 127
-; CHECK-RV64-FSGNJ-NEXT:    or a0, a1, a0
-; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
-; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 0(sp)
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-RV64-FSGNJ-NEXT:    fmul.s fa5, fa5, fa4
-; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
-; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
-; CHECK-RV64-FSGNJ-NEXT:    ret
+; RV32IZFHMIN-LABEL: fsgnjx_f16:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    lui a0, %hi(.LCPI23_0)
+; RV32IZFHMIN-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
+; RV32IZFHMIN-NEXT:    fsh fa0, 12(sp)
+; RV32IZFHMIN-NEXT:    fsh fa5, 8(sp)
+; RV32IZFHMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZFHMIN-NEXT:    lbu a1, 9(sp)
+; RV32IZFHMIN-NEXT:    andi a0, a0, 128
+; RV32IZFHMIN-NEXT:    andi a1, a1, 127
+; RV32IZFHMIN-NEXT:    or a0, a1, a0
+; RV32IZFHMIN-NEXT:    sb a0, 9(sp)
+; RV32IZFHMIN-NEXT:    flh fa5, 8(sp)
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV32IZFHMIN-NEXT:    fmul.s fa5, fa5, fa4
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: fsgnjx_f16:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    lui a0, %hi(.LCPI23_0)
+; RV64IZFHMIN-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
+; RV64IZFHMIN-NEXT:    fsh fa0, 8(sp)
+; RV64IZFHMIN-NEXT:    fsh fa5, 0(sp)
+; RV64IZFHMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZFHMIN-NEXT:    lbu a1, 1(sp)
+; RV64IZFHMIN-NEXT:    andi a0, a0, 128
+; RV64IZFHMIN-NEXT:    andi a1, a1, 127
+; RV64IZFHMIN-NEXT:    or a0, a1, a0
+; RV64IZFHMIN-NEXT:    sb a0, 1(sp)
+; RV64IZFHMIN-NEXT:    flh fa5, 0(sp)
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; RV64IZFHMIN-NEXT:    fmul.s fa5, fa5, fa4
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: fsgnjx_f16:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI23_0)
+; RV32IZHINXMIN-NEXT:    lh a2, %lo(.LCPI23_0)(a2)
+; RV32IZHINXMIN-NEXT:    sh a0, 12(sp)
+; RV32IZHINXMIN-NEXT:    sh a2, 8(sp)
+; RV32IZHINXMIN-NEXT:    lbu a0, 13(sp)
+; RV32IZHINXMIN-NEXT:    lbu a2, 9(sp)
+; RV32IZHINXMIN-NEXT:    andi a0, a0, 128
+; RV32IZHINXMIN-NEXT:    andi a2, a2, 127
+; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    sb a0, 9(sp)
+; RV32IZHINXMIN-NEXT:    lh a0, 8(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: fsgnjx_f16:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    lui a2, %hi(.LCPI23_0)
+; RV64IZHINXMIN-NEXT:    lh a2, %lo(.LCPI23_0)(a2)
+; RV64IZHINXMIN-NEXT:    sh a0, 8(sp)
+; RV64IZHINXMIN-NEXT:    sh a2, 0(sp)
+; RV64IZHINXMIN-NEXT:    lbu a0, 9(sp)
+; RV64IZHINXMIN-NEXT:    lbu a2, 1(sp)
+; RV64IZHINXMIN-NEXT:    andi a0, a0, 128
+; RV64IZHINXMIN-NEXT:    andi a2, a2, 127
+; RV64IZHINXMIN-NEXT:    or a0, a2, a0
+; RV64IZHINXMIN-NEXT:    sb a0, 1(sp)
+; RV64IZHINXMIN-NEXT:    lh a0, 0(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
   %z = call half @llvm.copysign.f16(half 1.0, half %x)
   %mul = fmul half %z, %y
   ret half %mul

From e0f2368cdeb7312973a92fb2d22199d1de540db8 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 30 Aug 2024 19:23:45 -0700
Subject: [PATCH 28/31] [clang-format] Correctly annotate braces in ObjC square
 brackets (#106654)

See
https://github.com/llvm/llvm-project/pull/88238#issuecomment-2316954781.
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 1 +
 clang/unittests/Format/TokenAnnotatorTest.cpp | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 5b518bf6c859e8..246b29d308bfaf 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2682,6 +2682,7 @@ void UnwrappedLineParser::parseSquare(bool LambdaIntroducer) {
       break;
     }
     case tok::at:
+    case tok::colon:
       nextToken();
       if (FormatTok->is(tok::l_brace)) {
         nextToken();
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 5aa5d93c1cb067..497b911f4efbba 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3286,6 +3286,15 @@ TEST_F(TokenAnnotatorTest, BlockLBrace) {
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_BlockLBrace);
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
+
+  Tokens = annotate("[foo bar:{{0, 1}} baz:baz];",
+                    getLLVMStyle(FormatStyle::LK_ObjC));
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_Unknown); // Not TT_BlockLBrace.
+  EXPECT_BRACE_KIND(Tokens[4], BK_Unknown);          // Not BK_Block.
+  EXPECT_BRACE_KIND(Tokens[5], BK_BracedInit);
+  EXPECT_BRACE_KIND(Tokens[9], BK_Unknown);  // Not BK_Block.
+  EXPECT_BRACE_KIND(Tokens[10], BK_Unknown); // Not BK_Block.
 }
 
 TEST_F(TokenAnnotatorTest, SwitchExpression) {

From d0dfcea608169e02293cb23905518481f3e8fedf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 30 Aug 2024 19:32:21 -0700
Subject: [PATCH 29/31] [RISCV][LoongArch] Don't store Dwarf register in
 MCRegister.

---
 .../lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp | 2 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index e40981f5b5cd57..595ce9fc815bf0 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -65,7 +65,7 @@ static MCAsmInfo *createLoongArchMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI = new LoongArchMCAsmInfo(TT);
 
   // Initial state of the frame pointer is sp(r3).
-  MCRegister SP = MRI.getDwarfRegNum(LoongArch::R3, true);
+  unsigned SP = MRI.getDwarfRegNum(LoongArch::R3, true);
   MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0);
   MAI->addInitialFrameState(Inst);
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index e051312d61a7bc..53329af093de0f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -71,7 +71,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
                                        const MCTargetOptions &Options) {
   MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
 
-  MCRegister SP = MRI.getDwarfRegNum(RISCV::X2, true);
+  unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true);
   MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0);
   MAI->addInitialFrameState(Inst);
 

From 4f9ea258c4f36e01e3a71a3603c588ee52b54a6f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 30 Aug 2024 19:32:42 -0700
Subject: [PATCH 30/31] [AsmPrinter] Don't store Dwarf register in Register.

---
 llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index c1e8355353cfdc..0a1ff189bedbc4 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -347,8 +347,9 @@ void DwarfCompileUnit::addLocationAttribute(
                 Asm->getObjFileLowering().getIndirectSymViaRWPI(Sym));
         // Base register
         Register BaseReg = Asm->getObjFileLowering().getStaticBase();
-        BaseReg = Asm->TM.getMCRegisterInfo()->getDwarfRegNum(BaseReg, false);
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + BaseReg);
+        unsigned DwarfBaseReg =
+            Asm->TM.getMCRegisterInfo()->getDwarfRegNum(BaseReg, false);
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DwarfBaseReg);
         // Offset from base register
         addSInt(*Loc, dwarf::DW_FORM_sdata, 0);
         // Operation

From ef50970204384643acca42ba4c7ca8f14865a0c2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 30 Aug 2024 19:46:33 -0700
Subject: [PATCH 31/31] workflows/release-binaries: Remove .git/config file
 from artifacts (#106310)

The .git/config file contains an auth token that can be leaked if the
.git directory is included in a workflow artifact.
---
 .github/workflows/release-binaries-save-stage/action.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml
index e2f3eeadd15bea..f08088c7bc56f1 100644
--- a/.github/workflows/release-binaries-save-stage/action.yml
+++ b/.github/workflows/release-binaries-save-stage/action.yml
@@ -10,6 +10,9 @@ inputs:
     required: true
     type: 'string'
 
+permissions:
+  contents: read
+
 runs:
   using: "composite"
   steps:
@@ -18,6 +21,9 @@ runs:
     - name: Package Build and Source Directories
       shell: bash
       run: |
+        # Remove .git/config to avoid leaking GITHUB_TOKEN stored there.
+        # See https://unit42.paloaltonetworks.com/github-repo-artifacts-leak-tokens/
+        rm -Rf .git/config
         # Windows does not support symlinks, so we need to dereference them.
         tar --exclude build/ ${{ (runner.os == 'Windows' && '-h') || '' }} -c . | zstd -T0 -c > ../llvm-project.tar.zst
         mv ../llvm-project.tar.zst .