From 77523f9d5f7eac860a39728f031e3db9a53ee3b6 Mon Sep 17 00:00:00 2001 From: wanglei Date: Mon, 2 Sep 2024 11:07:30 +0800 Subject: [PATCH 01/33] [LoongArch] Remove unnecessary increment operations `HighMask` is the value that sets bits from `Msb+1` to 63 to 1, while the other bits are set to 0. --- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp index 08e5ccc7bc0be5..6343817a0616d1 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp @@ -94,7 +94,7 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) { uint64_t Msb = 32; uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1); - for (; Msb < 64; ++Msb, HighMask = (HighMask << 1) + 1) { + for (; Msb < 64; ++Msb, HighMask = HighMask << 1) { for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) { uint64_t LowMask = (1ULL << Lsb) - 1; uint64_t Mask = HighMask | LowMask; From 27e244f51435f8f0933969782a6faddfcbe809a6 Mon Sep 17 00:00:00 2001 From: Jake Egan Date: Sun, 1 Sep 2024 23:37:43 -0400 Subject: [PATCH 02/33] [clang][AIX] Fix -print-runtime-dir on AIX (#104806) Currently the option prints a path to a nonexistent directory with the full triple, `lib/powerpc64-ibm-aix7.2.0.0`. It should only be `lib/aix`. --- clang/lib/Driver/ToolChain.cpp | 4 ++-- clang/test/Driver/aix-print-runtime-dir.c | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 clang/test/Driver/aix-print-runtime-dir.c diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 76901875c66959..64f23d43e87ee8 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -843,8 +843,8 @@ std::optional ToolChain::getRuntimePath() const { llvm::sys::path::append(P, "lib"); if (auto Ret = getTargetSubDirPath(P)) return Ret; - // Darwin does not use per-target runtime directory. - if (Triple.isOSDarwin()) + // Darwin and AIX does not use per-target runtime directory. + if (Triple.isOSDarwin() || Triple.isOSAIX()) return {}; llvm::sys::path::append(P, Triple.str()); return std::string(P); diff --git a/clang/test/Driver/aix-print-runtime-dir.c b/clang/test/Driver/aix-print-runtime-dir.c new file mode 100644 index 00000000000000..0d68ad6fee005e --- /dev/null +++ b/clang/test/Driver/aix-print-runtime-dir.c @@ -0,0 +1,11 @@ +// Test output of -print-runtime-dir on AIX + +// RUN: %clang -print-runtime-dir --target=powerpc-ibm-aix \ +// RUN: -resource-dir=%S/Inputs/resource_dir \ +// RUN: | FileCheck --check-prefix=PRINT-RUNTIME-DIR %s + +// RUN: %clang -print-runtime-dir --target=powerpc64-ibm-aix \ +// RUN: -resource-dir=%S/Inputs/resource_dir \ +// RUN: | FileCheck --check-prefix=PRINT-RUNTIME-DIR %s + +// PRINT-RUNTIME-DIR: lib{{/|\\}}aix{{$}} From c74cc73f2bfc1a82c2c68c2bfe9c4d70299aa060 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 1 Sep 2024 21:02:06 -0700 Subject: [PATCH 03/33] [RISCV] Move VLDSX0Pred from RISCVSchedSiFive7.td to RISCVScheduleV.td. NFC (#106671) This predicate isn't bound to the scheduler model and and we may want to reuse it in the future. We already moved it to reuse it in our downstream. --- llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 4 ---- llvm/lib/Target/RISCV/RISCVScheduleV.td | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 0b0ac0c368d070..3f2e8dee76fd66 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -479,10 +479,6 @@ foreach mx = SchedMxList in { // resource, we do not need to use LMULSEWXXX constructors. However, we do // use the SEW from the name to determine the number of Cycles. -// This predicate is true when the rs2 operand of vlse or vsse is x0, false -// otherwise. -def VLDSX0Pred : MCSchedPredicate>; - foreach mx = SchedMxList in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault.c; defvar Cycles = SiFive7GetCyclesOnePerElement.c; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 95fde1e53c805f..ee041ea142b94c 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -9,6 +9,10 @@ //===----------------------------------------------------------------------===// /// Define scheduler resources associated with def operands. +// This predicate is true when the rs2 operand of vlse or vsse is x0, false +// otherwise. +def VLDSX0Pred : MCSchedPredicate>; + defvar SchedMxList = ["MF8", "MF4", "MF2", "M1", "M2", "M4", "M8"]; // Used for widening and narrowing instructions as it doesn't contain M8. defvar SchedMxListW = !listremove(SchedMxList, ["M8"]); From 647f892a7281e99c4209cee07097f6a052ed474f Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 2 Sep 2024 00:29:24 -0400 Subject: [PATCH 04/33] [llvm][Support] Simplify HAVE_PTHREAD_GETNAME/SETNAME_NP handling. NFCI (#106486) --- llvm/lib/Support/Unix/Threading.inc | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc index 839c00c5ebbf96..6ba6395e6b8d63 100644 --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -141,12 +141,8 @@ static constexpr uint32_t get_max_thread_name_length_impl() { return PTHREAD_MAX_NAMELEN_NP; #elif defined(__APPLE__) return 64; -#elif defined(__linux__) -#if HAVE_PTHREAD_SETNAME_NP +#elif defined(__linux__) && HAVE_PTHREAD_SETNAME_NP return 16; -#else - return 0; -#endif #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) return 16; #elif defined(__OpenBSD__) @@ -174,12 +170,8 @@ void llvm::set_thread_name(const Twine &Name) { if (get_max_thread_name_length() > 0) NameStr = NameStr.take_back(get_max_thread_name_length() - 1); (void)NameStr; -#if defined(__linux__) -#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__) -#if HAVE_PTHREAD_SETNAME_NP +#if defined(__linux__) && HAVE_PTHREAD_SETNAME_NP ::pthread_setname_np(::pthread_self(), NameStr.data()); -#endif -#endif #elif defined(__FreeBSD__) || defined(__OpenBSD__) ::pthread_set_name_np(::pthread_self(), NameStr.data()); #elif defined(__NetBSD__) @@ -241,14 +233,12 @@ void llvm::get_thread_name(SmallVectorImpl &Name) { ::pthread_get_name_np(::pthread_self(), buf, len); Name.append(buf, buf + strlen(buf)); -#elif defined(__linux__) -#if HAVE_PTHREAD_GETNAME_NP +#elif defined(__linux__) && HAVE_PTHREAD_GETNAME_NP constexpr uint32_t len = get_max_thread_name_length_impl(); char Buffer[len] = {'\0'}; // FIXME: working around MSan false positive. if (0 == ::pthread_getname_np(::pthread_self(), Buffer, len)) Name.append(Buffer, Buffer + strlen(Buffer)); #endif -#endif } SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) { From 358165ded3c45115ce587d56ef792a9e7c0214ea Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Mon, 2 Sep 2024 13:42:42 +0800 Subject: [PATCH 05/33] [Clang][Concepts] Correct the CurContext for friend declarations (#106890) `FindInstantiatedDecl()` relies on the `CurContext` to find the corresponding class template instantiation for a class template declaration. Previously, we pushed the semantic declaration context for constraint comparison, which is incorrect for constraints on friend declarations. In issue #78101, the semantic context of the friend is the TU, so we missed the implicit template specialization `Template` when looking for the instantiation of the primary template `Template` at the time of checking the member instantiation; instead, we mistakenly picked up the explicit specialization `Template`, hence the error. As a bonus, this also fixes a crash when diagnosing constraints. The DeclarationName is not necessarily an identifier, so it's incorrect to call `getName()` on e.g. overloaded operators. Since the DiagnosticBuilder has correctly handled Decl printing, we don't need to find the printable name ourselves. Fixes https://github.com/llvm/llvm-project/issues/78101 --- clang/docs/ReleaseNotes.rst | 1 + .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/Sema/SemaConcept.cpp | 9 +++++++- clang/lib/Sema/SemaTemplateInstantiate.cpp | 3 +-- .../temp.constr/temp.constr.normal/p1.cpp | 2 +- clang/test/SemaTemplate/concepts-friends.cpp | 23 +++++++++++++++++++ 6 files changed, 35 insertions(+), 5 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 98fb0174d4a37e..fc940db4813948 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -338,6 +338,7 @@ Bug Fixes to C++ Support - Fixed an assertion failure when converting vectors to int/float with invalid expressions. (#GH105486) - Template parameter names are considered in the name lookup of out-of-line class template specialization right before its declaration context. (#GH64082) +- Fixed a constraint comparison bug for friend declarations. (#GH78101) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 66f8890da75e5d..dcb49d8a67604a 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -5603,7 +5603,7 @@ def note_checking_constraints_for_function_here : Note< def note_constraint_substitution_here : Note< "while substituting template arguments into constraint expression here">; def note_constraint_normalization_here : Note< - "while calculating associated constraint of template '%0' here">; + "while calculating associated constraint of template %0 here">; def note_parameter_mapping_substitution_here : Note< "while substituting into concept arguments here; substitution failures not " "allowed in concept arguments">; diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 86d6f308a51cc2..6a1b32598bb4a6 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -1012,7 +1012,14 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction( // possible that e.g. constraints involving C> and C are // perceived identical. std::optional ContextScope; - if (auto *RD = dyn_cast(DeclInfo.getDeclContext())) { + const DeclContext *DC = [&] { + if (!DeclInfo.getDecl()) + return DeclInfo.getDeclContext(); + return DeclInfo.getDecl()->getFriendObjectKind() + ? DeclInfo.getLexicalDeclContext() + : DeclInfo.getDeclContext(); + }(); + if (auto *RD = dyn_cast(DC)) { ThisScope.emplace(S, const_cast(RD), Qualifiers()); ContextScope.emplace(S, const_cast(cast(RD)), /*NewThisContext=*/false); diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 776297479e141e..c42cc250bb904a 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1226,8 +1226,7 @@ void Sema::PrintInstantiationStack() { case CodeSynthesisContext::ConstraintNormalization: Diags.Report(Active->PointOfInstantiation, diag::note_constraint_normalization_here) - << cast(Active->Entity)->getName() - << Active->InstantiationRange; + << cast(Active->Entity) << Active->InstantiationRange; break; case CodeSynthesisContext::ParameterMappingSubstitution: Diags.Report(Active->PointOfInstantiation, diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp index d80710937cdfa1..3992835c444027 100644 --- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp +++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp @@ -15,7 +15,7 @@ template requires Bar2 struct S2 { }; // expected-note@-1{{template is declared here}} template requires Bar2 && true struct S2 { }; // expected-error@-1{{class template partial specialization is not more specialized than the primary template}} -// expected-note@-2{{while calculating associated constraint of template 'S2' here}} +// expected-note@-2{{while calculating associated constraint of template 'S2' here}} namespace type_pack { template diff --git a/clang/test/SemaTemplate/concepts-friends.cpp b/clang/test/SemaTemplate/concepts-friends.cpp index 14b37d78d951dc..d05be423a8cfcd 100644 --- a/clang/test/SemaTemplate/concepts-friends.cpp +++ b/clang/test/SemaTemplate/concepts-friends.cpp @@ -525,3 +525,26 @@ struct S { }; } + +namespace GH78101 { + +template +concept True = true; + +template struct Template { + static constexpr int i = I; + + friend constexpr auto operator+(True auto f) { return i; } +}; + +template struct Template { + static constexpr int i = I; + + friend constexpr auto operator+(True auto f) { return i; } +}; + +Template f{}; + +static_assert(+Template{} == 5); + +} // namespace GH78101 From da13754103b8880811f4c164d858c6dd3c393927 Mon Sep 17 00:00:00 2001 From: Akshat Oke <76596238+Akshat-Oke@users.noreply.github.com> Date: Mon, 2 Sep 2024 11:41:56 +0530 Subject: [PATCH 06/33] AMDGPU/NewPM Port SILoadStoreOptimizer to NPM (#106362) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 6 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +- .../Target/AMDGPU/SILoadStoreOptimizer.cpp | 64 ++++++++++++++----- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h | 30 +++++++++ .../CodeGen/AMDGPU/load-store-opt-dlc.mir | 1 + .../CodeGen/AMDGPU/load-store-opt-scc.mir | 1 + 7 files changed, 86 insertions(+), 22 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 717e5f511ef2f9..6cc6863841a4c5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -40,7 +40,7 @@ FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesLegacyPass(); FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass(); FunctionPass *createSIShrinkInstructionsPass(); -FunctionPass *createSILoadStoreOptimizerPass(); +FunctionPass *createSILoadStoreOptimizerLegacyPass(); FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); @@ -190,8 +190,8 @@ extern char &AMDGPUMarkLastScratchLoadID; void initializeSILowerSGPRSpillsPass(PassRegistry &); extern char &SILowerSGPRSpillsID; -void initializeSILoadStoreOptimizerPass(PassRegistry &); -extern char &SILoadStoreOptimizerID; +void initializeSILoadStoreOptimizerLegacyPass(PassRegistry &); +extern char &SILoadStoreOptimizerLegacyID; void initializeSIWholeQuadModePass(PassRegistry &); extern char &SIWholeQuadModeID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 9976a8199d7047..d01e3f0b97ddd1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -99,4 +99,5 @@ MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) +MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) #undef MACHINE_FUNCTION_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 7df39f35478077..757a575841e61a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -36,6 +36,7 @@ #include "R600TargetMachine.h" #include "SIFixSGPRCopies.h" #include "SIFoldOperands.h" +#include "SILoadStoreOptimizer.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" @@ -417,7 +418,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSIOptimizeVGPRLiveRangePass(*PR); - initializeSILoadStoreOptimizerPass(*PR); + initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); @@ -1271,7 +1272,7 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SIFoldOperandsLegacyID); if (EnableDPPCombine) addPass(&GCNDPPCombineLegacyID); - addPass(&SILoadStoreOptimizerID); + addPass(&SILoadStoreOptimizerLegacyID); if (isPassEnabled(EnableSDWAPeephole)) { addPass(&SIPeepholeSDWAID); addPass(&EarlyMachineLICMID); diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ddce80b2ae129e..1e993aa4ad52a0 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -57,6 +57,7 @@ // //===----------------------------------------------------------------------===// +#include "SILoadStoreOptimizer.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -104,7 +105,7 @@ struct AddressRegs { // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. const unsigned MaxAddressRegs = 12 + 1 + 1; -class SILoadStoreOptimizer : public MachineFunctionPass { +class SILoadStoreOptimizer { struct CombineInfo { MachineBasicBlock::iterator I; unsigned EltSize; @@ -295,17 +296,21 @@ class SILoadStoreOptimizer : public MachineFunctionPass { static InstClassEnum getCommonInstClass(const CombineInfo &CI, const CombineInfo &Paired); -public: - static char ID; - - SILoadStoreOptimizer() : MachineFunctionPass(ID) { - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - } - bool optimizeInstsWithSameBaseAddr(std::list &MergeList, bool &OptimizeListAgain); bool optimizeBlock(std::list > &MergeableInsts); +public: + SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {} + bool run(MachineFunction &MF); +}; + +class SILoadStoreOptimizerLegacy : public MachineFunctionPass { +public: + static char ID; + + SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "SI Load Store Optimizer"; } @@ -882,18 +887,18 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, } // end anonymous namespace. -INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", - false, false) +INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE, + "SI Load Store Optimizer", false, false) -char SILoadStoreOptimizer::ID = 0; +char SILoadStoreOptimizerLegacy::ID = 0; -char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; +char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID; -FunctionPass *llvm::createSILoadStoreOptimizerPass() { - return new SILoadStoreOptimizer(); +FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() { + return new SILoadStoreOptimizerLegacy(); } static void addDefsUsesToList(const MachineInstr &MI, @@ -2522,10 +2527,15 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( return Modified; } -bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { +bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; + return SILoadStoreOptimizer( + &getAnalysis().getAAResults()) + .run(MF); +} +bool SILoadStoreOptimizer::run(MachineFunction &MF) { STM = &MF.getSubtarget(); if (!STM->loadStoreOptEnabled()) return false; @@ -2534,7 +2544,6 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - AA = &getAnalysis().getAAResults(); LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); @@ -2571,3 +2580,24 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { return Modified; } + +PreservedAnalyses +SILoadStoreOptimizerPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + MFPropsModifier _(*this, MF); + + if (MF.getFunction().hasOptNone()) + return PreservedAnalyses::all(); + + auto &FAM = MFAM.getResult(MF) + .getManager(); + AAResults &AA = FAM.getResult(MF.getFunction()); + + bool Changed = SILoadStoreOptimizer(&AA).run(MF); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h new file mode 100644 index 00000000000000..6c20401d6bf5c1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h @@ -0,0 +1,30 @@ +//===--- SILoadStoreOptimizer.h -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H +#define LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class SILoadStoreOptimizerPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + + MachineFunctionProperties getRequiredProperties() { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir index f4cdedf9cf6eb8..9295bd59621039 100644 --- a/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s # The purpose of this test is to make sure we are combining relevant memory # operations correctly with/without DLC bit. diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir index c4e131b90deb48..c0cc3e9f4edd7f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -passes=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s # The purpose of this test is to make sure we are combining relevant memory # operations correctly with/without SCC bit. From dd5d73007240712957f2b633f795d9965afaadd6 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Mon, 2 Sep 2024 08:34:14 +0200 Subject: [PATCH 07/33] [lldb] Better matching of types in anonymous namespaces (#102111) This patch extends TypeQuery matching to support anonymous namespaces. A new flag is added to control the behavior. In the "strict" mode, the query must match the type exactly -- all anonymous namespaces included. The dynamic type resolver in the itanium abi (the motivating use case for this) uses this flag, as it queries using the name from the demangles, which includes anonymous namespaces. This ensures we don't confuse a type with a same-named type in an anonymous namespace. However, this does *not* ensure we don't confuse two types in anonymous namespacs (in different CUs). To resolve this, we would need to use a completely different lookup algorithm, which probably also requires a DWARF extension. In the "lax" mode (the default), the anonymous namespaces in the query are optional, and this allows one search for the type using the usual language rules (`::A` matches `::(anonymous namespace)::A`). This patch also changes the type context computation algorithm in DWARFDIE, so that it includes anonymous namespace information. This causes a slight change in behavior: the algorithm previously stopped computing the context after encountering an anonymous namespace, which caused the outer namespaces to be ignored. This meant that a type like `NS::(anonymous namespace)::A` would be (incorrectly) recognized as `::A`). This can cause code depending on the old behavior to misbehave. The fix is to specify all the enclosing namespaces in the query, or use a non-exact match. --- lldb/include/lldb/Symbol/Type.h | 24 +++++++- .../ItaniumABI/ItaniumABILanguageRuntime.cpp | 1 + .../Plugins/SymbolFile/DWARF/DWARFDIE.cpp | 8 +-- lldb/source/Symbol/Type.cpp | 36 ++++++++++-- lldb/test/API/lang/cpp/dynamic-value/Makefile | 2 +- .../cpp/dynamic-value/TestDynamicValue.py | 15 ++++- lldb/test/API/lang/cpp/dynamic-value/a.h | 25 +++++++++ .../lang/cpp/dynamic-value/anonymous-b.cpp | 13 +++++ .../lang/cpp/dynamic-value/pass-to-base.cpp | 38 +++---------- .../API/lang/cpp/namespace/TestNamespace.py | 6 ++ lldb/unittests/Symbol/TestType.cpp | 56 +++++++++++++++++++ .../SymbolFile/DWARF/DWARFDIETest.cpp | 26 ++++++++- 12 files changed, 202 insertions(+), 48 deletions(-) create mode 100644 lldb/test/API/lang/cpp/dynamic-value/a.h create mode 100644 lldb/test/API/lang/cpp/dynamic-value/anonymous-b.cpp diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h index 420307e0dbcf02..03d9f927997476 100644 --- a/lldb/include/lldb/Symbol/Type.h +++ b/lldb/include/lldb/Symbol/Type.h @@ -77,10 +77,13 @@ FLAGS_ENUM(TypeQueryOptions){ /// If set, the query will ignore all Module entries in the type context, /// even for exact matches. e_ignore_modules = (1u << 2), + /// If set, all anonymous namespaces in the context must be matched exactly + /// by the pattern. Otherwise, superfluous namespaces are skipped. + e_strict_namespaces = (1u << 3), /// When true, the find types call should stop the query as soon as a single /// matching type is found. When false, the type query should find all /// matching types. - e_find_one = (1u << 3), + e_find_one = (1u << 4), }; LLDB_MARK_AS_BITMASK_ENUM(TypeQueryOptions) @@ -264,7 +267,22 @@ class TypeQuery { bool GetExactMatch() const { return (m_options & e_exact_match) != 0; } bool GetIgnoreModules() const { return (m_options & e_ignore_modules) != 0; } - void SetIgnoreModules() { m_options &= ~e_ignore_modules; } + void SetIgnoreModules(bool b) { + if (b) + m_options |= e_ignore_modules; + else + m_options &= ~e_ignore_modules; + } + + bool GetStrictNamespaces() const { + return (m_options & e_strict_namespaces) != 0; + } + void SetStrictNamespaces(bool b) { + if (b) + m_options |= e_strict_namespaces; + else + m_options &= ~e_strict_namespaces; + } /// The \a m_context can be used in two ways: normal types searching with /// the context containing a stanadard declaration context for a type, or @@ -279,7 +297,7 @@ class TypeQuery { if (b) m_options |= e_find_one; else - m_options &= (e_exact_match | e_find_one); + m_options &= ~e_find_one; } /// Access the internal compiler context array. diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp index 7af768aad0bc19..4c547afe30fe81 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp @@ -90,6 +90,7 @@ TypeAndOrName ItaniumABILanguageRuntime::GetTypeInfo( TypeResults results; TypeQuery query(const_lookup_name.GetStringRef(), TypeQueryOptions::e_exact_match | + TypeQueryOptions::e_strict_namespaces | TypeQueryOptions::e_find_one); if (module_sp) { module_sp->FindTypes(query, results); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp index fb32e2adeb3fea..0a13c457a307ae 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp @@ -440,12 +440,6 @@ static void GetTypeLookupContextImpl(DWARFDIE die, continue; } - // If there is no name, then there is no need to look anything up for this - // DIE. - const char *name = die.GetName(); - if (!name || !name[0]) - return; - // Add this DIE's contribution at the end of the chain. auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) { context.push_back({kind, ConstString(name)}); @@ -471,7 +465,7 @@ static void GetTypeLookupContextImpl(DWARFDIE die, push_ctx(CompilerContextKind::Typedef, die.GetName()); break; case DW_TAG_base_type: - push_ctx(CompilerContextKind::Builtin, name); + push_ctx(CompilerContextKind::Builtin, die.GetName()); break; // If any of the tags below appear in the parent chain, stop the decl // context and return. Prior to these being in here, if a type existed in a diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp index eb321407e3734c..f7b44ade0da165 100644 --- a/lldb/source/Symbol/Type.cpp +++ b/lldb/source/Symbol/Type.cpp @@ -134,6 +134,20 @@ bool TypeQuery::ContextMatches( if (ctx == ctx_end) return false; // Pattern too long. + if (ctx->kind == CompilerContextKind::Namespace && ctx->name.IsEmpty()) { + // We're matching an anonymous namespace. These are optional, so we check + // if the pattern expects an anonymous namespace. + if (pat->name.IsEmpty() && (pat->kind & CompilerContextKind::Namespace) == + CompilerContextKind::Namespace) { + // Match, advance both iterators. + ++pat; + } + // Otherwise, only advance the context to skip over the anonymous + // namespace, and try matching again. + ++ctx; + continue; + } + // See if there is a kind mismatch; they should have 1 bit in common. if ((ctx->kind & pat->kind) == CompilerContextKind()) return false; @@ -145,10 +159,16 @@ bool TypeQuery::ContextMatches( ++pat; } - // Skip over any remaining module entries if we were asked to do that. - while (GetIgnoreModules() && ctx != ctx_end && - ctx->kind == CompilerContextKind::Module) - ++ctx; + // Skip over any remaining module and anonymous namespace entries if we were + // asked to do that. + auto should_skip = [this](const CompilerContext &ctx) { + if (ctx.kind == CompilerContextKind::Module) + return GetIgnoreModules(); + if (ctx.kind == CompilerContextKind::Namespace && ctx.name.IsEmpty()) + return !GetStrictNamespaces(); + return false; + }; + ctx = std::find_if_not(ctx, ctx_end, should_skip); // At this point, we have exhausted the pattern and we have a partial match at // least. If that's all we're looking for, we're done. @@ -788,7 +808,13 @@ Type::GetTypeScopeAndBasename(llvm::StringRef name) { switch (pos.value()) { case ':': if (prev_is_colon && template_depth == 0) { - result.scope.push_back(name.slice(name_begin, pos.index() - 1)); + llvm::StringRef scope_name = name.slice(name_begin, pos.index() - 1); + // The itanium demangler uses this string to represent anonymous + // namespaces. Convert it to a more language-agnostic form (which is + // also used in DWARF). + if (scope_name == "(anonymous namespace)") + scope_name = ""; + result.scope.push_back(scope_name); name_begin = pos.index() + 1; } break; diff --git a/lldb/test/API/lang/cpp/dynamic-value/Makefile b/lldb/test/API/lang/cpp/dynamic-value/Makefile index 2bba8e757f79b7..ce91dc63f473f5 100644 --- a/lldb/test/API/lang/cpp/dynamic-value/Makefile +++ b/lldb/test/API/lang/cpp/dynamic-value/Makefile @@ -1,3 +1,3 @@ -CXX_SOURCES := pass-to-base.cpp +CXX_SOURCES := pass-to-base.cpp anonymous-b.cpp include Makefile.rules diff --git a/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py b/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py index 60a2590e1559d3..e016168f047c19 100644 --- a/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py +++ b/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py @@ -170,7 +170,7 @@ def test_get_dynamic_vals(self): self.assertTrue(reallyA_value) reallyA_loc = int(reallyA_value.GetLocation(), 16) - # Finally continue to doSomething again, and make sure we get the right value for anotherA, + # Continue to doSomething again, and make sure we get the right value for anotherA, # which this time around is just an "A". threads = lldbutil.continue_to_breakpoint(process, do_something_bpt) @@ -184,6 +184,19 @@ def test_get_dynamic_vals(self): self.assertEqual(anotherA_loc, reallyA_loc) self.assertEqual(anotherA_value.GetTypeName().find("B"), -1) + # Finally do the same with a B in an anonymous namespace. + threads = lldbutil.continue_to_breakpoint(process, do_something_bpt) + self.assertEqual(len(threads), 1) + thread = threads[0] + + frame = thread.GetFrameAtIndex(0) + anotherA_value = frame.FindVariable("anotherA", use_dynamic) + self.assertTrue(anotherA_value) + self.assertIn("B", anotherA_value.GetTypeName()) + anon_b_value = anotherA_value.GetChildMemberWithName("m_anon_b_value") + self.assertTrue(anon_b_value) + self.assertEqual(anon_b_value.GetValueAsSigned(), 47) + def examine_value_object_of_this_ptr( self, this_static, this_dynamic, dynamic_location ): diff --git a/lldb/test/API/lang/cpp/dynamic-value/a.h b/lldb/test/API/lang/cpp/dynamic-value/a.h new file mode 100644 index 00000000000000..708cbb79fee5cd --- /dev/null +++ b/lldb/test/API/lang/cpp/dynamic-value/a.h @@ -0,0 +1,25 @@ +#ifndef A_H +#define A_H + +#include +#include + +class A { +public: + A(int value) : m_a_value(value) {} + A(int value, A *client_A) : m_a_value(value), m_client_A(client_A) {} + + virtual ~A() {} + + virtual void doSomething(A &anotherA); + + int Value() { return m_a_value; } + +private: + int m_a_value; + std::auto_ptr m_client_A; +}; + +A *make_anonymous_B(); + +#endif diff --git a/lldb/test/API/lang/cpp/dynamic-value/anonymous-b.cpp b/lldb/test/API/lang/cpp/dynamic-value/anonymous-b.cpp new file mode 100644 index 00000000000000..755afcbf12a988 --- /dev/null +++ b/lldb/test/API/lang/cpp/dynamic-value/anonymous-b.cpp @@ -0,0 +1,13 @@ +#include "a.h" + +namespace { +class B : public A { +public: + B() : A(42) {} + +private: + int m_anon_b_value = 47; +}; +} // namespace + +A *make_anonymous_B() { return new B(); } diff --git a/lldb/test/API/lang/cpp/dynamic-value/pass-to-base.cpp b/lldb/test/API/lang/cpp/dynamic-value/pass-to-base.cpp index 2bccf3303823c1..be763390cc6f90 100644 --- a/lldb/test/API/lang/cpp/dynamic-value/pass-to-base.cpp +++ b/lldb/test/API/lang/cpp/dynamic-value/pass-to-base.cpp @@ -1,5 +1,10 @@ -#include -#include +#include "a.h" + +void A::doSomething(A &anotherA) { + printf("In A %p doing something with %d.\n", this, m_a_value); + int tmp_value = anotherA.Value(); + printf("Also have another A at %p: %d.\n", &anotherA, tmp_value); // Break here in doSomething. +} class Extra { @@ -11,33 +16,6 @@ class Extra int m_extra_two; }; -class A -{ -public: - A(int value) : m_a_value (value) {} - A(int value, A* client_A) : m_a_value (value), m_client_A (client_A) {} - - virtual ~A() {} - - virtual void - doSomething (A &anotherA) - { - printf ("In A %p doing something with %d.\n", this, m_a_value); - int tmp_value = anotherA.Value(); - printf ("Also have another A at %p: %d.\n", &anotherA, tmp_value); // Break here in doSomething. - } - - int - Value() - { - return m_a_value; - } - -private: - int m_a_value; - std::auto_ptr m_client_A; -}; - class B : public Extra, public virtual A { public: @@ -65,5 +43,7 @@ main (int argc, char **argv) A reallyA (500); myB.doSomething (reallyA); // Break here and get real address of reallyA. + myB.doSomething(*make_anonymous_B()); + return 0; } diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespace.py b/lldb/test/API/lang/cpp/namespace/TestNamespace.py index 84891b322180c3..8b013d928f9ca5 100644 --- a/lldb/test/API/lang/cpp/namespace/TestNamespace.py +++ b/lldb/test/API/lang/cpp/namespace/TestNamespace.py @@ -208,6 +208,12 @@ def test_with_run_command(self): patterns=[" = 3"], ) + # Search for a type in an anonymous namespace, both with and without the + # namespace prefix. + self.expect("type lookup -- my_uint_t", substrs=["unsigned int"]) + self.expect("type lookup -- (anonymous namespace)::my_uint_t", + substrs=["unsigned int"]) + # rdar://problem/8660275 # test/namespace: 'expression -- i+j' not working # This has been fixed. diff --git a/lldb/unittests/Symbol/TestType.cpp b/lldb/unittests/Symbol/TestType.cpp index e4b56b9ff02f7c..e3bb2cf6e69e2a 100644 --- a/lldb/unittests/Symbol/TestType.cpp +++ b/lldb/unittests/Symbol/TestType.cpp @@ -16,6 +16,7 @@ using namespace lldb; using namespace lldb_private; +using testing::ElementsAre; using testing::Not; TEST(Type, GetTypeScopeAndBasename) { @@ -59,8 +60,33 @@ MATCHER_P(MatchesIgnoringModules, pattern, "") { TypeQuery query(pattern, TypeQueryOptions::e_ignore_modules); return query.ContextMatches(arg); } +MATCHER_P(MatchesWithStrictNamespaces, pattern, "") { + TypeQuery query(pattern, TypeQueryOptions::e_strict_namespaces); + return query.ContextMatches(arg); +} } // namespace +TEST(Type, TypeQueryFlags) { + TypeQuery q("foo", e_none); + auto get = [](const TypeQuery &q) -> std::vector { + return {q.GetFindOne(), q.GetExactMatch(), q.GetModuleSearch(), + q.GetIgnoreModules(), q.GetStrictNamespaces()}; + }; + EXPECT_THAT(get(q), ElementsAre(false, false, false, false, false)); + + q.SetFindOne(true); + EXPECT_THAT(get(q), ElementsAre(true, false, false, false, false)); + + q.SetIgnoreModules(true); + EXPECT_THAT(get(q), ElementsAre(true, false, false, true, false)); + + q.SetStrictNamespaces(true); + EXPECT_THAT(get(q), ElementsAre(true, false, false, true, true)); + + q.SetIgnoreModules(false); + EXPECT_THAT(get(q), ElementsAre(true, false, false, false, true)); +} + TEST(Type, CompilerContextPattern) { auto make_module = [](llvm::StringRef name) { return CompilerContext(CompilerContextKind::Module, ConstString(name)); @@ -103,6 +129,10 @@ TEST(Type, CompilerContextPattern) { (std::vector{make_module("A"), make_module("B"), make_class("C")}), Matches( std::vector{make_module("A"), make_module("B"), make_any_type("C")})); + EXPECT_THAT((std::vector{make_module("A"), make_module("B"), + make_namespace(""), make_class("C")}), + Matches(std::vector{make_module("A"), make_module("B"), + make_any_type("C")})); EXPECT_THAT( (std::vector{make_module("A"), make_module("B"), make_enum("C2")}), Not(Matches(std::vector{make_module("A"), make_module("B"), @@ -111,4 +141,30 @@ TEST(Type, CompilerContextPattern) { Matches(std::vector{make_class("C")})); EXPECT_THAT((std::vector{make_namespace("NS"), make_class("C")}), Not(Matches(std::vector{make_any_type("C")}))); + + EXPECT_THAT((std::vector{make_namespace(""), make_class("C")}), + Matches(std::vector{make_class("C")})); + EXPECT_THAT((std::vector{make_namespace(""), make_class("C")}), + Not(MatchesWithStrictNamespaces(std::vector{make_class("C")}))); + EXPECT_THAT((std::vector{make_namespace(""), make_class("C")}), + Matches(std::vector{make_namespace(""), make_class("C")})); + EXPECT_THAT((std::vector{make_namespace(""), make_class("C")}), + MatchesWithStrictNamespaces( + std::vector{make_namespace(""), make_class("C")})); + EXPECT_THAT((std::vector{make_class("C")}), + Not(Matches(std::vector{make_namespace(""), make_class("C")}))); + EXPECT_THAT((std::vector{make_class("C")}), + Not(MatchesWithStrictNamespaces( + std::vector{make_namespace(""), make_class("C")}))); + EXPECT_THAT((std::vector{make_namespace(""), make_namespace("NS"), + make_namespace(""), make_class("C")}), + Matches(std::vector{make_namespace("NS"), make_class("C")})); + EXPECT_THAT( + (std::vector{make_namespace(""), make_namespace(""), make_namespace("NS"), + make_namespace(""), make_namespace(""), make_class("C")}), + Matches(std::vector{make_namespace("NS"), make_class("C")})); + EXPECT_THAT((std::vector{make_module("A"), make_namespace("NS"), + make_namespace(""), make_class("C")}), + MatchesIgnoringModules( + std::vector{make_namespace("NS"), make_class("C")})); } diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp index 122b7de7516b6d..1e4c8f3ba07787 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp @@ -222,6 +222,9 @@ TEST(DWARFDIETest, GetContext) { Attributes: - Attribute: DW_AT_name Form: DW_FORM_string + - Code: 0x4 + Tag: DW_TAG_namespace + Children: DW_CHILDREN_yes debug_info: - Version: 4 AddrSize: 8 @@ -235,6 +238,11 @@ TEST(DWARFDIETest, GetContext) { - AbbrCode: 0x3 Values: - CStr: STRUCT + - AbbrCode: 0x4 + - AbbrCode: 0x3 + Values: + - CStr: STRUCT + - AbbrCode: 0x0 - AbbrCode: 0x0 - AbbrCode: 0x0 )"; @@ -245,15 +253,17 @@ TEST(DWARFDIETest, GetContext) { DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0); ASSERT_TRUE(unit); - auto make_namespace = [](llvm::StringRef name) { + auto make_namespace = [](const char *name) { return CompilerContext(CompilerContextKind::Namespace, ConstString(name)); }; - auto make_struct = [](llvm::StringRef name) { + auto make_struct = [](const char *name) { return CompilerContext(CompilerContextKind::ClassOrStruct, ConstString(name)); }; DWARFDIE struct_die = unit->DIE().GetFirstChild().GetFirstChild(); ASSERT_TRUE(struct_die); + DWARFDIE anon_struct_die = struct_die.GetSibling().GetFirstChild(); + ASSERT_TRUE(anon_struct_die); EXPECT_THAT( struct_die.GetDeclContext(), testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT"))); @@ -263,6 +273,18 @@ TEST(DWARFDIETest, GetContext) { EXPECT_THAT(struct_die.GetDWARFDeclContext(), DWARFDeclContext({{DW_TAG_structure_type, "STRUCT"}, {DW_TAG_namespace, "NAMESPACE"}})); + EXPECT_THAT(anon_struct_die.GetDeclContext(), + testing::ElementsAre(make_namespace("NAMESPACE"), + make_namespace(nullptr), + make_struct("STRUCT"))); + EXPECT_THAT(anon_struct_die.GetTypeLookupContext(), + testing::ElementsAre(make_namespace("NAMESPACE"), + make_namespace(nullptr), + make_struct("STRUCT"))); + EXPECT_THAT(anon_struct_die.GetDWARFDeclContext(), + DWARFDeclContext({{DW_TAG_structure_type, "STRUCT"}, + {DW_TAG_namespace, nullptr}, + {DW_TAG_namespace, "NAMESPACE"}})); } TEST(DWARFDIETest, GetContextInFunction) { From d2ce9dc85e5d94e19a69d4a72e7b9197447d480a Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 2 Sep 2024 02:38:23 -0400 Subject: [PATCH 08/33] Add support for retrieving the thread ID on DragonFly BSD (#106938) --- llvm/lib/Support/Unix/Threading.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc index 6ba6395e6b8d63..1812d990f21ac1 100644 --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -29,7 +29,7 @@ #include -#if defined(__FreeBSD__) || defined(__OpenBSD__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) #include // For pthread_getthreadid_np() / pthread_set_name_np() #endif @@ -121,7 +121,7 @@ uint64_t llvm::get_threadid() { return InitSelf; }(); return Self; -#elif defined(__FreeBSD__) +#elif defined(__FreeBSD__) || defined(__DragonFly__) return uint64_t(pthread_getthreadid_np()); #elif defined(__NetBSD__) return uint64_t(_lwp_self()); From f044564db1cbc588d0cad4f953d38f6c787dadd4 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 2 Sep 2024 09:09:21 +0200 Subject: [PATCH 09/33] [InstCombine] Make backedge check in op of phi transform more precise (#106075) The op of phi transform wants to prevent moving an operation across a backedge, as this may lead to an infinite combine loop. Currently, this is done using isPotentiallyReachable(). The problem with that is that all blocks inside a loop are reachable from each other. This means that the op of phi transform is effectively completely disabled for code inside loops, even when it's not actually operating on a loop phi (just a phi that happens to be in a loop). Fix this by explicitly computing the backedges inside the function instead. Do this via RPOT, which is a bit more efficient than using FindFunctionBackedges() (which does it without any pre-computed analyses). For irreducible cycles, there may be multiple possible choices of backedge, and this just picks one of them. This is still sufficient to prevent combine loops. This also removes the last use of LoopInfo in InstCombine -- I'll drop the analysis in a followup. --- .../llvm/Transforms/InstCombine/InstCombiner.h | 13 +++++++++++++ .../InstCombine/InstructionCombining.cpp | 18 ++++++++++++++---- llvm/test/Transforms/InstCombine/phi.ll | 6 +++--- .../test/Transforms/LoopVectorize/induction.ll | 6 +++--- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h index c2ea88a107c32a..05322f7650efc7 100644 --- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -94,6 +94,12 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { /// Order of predecessors to canonicalize phi nodes towards. SmallDenseMap, 8> PredOrder; + /// Backedges, used to avoid pushing instructions across backedges in cases + /// where this may result in infinite combine loops. For irreducible loops + /// this picks an arbitrary backedge. + SmallDenseSet, 8> BackEdges; + bool ComputedBackEdges = false; + public: InstCombiner(InstructionWorklist &Worklist, BuilderTy &Builder, bool MinimizeSize, AAResults *AA, AssumptionCache &AC, @@ -359,6 +365,13 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { std::function SimplifyAndSetOp); + void computeBackEdges(); + bool isBackEdge(const BasicBlock *From, const BasicBlock *To) { + if (!ComputedBackEdges) + computeBackEdges(); + return BackEdges.contains({From, To}); + } + /// Inserts an instruction \p New before instruction \p Old /// /// Also adds the new instruction to the worklist and returns \p New so that diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 866e5f8a00b52d..9ee1f0bb7d3577 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1812,12 +1812,10 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { if (cast(InVal)->getParent() == NonSimplifiedBB) return nullptr; - // If the incoming non-constant value is reachable from the phis block, - // we'll push the operation across a loop backedge. This could result in + // Do not push the operation across a loop backedge. This could result in // an infinite combine loop, and is generally non-profitable (especially // if the operation was originally outside the loop). - if (isPotentiallyReachable(PN->getParent(), NonSimplifiedBB, nullptr, &DT, - LI)) + if (isBackEdge(NonSimplifiedBB, PN->getParent())) return nullptr; } @@ -5390,6 +5388,18 @@ bool InstCombinerImpl::prepareWorklist(Function &F) { return MadeIRChange; } +void InstCombiner::computeBackEdges() { + // Collect backedges. + SmallPtrSet Visited; + for (BasicBlock *BB : RPOT) { + Visited.insert(BB); + for (BasicBlock *Succ : successors(BB)) + if (Visited.contains(Succ)) + BackEdges.insert({BB, Succ}); + } + ComputedBackEdges = true; +} + static bool combineInstructionsOverFunction( Function &F, InstructionWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI, diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll index ba29f4290a9fa4..3b1fa3a97d9cd7 100644 --- a/llvm/test/Transforms/InstCombine/phi.ll +++ b/llvm/test/Transforms/InstCombine/phi.ll @@ -2721,11 +2721,11 @@ define void @phi_op_in_loop(i1 %c, i32 %x) { ; CHECK: loop: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[LOOP_LATCH:%.*]] ; CHECK: if: +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 1 ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[X:%.*]], [[IF]] ], [ 0, [[LOOP]] ] -; CHECK-NEXT: [[AND:%.*]] = and i32 [[PHI]], 1 -; CHECK-NEXT: call void @use(i32 [[AND]]) +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP1]], [[IF]] ], [ 0, [[LOOP]] ] +; CHECK-NEXT: call void @use(i32 [[PHI]]) ; CHECK-NEXT: br label [[LOOP]] ; br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 08d05a1e2db69f..c51ba0e5b6ea4e 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -5504,11 +5504,11 @@ define i32 @PR32419(i32 %a, i16 %b) { ; INTERLEAVE-NEXT: br i1 [[VAR2]], label [[FOR_INC]], label [[FOR_COND:%.*]] ; INTERLEAVE: for.cond: ; INTERLEAVE-NEXT: [[VAR3:%.*]] = urem i16 [[B]], [[VAR1]] +; INTERLEAVE-NEXT: [[TMP50:%.*]] = sext i16 [[VAR3]] to i32 ; INTERLEAVE-NEXT: br label [[FOR_INC]] ; INTERLEAVE: for.inc: -; INTERLEAVE-NEXT: [[VAR4:%.*]] = phi i16 [ [[VAR3]], [[FOR_COND]] ], [ 0, [[FOR_BODY]] ] -; INTERLEAVE-NEXT: [[VAR5:%.*]] = sext i16 [[VAR4]] to i32 -; INTERLEAVE-NEXT: [[VAR6]] = or i32 [[VAR0]], [[VAR5]] +; INTERLEAVE-NEXT: [[VAR4:%.*]] = phi i32 [ [[TMP50]], [[FOR_COND]] ], [ 0, [[FOR_BODY]] ] +; INTERLEAVE-NEXT: [[VAR6]] = or i32 [[VAR0]], [[VAR4]] ; INTERLEAVE-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 ; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], 0 ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] From c950ecb90e1945012ef3180aacbf92c994b7ee83 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 2 Sep 2024 00:18:52 -0700 Subject: [PATCH 10/33] [RISCV] Remove zfbfmin.ll. NFC (#106937) Most of it is redundant with bfloat-convert.ll. One testcase is found in bfloat-imm.ll. The load and stores are more thoroughly tested in bfloat-mem.ll. --- llvm/test/CodeGen/RISCV/zfbfmin.ll | 92 ------------------------------ 1 file changed, 92 deletions(-) delete mode 100644 llvm/test/CodeGen/RISCV/zfbfmin.ll diff --git a/llvm/test/CodeGen/RISCV/zfbfmin.ll b/llvm/test/CodeGen/RISCV/zfbfmin.ll deleted file mode 100644 index f120185bbec003..00000000000000 --- a/llvm/test/CodeGen/RISCV/zfbfmin.ll +++ /dev/null @@ -1,92 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zfbfmin -verify-machineinstrs \ -; RUN: -target-abi ilp32d < %s | FileCheck -check-prefix=CHECKIZFBFMIN %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zfbfmin -verify-machineinstrs \ -; RUN: -target-abi lp64d < %s | FileCheck -check-prefix=CHECKIZFBFMIN %s - -define bfloat @bitcast_bf16_i16(i16 %a) nounwind { -; CHECKIZFBFMIN-LABEL: bitcast_bf16_i16: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: fmv.h.x fa0, a0 -; CHECKIZFBFMIN-NEXT: ret - %1 = bitcast i16 %a to bfloat - ret bfloat %1 -} - -define i16 @bitcast_i16_bf16(bfloat %a) nounwind { -; CHECKIZFBFMIN-LABEL: bitcast_i16_bf16: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: fmv.x.h a0, fa0 -; CHECKIZFBFMIN-NEXT: ret - %1 = bitcast bfloat %a to i16 - ret i16 %1 -} - -define bfloat @fcvt_bf16_s(float %a) nounwind { -; CHECKIZFBFMIN-LABEL: fcvt_bf16_s: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: fcvt.bf16.s fa0, fa0 -; CHECKIZFBFMIN-NEXT: ret - %1 = fptrunc float %a to bfloat - ret bfloat %1 -} - -define float @fcvt_s_bf16(bfloat %a) nounwind { -; CHECKIZFBFMIN-LABEL: fcvt_s_bf16: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: fcvt.s.bf16 fa0, fa0 -; CHECKIZFBFMIN-NEXT: ret - %1 = fpext bfloat %a to float - ret float %1 -} - -define bfloat @fcvt_bf16_d(double %a) nounwind { -; CHECKIZFBFMIN-LABEL: fcvt_bf16_d: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: fcvt.s.d fa5, fa0 -; CHECKIZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 -; CHECKIZFBFMIN-NEXT: ret - %1 = fptrunc double %a to bfloat - ret bfloat %1 -} - -define double @fcvt_d_bf16(bfloat %a) nounwind { -; CHECKIZFBFMIN-LABEL: fcvt_d_bf16: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECKIZFBFMIN-NEXT: fcvt.d.s fa0, fa5 -; CHECKIZFBFMIN-NEXT: ret - %1 = fpext bfloat %a to double - ret double %1 -} - -define bfloat @bfloat_load(ptr %a) nounwind { -; CHECKIZFBFMIN-LABEL: bfloat_load: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: flh fa0, 6(a0) -; CHECKIZFBFMIN-NEXT: ret - %1 = getelementptr bfloat, ptr %a, i32 3 - %2 = load bfloat, ptr %1 - ret bfloat %2 -} - -define bfloat @bfloat_imm() nounwind { -; CHECKIZFBFMIN-LABEL: bfloat_imm: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: lui a0, %hi(.LCPI7_0) -; CHECKIZFBFMIN-NEXT: flh fa0, %lo(.LCPI7_0)(a0) -; CHECKIZFBFMIN-NEXT: ret - ret bfloat 3.0 -} - -define dso_local void @bfloat_store(ptr %a, bfloat %b) nounwind { -; CHECKIZFBFMIN-LABEL: bfloat_store: -; CHECKIZFBFMIN: # %bb.0: -; CHECKIZFBFMIN-NEXT: fsh fa0, 0(a0) -; CHECKIZFBFMIN-NEXT: fsh fa0, 16(a0) -; CHECKIZFBFMIN-NEXT: ret - store bfloat %b, ptr %a - %1 = getelementptr bfloat, ptr %a, i32 8 - store bfloat %b, ptr %1 - ret void -} From cd3667d1dbc9c1db05aaf3cd5b39f33b143bd8b5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 2 Sep 2024 00:19:19 -0700 Subject: [PATCH 11/33] [CodeGen] Update a few places that were passing Register to raw_ostream::operator<< (#106877) These would implicitly cast the register to `unsigned`. Switch most of them to use printReg will give a more readable output. Change some others to use Register::id() so we can eventually remove the implicit cast to `unsigned`. --- llvm/lib/CodeGen/InitUndef.cpp | 2 +- llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp | 4 ++-- llvm/lib/CodeGen/LiveDebugVariables.cpp | 10 ++++------ llvm/lib/CodeGen/LocalStackSlotAllocation.cpp | 3 ++- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 5 +++-- llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp | 2 +- .../lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp | 7 ++++--- 8 files changed, 18 insertions(+), 17 deletions(-) diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp index 51c50ff872ef21..7c1b90afd495e7 100644 --- a/llvm/lib/CodeGen/InitUndef.cpp +++ b/llvm/lib/CodeGen/InitUndef.cpp @@ -198,7 +198,7 @@ bool InitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) { LLVM_DEBUG( dbgs() << "Emitting PseudoInitUndef Instruction for implicit register " - << MO.getReg() << '\n'); + << printReg(MO.getReg()) << '\n'); const TargetRegisterClass *TargetRegClass = TRI->getLargestSuperClass(MRI->getRegClass(MO.getReg())); diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index 2959d3261bea71..2d95ff9e05abe7 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -1789,14 +1789,14 @@ void VarLocBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI, if (isLocationSpill(MI, MF, Reg)) { TKind = TransferKind::TransferSpill; LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump();); - LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI) + LLVM_DEBUG(dbgs() << "Register: " << Reg.id() << " " << printReg(Reg, TRI) << "\n"); } else { if (!(Loc = isRestoreInstruction(MI, MF, Reg))) return; TKind = TransferKind::TransferRestore; LLVM_DEBUG(dbgs() << "Recognized as restore: "; MI.dump();); - LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI) + LLVM_DEBUG(dbgs() << "Register: " << Reg.id() << " " << printReg(Reg, TRI) << "\n"); } // Check if the register or spill location is the location of a debug value. diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp index 48bcc0a61e30c9..822a1beb489592 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -1873,12 +1873,10 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) { Builder.addImm(regSizeInBits); } - LLVM_DEBUG( - if (SpillOffset != 0) { - dbgs() << "DBG_PHI for Vreg " << Reg << " subreg " << SubReg << - " has nonzero offset\n"; - } - ); + LLVM_DEBUG(if (SpillOffset != 0) { + dbgs() << "DBG_PHI for " << printReg(Reg, TRI, SubReg) + << " has nonzero offset\n"; + }); } // If there was no mapping for a value ID, it's optimized out. Create no // DBG_PHI, and any variables using this value will become optimized out. diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 0bb7953efd52f4..0e9f041f7bfdfe 100644 --- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -407,7 +407,8 @@ bool LocalStackSlotImpl::insertFrameReferenceRegisters(MachineFunction &Fn) { if (BaseReg.isValid() && lookupCandidateBaseReg(BaseReg, BaseOffset, FrameSizeAdjust, LocalOffset, MI, TRI)) { - LLVM_DEBUG(dbgs() << " Reusing base register " << BaseReg << "\n"); + LLVM_DEBUG(dbgs() << " Reusing base register " << printReg(BaseReg) + << "\n"); // We found a register to reuse. Offset = FrameSizeAdjust + LocalOffset - BaseOffset; } else { diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 16dab974efacb2..ac9a3d6f0d1a60 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -120,7 +120,7 @@ Printable printReg(Register Reg, const TargetRegisterInfo *TRI, OS << '%' << Register::virtReg2Index(Reg); } } else if (!TRI) - OS << '$' << "physreg" << Reg; + OS << '$' << "physreg" << Reg.id(); else if (Reg < TRI->getNumRegs()) { OS << '$'; printLowerCase(TRI->getName(Reg), OS); diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 1e993aa4ad52a0..1b52a48d068ebc 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2187,8 +2187,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( return false; } - LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " - << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); + LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", " + << printReg(MAddr.Base.LoReg, TRI) + << "} Offset: " << MAddr.Offset << "\n\n";); // Step2: Traverse through MI's basic block and find an anchor(that has the // same base-registers) with the highest 13bit distance from MI's offset. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp index 4a75bab6b95ddc..283d93408575b5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp @@ -107,7 +107,7 @@ bool WebAssemblyDebugFixup::runOnMachineFunction(MachineFunction &MF) { for (auto &Elem : reverse(Stack)) { if (MO.getReg() == Elem.Reg) { auto Depth = static_cast(&Elem - &Stack[0]); - LLVM_DEBUG(dbgs() << "Debug Value VReg " << MO.getReg() + LLVM_DEBUG(dbgs() << "Debug Value VReg " << printReg(MO.getReg()) << " -> Stack Relative " << Depth << "\n"); MO.ChangeToTargetIndex(WebAssembly::TI_OPERAND_STACK, Depth); // Save the DBG_VALUE instruction that defined this stackified diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp index 1203b343bf24bf..3dc9cdc11eb575 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp @@ -76,7 +76,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) { break; int64_t Imm = MI.getOperand(1).getImm(); - LLVM_DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() + LLVM_DEBUG(dbgs() << "Arg VReg " << printReg(MI.getOperand(0).getReg()) << " -> WAReg " << Imm << "\n"); MFI.setWAReg(MI.getOperand(0).getReg(), Imm); } @@ -95,13 +95,14 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) { continue; // Handle stackified registers. if (MFI.isVRegStackified(VReg)) { - LLVM_DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " + LLVM_DEBUG(dbgs() << "VReg " << printReg(VReg) << " -> WAReg " << (INT32_MIN | NumStackRegs) << "\n"); MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++); continue; } if (MFI.getWAReg(VReg) == WebAssembly::UnusedReg) { - LLVM_DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n"); + LLVM_DEBUG(dbgs() << "VReg " << printReg(VReg) << " -> WAReg " << CurReg + << "\n"); MFI.setWAReg(VReg, CurReg++); } } From 08a72cbd6b12b5ccffb82c657bd668938f1b42e1 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 2 Sep 2024 17:24:14 +1000 Subject: [PATCH 12/33] [clang] Bump up DIAG_SIZE_SEMA by 500 for downstream diagnostics. Recently added HLSL diagnostics (89fb8490a99e) pushed the Swift compiler over the existing limit. rdar://135126738 --- clang/include/clang/Basic/DiagnosticIDs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index bce7605b95ba43..8b976bdac6dc51 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -39,7 +39,7 @@ namespace clang { DIAG_SIZE_AST = 300, DIAG_SIZE_COMMENT = 100, DIAG_SIZE_CROSSTU = 100, - DIAG_SIZE_SEMA = 4500, + DIAG_SIZE_SEMA = 5000, DIAG_SIZE_ANALYSIS = 100, DIAG_SIZE_REFACTORING = 1000, DIAG_SIZE_INSTALLAPI = 100, From fe1006b7f25258742173304c7c32e891be31d14e Mon Sep 17 00:00:00 2001 From: pudge62 <70063806+pudge62@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:31:51 +0800 Subject: [PATCH 13/33] [TSan] fix crash when symbolize on darwin platforms (#99441) The `dli_sname` filed in `Dl_info` may be `NULL`, which could cause a crash --- compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_mac.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_mac.cpp index f1cc0b5e1e8ac0..88536fc4e6222f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_mac.cpp @@ -30,7 +30,7 @@ namespace __sanitizer { bool DlAddrSymbolizer::SymbolizePC(uptr addr, SymbolizedStack *stack) { Dl_info info; int result = dladdr((const void *)addr, &info); - if (!result) return false; + if (!result || !info.dli_sname) return false; // Compute offset if possible. `dladdr()` doesn't always ensure that `addr >= // sym_addr` so only compute the offset when this holds. Failure to find the @@ -51,7 +51,7 @@ bool DlAddrSymbolizer::SymbolizePC(uptr addr, SymbolizedStack *stack) { bool DlAddrSymbolizer::SymbolizeData(uptr addr, DataInfo *datainfo) { Dl_info info; int result = dladdr((const void *)addr, &info); - if (!result) return false; + if (!result || !info.dli_sname) return false; const char *demangled = DemangleSwiftAndCXX(info.dli_sname); if (!demangled) demangled = info.dli_sname; From ed6d9f6d2af7da90ac089cf648a1f8b2e8e4eb10 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Mon, 26 Aug 2024 22:02:59 +0200 Subject: [PATCH 14/33] [CGP] Introduce test for PR102926 (NFC) --- ...evert-constant-ptr-propagation-on-calls.ll | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll diff --git a/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll b/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll new file mode 100644 index 00000000000000..0e5bc79054d53b --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll @@ -0,0 +1,169 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +%struct.S = type { i8 } +%struct.X = type { i32 } + +@g_getS = internal global %struct.S zeroinitializer, align 1 +@g_getX = internal global %struct.X zeroinitializer, align 1 +@guard = internal global i64 0, align 8 + +declare ptr @getS_dec() +declare extern_weak dllimport ptr @getS_dllimport_function() + +define ptr @getS() personality ptr @__gxx_personality_v0 { +entry: + %guard = load atomic i8, ptr @guard acquire, align 8 + %mask = and i8 %guard, 1 + %cond = icmp eq i8 %mask, 0 + br i1 %cond, label %to_be_init, label %return + +to_be_init: ; preds = %entry + %is_init = call i32 @__cxa_guard_acquire(ptr @guard) + %cond.2 = icmp ne i32 %is_init, 0 + br i1 %cond.2, label %ctor, label %return + +ctor: ; preds = %to_be_init + invoke void @S_ctor(ptr @g_getS) + to label %continue unwind label %landing_pad + +continue: ; preds = %ctor + call void @__cxa_guard_release(ptr @guard) + br label %return + +return: ; preds = %continue, %to_be_init, %entry + ret ptr @g_getS + +landing_pad: ; preds = %ctor + %lp = landingpad { ptr, i32 } cleanup + call void @__cxa_guard_abort(ptr @guard) + resume { ptr, i32 } %lp +} + +define ptr @getS_or_getX(i1 %cond) { +entry: + %result = select i1 %cond, ptr @g_getS, ptr @g_getX + ret ptr %result +} + +define weak ptr @getS_weak_function() { +entry: + ret ptr @g_getS +} + +define linkonce_odr ptr @getS_linkonce_odr_function() { +entry: + ret ptr @g_getS +} + +; May revert propagation. +define i32 @caller_1() { +; CHECK-LABEL: @caller_1( +; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS() +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: ret i32 [[GETI]] +; + %getS_ptr = call ptr @getS() + %getI = call i32 @S_getI(ptr @g_getS) + ret i32 %getI +} + +; Cannot revert propagation due to use appearing in a different basic block. +define i32 @caller_2() { +; CHECK-LABEL: @caller_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS() +; CHECK-NEXT: br label [[USE:%.*]] +; CHECK: use: +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr [[GETS_PTR]]) +; CHECK-NEXT: ret i32 [[GETI]] +; +entry: + %getS_ptr = call ptr @getS() + br label %use + +use: ; preds = %entry + %getI = call i32 @S_getI(ptr %getS_ptr) + ret i32 %getI +} + +; Cannot revert propagation due to use before call. +define i32 @caller_3() { +; CHECK-LABEL: @caller_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS() +; CHECK-NEXT: ret i32 [[GETI]] +; +entry: + %getI = call i32 @S_getI(ptr @g_getS) + %getS_ptr = call ptr @getS() + ret i32 %getI +} + +; Cannot revert propagation due to non-uniform returned constant. +define i32 @caller_4(i1 %cond) { +; CHECK-LABEL: @caller_4( +; CHECK-NEXT: [[GETS_OR_GETX_PTR:%.*]] = call ptr @getS_or_getX(i1 [[COND:%.*]]) +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: ret i32 [[GETI]] +; + %getS_or_getX_ptr = call ptr @getS_or_getX(i1 %cond) + %getI = call i32 @S_getI(ptr @g_getS) + ret i32 %getI +} + +; Cannot revert propagation due to weak-linkage callee. +define i32 @caller_5() { +; CHECK-LABEL: @caller_5( +; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS_weak_function() +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: ret i32 [[GETI]] +; + %getS_ptr = call ptr @getS_weak_function() + %getI = call i32 @S_getI(ptr @g_getS) + ret i32 %getI +} + +; Cannot revert propagation due to callee with external function definition. +define i32 @caller_6() { +; CHECK-LABEL: @caller_6( +; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS_dec() +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: ret i32 [[GETI]] +; + %getS_ptr = call ptr @getS_dec() + %getI = call i32 @S_getI(ptr @g_getS) + ret i32 %getI +} + +; Cannot revert propagation due to callee with DLLImport storage class. +define i32 @caller_7() { +; CHECK-LABEL: @caller_7( +; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS_dllimport_function() +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: ret i32 [[GETI]] +; + %getS_ptr = call ptr @getS_dllimport_function() + %getI = call i32 @S_getI(ptr @g_getS) + ret i32 %getI +} + +; Cannot revert propagation due to callee whose definition may be overridden. +define i32 @caller_8() { +; CHECK-LABEL: @caller_8( +; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS_linkonce_odr_function() +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: ret i32 [[GETI]] +; + %getS_ptr = call ptr @getS_linkonce_odr_function() + %getI = call i32 @S_getI(ptr @g_getS) + ret i32 %getI +} + +declare i32 @__cxa_guard_acquire(ptr) +declare void @S_ctor(ptr) +declare i32 @S_getI(ptr) +declare void @__cxa_guard_abort(ptr) +declare void @__cxa_guard_release(ptr) +declare i32 @__gxx_personality_v0(...) From e4e0dfb0c24c9bcd4bef835bd6a162967f097584 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Mon, 26 Aug 2024 22:03:11 +0200 Subject: [PATCH 15/33] [CGP] Undo constant propagation of pointers across calls It may be profitable to revert SCCP propagation of C++ static values, if such constants are pointers, in order to avoid redundant pointer computation, since the method returning the constant is non-removable. --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 48 ++++++++++++++++++- ...evert-constant-ptr-propagation-on-calls.ll | 2 +- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 271a047fc6a7b8..631cc26d6022fe 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2677,7 +2677,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { } // From here on out we're working with named functions. - if (!CI->getCalledFunction()) + auto *Callee = CI->getCalledFunction(); + if (!Callee) return false; // Lower all default uses of _chk calls. This is very similar @@ -2692,6 +2693,51 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { return true; } + // SCCP may have propagated, among other things, C++ static variables across + // calls. If this happens to be the case, we may want to undo it in order to + // avoid redundant pointer computation of the constant, as the function method + // returning the constant needs to be executed anyways. + auto GetUniformReturnValue = [](const Function *F) -> GlobalVariable * { + if (!F->getReturnType()->isPointerTy()) + return nullptr; + + GlobalVariable *UniformValue = nullptr; + for (auto &BB : *F) { + if (auto *RI = dyn_cast(BB.getTerminator())) { + if (auto *V = dyn_cast(RI->getReturnValue())) { + if (!UniformValue) + UniformValue = V; + else if (V != UniformValue) + return nullptr; + } else { + return nullptr; + } + } + } + + return UniformValue; + }; + + if (Callee->hasExactDefinition()) { + if (GlobalVariable *RV = GetUniformReturnValue(Callee)) { + bool MadeChange = false; + for (Use &U : make_early_inc_range(RV->uses())) { + auto *I = dyn_cast(U.getUser()); + if (!I || I->getParent() != CI->getParent()) { + // Limit to the same basic block to avoid extending the call-site live + // range, which otherwise could increase register pressure. + continue; + } + if (CI->comesBefore(I)) { + U.set(CI); + MadeChange = true; + } + } + + return MadeChange; + } + } + return false; } diff --git a/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll b/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll index 0e5bc79054d53b..51f1283a20ab27 100644 --- a/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll +++ b/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll @@ -60,7 +60,7 @@ entry: define i32 @caller_1() { ; CHECK-LABEL: @caller_1( ; CHECK-NEXT: [[GETS_PTR:%.*]] = call ptr @getS() -; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr @g_getS) +; CHECK-NEXT: [[GETI:%.*]] = call i32 @S_getI(ptr [[GETS_PTR]]) ; CHECK-NEXT: ret i32 [[GETI]] ; %getS_ptr = call ptr @getS() From 30cc198c2d4ad784f18cc10a03d45a19145357af Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 2 Sep 2024 09:48:54 +0200 Subject: [PATCH 16/33] [APInt] Add default-disabled assertion to APInt constructor (#106524) If the uint64_t constructor is used, assert that the value is actually a signed or unsigned N-bit integer depending on whether the isSigned flag is set. Provide an implicitTrunc flag to restore the previous behavior, where the argument is silently truncated instead. In this commit, implicitTrunc is enabled by default, which means that the new assertions are disabled and no actual change in behavior occurs. The plan is to flip the default once all places violating the assertion have been fixed. See #80309 for the scope of the necessary changes. The primary motivation for this change is to avoid incorrectly specified isSigned flags. A recurring problem we have is that people write something like `APInt(BW, -1)` and this works perfectly fine -- until the code path is hit with `BW > 64`. Most of our i128 specific miscompilations are caused by variants of this issue. The cost of the change is that we have to specify the correct isSigned flag (and make sure there are no excess bits) for uses where BW is always <= 64 as well. --- llvm/include/llvm/ADT/APInt.h | 19 +++++++++++++++++-- llvm/lib/Support/APInt.cpp | 14 +++++++++----- llvm/unittests/ADT/APIntTest.cpp | 5 +++-- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 65ba3f15305c78..a42dae8887392d 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -106,11 +106,26 @@ class [[nodiscard]] APInt { /// \param numBits the bit width of the constructed APInt /// \param val the initial value of the APInt /// \param isSigned how to treat signedness of val - APInt(unsigned numBits, uint64_t val, bool isSigned = false) + /// \param implicitTrunc allow implicit truncation of non-zero/sign bits of + /// val beyond the range of numBits + APInt(unsigned numBits, uint64_t val, bool isSigned = false, + bool implicitTrunc = true) : BitWidth(numBits) { + if (!implicitTrunc) { + if (BitWidth == 0) { + assert(val == 0 && "Value must be zero for 0-bit APInt"); + } else if (isSigned) { + assert(llvm::isIntN(BitWidth, val) && + "Value is not an N-bit signed value"); + } else { + assert(llvm::isUIntN(BitWidth, val) && + "Value is not an N-bit unsigned value"); + } + } if (isSingleWord()) { U.VAL = val; - clearUnusedBits(); + if (implicitTrunc || isSigned) + clearUnusedBits(); } else { initSlowCase(val, isSigned); } diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp index fe22e9ba04b6f5..78d573966c6c99 100644 --- a/llvm/lib/Support/APInt.cpp +++ b/llvm/lib/Support/APInt.cpp @@ -234,7 +234,8 @@ APInt& APInt::operator-=(uint64_t RHS) { APInt APInt::operator*(const APInt& RHS) const { assert(BitWidth == RHS.BitWidth && "Bit widths must be the same"); if (isSingleWord()) - return APInt(BitWidth, U.VAL * RHS.U.VAL); + return APInt(BitWidth, U.VAL * RHS.U.VAL, /*isSigned=*/false, + /*implicitTrunc=*/true); APInt Result(getMemory(getNumWords()), getBitWidth()); tcMultiply(Result.U.pVal, U.pVal, RHS.U.pVal, getNumWords()); @@ -455,7 +456,8 @@ APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const { "Illegal bit extraction"); if (isSingleWord()) - return APInt(numBits, U.VAL >> bitPosition); + return APInt(numBits, U.VAL >> bitPosition, /*isSigned=*/false, + /*implicitTrunc=*/true); unsigned loBit = whichBit(bitPosition); unsigned loWord = whichWord(bitPosition); @@ -463,7 +465,8 @@ APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const { // Single word result extracting bits from a single word source. if (loWord == hiWord) - return APInt(numBits, U.pVal[loWord] >> loBit); + return APInt(numBits, U.pVal[loWord] >> loBit, /*isSigned=*/false, + /*implicitTrunc=*/true); // Extracting bits that start on a source word boundary can be done // as a fast memory copy. @@ -907,7 +910,8 @@ APInt APInt::trunc(unsigned width) const { assert(width <= BitWidth && "Invalid APInt Truncate request"); if (width <= APINT_BITS_PER_WORD) - return APInt(width, getRawData()[0]); + return APInt(width, getRawData()[0], /*isSigned=*/false, + /*implicitTrunc=*/true); if (width == BitWidth) return *this; @@ -955,7 +959,7 @@ APInt APInt::sext(unsigned Width) const { assert(Width >= BitWidth && "Invalid APInt SignExtend request"); if (Width <= APINT_BITS_PER_WORD) - return APInt(Width, SignExtend64(U.VAL, BitWidth)); + return APInt(Width, SignExtend64(U.VAL, BitWidth), /*isSigned=*/true); if (Width == BitWidth) return *this; diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp index eb4b847185f53b..fff29d24a05299 100644 --- a/llvm/unittests/ADT/APIntTest.cpp +++ b/llvm/unittests/ADT/APIntTest.cpp @@ -220,11 +220,12 @@ TEST(APIntTest, i256) { } TEST(APIntTest, i1) { - const APInt neg_two(1, static_cast(-2), true); + const APInt neg_two(1, static_cast(-2), true, + /*implicitTrunc=*/true); const APInt neg_one(1, static_cast(-1), true); const APInt zero(1, 0); const APInt one(1, 1); - const APInt two(1, 2); + const APInt two(1, 2, false, /*implicitTrunc=*/true); EXPECT_EQ(0, neg_two.getSExtValue()); EXPECT_EQ(-1, neg_one.getSExtValue()); From 9cf68679c4f45e79d67c94ef1f968c7c1213b610 Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Mon, 2 Sep 2024 08:54:10 +0100 Subject: [PATCH 17/33] [ARM] Fix failure to register-allocate CMP_SWAP_64 pseudo-inst (#106721) This test case was failing to compile with a "ran out of registers during register allocation" error at -O0. This was because CMP_SWAP_64 has 3 operands which must be an even-odd register pair, and two other GPR operands. All of the def operands are also early-clobber, so registers can't be shared between uses and defs. Because the function has an over-aligned alloca it needs frame and base pointers, so r6 and r11 are both reserved. That leaves r0/r1, r2/r3, r4/r5 and r8/r9 as the only valid register pairs, and if the two individual GPR operands happen to get allocated to registers in different pairs then only 2 pairs will be available for the three GPRPair operands. To fix this, I've merged the two GPR operands into a single GPRPair operand. This means that the instruction now has 4 GPRPair operands, which can always be allocated without relying on luck. This does constrain register allocation a bit more, but this pseudo instruction is only used at -O0, so I don't think that's a problem. --- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 9 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 37 +- llvm/lib/Target/ARM/ARMInstrInfo.td | 17 +- .../CodeGen/ARM/atomic-64bit-fast-regalloc.ll | 96 ++ llvm/test/CodeGen/ARM/atomic-load-store.ll | 96 +- .../ARM/atomicrmw_exclusive_monitor_ints.ll | 1112 +++++++++-------- llvm/test/CodeGen/ARM/cmpxchg-O0.ll | 5 +- llvm/test/CodeGen/ARM/cmpxchg.mir | 13 +- llvm/test/CodeGen/Thumb2/cmpxchg.mir | 13 +- 9 files changed, 832 insertions(+), 566 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index df10613fcc7c93..25dfacca956bb8 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1942,11 +1942,14 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); - Register TempReg = MI.getOperand(1).getReg(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. - assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); - Register AddrReg = MI.getOperand(2).getReg(); + assert(!MI.getOperand(1).isUndef() && "cannot handle undef"); + Register AddrAndTempReg = MI.getOperand(1).getReg(); + Register AddrReg = TRI->getSubReg(AddrAndTempReg, ARM::gsub_0); + Register TempReg = TRI->getSubReg(AddrAndTempReg, ARM::gsub_1); + assert(MI.getOperand(1).getReg() == MI.getOperand(2).getReg() && + "tied operands have different registers"); Register DesiredReg = MI.getOperand(3).getReg(); MachineOperand New = MI.getOperand(4); New.setIsKill(false); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index ec6367a803506b..9096617a948557 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -10479,33 +10479,42 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, Results.push_back(Cycles32.getValue(1)); } -static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { - SDLoc dl(V.getNode()); - auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32); - bool isBigEndian = DAG.getDataLayout().isBigEndian(); - if (isBigEndian) - std::swap (VLo, VHi); +static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, + SDValue V1) { + SDLoc dl(V0.getNode()); SDValue RegClass = DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); - const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; + const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1}; return SDValue( DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); } +static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V) { + SDLoc dl(V.getNode()); + auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32); + bool isBigEndian = DAG.getDataLayout().isBigEndian(); + if (isBigEndian) + std::swap(VLo, VHi); + return createGPRPairNode2xi32(DAG, VLo, VHi); +} + static void ReplaceCMP_SWAP_64Results(SDNode *N, - SmallVectorImpl & Results, - SelectionDAG &DAG) { + SmallVectorImpl &Results, + SelectionDAG &DAG) { assert(N->getValueType(0) == MVT::i64 && "AtomicCmpSwap on types less than 64 should be legal"); - SDValue Ops[] = {N->getOperand(1), - createGPRPairNode(DAG, N->getOperand(2)), - createGPRPairNode(DAG, N->getOperand(3)), - N->getOperand(0)}; + SDValue Ops[] = { + createGPRPairNode2xi32(DAG, N->getOperand(1), + DAG.getUNDEF(MVT::i32)), // pointer, temp + createGPRPairNodei64(DAG, N->getOperand(2)), // expected + createGPRPairNodei64(DAG, N->getOperand(3)), // new + N->getOperand(0), // chain in + }; SDNode *CmpSwap = DAG.getMachineNode( ARM::CMP_SWAP_64, SDLoc(N), - DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); + DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops); MachineMemOperand *MemOp = cast(N)->getMemOperand(); DAG.setNodeMemRefs(cast(CmpSwap), {MemOp}); diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 26f7d70b43b262..0fc561382084e3 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -6509,8 +6509,21 @@ def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$temp), (ins GPR:$addr, GPR:$desired, GPR:$new), NoItinerary, []>, Sched<[]>; -def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp), - (ins GPR:$addr, GPRPair:$desired, GPRPair:$new), +// The addr_temp and addr_temp_out operands are logically a pair of GPR +// operands: +// * addr is an input, holding the address to swap. +// * temp is a earlyclobber output, used internally in the expansion of the +// pseudo-inst. +// These are combined into one GPRPair operand to ensure that register +// allocation always succeeds. In the worst case there are only 4 GPRPair +// registers available, of which this instruction needs 3 for the other +// operands. If these operands weren't combined they would also use two GPR +// registers, which could overlap with two different GPRPairs, causing +// allocation to fail. With them combined, we need to allocate 4 GPRPairs, +// which will always succeed. +let Constraints = "@earlyclobber $Rd,$addr_temp_out = $addr_temp" in +def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPRPair:$addr_temp_out), + (ins GPRPair:$addr_temp, GPRPair:$desired, GPRPair:$new), NoItinerary, []>, Sched<[]>; } diff --git a/llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll b/llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll new file mode 100644 index 00000000000000..bcaea3d0258b70 --- /dev/null +++ b/llvm/test/CodeGen/ARM/atomic-64bit-fast-regalloc.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -O0 | FileCheck %s --check-prefix=CHECK --check-prefix=LE +; RUN: llc < %s -mtriple=armv7eb-none-eabi -O0 | FileCheck %s --check-prefix=CHECK --check-prefix=BE + +;; Previously, this failed during register allocation because the CMP_SWAP_64 +;; pseudo-instruction has a lot of operands, many of which need to be even-odd +;; register pairs, and the over-aligned alloca in this function causes both a +;; frame pointer and a base pointer to be needed. + +define void @test(ptr %ptr) { +; CHECK-LABEL: test: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r8, r9, r10, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r8, r9, r10, r11, lr} +; CHECK-NEXT: .setfp r11, sp, #24 +; CHECK-NEXT: add r11, sp, #24 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: bfc sp, #0, #4 +; CHECK-NEXT: mov r6, sp +; CHECK-NEXT: str r0, [r6, #28] @ 4-byte Spill +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_1: @ %block1 +; CHECK-NEXT: ldr r0, [r6, #28] @ 4-byte Reload +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: sub r1, r1, #16 +; CHECK-NEXT: bic r1, r1, #15 +; CHECK-NEXT: mov sp, r1 +; CHECK-NEXT: dmb ish +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: ldr r0, [r0, #4] +; CHECK-NEXT: str r1, [r6, #20] @ 4-byte Spill +; CHECK-NEXT: str r0, [r6, #24] @ 4-byte Spill +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_2: @ %atomicrmw.start +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 +; CHECK-NEXT: ldr r2, [r6, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [r6, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r8, [r6, #28] @ 4-byte Reload +; LE-NEXT: str r2, [r6, #16] @ 4-byte Spill +; LE-NEXT: str r0, [r6, #12] @ 4-byte Spill +; BE-NEXT: str r2, [r6, #12] @ 4-byte Spill +; BE-NEXT: str r0, [r6, #16] @ 4-byte Spill +; CHECK-NEXT: @ implicit-def: $r1 +; CHECK-NEXT: @ implicit-def: $r3 +; CHECK-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: @ kill: def $r0 killed $r0 def $r0_r1 +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: .LBB0_3: @ %atomicrmw.start +; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldrexd r4, r5, [r8] +; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: cmpeq r5, r1 +; CHECK-NEXT: bne .LBB0_5 +; CHECK-NEXT: @ %bb.4: @ %atomicrmw.start +; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=2 +; CHECK-NEXT: strexd r9, r2, r3, [r8] +; CHECK-NEXT: cmp r9, #0 +; CHECK-NEXT: bne .LBB0_3 +; CHECK-NEXT: .LBB0_5: @ %atomicrmw.start +; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: ldr r2, [r6, #12] @ 4-byte Reload +; LE-NEXT: ldr r1, [r6, #16] @ 4-byte Reload +; LE-NEXT: mov r0, r5 +; LE-NEXT: eor r3, r0, r1 +; LE-NEXT: mov r1, r4 +; LE-NEXT: eor r2, r1, r2 +; BE-NEXT: ldr r0, [r6, #16] @ 4-byte Reload +; BE-NEXT: mov r1, r4 +; BE-NEXT: eor r3, r1, r0 +; BE-NEXT: mov r0, r5 +; BE-NEXT: eor r2, r0, r2 +; CHECK-NEXT: orr r2, r2, r3 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: str r1, [r6, #20] @ 4-byte Spill +; CHECK-NEXT: str r0, [r6, #24] @ 4-byte Spill +; CHECK-NEXT: bne .LBB0_2 +; CHECK-NEXT: b .LBB0_6 +; CHECK-NEXT: .LBB0_6: @ %atomicrmw.end +; CHECK-NEXT: dmb ish +; CHECK-NEXT: sub sp, r11, #24 +; CHECK-NEXT: pop {r4, r5, r6, r8, r9, r10, r11, pc} +entry: + br label %block1 + +block1: + %stuff = alloca i8, i64 16, align 16 + store atomic i64 0, ptr %ptr seq_cst, align 8 + ret void +} diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll index c53fb2f330a792..14e49bf3c9376a 100644 --- a/llvm/test/CodeGen/ARM/atomic-load-store.ll +++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll @@ -327,50 +327,56 @@ define void @test_old_store_64bit(ptr %p, i64 %v) { ; ARMOPTNONE-NEXT: push {r4, r5, r7, lr} ; ARMOPTNONE-NEXT: add r7, sp, #8 ; ARMOPTNONE-NEXT: push {r8, r10, r11} -; ARMOPTNONE-NEXT: sub sp, sp, #20 -; ARMOPTNONE-NEXT: str r0, [sp] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r2, [sp, #4] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r1, [sp, #8] @ 4-byte Spill +; ARMOPTNONE-NEXT: sub sp, sp, #24 +; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill ; ARMOPTNONE-NEXT: dmb ish ; ARMOPTNONE-NEXT: ldr r1, [r0] ; ARMOPTNONE-NEXT: ldr r0, [r0, #4] -; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill ; ARMOPTNONE-NEXT: b LBB5_1 ; ARMOPTNONE-NEXT: LBB5_1: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1 ; ARMOPTNONE-NEXT: @ Child Loop BB5_2 Depth 2 -; ARMOPTNONE-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r3, [sp] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r10, [sp, #8] @ 4-byte Reload -; ARMOPTNONE-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 -; ARMOPTNONE-NEXT: mov r11, r0 -; ARMOPTNONE-NEXT: mov r8, r2 +; ARMOPTNONE-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r12, [sp, #8] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r8, [sp, #4] @ 4-byte Reload +; ARMOPTNONE-NEXT: str r3, [sp] @ 4-byte Spill +; ARMOPTNONE-NEXT: @ implicit-def: $r1 +; ARMOPTNONE-NEXT: @ implicit-def: $r9 +; ARMOPTNONE-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 ; ARMOPTNONE-NEXT: mov r9, r1 +; ARMOPTNONE-NEXT: @ kill: def $r0 killed $r0 def $r0_r1 +; ARMOPTNONE-NEXT: mov r1, r12 +; ARMOPTNONE-NEXT: mov r10, r2 +; ARMOPTNONE-NEXT: mov r11, r3 ; ARMOPTNONE-NEXT: LBB5_2: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ Parent Loop BB5_1 Depth=1 ; ARMOPTNONE-NEXT: @ => This Inner Loop Header: Depth=2 -; ARMOPTNONE-NEXT: ldrexd r4, r5, [r3] -; ARMOPTNONE-NEXT: cmp r4, r8 -; ARMOPTNONE-NEXT: cmpeq r5, r9 +; ARMOPTNONE-NEXT: ldrexd r4, r5, [r8] +; ARMOPTNONE-NEXT: cmp r4, r10 +; ARMOPTNONE-NEXT: cmpeq r5, r11 ; ARMOPTNONE-NEXT: bne LBB5_4 ; ARMOPTNONE-NEXT: @ %bb.3: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ in Loop: Header=BB5_2 Depth=2 -; ARMOPTNONE-NEXT: strexd r0, r10, r11, [r3] -; ARMOPTNONE-NEXT: cmp r0, #0 +; ARMOPTNONE-NEXT: strexd r9, r0, r1, [r8] +; ARMOPTNONE-NEXT: cmp r9, #0 ; ARMOPTNONE-NEXT: bne LBB5_2 ; ARMOPTNONE-NEXT: LBB5_4: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ in Loop: Header=BB5_1 Depth=1 +; ARMOPTNONE-NEXT: ldr r1, [sp] @ 4-byte Reload ; ARMOPTNONE-NEXT: mov r0, r5 ; ARMOPTNONE-NEXT: eor r3, r0, r1 ; ARMOPTNONE-NEXT: mov r1, r4 ; ARMOPTNONE-NEXT: eor r2, r1, r2 ; ARMOPTNONE-NEXT: orr r2, r2, r3 ; ARMOPTNONE-NEXT: cmp r2, #0 -; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill ; ARMOPTNONE-NEXT: bne LBB5_1 ; ARMOPTNONE-NEXT: b LBB5_5 ; ARMOPTNONE-NEXT: LBB5_5: @ %atomicrmw.end @@ -861,52 +867,58 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) { ; ARMOPTNONE-NEXT: push {r4, r5, r7, lr} ; ARMOPTNONE-NEXT: add r7, sp, #8 ; ARMOPTNONE-NEXT: push {r8, r10, r11} -; ARMOPTNONE-NEXT: sub sp, sp, #20 -; ARMOPTNONE-NEXT: str r0, [sp] @ 4-byte Spill +; ARMOPTNONE-NEXT: sub sp, sp, #24 +; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill ; ARMOPTNONE-NEXT: vmov d16, r1, r2 ; ARMOPTNONE-NEXT: vmov r1, r2, d16 -; ARMOPTNONE-NEXT: str r2, [sp, #4] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r1, [sp, #8] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill ; ARMOPTNONE-NEXT: dmb ish ; ARMOPTNONE-NEXT: ldr r1, [r0] ; ARMOPTNONE-NEXT: ldr r0, [r0, #4] -; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill ; ARMOPTNONE-NEXT: b LBB13_1 ; ARMOPTNONE-NEXT: LBB13_1: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1 ; ARMOPTNONE-NEXT: @ Child Loop BB13_2 Depth 2 -; ARMOPTNONE-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r3, [sp] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; ARMOPTNONE-NEXT: ldr r10, [sp, #8] @ 4-byte Reload -; ARMOPTNONE-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 -; ARMOPTNONE-NEXT: mov r11, r0 -; ARMOPTNONE-NEXT: mov r8, r2 +; ARMOPTNONE-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r12, [sp, #8] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; ARMOPTNONE-NEXT: ldr r8, [sp, #4] @ 4-byte Reload +; ARMOPTNONE-NEXT: str r3, [sp] @ 4-byte Spill +; ARMOPTNONE-NEXT: @ implicit-def: $r1 +; ARMOPTNONE-NEXT: @ implicit-def: $r9 +; ARMOPTNONE-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 ; ARMOPTNONE-NEXT: mov r9, r1 +; ARMOPTNONE-NEXT: @ kill: def $r0 killed $r0 def $r0_r1 +; ARMOPTNONE-NEXT: mov r1, r12 +; ARMOPTNONE-NEXT: mov r10, r2 +; ARMOPTNONE-NEXT: mov r11, r3 ; ARMOPTNONE-NEXT: LBB13_2: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ Parent Loop BB13_1 Depth=1 ; ARMOPTNONE-NEXT: @ => This Inner Loop Header: Depth=2 -; ARMOPTNONE-NEXT: ldrexd r4, r5, [r3] -; ARMOPTNONE-NEXT: cmp r4, r8 -; ARMOPTNONE-NEXT: cmpeq r5, r9 +; ARMOPTNONE-NEXT: ldrexd r4, r5, [r8] +; ARMOPTNONE-NEXT: cmp r4, r10 +; ARMOPTNONE-NEXT: cmpeq r5, r11 ; ARMOPTNONE-NEXT: bne LBB13_4 ; ARMOPTNONE-NEXT: @ %bb.3: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ in Loop: Header=BB13_2 Depth=2 -; ARMOPTNONE-NEXT: strexd r0, r10, r11, [r3] -; ARMOPTNONE-NEXT: cmp r0, #0 +; ARMOPTNONE-NEXT: strexd r9, r0, r1, [r8] +; ARMOPTNONE-NEXT: cmp r9, #0 ; ARMOPTNONE-NEXT: bne LBB13_2 ; ARMOPTNONE-NEXT: LBB13_4: @ %atomicrmw.start ; ARMOPTNONE-NEXT: @ in Loop: Header=BB13_1 Depth=1 +; ARMOPTNONE-NEXT: ldr r1, [sp] @ 4-byte Reload ; ARMOPTNONE-NEXT: mov r0, r5 ; ARMOPTNONE-NEXT: eor r3, r0, r1 ; ARMOPTNONE-NEXT: mov r1, r4 ; ARMOPTNONE-NEXT: eor r2, r1, r2 ; ARMOPTNONE-NEXT: orr r2, r2, r3 ; ARMOPTNONE-NEXT: cmp r2, #0 -; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill -; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill +; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill ; ARMOPTNONE-NEXT: bne LBB13_1 ; ARMOPTNONE-NEXT: b LBB13_5 ; ARMOPTNONE-NEXT: LBB13_5: @ %atomicrmw.end diff --git a/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll index 161692137fc30b..a38ade7cdbf06b 100644 --- a/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll +++ b/llvm/test/CodeGen/ARM/atomicrmw_exclusive_monitor_ints.ll @@ -6765,8 +6765,8 @@ entry: define i64 @test_xchg_i64() { ; CHECK-ARM8-LABEL: test_xchg_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -6781,25 +6781,29 @@ define i64 @test_xchg_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB33_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 ; CHECK-ARM8-NEXT: mov r0, #0 -; CHECK-ARM8-NEXT: mov r8, #1 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 +; CHECK-ARM8-NEXT: mov r10, #1 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 ; CHECK-ARM8-NEXT: .LBB33_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB33_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB33_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB33_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB33_2 ; CHECK-ARM8-NEXT: .LBB33_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB33_1 Depth=1 @@ -6819,12 +6823,12 @@ define i64 @test_xchg_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_xchg_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI33_0 @@ -6838,24 +6842,28 @@ define i64 @test_xchg_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB33_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 -; CHECK-ARM6-NEXT: ldr r3, .LCPI33_0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI33_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 ; CHECK-ARM6-NEXT: mov r0, #0 -; CHECK-ARM6-NEXT: mov r8, #1 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 +; CHECK-ARM6-NEXT: mov r10, #1 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 ; CHECK-ARM6-NEXT: .LBB33_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB33_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB33_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB33_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB33_2 ; CHECK-ARM6-NEXT: .LBB33_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB33_1 Depth=1 @@ -6875,7 +6883,7 @@ define i64 @test_xchg_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI33_0: @@ -6883,8 +6891,8 @@ define i64 @test_xchg_i64() { ; ; CHECK-THUMB7-LABEL: test_xchg_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -6899,26 +6907,30 @@ define i64 @test_xchg_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB33_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 ; CHECK-THUMB7-NEXT: movs r0, #0 -; CHECK-THUMB7-NEXT: mov.w r8, #1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 +; CHECK-THUMB7-NEXT: mov.w r10, #1 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 ; CHECK-THUMB7-NEXT: .LBB33_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB33_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB33_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB33_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB33_2 ; CHECK-THUMB7-NEXT: .LBB33_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB33_1 Depth=1 @@ -6938,7 +6950,7 @@ define i64 @test_xchg_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_xchg_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -6975,8 +6987,8 @@ entry: define i64 @test_add_i64() { ; CHECK-ARM8-LABEL: test_add_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -6991,25 +7003,29 @@ define i64 @test_add_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB34_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 -; CHECK-ARM8-NEXT: adds r8, r2, #1 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 +; CHECK-ARM8-NEXT: adds r10, r2, #1 ; CHECK-ARM8-NEXT: adc r0, r1, #0 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB34_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB34_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB34_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB34_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB34_2 ; CHECK-ARM8-NEXT: .LBB34_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB34_1 Depth=1 @@ -7029,12 +7045,12 @@ define i64 @test_add_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_add_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI34_0 @@ -7048,24 +7064,28 @@ define i64 @test_add_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB34_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 -; CHECK-ARM6-NEXT: adds r8, r2, #1 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 +; CHECK-ARM6-NEXT: adds r10, r2, #1 ; CHECK-ARM6-NEXT: adc r0, r1, #0 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: ldr r3, .LCPI34_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI34_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB34_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB34_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB34_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB34_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB34_2 ; CHECK-ARM6-NEXT: .LBB34_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB34_1 Depth=1 @@ -7085,7 +7105,7 @@ define i64 @test_add_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI34_0: @@ -7093,8 +7113,8 @@ define i64 @test_add_i64() { ; ; CHECK-THUMB7-LABEL: test_add_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -7109,26 +7129,30 @@ define i64 @test_add_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB34_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: adds.w r8, r2, #1 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 +; CHECK-THUMB7-NEXT: adds.w r10, r2, #1 ; CHECK-THUMB7-NEXT: adc r0, r1, #0 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB34_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB34_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB34_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB34_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB34_2 ; CHECK-THUMB7-NEXT: .LBB34_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB34_1 Depth=1 @@ -7148,7 +7172,7 @@ define i64 @test_add_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_add_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -7185,8 +7209,8 @@ entry: define i64 @test_sub_i64() { ; CHECK-ARM8-LABEL: test_sub_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -7201,25 +7225,29 @@ define i64 @test_sub_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB35_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 -; CHECK-ARM8-NEXT: subs r8, r2, #1 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 +; CHECK-ARM8-NEXT: subs r10, r2, #1 ; CHECK-ARM8-NEXT: sbc r0, r1, #0 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB35_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB35_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB35_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB35_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB35_2 ; CHECK-ARM8-NEXT: .LBB35_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB35_1 Depth=1 @@ -7239,12 +7267,12 @@ define i64 @test_sub_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_sub_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI35_0 @@ -7258,24 +7286,28 @@ define i64 @test_sub_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB35_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 -; CHECK-ARM6-NEXT: subs r8, r2, #1 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 +; CHECK-ARM6-NEXT: subs r10, r2, #1 ; CHECK-ARM6-NEXT: sbc r0, r1, #0 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: ldr r3, .LCPI35_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI35_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB35_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB35_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB35_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB35_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB35_2 ; CHECK-ARM6-NEXT: .LBB35_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB35_1 Depth=1 @@ -7295,7 +7327,7 @@ define i64 @test_sub_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI35_0: @@ -7303,8 +7335,8 @@ define i64 @test_sub_i64() { ; ; CHECK-THUMB7-LABEL: test_sub_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -7319,26 +7351,30 @@ define i64 @test_sub_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB35_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: subs.w r8, r2, #1 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 +; CHECK-THUMB7-NEXT: subs.w r10, r2, #1 ; CHECK-THUMB7-NEXT: sbc r0, r1, #0 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB35_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB35_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB35_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB35_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB35_2 ; CHECK-THUMB7-NEXT: .LBB35_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB35_1 Depth=1 @@ -7358,7 +7394,7 @@ define i64 @test_sub_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_sub_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -7395,8 +7431,8 @@ entry: define i64 @test_and_i64() { ; CHECK-ARM8-LABEL: test_and_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -7411,25 +7447,29 @@ define i64 @test_and_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB36_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: and r8, r2, #1 +; CHECK-ARM8-NEXT: and r10, r2, #1 ; CHECK-ARM8-NEXT: mov r0, #0 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB36_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB36_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB36_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB36_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB36_2 ; CHECK-ARM8-NEXT: .LBB36_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB36_1 Depth=1 @@ -7449,12 +7489,12 @@ define i64 @test_and_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_and_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI36_0 @@ -7468,24 +7508,28 @@ define i64 @test_and_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB36_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: and r8, r2, #1 +; CHECK-ARM6-NEXT: and r10, r2, #1 ; CHECK-ARM6-NEXT: mov r0, #0 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 -; CHECK-ARM6-NEXT: ldr r3, .LCPI36_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 +; CHECK-ARM6-NEXT: ldr r6, .LCPI36_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB36_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB36_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB36_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB36_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB36_2 ; CHECK-ARM6-NEXT: .LBB36_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB36_1 Depth=1 @@ -7505,7 +7549,7 @@ define i64 @test_and_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI36_0: @@ -7513,8 +7557,8 @@ define i64 @test_and_i64() { ; ; CHECK-THUMB7-LABEL: test_and_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -7529,26 +7573,30 @@ define i64 @test_and_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB36_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: and r8, r2, #1 +; CHECK-THUMB7-NEXT: and r10, r2, #1 ; CHECK-THUMB7-NEXT: movs r0, #0 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB36_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB36_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB36_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB36_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB36_2 ; CHECK-THUMB7-NEXT: .LBB36_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB36_1 Depth=1 @@ -7568,7 +7616,7 @@ define i64 @test_and_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_and_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -7605,8 +7653,8 @@ entry: define i64 @test_nand_i64() { ; CHECK-ARM8-LABEL: test_nand_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -7621,27 +7669,31 @@ define i64 @test_nand_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB37_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 ; CHECK-ARM8-NEXT: mvn r0, r2 ; CHECK-ARM8-NEXT: mvn r3, #1 -; CHECK-ARM8-NEXT: orr r8, r0, r3 +; CHECK-ARM8-NEXT: orr r10, r0, r3 ; CHECK-ARM8-NEXT: mvn r0, #0 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB37_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB37_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB37_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB37_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB37_2 ; CHECK-ARM8-NEXT: .LBB37_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB37_1 Depth=1 @@ -7661,12 +7713,12 @@ define i64 @test_nand_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_nand_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI37_0 @@ -7680,26 +7732,30 @@ define i64 @test_nand_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB37_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 ; CHECK-ARM6-NEXT: mvn r0, r2 ; CHECK-ARM6-NEXT: mvn r3, #1 -; CHECK-ARM6-NEXT: orr r8, r0, r3 +; CHECK-ARM6-NEXT: orr r10, r0, r3 ; CHECK-ARM6-NEXT: mvn r0, #0 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: ldr r3, .LCPI37_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI37_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB37_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB37_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB37_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB37_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB37_2 ; CHECK-ARM6-NEXT: .LBB37_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB37_1 Depth=1 @@ -7719,7 +7775,7 @@ define i64 @test_nand_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI37_0: @@ -7727,8 +7783,8 @@ define i64 @test_nand_i64() { ; ; CHECK-THUMB7-LABEL: test_nand_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -7743,27 +7799,31 @@ define i64 @test_nand_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB37_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 ; CHECK-THUMB7-NEXT: mvn r0, #1 -; CHECK-THUMB7-NEXT: orn r8, r0, r2 +; CHECK-THUMB7-NEXT: orn r10, r0, r2 ; CHECK-THUMB7-NEXT: mov.w r0, #-1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB37_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB37_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB37_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB37_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB37_2 ; CHECK-THUMB7-NEXT: .LBB37_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB37_1 Depth=1 @@ -7783,7 +7843,7 @@ define i64 @test_nand_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_nand_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -7820,8 +7880,8 @@ entry: define i64 @test_or_i64() { ; CHECK-ARM8-LABEL: test_or_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -7836,24 +7896,28 @@ define i64 @test_or_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB38_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: orr r8, r2, #1 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 +; CHECK-ARM8-NEXT: orr r10, r2, #1 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r1 +; CHECK-ARM8-NEXT: mov r8, r2 ; CHECK-ARM8-NEXT: mov r9, r1 -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB38_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB38_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB38_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB38_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB38_2 ; CHECK-ARM8-NEXT: .LBB38_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB38_1 Depth=1 @@ -7873,12 +7937,12 @@ define i64 @test_or_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_or_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI38_0 @@ -7892,23 +7956,27 @@ define i64 @test_or_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB38_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: orr r8, r2, #1 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 +; CHECK-ARM6-NEXT: orr r10, r2, #1 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r1 +; CHECK-ARM6-NEXT: mov r8, r2 ; CHECK-ARM6-NEXT: mov r9, r1 -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 -; CHECK-ARM6-NEXT: ldr r3, .LCPI38_0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI38_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB38_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB38_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB38_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB38_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB38_2 ; CHECK-ARM6-NEXT: .LBB38_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB38_1 Depth=1 @@ -7928,7 +7996,7 @@ define i64 @test_or_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI38_0: @@ -7936,8 +8004,8 @@ define i64 @test_or_i64() { ; ; CHECK-THUMB7-LABEL: test_or_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -7952,25 +8020,29 @@ define i64 @test_or_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB38_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: orr r8, r2, #1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 +; CHECK-THUMB7-NEXT: orr r10, r2, #1 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r1 +; CHECK-THUMB7-NEXT: mov r8, r2 ; CHECK-THUMB7-NEXT: mov r9, r1 -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB38_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB38_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB38_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB38_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB38_2 ; CHECK-THUMB7-NEXT: .LBB38_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB38_1 Depth=1 @@ -7990,7 +8062,7 @@ define i64 @test_or_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_or_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -8027,8 +8099,8 @@ entry: define i64 @test_xor_i64() { ; CHECK-ARM8-LABEL: test_xor_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -8043,24 +8115,28 @@ define i64 @test_xor_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB39_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: eor r8, r2, #1 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 +; CHECK-ARM8-NEXT: eor r10, r2, #1 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r1 +; CHECK-ARM8-NEXT: mov r8, r2 ; CHECK-ARM8-NEXT: mov r9, r1 -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB39_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB39_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB39_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB39_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB39_2 ; CHECK-ARM8-NEXT: .LBB39_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB39_1 Depth=1 @@ -8080,12 +8156,12 @@ define i64 @test_xor_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_xor_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI39_0 @@ -8099,23 +8175,27 @@ define i64 @test_xor_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB39_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: eor r8, r2, #1 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 +; CHECK-ARM6-NEXT: eor r10, r2, #1 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r1 +; CHECK-ARM6-NEXT: mov r8, r2 ; CHECK-ARM6-NEXT: mov r9, r1 -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 -; CHECK-ARM6-NEXT: ldr r3, .LCPI39_0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI39_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB39_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB39_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB39_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB39_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB39_2 ; CHECK-ARM6-NEXT: .LBB39_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB39_1 Depth=1 @@ -8135,7 +8215,7 @@ define i64 @test_xor_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI39_0: @@ -8143,8 +8223,8 @@ define i64 @test_xor_i64() { ; ; CHECK-THUMB7-LABEL: test_xor_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -8159,25 +8239,29 @@ define i64 @test_xor_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB39_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: eor r8, r2, #1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 +; CHECK-THUMB7-NEXT: eor r10, r2, #1 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r1 +; CHECK-THUMB7-NEXT: mov r8, r2 ; CHECK-THUMB7-NEXT: mov r9, r1 -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB39_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB39_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB39_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB39_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB39_2 ; CHECK-THUMB7-NEXT: .LBB39_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB39_1 Depth=1 @@ -8197,7 +8281,7 @@ define i64 @test_xor_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_xor_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -8235,8 +8319,8 @@ entry: define i64 @test_max_i64() { ; CHECK-ARM8-LABEL: test_max_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -8251,32 +8335,36 @@ define i64 @test_max_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB40_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 ; CHECK-ARM8-NEXT: rsbs r0, r2, #1 ; CHECK-ARM8-NEXT: rscs r0, r1, #0 ; CHECK-ARM8-NEXT: mov r0, #0 ; CHECK-ARM8-NEXT: movwlt r0, #1 -; CHECK-ARM8-NEXT: mov r8, #1 +; CHECK-ARM8-NEXT: mov r10, #1 ; CHECK-ARM8-NEXT: cmp r0, #0 -; CHECK-ARM8-NEXT: movne r8, r2 +; CHECK-ARM8-NEXT: movne r10, r2 ; CHECK-ARM8-NEXT: cmp r0, #0 ; CHECK-ARM8-NEXT: movne r0, r1 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB40_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB40_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB40_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB40_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB40_2 ; CHECK-ARM8-NEXT: .LBB40_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB40_1 Depth=1 @@ -8296,12 +8384,12 @@ define i64 @test_max_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_max_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI40_0 @@ -8315,31 +8403,35 @@ define i64 @test_max_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB40_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 ; CHECK-ARM6-NEXT: rsbs r0, r2, #1 ; CHECK-ARM6-NEXT: rscs r0, r1, #0 ; CHECK-ARM6-NEXT: mov r0, #0 ; CHECK-ARM6-NEXT: movlt r0, #1 -; CHECK-ARM6-NEXT: mov r8, #1 +; CHECK-ARM6-NEXT: mov r10, #1 ; CHECK-ARM6-NEXT: cmp r0, #0 -; CHECK-ARM6-NEXT: movne r8, r2 +; CHECK-ARM6-NEXT: movne r10, r2 ; CHECK-ARM6-NEXT: cmp r0, #0 ; CHECK-ARM6-NEXT: movne r0, r1 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: ldr r3, .LCPI40_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI40_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB40_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB40_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB40_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB40_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB40_2 ; CHECK-ARM6-NEXT: .LBB40_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB40_1 Depth=1 @@ -8359,7 +8451,7 @@ define i64 @test_max_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI40_0: @@ -8367,8 +8459,8 @@ define i64 @test_max_i64() { ; ; CHECK-THUMB7-LABEL: test_max_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -8388,31 +8480,35 @@ define i64 @test_max_i64() { ; CHECK-THUMB7-NEXT: sbcs.w r3, r0, r1 ; CHECK-THUMB7-NEXT: it lt ; CHECK-THUMB7-NEXT: movlt r0, #1 -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: mov.w r8, #1 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 +; CHECK-THUMB7-NEXT: mov.w r10, #1 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne -; CHECK-THUMB7-NEXT: movne r8, r2 +; CHECK-THUMB7-NEXT: movne r10, r2 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne ; CHECK-THUMB7-NEXT: movne r0, r1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB40_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB40_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB40_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB40_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB40_2 ; CHECK-THUMB7-NEXT: .LBB40_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB40_1 Depth=1 @@ -8432,7 +8528,7 @@ define i64 @test_max_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_max_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -8539,8 +8635,8 @@ entry: define i64 @test_min_i64() { ; CHECK-ARM8-LABEL: test_min_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -8555,32 +8651,36 @@ define i64 @test_min_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB41_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 ; CHECK-ARM8-NEXT: subs r0, r2, #2 ; CHECK-ARM8-NEXT: sbcs r0, r1, #0 ; CHECK-ARM8-NEXT: mov r0, #0 ; CHECK-ARM8-NEXT: movwlt r0, #1 -; CHECK-ARM8-NEXT: mov r8, #1 +; CHECK-ARM8-NEXT: mov r10, #1 ; CHECK-ARM8-NEXT: cmp r0, #0 -; CHECK-ARM8-NEXT: movne r8, r2 +; CHECK-ARM8-NEXT: movne r10, r2 ; CHECK-ARM8-NEXT: cmp r0, #0 ; CHECK-ARM8-NEXT: movne r0, r1 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB41_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB41_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB41_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB41_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB41_2 ; CHECK-ARM8-NEXT: .LBB41_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB41_1 Depth=1 @@ -8600,12 +8700,12 @@ define i64 @test_min_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_min_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI41_0 @@ -8619,31 +8719,35 @@ define i64 @test_min_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB41_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 ; CHECK-ARM6-NEXT: subs r0, r2, #2 ; CHECK-ARM6-NEXT: sbcs r0, r1, #0 ; CHECK-ARM6-NEXT: mov r0, #0 ; CHECK-ARM6-NEXT: movlt r0, #1 -; CHECK-ARM6-NEXT: mov r8, #1 +; CHECK-ARM6-NEXT: mov r10, #1 ; CHECK-ARM6-NEXT: cmp r0, #0 -; CHECK-ARM6-NEXT: movne r8, r2 +; CHECK-ARM6-NEXT: movne r10, r2 ; CHECK-ARM6-NEXT: cmp r0, #0 ; CHECK-ARM6-NEXT: movne r0, r1 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: ldr r3, .LCPI41_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI41_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB41_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB41_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB41_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB41_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB41_2 ; CHECK-ARM6-NEXT: .LBB41_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB41_1 Depth=1 @@ -8663,7 +8767,7 @@ define i64 @test_min_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI41_0: @@ -8671,8 +8775,8 @@ define i64 @test_min_i64() { ; ; CHECK-THUMB7-LABEL: test_min_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -8687,36 +8791,40 @@ define i64 @test_min_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB41_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 ; CHECK-THUMB7-NEXT: subs r0, r2, #2 ; CHECK-THUMB7-NEXT: sbcs r0, r1, #0 ; CHECK-THUMB7-NEXT: mov.w r0, #0 ; CHECK-THUMB7-NEXT: it lt ; CHECK-THUMB7-NEXT: movlt r0, #1 -; CHECK-THUMB7-NEXT: mov.w r8, #1 +; CHECK-THUMB7-NEXT: mov.w r10, #1 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne -; CHECK-THUMB7-NEXT: movne r8, r2 +; CHECK-THUMB7-NEXT: movne r10, r2 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne ; CHECK-THUMB7-NEXT: movne r0, r1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB41_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB41_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB41_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB41_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB41_2 ; CHECK-THUMB7-NEXT: .LBB41_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB41_1 Depth=1 @@ -8736,7 +8844,7 @@ define i64 @test_min_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_min_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -8843,8 +8951,8 @@ entry: define i64 @test_umax_i64() { ; CHECK-ARM8-LABEL: test_umax_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -8859,32 +8967,36 @@ define i64 @test_umax_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB42_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 ; CHECK-ARM8-NEXT: rsbs r0, r2, #1 ; CHECK-ARM8-NEXT: rscs r0, r1, #0 ; CHECK-ARM8-NEXT: mov r0, #0 ; CHECK-ARM8-NEXT: movwlo r0, #1 -; CHECK-ARM8-NEXT: mov r8, #1 +; CHECK-ARM8-NEXT: mov r10, #1 ; CHECK-ARM8-NEXT: cmp r0, #0 -; CHECK-ARM8-NEXT: movne r8, r2 +; CHECK-ARM8-NEXT: movne r10, r2 ; CHECK-ARM8-NEXT: cmp r0, #0 ; CHECK-ARM8-NEXT: movne r0, r1 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB42_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB42_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB42_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB42_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB42_2 ; CHECK-ARM8-NEXT: .LBB42_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB42_1 Depth=1 @@ -8904,12 +9016,12 @@ define i64 @test_umax_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_umax_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI42_0 @@ -8923,31 +9035,35 @@ define i64 @test_umax_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB42_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 ; CHECK-ARM6-NEXT: rsbs r0, r2, #1 ; CHECK-ARM6-NEXT: rscs r0, r1, #0 ; CHECK-ARM6-NEXT: mov r0, #0 ; CHECK-ARM6-NEXT: movlo r0, #1 -; CHECK-ARM6-NEXT: mov r8, #1 +; CHECK-ARM6-NEXT: mov r10, #1 ; CHECK-ARM6-NEXT: cmp r0, #0 -; CHECK-ARM6-NEXT: movne r8, r2 +; CHECK-ARM6-NEXT: movne r10, r2 ; CHECK-ARM6-NEXT: cmp r0, #0 ; CHECK-ARM6-NEXT: movne r0, r1 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: ldr r3, .LCPI42_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI42_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB42_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB42_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB42_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB42_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB42_2 ; CHECK-ARM6-NEXT: .LBB42_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB42_1 Depth=1 @@ -8967,7 +9083,7 @@ define i64 @test_umax_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI42_0: @@ -8975,8 +9091,8 @@ define i64 @test_umax_i64() { ; ; CHECK-THUMB7-LABEL: test_umax_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -8996,31 +9112,35 @@ define i64 @test_umax_i64() { ; CHECK-THUMB7-NEXT: sbcs.w r3, r0, r1 ; CHECK-THUMB7-NEXT: it lo ; CHECK-THUMB7-NEXT: movlo r0, #1 -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 -; CHECK-THUMB7-NEXT: mov.w r8, #1 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 +; CHECK-THUMB7-NEXT: mov.w r10, #1 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne -; CHECK-THUMB7-NEXT: movne r8, r2 +; CHECK-THUMB7-NEXT: movne r10, r2 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne ; CHECK-THUMB7-NEXT: movne r0, r1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB42_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB42_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB42_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB42_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB42_2 ; CHECK-THUMB7-NEXT: .LBB42_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB42_1 Depth=1 @@ -9040,7 +9160,7 @@ define i64 @test_umax_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_umax_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry @@ -9147,8 +9267,8 @@ entry: define i64 @test_umin_i64() { ; CHECK-ARM8-LABEL: test_umin_i64: ; CHECK-ARM8: @ %bb.0: @ %entry -; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM8-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM8-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM8-NEXT: .pad #16 ; CHECK-ARM8-NEXT: sub sp, sp, #16 ; CHECK-ARM8-NEXT: movw r0, :lower16:atomic_i64 @@ -9163,32 +9283,36 @@ define i64 @test_umin_i64() { ; CHECK-ARM8-NEXT: @ Child Loop BB43_2 Depth 2 ; CHECK-ARM8-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM8-NEXT: mov r6, r2 -; CHECK-ARM8-NEXT: mov r7, r1 +; CHECK-ARM8-NEXT: mov r8, r2 +; CHECK-ARM8-NEXT: mov r9, r1 ; CHECK-ARM8-NEXT: subs r0, r2, #2 ; CHECK-ARM8-NEXT: sbcs r0, r1, #0 ; CHECK-ARM8-NEXT: mov r0, #0 ; CHECK-ARM8-NEXT: movwlo r0, #1 -; CHECK-ARM8-NEXT: mov r8, #1 +; CHECK-ARM8-NEXT: mov r10, #1 ; CHECK-ARM8-NEXT: cmp r0, #0 -; CHECK-ARM8-NEXT: movne r8, r2 +; CHECK-ARM8-NEXT: movne r10, r2 ; CHECK-ARM8-NEXT: cmp r0, #0 ; CHECK-ARM8-NEXT: movne r0, r1 -; CHECK-ARM8-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM8-NEXT: mov r9, r0 -; CHECK-ARM8-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-ARM8-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM8-NEXT: mov r11, r0 +; CHECK-ARM8-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-ARM8-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-ARM8-NEXT: @ implicit-def: $r0 +; CHECK-ARM8-NEXT: @ implicit-def: $r3 +; CHECK-ARM8-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM8-NEXT: mov r7, r0 ; CHECK-ARM8-NEXT: .LBB43_2: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ Parent Loop BB43_1 Depth=1 ; CHECK-ARM8-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM8-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM8-NEXT: cmp r4, r6 -; CHECK-ARM8-NEXT: cmpeq r5, r7 +; CHECK-ARM8-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM8-NEXT: cmp r4, r8 +; CHECK-ARM8-NEXT: cmpeq r5, r9 ; CHECK-ARM8-NEXT: bne .LBB43_4 ; CHECK-ARM8-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB43_2 Depth=2 -; CHECK-ARM8-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM8-NEXT: cmp r0, #0 +; CHECK-ARM8-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM8-NEXT: cmp r7, #0 ; CHECK-ARM8-NEXT: bne .LBB43_2 ; CHECK-ARM8-NEXT: .LBB43_4: @ %atomicrmw.start ; CHECK-ARM8-NEXT: @ in Loop: Header=BB43_1 Depth=1 @@ -9208,12 +9332,12 @@ define i64 @test_umin_i64() { ; CHECK-ARM8-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM8-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM8-NEXT: add sp, sp, #16 -; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM8-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-ARM6-LABEL: test_umin_i64: ; CHECK-ARM6: @ %bb.0: @ %entry -; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-ARM6-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-ARM6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-ARM6-NEXT: .pad #16 ; CHECK-ARM6-NEXT: sub sp, sp, #16 ; CHECK-ARM6-NEXT: ldr r0, .LCPI43_0 @@ -9227,31 +9351,35 @@ define i64 @test_umin_i64() { ; CHECK-ARM6-NEXT: @ Child Loop BB43_2 Depth 2 ; CHECK-ARM6-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-ARM6-NEXT: mov r6, r2 -; CHECK-ARM6-NEXT: mov r7, r1 +; CHECK-ARM6-NEXT: mov r8, r2 +; CHECK-ARM6-NEXT: mov r9, r1 ; CHECK-ARM6-NEXT: subs r0, r2, #2 ; CHECK-ARM6-NEXT: sbcs r0, r1, #0 ; CHECK-ARM6-NEXT: mov r0, #0 ; CHECK-ARM6-NEXT: movlo r0, #1 -; CHECK-ARM6-NEXT: mov r8, #1 +; CHECK-ARM6-NEXT: mov r10, #1 ; CHECK-ARM6-NEXT: cmp r0, #0 -; CHECK-ARM6-NEXT: movne r8, r2 +; CHECK-ARM6-NEXT: movne r10, r2 ; CHECK-ARM6-NEXT: cmp r0, #0 ; CHECK-ARM6-NEXT: movne r0, r1 -; CHECK-ARM6-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-ARM6-NEXT: mov r9, r0 -; CHECK-ARM6-NEXT: ldr r3, .LCPI43_0 +; CHECK-ARM6-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-ARM6-NEXT: mov r11, r0 +; CHECK-ARM6-NEXT: ldr r6, .LCPI43_0 +; CHECK-ARM6-NEXT: @ implicit-def: $r0 +; CHECK-ARM6-NEXT: @ implicit-def: $r3 +; CHECK-ARM6-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-ARM6-NEXT: mov r7, r0 ; CHECK-ARM6-NEXT: .LBB43_2: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ Parent Loop BB43_1 Depth=1 ; CHECK-ARM6-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-ARM6-NEXT: ldrexd r4, r5, [r3] -; CHECK-ARM6-NEXT: cmp r4, r6 -; CHECK-ARM6-NEXT: cmpeq r5, r7 +; CHECK-ARM6-NEXT: ldrexd r4, r5, [r6] +; CHECK-ARM6-NEXT: cmp r4, r8 +; CHECK-ARM6-NEXT: cmpeq r5, r9 ; CHECK-ARM6-NEXT: bne .LBB43_4 ; CHECK-ARM6-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB43_2 Depth=2 -; CHECK-ARM6-NEXT: strexd r0, r8, r9, [r3] -; CHECK-ARM6-NEXT: cmp r0, #0 +; CHECK-ARM6-NEXT: strexd r7, r10, r11, [r6] +; CHECK-ARM6-NEXT: cmp r7, #0 ; CHECK-ARM6-NEXT: bne .LBB43_2 ; CHECK-ARM6-NEXT: .LBB43_4: @ %atomicrmw.start ; CHECK-ARM6-NEXT: @ in Loop: Header=BB43_1 Depth=1 @@ -9271,7 +9399,7 @@ define i64 @test_umin_i64() { ; CHECK-ARM6-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-ARM6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-ARM6-NEXT: add sp, sp, #16 -; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-ARM6-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-ARM6-NEXT: .p2align 2 ; CHECK-ARM6-NEXT: @ %bb.6: ; CHECK-ARM6-NEXT: .LCPI43_0: @@ -9279,8 +9407,8 @@ define i64 @test_umin_i64() { ; ; CHECK-THUMB7-LABEL: test_umin_i64: ; CHECK-THUMB7: @ %bb.0: @ %entry -; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-THUMB7-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-THUMB7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-THUMB7-NEXT: .pad #16 ; CHECK-THUMB7-NEXT: sub sp, #16 ; CHECK-THUMB7-NEXT: movw r0, :lower16:atomic_i64 @@ -9295,36 +9423,40 @@ define i64 @test_umin_i64() { ; CHECK-THUMB7-NEXT: @ Child Loop BB43_2 Depth 2 ; CHECK-THUMB7-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-THUMB7-NEXT: mov r6, r2 -; CHECK-THUMB7-NEXT: mov r7, r1 +; CHECK-THUMB7-NEXT: mov r8, r2 +; CHECK-THUMB7-NEXT: mov r9, r1 ; CHECK-THUMB7-NEXT: subs r0, r2, #2 ; CHECK-THUMB7-NEXT: sbcs r0, r1, #0 ; CHECK-THUMB7-NEXT: mov.w r0, #0 ; CHECK-THUMB7-NEXT: it lo ; CHECK-THUMB7-NEXT: movlo r0, #1 -; CHECK-THUMB7-NEXT: mov.w r8, #1 +; CHECK-THUMB7-NEXT: mov.w r10, #1 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne -; CHECK-THUMB7-NEXT: movne r8, r2 +; CHECK-THUMB7-NEXT: movne r10, r2 ; CHECK-THUMB7-NEXT: cmp r0, #0 ; CHECK-THUMB7-NEXT: it ne ; CHECK-THUMB7-NEXT: movne r0, r1 -; CHECK-THUMB7-NEXT: @ kill: def $r8 killed $r8 def $r8_r9 -; CHECK-THUMB7-NEXT: mov r9, r0 -; CHECK-THUMB7-NEXT: movw r3, :lower16:atomic_i64 -; CHECK-THUMB7-NEXT: movt r3, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ kill: def $r10 killed $r10 def $r10_r11 +; CHECK-THUMB7-NEXT: mov r11, r0 +; CHECK-THUMB7-NEXT: movw r6, :lower16:atomic_i64 +; CHECK-THUMB7-NEXT: movt r6, :upper16:atomic_i64 +; CHECK-THUMB7-NEXT: @ implicit-def: $r0 +; CHECK-THUMB7-NEXT: @ implicit-def: $r3 +; CHECK-THUMB7-NEXT: @ kill: def $r6 killed $r6 def $r6_r7 +; CHECK-THUMB7-NEXT: mov r7, r0 ; CHECK-THUMB7-NEXT: .LBB43_2: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ Parent Loop BB43_1 Depth=1 ; CHECK-THUMB7-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r3] -; CHECK-THUMB7-NEXT: cmp r4, r6 +; CHECK-THUMB7-NEXT: ldrexd r4, r5, [r6] +; CHECK-THUMB7-NEXT: cmp r4, r8 ; CHECK-THUMB7-NEXT: it eq -; CHECK-THUMB7-NEXT: cmpeq r5, r7 +; CHECK-THUMB7-NEXT: cmpeq r5, r9 ; CHECK-THUMB7-NEXT: bne .LBB43_4 ; CHECK-THUMB7-NEXT: @ %bb.3: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB43_2 Depth=2 -; CHECK-THUMB7-NEXT: strexd r0, r8, r9, [r3] -; CHECK-THUMB7-NEXT: cmp r0, #0 +; CHECK-THUMB7-NEXT: strexd r7, r10, r11, [r6] +; CHECK-THUMB7-NEXT: cmp r7, #0 ; CHECK-THUMB7-NEXT: bne .LBB43_2 ; CHECK-THUMB7-NEXT: .LBB43_4: @ %atomicrmw.start ; CHECK-THUMB7-NEXT: @ in Loop: Header=BB43_1 Depth=1 @@ -9344,7 +9476,7 @@ define i64 @test_umin_i64() { ; CHECK-THUMB7-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-THUMB7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-THUMB7-NEXT: add sp, #16 -; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-THUMB7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-THUMB6-LABEL: test_umin_i64: ; CHECK-THUMB6: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/ARM/cmpxchg-O0.ll b/llvm/test/CodeGen/ARM/cmpxchg-O0.ll index 28a64db1aeba4f..9158ae0c9fe159 100644 --- a/llvm/test/CodeGen/ARM/cmpxchg-O0.ll +++ b/llvm/test/CodeGen/ARM/cmpxchg-O0.ll @@ -78,15 +78,14 @@ define { i32, i1 } @test_cmpxchg_32(ptr %addr, i32 %desired, i32 %new) nounwind define { i64, i1 } @test_cmpxchg_64(ptr %addr, i64 %desired, i64 %new) nounwind { ; CHECK-LABEL: test_cmpxchg_64: -; CHECK: mov [[ADDR:r[0-9]+]], r0 ; CHECK: dmb ish ; CHECK-NOT: uxt ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: -; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [[[ADDR]]] +; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [r0] ; CHECK: cmp [[OLDLO]], r6 ; CHECK: cmpeq [[OLDHI]], r7 ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strexd [[STATUS:[lr0-9]+]], r8, r9, [r1] +; CHECK: strexd [[STATUS:[lr0-9]+]], r8, r9, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: diff --git a/llvm/test/CodeGen/ARM/cmpxchg.mir b/llvm/test/CodeGen/ARM/cmpxchg.mir index 20ab787fb4575b..2ef3281ca733e7 100644 --- a/llvm/test/CodeGen/ARM/cmpxchg.mir +++ b/llvm/test/CodeGen/ARM/cmpxchg.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -o - %s -mtriple=armv7-unknown-linux-gnu -verify-machineinstrs -run-pass=arm-pseudo | FileCheck %s +# RUN: llc -o - %s -mtriple=armv7eb-unknown-linux-gnu -verify-machineinstrs -run-pass=arm-pseudo | FileCheck %s --- name: func tracksRegLiveness: true @@ -12,23 +13,23 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .1: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: liveins: $r4_r5, $r3 + ; CHECK-NEXT: liveins: $r4_r5, $r2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $r0_r1 = LDREXD $r3, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r0_r1 = LDREXD $r2, 14 /* CC::al */, $noreg ; CHECK-NEXT: CMPrr killed $r0, $r4, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NEXT: CMPrr killed $r1, $r5, 0 /* CC::eq */, killed $cpsr, implicit-def $cpsr ; CHECK-NEXT: Bcc %bb.3, 1 /* CC::ne */, killed $cpsr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .2: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; CHECK-NEXT: liveins: $r4_r5, $r3 + ; CHECK-NEXT: liveins: $r4_r5, $r2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $r2 = STREXD $r4_r5, $r3, 14 /* CC::al */, $noreg - ; CHECK-NEXT: CMPri killed $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NEXT: early-clobber $r3 = STREXD $r4_r5, $r2, 14 /* CC::al */, $noreg + ; CHECK-NEXT: CMPri killed $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NEXT: Bcc %bb.1, 1 /* CC::ne */, killed $cpsr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: - dead early-clobber renamable $r0_r1, dead early-clobber renamable $r2 = CMP_SWAP_64 killed renamable $r3, killed renamable $r4_r5, renamable $r4_r5 :: (volatile load store monotonic monotonic (s64)) + dead early-clobber renamable $r0_r1, dead early-clobber renamable $r2_r3 = CMP_SWAP_64 killed renamable $r2_r3, killed renamable $r4_r5, renamable $r4_r5 :: (volatile load store monotonic monotonic (s64)) ... --- name: func2 diff --git a/llvm/test/CodeGen/Thumb2/cmpxchg.mir b/llvm/test/CodeGen/Thumb2/cmpxchg.mir index 33de25d469a757..c1adb465380f8e 100644 --- a/llvm/test/CodeGen/Thumb2/cmpxchg.mir +++ b/llvm/test/CodeGen/Thumb2/cmpxchg.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -o - %s -mtriple=thumbv7-unknown-linux-gnu -verify-machineinstrs -run-pass=arm-pseudo | FileCheck %s +# RUN: llc -o - %s -mtriple=thumbv7eb-unknown-linux-gnu -verify-machineinstrs -run-pass=arm-pseudo | FileCheck %s --- name: func tracksRegLiveness: true @@ -12,23 +13,23 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .1: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: liveins: $r4, $r5, $r3 + ; CHECK-NEXT: liveins: $r4, $r5, $r2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $r0, $r1 = t2LDREXD $r3, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r0, $r1 = t2LDREXD $r2, 14 /* CC::al */, $noreg ; CHECK-NEXT: tCMPhir killed $r0, $r4, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NEXT: tCMPhir killed $r1, $r5, 0 /* CC::eq */, killed $cpsr, implicit-def $cpsr ; CHECK-NEXT: tBcc %bb.3, 1 /* CC::ne */, killed $cpsr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .2: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; CHECK-NEXT: liveins: $r4, $r5, $r3 + ; CHECK-NEXT: liveins: $r4, $r5, $r2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $r2 = t2STREXD $r4, $r5, $r3, 14 /* CC::al */, $noreg - ; CHECK-NEXT: t2CMPri killed $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NEXT: early-clobber $r3 = t2STREXD $r4, $r5, $r2, 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2CMPri killed $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr ; CHECK-NEXT: tBcc %bb.1, 1 /* CC::ne */, killed $cpsr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: - dead early-clobber renamable $r0_r1, dead early-clobber renamable $r2 = CMP_SWAP_64 killed renamable $r3, killed renamable $r4_r5, renamable $r4_r5 :: (volatile load store monotonic monotonic (s64)) + dead early-clobber renamable $r0_r1, dead early-clobber renamable $r2_r3 = CMP_SWAP_64 killed renamable $r2_r3, killed renamable $r4_r5, renamable $r4_r5 :: (volatile load store monotonic monotonic (s64)) ... --- name: func2 From d79c4c111952990062173f30bb83084cb2993f39 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Mon, 2 Sep 2024 09:53:44 +0200 Subject: [PATCH 18/33] [CGP] Regenerate `revert-constant-ptr-propagation-on-calls.ll` test (NFC) Multiple buildbots were previously failing. --- .../CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll b/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll index 51f1283a20ab27..8b9e60214057d6 100644 --- a/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll +++ b/llvm/test/Transforms/CodeGenPrepare/revert-constant-ptr-propagation-on-calls.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +; REQUIRES: aarch64-registered-target %struct.S = type { i8 } %struct.X = type { i32 } From 5bd3ee0ac02880df0c4d7e89026ee8b9d8f1039e Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 2 Sep 2024 09:06:14 +0100 Subject: [PATCH 19/33] [libcxx][test] Use long double test macro in strong_order.pass.cpp (#106742) --- .../language.support/cmp/cmp.alg/strong_order.pass.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order.pass.cpp b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order.pass.cpp index ac6b6879f77309..e384ea289bb5bf 100644 --- a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order.pass.cpp +++ b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order.pass.cpp @@ -454,12 +454,16 @@ int main(int, char**) test_1_2(); test_1_3(); test_1_3(); - // test_1_3(); // UNIMPLEMENTED +#ifdef TEST_LONG_DOUBLE_IS_DOUBLE + test_1_3(); // UNIMPLEMENTED when long double is a distinct type +#endif test_1_4(); static_assert(test_1_3()); static_assert(test_1_3()); - // static_assert(test_1_3()); // UNIMPLEMENTED +#ifdef TEST_LONG_DOUBLE_IS_DOUBLE + static_assert(test_1_3()); // UNIMPLEMENTED when long double is a distinct type +#endif static_assert(test_1_4()); return 0; From 34b10e165d809bb133d37dfe934859800f21a100 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 2 Sep 2024 09:43:56 +0200 Subject: [PATCH 20/33] [InstCombine] Remove optional LoopInfo dependency https://github.com/llvm/llvm-project/pull/106075 has removed the last dependency on LoopInfo in InstCombine, so don't fetch the analysis anymore and remove the use-loop-info pass option. --- .../llvm/Transforms/InstCombine/InstCombine.h | 6 ------ .../Transforms/InstCombine/InstCombiner.h | 9 ++------- llvm/lib/Passes/PassBuilder.cpp | 4 +--- .../InstCombine/InstCombineInternal.h | 5 ++--- .../InstCombine/InstructionCombining.cpp | 19 ++++--------------- llvm/test/Other/new-pm-print-pipeline.ll | 4 ++-- .../InstCombine/gep-combine-loop-invariant.ll | 2 +- 7 files changed, 12 insertions(+), 37 deletions(-) diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h index f38ec2debb1813..b4f0166239520a 100644 --- a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h @@ -28,18 +28,12 @@ namespace llvm { static constexpr unsigned InstCombineDefaultMaxIterations = 1; struct InstCombineOptions { - bool UseLoopInfo = false; // Verify that a fix point has been reached after MaxIterations. bool VerifyFixpoint = false; unsigned MaxIterations = InstCombineDefaultMaxIterations; InstCombineOptions() = default; - InstCombineOptions &setUseLoopInfo(bool Value) { - UseLoopInfo = Value; - return *this; - } - InstCombineOptions &setVerifyFixpoint(bool Value) { VerifyFixpoint = Value; return *this; diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h index 05322f7650efc7..f5f16037bef893 100644 --- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -80,10 +80,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { ProfileSummaryInfo *PSI; DomConditionCache DC; - // Optional analyses. When non-null, these can both be used to do better - // combining and will be updated to reflect any changes. - LoopInfo *LI; - ReversePostOrderTraversal &RPOT; bool MadeIRChange = false; @@ -106,13 +102,13 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { TargetLibraryInfo &TLI, TargetTransformInfo &TTI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, - ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI, + ProfileSummaryInfo *PSI, const DataLayout &DL, ReversePostOrderTraversal &RPOT) : TTI(TTI), Builder(Builder), Worklist(Worklist), MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true, /*CanUseUndef*/ true, &DC), - ORE(ORE), BFI(BFI), BPI(BPI), PSI(PSI), LI(LI), RPOT(RPOT) {} + ORE(ORE), BFI(BFI), BPI(BPI), PSI(PSI), RPOT(RPOT) {} virtual ~InstCombiner() = default; @@ -351,7 +347,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { } BlockFrequencyInfo *getBlockFrequencyInfo() const { return BFI; } ProfileSummaryInfo *getProfileSummaryInfo() const { return PSI; } - LoopInfo *getLoopInfo() const { return LI; } // Call target specific combiners std::optional targetInstCombineIntrinsic(IntrinsicInst &II); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 63173c4abb8191..1df1449fce597c 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -882,9 +882,7 @@ Expected parseInstCombineOptions(StringRef Params) { std::tie(ParamName, Params) = Params.split(';'); bool Enable = !ParamName.consume_front("no-"); - if (ParamName == "use-loop-info") { - Result.setUseLoopInfo(Enable); - } else if (ParamName == "verify-fixpoint") { + if (ParamName == "verify-fixpoint") { Result.setVerifyFixpoint(Enable); } else if (Enable && ParamName.consume_front("max-iterations=")) { APInt MaxIterations; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 57f27e6a3b7fa5..a051a568bfd62e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -51,7 +51,6 @@ class DataLayout; class DominatorTree; class GEPOperator; class GlobalVariable; -class LoopInfo; class OptimizationRemarkEmitter; class ProfileSummaryInfo; class TargetLibraryInfo; @@ -66,10 +65,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final TargetLibraryInfo &TLI, TargetTransformInfo &TTI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, - ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI, + ProfileSummaryInfo *PSI, const DataLayout &DL, ReversePostOrderTraversal &RPOT) : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE, - BFI, BPI, PSI, DL, LI, RPOT) {} + BFI, BPI, PSI, DL, RPOT) {} virtual ~InstCombinerImpl() = default; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 9ee1f0bb7d3577..ad2a620081bcd9 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -48,7 +48,6 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -5404,7 +5403,7 @@ static bool combineInstructionsOverFunction( Function &F, InstructionWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, - BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI, LoopInfo *LI, + BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI, const InstCombineOptions &Opts) { auto &DL = F.getDataLayout(); @@ -5443,7 +5442,7 @@ static bool combineInstructionsOverFunction( << F.getName() << "\n"); InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT, - ORE, BFI, BPI, PSI, DL, LI, RPOT); + ORE, BFI, BPI, PSI, DL, RPOT); IC.MaxArraySizeForCombine = MaxArraySize; bool MadeChangeInThisIteration = IC.prepareWorklist(F); MadeChangeInThisIteration |= IC.run(); @@ -5480,7 +5479,6 @@ void InstCombinePass::printPipeline( OS, MapClassName2PassName); OS << '<'; OS << "max-iterations=" << Options.MaxIterations << ";"; - OS << (Options.UseLoopInfo ? "" : "no-") << "use-loop-info;"; OS << (Options.VerifyFixpoint ? "" : "no-") << "verify-fixpoint"; OS << '>'; } @@ -5493,12 +5491,6 @@ PreservedAnalyses InstCombinePass::run(Function &F, auto &ORE = AM.getResult(F); auto &TTI = AM.getResult(F); - // TODO: Only use LoopInfo when the option is set. This requires that the - // callers in the pass pipeline explicitly set the option. - auto *LI = AM.getCachedResult(F); - if (!LI && Options.UseLoopInfo) - LI = &AM.getResult(F); - auto *AA = &AM.getResult(F); auto &MAMProxy = AM.getResult(F); ProfileSummaryInfo *PSI = @@ -5508,7 +5500,7 @@ PreservedAnalyses InstCombinePass::run(Function &F, auto *BPI = AM.getCachedResult(F); if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE, - BFI, BPI, PSI, LI, Options)) + BFI, BPI, PSI, Options)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -5547,8 +5539,6 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { auto &ORE = getAnalysis().getORE(); // Optional analyses. - auto *LIWP = getAnalysisIfAvailable(); - auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; ProfileSummaryInfo *PSI = &getAnalysis().getPSI(); BlockFrequencyInfo *BFI = @@ -5561,8 +5551,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { BPI = &WrapperPass->getBPI(); return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE, - BFI, BPI, PSI, LI, - InstCombineOptions()); + BFI, BPI, PSI, InstCombineOptions()); } char InstructionCombiningPass::ID = 0; diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll index 12f88d60d66cec..9016473b36ba44 100644 --- a/llvm/test/Other/new-pm-print-pipeline.ll +++ b/llvm/test/Other/new-pm-print-pipeline.ll @@ -92,8 +92,8 @@ ; CHECK-27: function(separate-const-offset-from-gep) ;; Test InstCombine options - the first pass checks default settings, and the second checks customized options. -; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(instcombine,instcombine)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-28 -; CHECK-28: function(instcombine,instcombine) +; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(instcombine,instcombine)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-28 +; CHECK-28: function(instcombine,instcombine) ;; Test function-attrs ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='cgscc(function-attrs)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-29 diff --git a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll index 99cdb6bc760b46..b2bc1abeaba568 100644 --- a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll +++ b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes='instcombine' -S | FileCheck %s +; RUN: opt < %s -passes=instcombine -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" From 0fa78b6c7bd43c2498700a98c47a02cf4fd06388 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Mon, 2 Sep 2024 01:40:13 -0700 Subject: [PATCH 21/33] [clang-format] Correctly annotate braces in macro definition (#106662) Fixes #106418. --- clang/lib/Format/UnwrappedLineParser.cpp | 3 +-- clang/unittests/Format/TokenAnnotatorTest.cpp | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 246b29d308bfaf..0d42a6c2bfb5c6 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -609,9 +609,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { ProbablyBracedList = NextTok->isNot(tok::l_square); } - // Cpp macro definition body that is a nonempty braced list or block: + // Cpp macro definition body containing nonempty braced list or block: if (IsCpp && Line->InMacroBody && PrevTok != FormatTok && - !FormatTok->Previous && NextTok->is(tok::eof) && // A statement can end with only `;` (simple statement), a block // closing brace (compound statement), or `:` (label statement). // If PrevTok is a block opening brace, Tok ends an empty block. diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 497b911f4efbba..5d37a65250d0b1 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3256,6 +3256,11 @@ TEST_F(TokenAnnotatorTest, BraceKind) { EXPECT_BRACE_KIND(Tokens[10], BK_Block); EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_StructRBrace); EXPECT_BRACE_KIND(Tokens[11], BK_Block); + + Tokens = annotate("#define MEMBER(NAME) NAME{\"\"}"); + ASSERT_EQ(Tokens.size(), 11u) << Tokens; + EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit); } TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) { From a156b5a47df58a1ac75cf67e26f557b1a4d26dc9 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Mon, 2 Sep 2024 17:06:07 +0800 Subject: [PATCH 22/33] [SLP] Add vectorization support for [u|s]cmp (#106747) This patch adds vectorization support for [u|s]cmp intrinsic calls. --- llvm/lib/Analysis/VectorUtils.cpp | 4 + .../SLPVectorizer/X86/arith-scmp.ll | 646 +++++++++++++++++ .../SLPVectorizer/X86/arith-ucmp.ll | 681 ++++++++++++++++++ 3 files changed, 1331 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/arith-scmp.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/arith-ucmp.ll diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 32ce34114b2f50..d45d3bbefe4fd3 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -103,6 +103,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::fptoui_sat: case Intrinsic::lrint: case Intrinsic::llrint: + case Intrinsic::ucmp: + case Intrinsic::scmp: return true; default: return false; @@ -138,6 +140,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, case Intrinsic::fptoui_sat: case Intrinsic::lrint: case Intrinsic::llrint: + case Intrinsic::ucmp: + case Intrinsic::scmp: return OpdIdx == -1 || OpdIdx == 0; case Intrinsic::is_fpclass: return OpdIdx == 0; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-scmp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-scmp.ll new file mode 100644 index 00000000000000..6e6bd8e526c605 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-scmp.ll @@ -0,0 +1,646 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @scmp_v8i64() { +; AVX-LABEL: @scmp_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], ptr @c64, align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) +; AVX-NEXT: store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @scmp_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: store <8 x i64> [[TMP3]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; + %a0 = load i64, ptr @a64, align 8 + %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 + %a2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 + %a3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 + %a4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 + %a5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 + %a6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 + %a7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 + %b0 = load i64, ptr @b64, align 8 + %b1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 + %b2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 + %b3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 + %b4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 + %b5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 + %b6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 + %b7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 + %r0 = call i64 @llvm.scmp.i64.i64(i64 %a0, i64 %b0) + %r1 = call i64 @llvm.scmp.i64.i64(i64 %a1, i64 %b1) + %r2 = call i64 @llvm.scmp.i64.i64(i64 %a2, i64 %b2) + %r3 = call i64 @llvm.scmp.i64.i64(i64 %a3, i64 %b3) + %r4 = call i64 @llvm.scmp.i64.i64(i64 %a4, i64 %b4) + %r5 = call i64 @llvm.scmp.i64.i64(i64 %a5, i64 %b5) + %r6 = call i64 @llvm.scmp.i64.i64(i64 %a6, i64 %b6) + %r7 = call i64 @llvm.scmp.i64.i64(i64 %a7, i64 %b7) + store i64 %r0, ptr @c64, align 8 + store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 + store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 + store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 + store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 + store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 + store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 + store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 + ret void +} + +define void @scmp_v16i32() { +; SSE-LABEL: @scmp_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], ptr @c32, align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) +; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @scmp_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], ptr @c32, align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @scmp_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: store <16 x i32> [[TMP3]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 + %a11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 + %a12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 + %a13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 + %a14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 + %a15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 + %b0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 + %b11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 + %b12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 + %b13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 + %b14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 + %b15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 + %r0 = call i32 @llvm.scmp.i32.i32(i32 %a0 , i32 %b0 ) + %r1 = call i32 @llvm.scmp.i32.i32(i32 %a1 , i32 %b1 ) + %r2 = call i32 @llvm.scmp.i32.i32(i32 %a2 , i32 %b2 ) + %r3 = call i32 @llvm.scmp.i32.i32(i32 %a3 , i32 %b3 ) + %r4 = call i32 @llvm.scmp.i32.i32(i32 %a4 , i32 %b4 ) + %r5 = call i32 @llvm.scmp.i32.i32(i32 %a5 , i32 %b5 ) + %r6 = call i32 @llvm.scmp.i32.i32(i32 %a6 , i32 %b6 ) + %r7 = call i32 @llvm.scmp.i32.i32(i32 %a7 , i32 %b7 ) + %r8 = call i32 @llvm.scmp.i32.i32(i32 %a8 , i32 %b8 ) + %r9 = call i32 @llvm.scmp.i32.i32(i32 %a9 , i32 %b9 ) + %r10 = call i32 @llvm.scmp.i32.i32(i32 %a10, i32 %b10) + %r11 = call i32 @llvm.scmp.i32.i32(i32 %a11, i32 %b11) + %r12 = call i32 @llvm.scmp.i32.i32(i32 %a12, i32 %b12) + %r13 = call i32 @llvm.scmp.i32.i32(i32 %a13, i32 %b13) + %r14 = call i32 @llvm.scmp.i32.i32(i32 %a14, i32 %b14) + %r15 = call i32 @llvm.scmp.i32.i32(i32 %a15, i32 %b15) + store i32 %r0 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9 ), align 4 + store i32 %r10, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 + store i32 %r11, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 + store i32 %r12, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 + store i32 %r13, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 + store i32 %r14, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 + store i32 %r15, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 + ret void +} + +define void @scmp_v32i16() { +; SSE-LABEL: @scmp_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], ptr @c16, align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @scmp_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], ptr @c16, align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) +; AVX-NEXT: store <16 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @scmp_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: store <32 x i16> [[TMP3]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; + %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 + %a11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 + %a12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 + %a13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 + %a14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 + %a15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 + %a16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 + %a17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 + %a18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 + %a19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 + %a20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 + %a21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 + %a22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 + %a23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 + %a24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 + %a25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 + %a26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 + %a27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 + %a28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 + %a29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 + %a30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 + %a31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 + %b0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 + %b11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 + %b12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 + %b13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 + %b14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 + %b15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 + %b16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 + %b17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 + %b18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 + %b19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 + %b20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 + %b21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 + %b22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 + %b23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 + %b24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 + %b25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 + %b26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 + %b27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 + %b28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 + %b29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 + %b30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 + %b31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 + %r0 = call i16 @llvm.scmp.i16.i16(i16 %a0 , i16 %b0 ) + %r1 = call i16 @llvm.scmp.i16.i16(i16 %a1 , i16 %b1 ) + %r2 = call i16 @llvm.scmp.i16.i16(i16 %a2 , i16 %b2 ) + %r3 = call i16 @llvm.scmp.i16.i16(i16 %a3 , i16 %b3 ) + %r4 = call i16 @llvm.scmp.i16.i16(i16 %a4 , i16 %b4 ) + %r5 = call i16 @llvm.scmp.i16.i16(i16 %a5 , i16 %b5 ) + %r6 = call i16 @llvm.scmp.i16.i16(i16 %a6 , i16 %b6 ) + %r7 = call i16 @llvm.scmp.i16.i16(i16 %a7 , i16 %b7 ) + %r8 = call i16 @llvm.scmp.i16.i16(i16 %a8 , i16 %b8 ) + %r9 = call i16 @llvm.scmp.i16.i16(i16 %a9 , i16 %b9 ) + %r10 = call i16 @llvm.scmp.i16.i16(i16 %a10, i16 %b10) + %r11 = call i16 @llvm.scmp.i16.i16(i16 %a11, i16 %b11) + %r12 = call i16 @llvm.scmp.i16.i16(i16 %a12, i16 %b12) + %r13 = call i16 @llvm.scmp.i16.i16(i16 %a13, i16 %b13) + %r14 = call i16 @llvm.scmp.i16.i16(i16 %a14, i16 %b14) + %r15 = call i16 @llvm.scmp.i16.i16(i16 %a15, i16 %b15) + %r16 = call i16 @llvm.scmp.i16.i16(i16 %a16, i16 %b16) + %r17 = call i16 @llvm.scmp.i16.i16(i16 %a17, i16 %b17) + %r18 = call i16 @llvm.scmp.i16.i16(i16 %a18, i16 %b18) + %r19 = call i16 @llvm.scmp.i16.i16(i16 %a19, i16 %b19) + %r20 = call i16 @llvm.scmp.i16.i16(i16 %a20, i16 %b20) + %r21 = call i16 @llvm.scmp.i16.i16(i16 %a21, i16 %b21) + %r22 = call i16 @llvm.scmp.i16.i16(i16 %a22, i16 %b22) + %r23 = call i16 @llvm.scmp.i16.i16(i16 %a23, i16 %b23) + %r24 = call i16 @llvm.scmp.i16.i16(i16 %a24, i16 %b24) + %r25 = call i16 @llvm.scmp.i16.i16(i16 %a25, i16 %b25) + %r26 = call i16 @llvm.scmp.i16.i16(i16 %a26, i16 %b26) + %r27 = call i16 @llvm.scmp.i16.i16(i16 %a27, i16 %b27) + %r28 = call i16 @llvm.scmp.i16.i16(i16 %a28, i16 %b28) + %r29 = call i16 @llvm.scmp.i16.i16(i16 %a29, i16 %b29) + %r30 = call i16 @llvm.scmp.i16.i16(i16 %a30, i16 %b30) + %r31 = call i16 @llvm.scmp.i16.i16(i16 %a31, i16 %b31) + store i16 %r0 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9 ), align 2 + store i16 %r10, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 + store i16 %r11, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 + store i16 %r12, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 + store i16 %r13, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 + store i16 %r14, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 + store i16 %r15, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 + store i16 %r16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 + store i16 %r17, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 + store i16 %r18, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 + store i16 %r19, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 + store i16 %r20, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 + store i16 %r21, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 + store i16 %r22, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 + store i16 %r23, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 + store i16 %r24, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 + store i16 %r25, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 + store i16 %r26, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 + store i16 %r27, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 + store i16 %r28, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 + store i16 %r29, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 + store i16 %r30, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 + store i16 %r31, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 + ret void +} + +define void @scmp_v64i8() { +; SSE-LABEL: @scmp_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], ptr @c8, align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; AVX-LABEL: @scmp_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], ptr @c8, align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) +; AVX-NEXT: store <32 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @scmp_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: store <64 x i8> [[TMP3]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; + %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 + %a11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 + %a12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 + %a13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 + %a14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 + %a15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 + %a16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 + %a17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 + %a18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 + %a19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 + %a20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 + %a21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 + %a22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 + %a23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 + %a24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 + %a25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 + %a26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 + %a27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 + %a28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 + %a29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 + %a30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 + %a31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 + %a32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 + %a33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 + %a34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 + %a35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 + %a36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 + %a37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 + %a38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 + %a39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 + %a40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 + %a41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 + %a42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 + %a43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 + %a44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 + %a45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 + %a46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 + %a47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 + %a48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 + %a49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 + %a50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 + %a51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 + %a52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 + %a53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 + %a54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 + %a55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 + %a56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 + %a57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 + %a58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 + %a59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 + %a60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 + %a61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 + %a62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 + %a63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 + %b0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 + %b11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 + %b12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 + %b13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 + %b14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 + %b15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 + %b16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 + %b17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 + %b18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 + %b19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 + %b20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 + %b21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 + %b22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 + %b23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 + %b24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 + %b25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 + %b26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 + %b27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 + %b28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 + %b29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 + %b30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 + %b31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 + %b32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 + %b33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 + %b34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 + %b35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 + %b36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 + %b37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 + %b38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 + %b39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 + %b40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 + %b41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 + %b42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 + %b43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 + %b44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 + %b45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 + %b46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 + %b47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 + %b48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 + %b49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 + %b50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 + %b51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 + %b52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 + %b53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 + %b54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 + %b55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 + %b56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 + %b57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 + %b58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 + %b59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 + %b60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 + %b61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 + %b62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 + %b63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 + %r0 = call i8 @llvm.scmp.i8.i8(i8 %a0 , i8 %b0 ) + %r1 = call i8 @llvm.scmp.i8.i8(i8 %a1 , i8 %b1 ) + %r2 = call i8 @llvm.scmp.i8.i8(i8 %a2 , i8 %b2 ) + %r3 = call i8 @llvm.scmp.i8.i8(i8 %a3 , i8 %b3 ) + %r4 = call i8 @llvm.scmp.i8.i8(i8 %a4 , i8 %b4 ) + %r5 = call i8 @llvm.scmp.i8.i8(i8 %a5 , i8 %b5 ) + %r6 = call i8 @llvm.scmp.i8.i8(i8 %a6 , i8 %b6 ) + %r7 = call i8 @llvm.scmp.i8.i8(i8 %a7 , i8 %b7 ) + %r8 = call i8 @llvm.scmp.i8.i8(i8 %a8 , i8 %b8 ) + %r9 = call i8 @llvm.scmp.i8.i8(i8 %a9 , i8 %b9 ) + %r10 = call i8 @llvm.scmp.i8.i8(i8 %a10, i8 %b10) + %r11 = call i8 @llvm.scmp.i8.i8(i8 %a11, i8 %b11) + %r12 = call i8 @llvm.scmp.i8.i8(i8 %a12, i8 %b12) + %r13 = call i8 @llvm.scmp.i8.i8(i8 %a13, i8 %b13) + %r14 = call i8 @llvm.scmp.i8.i8(i8 %a14, i8 %b14) + %r15 = call i8 @llvm.scmp.i8.i8(i8 %a15, i8 %b15) + %r16 = call i8 @llvm.scmp.i8.i8(i8 %a16, i8 %b16) + %r17 = call i8 @llvm.scmp.i8.i8(i8 %a17, i8 %b17) + %r18 = call i8 @llvm.scmp.i8.i8(i8 %a18, i8 %b18) + %r19 = call i8 @llvm.scmp.i8.i8(i8 %a19, i8 %b19) + %r20 = call i8 @llvm.scmp.i8.i8(i8 %a20, i8 %b20) + %r21 = call i8 @llvm.scmp.i8.i8(i8 %a21, i8 %b21) + %r22 = call i8 @llvm.scmp.i8.i8(i8 %a22, i8 %b22) + %r23 = call i8 @llvm.scmp.i8.i8(i8 %a23, i8 %b23) + %r24 = call i8 @llvm.scmp.i8.i8(i8 %a24, i8 %b24) + %r25 = call i8 @llvm.scmp.i8.i8(i8 %a25, i8 %b25) + %r26 = call i8 @llvm.scmp.i8.i8(i8 %a26, i8 %b26) + %r27 = call i8 @llvm.scmp.i8.i8(i8 %a27, i8 %b27) + %r28 = call i8 @llvm.scmp.i8.i8(i8 %a28, i8 %b28) + %r29 = call i8 @llvm.scmp.i8.i8(i8 %a29, i8 %b29) + %r30 = call i8 @llvm.scmp.i8.i8(i8 %a30, i8 %b30) + %r31 = call i8 @llvm.scmp.i8.i8(i8 %a31, i8 %b31) + %r32 = call i8 @llvm.scmp.i8.i8(i8 %a32, i8 %b32) + %r33 = call i8 @llvm.scmp.i8.i8(i8 %a33, i8 %b33) + %r34 = call i8 @llvm.scmp.i8.i8(i8 %a34, i8 %b34) + %r35 = call i8 @llvm.scmp.i8.i8(i8 %a35, i8 %b35) + %r36 = call i8 @llvm.scmp.i8.i8(i8 %a36, i8 %b36) + %r37 = call i8 @llvm.scmp.i8.i8(i8 %a37, i8 %b37) + %r38 = call i8 @llvm.scmp.i8.i8(i8 %a38, i8 %b38) + %r39 = call i8 @llvm.scmp.i8.i8(i8 %a39, i8 %b39) + %r40 = call i8 @llvm.scmp.i8.i8(i8 %a40, i8 %b40) + %r41 = call i8 @llvm.scmp.i8.i8(i8 %a41, i8 %b41) + %r42 = call i8 @llvm.scmp.i8.i8(i8 %a42, i8 %b42) + %r43 = call i8 @llvm.scmp.i8.i8(i8 %a43, i8 %b43) + %r44 = call i8 @llvm.scmp.i8.i8(i8 %a44, i8 %b44) + %r45 = call i8 @llvm.scmp.i8.i8(i8 %a45, i8 %b45) + %r46 = call i8 @llvm.scmp.i8.i8(i8 %a46, i8 %b46) + %r47 = call i8 @llvm.scmp.i8.i8(i8 %a47, i8 %b47) + %r48 = call i8 @llvm.scmp.i8.i8(i8 %a48, i8 %b48) + %r49 = call i8 @llvm.scmp.i8.i8(i8 %a49, i8 %b49) + %r50 = call i8 @llvm.scmp.i8.i8(i8 %a50, i8 %b50) + %r51 = call i8 @llvm.scmp.i8.i8(i8 %a51, i8 %b51) + %r52 = call i8 @llvm.scmp.i8.i8(i8 %a52, i8 %b52) + %r53 = call i8 @llvm.scmp.i8.i8(i8 %a53, i8 %b53) + %r54 = call i8 @llvm.scmp.i8.i8(i8 %a54, i8 %b54) + %r55 = call i8 @llvm.scmp.i8.i8(i8 %a55, i8 %b55) + %r56 = call i8 @llvm.scmp.i8.i8(i8 %a56, i8 %b56) + %r57 = call i8 @llvm.scmp.i8.i8(i8 %a57, i8 %b57) + %r58 = call i8 @llvm.scmp.i8.i8(i8 %a58, i8 %b58) + %r59 = call i8 @llvm.scmp.i8.i8(i8 %a59, i8 %b59) + %r60 = call i8 @llvm.scmp.i8.i8(i8 %a60, i8 %b60) + %r61 = call i8 @llvm.scmp.i8.i8(i8 %a61, i8 %b61) + %r62 = call i8 @llvm.scmp.i8.i8(i8 %a62, i8 %b62) + %r63 = call i8 @llvm.scmp.i8.i8(i8 %a63, i8 %b63) + store i8 %r0 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9 ), align 1 + store i8 %r10, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 + store i8 %r11, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 + store i8 %r12, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 + store i8 %r13, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 + store i8 %r14, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 + store i8 %r15, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 + store i8 %r16, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 + store i8 %r17, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 + store i8 %r18, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 + store i8 %r19, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 + store i8 %r20, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 + store i8 %r21, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 + store i8 %r22, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 + store i8 %r23, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 + store i8 %r24, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 + store i8 %r25, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 + store i8 %r26, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 + store i8 %r27, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 + store i8 %r28, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 + store i8 %r29, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 + store i8 %r30, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 + store i8 %r31, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 + store i8 %r32, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 + store i8 %r33, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 + store i8 %r34, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 + store i8 %r35, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 + store i8 %r36, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 + store i8 %r37, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 + store i8 %r38, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 + store i8 %r39, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 + store i8 %r40, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 + store i8 %r41, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 + store i8 %r42, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 + store i8 %r43, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 + store i8 %r44, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 + store i8 %r45, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 + store i8 %r46, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 + store i8 %r47, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 + store i8 %r48, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 + store i8 %r49, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 + store i8 %r50, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 + store i8 %r51, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 + store i8 %r52, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 + store i8 %r53, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 + store i8 %r54, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 + store i8 %r55, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 + store i8 %r56, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 + store i8 %r57, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 + store i8 %r58, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 + store i8 %r59, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 + store i8 %r60, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 + store i8 %r61, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 + store i8 %r62, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 + store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-ucmp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-ucmp.ll new file mode 100644 index 00000000000000..8413238d969c59 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-ucmp.ll @@ -0,0 +1,681 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @ucmp_v8i64() { +; SSE-LABEL: @ucmp_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A0]], i64 [[B0]]) +; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A1]], i64 [[B1]]) +; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A2]], i64 [[B2]]) +; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A3]], i64 [[B3]]) +; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A4]], i64 [[B4]]) +; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A5]], i64 [[B5]]) +; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A6]], i64 [[B6]]) +; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[A7]], i64 [[B7]]) +; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8 +; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ucmp_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], ptr @c64, align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) +; AVX-NEXT: store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @ucmp_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8 +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]]) +; AVX512-NEXT: store <8 x i64> [[TMP3]], ptr @c64, align 8 +; AVX512-NEXT: ret void +; + %a0 = load i64, ptr @a64, align 8 + %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8 + %a2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8 + %a3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8 + %a4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8 + %a5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8 + %a6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8 + %a7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8 + %b0 = load i64, ptr @b64, align 8 + %b1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8 + %b2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8 + %b3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8 + %b4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8 + %b5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8 + %b6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8 + %b7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8 + %r0 = call i64 @llvm.ucmp.i64.i64(i64 %a0, i64 %b0) + %r1 = call i64 @llvm.ucmp.i64.i64(i64 %a1, i64 %b1) + %r2 = call i64 @llvm.ucmp.i64.i64(i64 %a2, i64 %b2) + %r3 = call i64 @llvm.ucmp.i64.i64(i64 %a3, i64 %b3) + %r4 = call i64 @llvm.ucmp.i64.i64(i64 %a4, i64 %b4) + %r5 = call i64 @llvm.ucmp.i64.i64(i64 %a5, i64 %b5) + %r6 = call i64 @llvm.ucmp.i64.i64(i64 %a6, i64 %b6) + %r7 = call i64 @llvm.ucmp.i64.i64(i64 %a7, i64 %b7) + store i64 %r0, ptr @c64, align 8 + store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8 + store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8 + store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8 + store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8 + store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8 + store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8 + store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8 + ret void +} + +define void @ucmp_v16i32() { +; SSE-LABEL: @ucmp_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], ptr @c32, align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) +; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ucmp_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], ptr @c32, align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @ucmp_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4 +; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) +; AVX512-NEXT: store <16 x i32> [[TMP3]], ptr @c32, align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4 + %a11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4 + %a12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 + %a13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4 + %a14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4 + %a15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4 + %b0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4 + %b11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4 + %b12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 + %b13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4 + %b14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4 + %b15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4 + %r0 = call i32 @llvm.ucmp.i32.i32(i32 %a0 , i32 %b0 ) + %r1 = call i32 @llvm.ucmp.i32.i32(i32 %a1 , i32 %b1 ) + %r2 = call i32 @llvm.ucmp.i32.i32(i32 %a2 , i32 %b2 ) + %r3 = call i32 @llvm.ucmp.i32.i32(i32 %a3 , i32 %b3 ) + %r4 = call i32 @llvm.ucmp.i32.i32(i32 %a4 , i32 %b4 ) + %r5 = call i32 @llvm.ucmp.i32.i32(i32 %a5 , i32 %b5 ) + %r6 = call i32 @llvm.ucmp.i32.i32(i32 %a6 , i32 %b6 ) + %r7 = call i32 @llvm.ucmp.i32.i32(i32 %a7 , i32 %b7 ) + %r8 = call i32 @llvm.ucmp.i32.i32(i32 %a8 , i32 %b8 ) + %r9 = call i32 @llvm.ucmp.i32.i32(i32 %a9 , i32 %b9 ) + %r10 = call i32 @llvm.ucmp.i32.i32(i32 %a10, i32 %b10) + %r11 = call i32 @llvm.ucmp.i32.i32(i32 %a11, i32 %b11) + %r12 = call i32 @llvm.ucmp.i32.i32(i32 %a12, i32 %b12) + %r13 = call i32 @llvm.ucmp.i32.i32(i32 %a13, i32 %b13) + %r14 = call i32 @llvm.ucmp.i32.i32(i32 %a14, i32 %b14) + %r15 = call i32 @llvm.ucmp.i32.i32(i32 %a15, i32 %b15) + store i32 %r0 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9 ), align 4 + store i32 %r10, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4 + store i32 %r11, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4 + store i32 %r12, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4 + store i32 %r13, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4 + store i32 %r14, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4 + store i32 %r15, ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4 + ret void +} + +define void @ucmp_v32i16() { +; SSE-LABEL: @ucmp_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], ptr @c16, align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) +; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ucmp_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], ptr @c16, align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) +; AVX-NEXT: store <16 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @ucmp_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2 +; AVX512-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) +; AVX512-NEXT: store <32 x i16> [[TMP3]], ptr @c16, align 2 +; AVX512-NEXT: ret void +; + %a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 + %a11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 + %a12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 + %a13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 + %a14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 + %a15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 + %a16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 + %a17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 + %a18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 + %a19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 + %a20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 + %a21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 + %a22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 + %a23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 + %a24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 + %a25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 + %a26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 + %a27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 + %a28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 + %a29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 + %a30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 + %a31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 + %b0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 + %b11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 + %b12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 + %b13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 + %b14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 + %b15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 + %b16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 + %b17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 + %b18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 + %b19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 + %b20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 + %b21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 + %b22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 + %b23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 + %b24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 + %b25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 + %b26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 + %b27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 + %b28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 + %b29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 + %b30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 + %b31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 + %r0 = call i16 @llvm.ucmp.i16.i16(i16 %a0 , i16 %b0 ) + %r1 = call i16 @llvm.ucmp.i16.i16(i16 %a1 , i16 %b1 ) + %r2 = call i16 @llvm.ucmp.i16.i16(i16 %a2 , i16 %b2 ) + %r3 = call i16 @llvm.ucmp.i16.i16(i16 %a3 , i16 %b3 ) + %r4 = call i16 @llvm.ucmp.i16.i16(i16 %a4 , i16 %b4 ) + %r5 = call i16 @llvm.ucmp.i16.i16(i16 %a5 , i16 %b5 ) + %r6 = call i16 @llvm.ucmp.i16.i16(i16 %a6 , i16 %b6 ) + %r7 = call i16 @llvm.ucmp.i16.i16(i16 %a7 , i16 %b7 ) + %r8 = call i16 @llvm.ucmp.i16.i16(i16 %a8 , i16 %b8 ) + %r9 = call i16 @llvm.ucmp.i16.i16(i16 %a9 , i16 %b9 ) + %r10 = call i16 @llvm.ucmp.i16.i16(i16 %a10, i16 %b10) + %r11 = call i16 @llvm.ucmp.i16.i16(i16 %a11, i16 %b11) + %r12 = call i16 @llvm.ucmp.i16.i16(i16 %a12, i16 %b12) + %r13 = call i16 @llvm.ucmp.i16.i16(i16 %a13, i16 %b13) + %r14 = call i16 @llvm.ucmp.i16.i16(i16 %a14, i16 %b14) + %r15 = call i16 @llvm.ucmp.i16.i16(i16 %a15, i16 %b15) + %r16 = call i16 @llvm.ucmp.i16.i16(i16 %a16, i16 %b16) + %r17 = call i16 @llvm.ucmp.i16.i16(i16 %a17, i16 %b17) + %r18 = call i16 @llvm.ucmp.i16.i16(i16 %a18, i16 %b18) + %r19 = call i16 @llvm.ucmp.i16.i16(i16 %a19, i16 %b19) + %r20 = call i16 @llvm.ucmp.i16.i16(i16 %a20, i16 %b20) + %r21 = call i16 @llvm.ucmp.i16.i16(i16 %a21, i16 %b21) + %r22 = call i16 @llvm.ucmp.i16.i16(i16 %a22, i16 %b22) + %r23 = call i16 @llvm.ucmp.i16.i16(i16 %a23, i16 %b23) + %r24 = call i16 @llvm.ucmp.i16.i16(i16 %a24, i16 %b24) + %r25 = call i16 @llvm.ucmp.i16.i16(i16 %a25, i16 %b25) + %r26 = call i16 @llvm.ucmp.i16.i16(i16 %a26, i16 %b26) + %r27 = call i16 @llvm.ucmp.i16.i16(i16 %a27, i16 %b27) + %r28 = call i16 @llvm.ucmp.i16.i16(i16 %a28, i16 %b28) + %r29 = call i16 @llvm.ucmp.i16.i16(i16 %a29, i16 %b29) + %r30 = call i16 @llvm.ucmp.i16.i16(i16 %a30, i16 %b30) + %r31 = call i16 @llvm.ucmp.i16.i16(i16 %a31, i16 %b31) + store i16 %r0 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9 ), align 2 + store i16 %r10, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2 + store i16 %r11, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2 + store i16 %r12, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2 + store i16 %r13, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2 + store i16 %r14, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2 + store i16 %r15, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2 + store i16 %r16, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2 + store i16 %r17, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2 + store i16 %r18, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2 + store i16 %r19, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2 + store i16 %r20, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2 + store i16 %r21, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2 + store i16 %r22, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2 + store i16 %r23, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2 + store i16 %r24, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2 + store i16 %r25, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2 + store i16 %r26, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2 + store i16 %r27, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2 + store i16 %r28, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2 + store i16 %r29, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2 + store i16 %r30, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2 + store i16 %r31, ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2 + ret void +} + +define void @ucmp_v64i8() { +; SSE-LABEL: @ucmp_v64i8( +; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], ptr @c8, align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ucmp_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], ptr @c8, align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) +; AVX-NEXT: store <32 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @ucmp_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1 +; AVX512-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]]) +; AVX512-NEXT: store <64 x i8> [[TMP3]], ptr @c8, align 1 +; AVX512-NEXT: ret void +; + %a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1 + %a11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1 + %a12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1 + %a13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1 + %a14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1 + %a15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1 + %a16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 + %a17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1 + %a18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1 + %a19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1 + %a20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1 + %a21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1 + %a22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1 + %a23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1 + %a24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1 + %a25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1 + %a26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1 + %a27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1 + %a28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1 + %a29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1 + %a30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1 + %a31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1 + %a32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 + %a33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1 + %a34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1 + %a35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1 + %a36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1 + %a37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1 + %a38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1 + %a39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1 + %a40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1 + %a41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1 + %a42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1 + %a43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1 + %a44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1 + %a45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1 + %a46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1 + %a47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1 + %a48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 + %a49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1 + %a50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1 + %a51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1 + %a52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1 + %a53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1 + %a54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1 + %a55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1 + %a56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1 + %a57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1 + %a58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1 + %a59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1 + %a60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1 + %a61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1 + %a62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1 + %a63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1 + %b0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1 + %b11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1 + %b12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1 + %b13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1 + %b14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1 + %b15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1 + %b16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 + %b17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1 + %b18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1 + %b19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1 + %b20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1 + %b21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1 + %b22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1 + %b23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1 + %b24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1 + %b25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1 + %b26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1 + %b27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1 + %b28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1 + %b29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1 + %b30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1 + %b31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1 + %b32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 + %b33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1 + %b34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1 + %b35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1 + %b36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1 + %b37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1 + %b38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1 + %b39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1 + %b40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1 + %b41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1 + %b42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1 + %b43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1 + %b44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1 + %b45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1 + %b46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1 + %b47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1 + %b48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 + %b49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1 + %b50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1 + %b51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1 + %b52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1 + %b53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1 + %b54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1 + %b55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1 + %b56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1 + %b57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1 + %b58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1 + %b59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1 + %b60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1 + %b61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1 + %b62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1 + %b63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1 + %r0 = call i8 @llvm.ucmp.i8.i8(i8 %a0 , i8 %b0 ) + %r1 = call i8 @llvm.ucmp.i8.i8(i8 %a1 , i8 %b1 ) + %r2 = call i8 @llvm.ucmp.i8.i8(i8 %a2 , i8 %b2 ) + %r3 = call i8 @llvm.ucmp.i8.i8(i8 %a3 , i8 %b3 ) + %r4 = call i8 @llvm.ucmp.i8.i8(i8 %a4 , i8 %b4 ) + %r5 = call i8 @llvm.ucmp.i8.i8(i8 %a5 , i8 %b5 ) + %r6 = call i8 @llvm.ucmp.i8.i8(i8 %a6 , i8 %b6 ) + %r7 = call i8 @llvm.ucmp.i8.i8(i8 %a7 , i8 %b7 ) + %r8 = call i8 @llvm.ucmp.i8.i8(i8 %a8 , i8 %b8 ) + %r9 = call i8 @llvm.ucmp.i8.i8(i8 %a9 , i8 %b9 ) + %r10 = call i8 @llvm.ucmp.i8.i8(i8 %a10, i8 %b10) + %r11 = call i8 @llvm.ucmp.i8.i8(i8 %a11, i8 %b11) + %r12 = call i8 @llvm.ucmp.i8.i8(i8 %a12, i8 %b12) + %r13 = call i8 @llvm.ucmp.i8.i8(i8 %a13, i8 %b13) + %r14 = call i8 @llvm.ucmp.i8.i8(i8 %a14, i8 %b14) + %r15 = call i8 @llvm.ucmp.i8.i8(i8 %a15, i8 %b15) + %r16 = call i8 @llvm.ucmp.i8.i8(i8 %a16, i8 %b16) + %r17 = call i8 @llvm.ucmp.i8.i8(i8 %a17, i8 %b17) + %r18 = call i8 @llvm.ucmp.i8.i8(i8 %a18, i8 %b18) + %r19 = call i8 @llvm.ucmp.i8.i8(i8 %a19, i8 %b19) + %r20 = call i8 @llvm.ucmp.i8.i8(i8 %a20, i8 %b20) + %r21 = call i8 @llvm.ucmp.i8.i8(i8 %a21, i8 %b21) + %r22 = call i8 @llvm.ucmp.i8.i8(i8 %a22, i8 %b22) + %r23 = call i8 @llvm.ucmp.i8.i8(i8 %a23, i8 %b23) + %r24 = call i8 @llvm.ucmp.i8.i8(i8 %a24, i8 %b24) + %r25 = call i8 @llvm.ucmp.i8.i8(i8 %a25, i8 %b25) + %r26 = call i8 @llvm.ucmp.i8.i8(i8 %a26, i8 %b26) + %r27 = call i8 @llvm.ucmp.i8.i8(i8 %a27, i8 %b27) + %r28 = call i8 @llvm.ucmp.i8.i8(i8 %a28, i8 %b28) + %r29 = call i8 @llvm.ucmp.i8.i8(i8 %a29, i8 %b29) + %r30 = call i8 @llvm.ucmp.i8.i8(i8 %a30, i8 %b30) + %r31 = call i8 @llvm.ucmp.i8.i8(i8 %a31, i8 %b31) + %r32 = call i8 @llvm.ucmp.i8.i8(i8 %a32, i8 %b32) + %r33 = call i8 @llvm.ucmp.i8.i8(i8 %a33, i8 %b33) + %r34 = call i8 @llvm.ucmp.i8.i8(i8 %a34, i8 %b34) + %r35 = call i8 @llvm.ucmp.i8.i8(i8 %a35, i8 %b35) + %r36 = call i8 @llvm.ucmp.i8.i8(i8 %a36, i8 %b36) + %r37 = call i8 @llvm.ucmp.i8.i8(i8 %a37, i8 %b37) + %r38 = call i8 @llvm.ucmp.i8.i8(i8 %a38, i8 %b38) + %r39 = call i8 @llvm.ucmp.i8.i8(i8 %a39, i8 %b39) + %r40 = call i8 @llvm.ucmp.i8.i8(i8 %a40, i8 %b40) + %r41 = call i8 @llvm.ucmp.i8.i8(i8 %a41, i8 %b41) + %r42 = call i8 @llvm.ucmp.i8.i8(i8 %a42, i8 %b42) + %r43 = call i8 @llvm.ucmp.i8.i8(i8 %a43, i8 %b43) + %r44 = call i8 @llvm.ucmp.i8.i8(i8 %a44, i8 %b44) + %r45 = call i8 @llvm.ucmp.i8.i8(i8 %a45, i8 %b45) + %r46 = call i8 @llvm.ucmp.i8.i8(i8 %a46, i8 %b46) + %r47 = call i8 @llvm.ucmp.i8.i8(i8 %a47, i8 %b47) + %r48 = call i8 @llvm.ucmp.i8.i8(i8 %a48, i8 %b48) + %r49 = call i8 @llvm.ucmp.i8.i8(i8 %a49, i8 %b49) + %r50 = call i8 @llvm.ucmp.i8.i8(i8 %a50, i8 %b50) + %r51 = call i8 @llvm.ucmp.i8.i8(i8 %a51, i8 %b51) + %r52 = call i8 @llvm.ucmp.i8.i8(i8 %a52, i8 %b52) + %r53 = call i8 @llvm.ucmp.i8.i8(i8 %a53, i8 %b53) + %r54 = call i8 @llvm.ucmp.i8.i8(i8 %a54, i8 %b54) + %r55 = call i8 @llvm.ucmp.i8.i8(i8 %a55, i8 %b55) + %r56 = call i8 @llvm.ucmp.i8.i8(i8 %a56, i8 %b56) + %r57 = call i8 @llvm.ucmp.i8.i8(i8 %a57, i8 %b57) + %r58 = call i8 @llvm.ucmp.i8.i8(i8 %a58, i8 %b58) + %r59 = call i8 @llvm.ucmp.i8.i8(i8 %a59, i8 %b59) + %r60 = call i8 @llvm.ucmp.i8.i8(i8 %a60, i8 %b60) + %r61 = call i8 @llvm.ucmp.i8.i8(i8 %a61, i8 %b61) + %r62 = call i8 @llvm.ucmp.i8.i8(i8 %a62, i8 %b62) + %r63 = call i8 @llvm.ucmp.i8.i8(i8 %a63, i8 %b63) + store i8 %r0 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9 ), align 1 + store i8 %r10, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1 + store i8 %r11, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1 + store i8 %r12, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1 + store i8 %r13, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1 + store i8 %r14, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1 + store i8 %r15, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1 + store i8 %r16, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 + store i8 %r17, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1 + store i8 %r18, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1 + store i8 %r19, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1 + store i8 %r20, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1 + store i8 %r21, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1 + store i8 %r22, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1 + store i8 %r23, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1 + store i8 %r24, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1 + store i8 %r25, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1 + store i8 %r26, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1 + store i8 %r27, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1 + store i8 %r28, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1 + store i8 %r29, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1 + store i8 %r30, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1 + store i8 %r31, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1 + store i8 %r32, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 + store i8 %r33, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1 + store i8 %r34, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1 + store i8 %r35, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1 + store i8 %r36, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1 + store i8 %r37, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1 + store i8 %r38, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1 + store i8 %r39, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1 + store i8 %r40, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1 + store i8 %r41, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1 + store i8 %r42, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1 + store i8 %r43, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1 + store i8 %r44, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1 + store i8 %r45, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1 + store i8 %r46, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1 + store i8 %r47, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1 + store i8 %r48, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 + store i8 %r49, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1 + store i8 %r50, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1 + store i8 %r51, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1 + store i8 %r52, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1 + store i8 %r53, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1 + store i8 %r54, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1 + store i8 %r55, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1 + store i8 %r56, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1 + store i8 %r57, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1 + store i8 %r58, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1 + store i8 %r59, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1 + store i8 %r60, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1 + store i8 %r61, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1 + store i8 %r62, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1 + store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1 + ret void +} From a0a253181e3eb2e7173a37b043b82325c7cddd67 Mon Sep 17 00:00:00 2001 From: Alastair Houghton Date: Mon, 2 Sep 2024 10:07:11 +0100 Subject: [PATCH 23/33] [RuntimeDyld][Windows] Allocate space for dllimport things. (#102586) We weren't taking account of the space we require in the stubs for things that are dllimported, and as a result we could hit the assertion failure for running out of stub space. Fix that. rdar://133473673 --------- Co-authored-by: Saleem Abdulrasool Co-authored-by: Lang Hames Co-authored-by: Ben Barham --- llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp | 5 ++++- .../ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp | 10 ++++++++++ llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h | 7 +++++++ llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h | 10 ++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 7eb7da0138c972..5ac5532705dc49 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -690,9 +690,12 @@ unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj, if (!(RelSecI == Section)) continue; - for (const RelocationRef &Reloc : SI->relocations()) + for (const RelocationRef &Reloc : SI->relocations()) { if (relocationNeedsStub(Reloc)) StubBufSize += StubSize; + if (relocationNeedsDLLImportStub(Reloc)) + StubBufSize = sizeAfterAddingDLLImportStub(StubBufSize); + } } // Get section data size and alignment diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp index 25a2d8780fb56c..73b37ee0ff3311 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp @@ -119,4 +119,14 @@ bool RuntimeDyldCOFF::isCompatibleFile(const object::ObjectFile &Obj) const { return Obj.isCOFF(); } +bool RuntimeDyldCOFF::relocationNeedsDLLImportStub( + const RelocationRef &R) const { + object::symbol_iterator Symbol = R.getSymbol(); + Expected TargetNameOrErr = Symbol->getName(); + if (!TargetNameOrErr) + return false; + + return TargetNameOrErr->starts_with(getImportSymbolPrefix()); +} + } // namespace llvm diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h index 25e3783cf160b2..51d177c7bb8bec 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h @@ -14,6 +14,7 @@ #define LLVM_RUNTIME_DYLD_COFF_H #include "RuntimeDyldImpl.h" +#include "llvm/Support/MathExtras.h" namespace llvm { @@ -45,6 +46,12 @@ class RuntimeDyldCOFF : public RuntimeDyldImpl { static constexpr StringRef getImportSymbolPrefix() { return "__imp_"; } + bool relocationNeedsDLLImportStub(const RelocationRef &R) const; + + unsigned sizeAfterAddingDLLImportStub(unsigned Size) const { + return alignTo(Size, PointerSize) + PointerSize; + } + private: unsigned PointerSize; uint32_t PointerReloc; diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index e09c632842d6e9..de7630b9747ea4 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -455,6 +455,16 @@ class RuntimeDyldImpl { return true; // Conservative answer } + // Return true if the relocation R may require allocating a DLL import stub. + virtual bool relocationNeedsDLLImportStub(const RelocationRef &R) const { + return false; + } + + // Add the size of a DLL import stub to the buffer size + virtual unsigned sizeAfterAddingDLLImportStub(unsigned Size) const { + return Size; + } + public: RuntimeDyldImpl(RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver) From cde3838c430502620cb4c1663e843e465c6e67b5 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Mon, 2 Sep 2024 10:12:43 +0100 Subject: [PATCH 24/33] [flang][runtime] long double isn't always f80 (#106746) f80 is only a thing on x86, and even then the size of long double can be changed with compiler flags. Instead set the size according to the host system (this is what is already done for integer types). --- .../flang/Optimizer/Builder/Runtime/RTBuilder.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h index 845ba385918d0d..a103861f1510b8 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h @@ -341,7 +341,18 @@ constexpr TypeBuilderFunc getModel() { template <> constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { - return mlir::FloatType::getF80(context); + // See TODO at the top of the file. This is configuring for the host system + // - it might be incorrect when cross-compiling! + constexpr size_t size = sizeof(long double); + static_assert(size == 16 || size == 10 || size == 8, + "unsupported long double size"); + if constexpr (size == 16) + return mlir::FloatType::getF128(context); + if constexpr (size == 10) + return mlir::FloatType::getF80(context); + if constexpr (size == 8) + return mlir::FloatType::getF64(context); + llvm_unreachable("failed static assert"); }; } template <> From eaea4d15acd4cab92e6287d692d2652066c3368a Mon Sep 17 00:00:00 2001 From: c8ef Date: Mon, 2 Sep 2024 17:15:44 +0800 Subject: [PATCH 25/33] [clang] The ms-extension __noop should return zero in a constexpr context. (#106849) Fixes #106713. --- clang/lib/AST/ExprConstant.cpp | 4 ++-- clang/test/SemaCXX/builtins.cpp | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index b5dfd4dd32b63c..3dc13c14c00343 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12720,8 +12720,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, } case Builtin::BI__noop: - // __noop always evaluates successfully - return true; + // __noop always evaluates successfully and returns 0. + return Success(0, E); case Builtin::BI__builtin_is_constant_evaluated: { const auto *Callee = Info.CurrentCall->getCallee(); diff --git a/clang/test/SemaCXX/builtins.cpp b/clang/test/SemaCXX/builtins.cpp index c6fbb8b514d671..f47ed3a1f7ebfc 100644 --- a/clang/test/SemaCXX/builtins.cpp +++ b/clang/test/SemaCXX/builtins.cpp @@ -177,5 +177,21 @@ static void __builtin_cpu_init(); // expected-error {{static declaration of '__b #endif #ifdef _MSC_VER -constexpr int x = []{ __noop; return 0; }(); // expected-no-diagnostics +constexpr int x = [] { + __noop; + return 0; +}(); // expected-no-diagnostics +static_assert([] { return __noop; }() == 0); +static_assert([] { return __noop(4); }() == 0); +extern int not_accessed; +void not_called(); +static_assert([] { return __noop(not_accessed *= 6); }() == 0); +static_assert([] { return __noop(not_called()); }() == 0); +static_assert([] { return __noop(throw ""); }() == 0); +static_assert([] { return __noop(throw "", throw ""); }() == 0); +static_assert([] { + int a = 5; + __noop(++a); + return a; +}() == 5); #endif From 87d904871fe96a01dfa1f254ca2a7639de67960c Mon Sep 17 00:00:00 2001 From: Alastair Houghton Date: Mon, 2 Sep 2024 10:27:28 +0100 Subject: [PATCH 26/33] Revert "[RuntimeDyld][Windows] Allocate space for dllimport things." (#106954) Looks like I missed an `override` (maybe that warning was enabled recently?). Will revert and fix. Reverts llvm/llvm-project#102586 --- llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp | 5 +---- .../ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp | 10 ---------- llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h | 7 ------- llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h | 10 ---------- 4 files changed, 1 insertion(+), 31 deletions(-) diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 5ac5532705dc49..7eb7da0138c972 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -690,12 +690,9 @@ unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj, if (!(RelSecI == Section)) continue; - for (const RelocationRef &Reloc : SI->relocations()) { + for (const RelocationRef &Reloc : SI->relocations()) if (relocationNeedsStub(Reloc)) StubBufSize += StubSize; - if (relocationNeedsDLLImportStub(Reloc)) - StubBufSize = sizeAfterAddingDLLImportStub(StubBufSize); - } } // Get section data size and alignment diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp index 73b37ee0ff3311..25a2d8780fb56c 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp @@ -119,14 +119,4 @@ bool RuntimeDyldCOFF::isCompatibleFile(const object::ObjectFile &Obj) const { return Obj.isCOFF(); } -bool RuntimeDyldCOFF::relocationNeedsDLLImportStub( - const RelocationRef &R) const { - object::symbol_iterator Symbol = R.getSymbol(); - Expected TargetNameOrErr = Symbol->getName(); - if (!TargetNameOrErr) - return false; - - return TargetNameOrErr->starts_with(getImportSymbolPrefix()); -} - } // namespace llvm diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h index 51d177c7bb8bec..25e3783cf160b2 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h @@ -14,7 +14,6 @@ #define LLVM_RUNTIME_DYLD_COFF_H #include "RuntimeDyldImpl.h" -#include "llvm/Support/MathExtras.h" namespace llvm { @@ -46,12 +45,6 @@ class RuntimeDyldCOFF : public RuntimeDyldImpl { static constexpr StringRef getImportSymbolPrefix() { return "__imp_"; } - bool relocationNeedsDLLImportStub(const RelocationRef &R) const; - - unsigned sizeAfterAddingDLLImportStub(unsigned Size) const { - return alignTo(Size, PointerSize) + PointerSize; - } - private: unsigned PointerSize; uint32_t PointerReloc; diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index de7630b9747ea4..e09c632842d6e9 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -455,16 +455,6 @@ class RuntimeDyldImpl { return true; // Conservative answer } - // Return true if the relocation R may require allocating a DLL import stub. - virtual bool relocationNeedsDLLImportStub(const RelocationRef &R) const { - return false; - } - - // Add the size of a DLL import stub to the buffer size - virtual unsigned sizeAfterAddingDLLImportStub(unsigned Size) const { - return Size; - } - public: RuntimeDyldImpl(RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver) From 24fe1d4fd61983277c8061ce591970bc775a0fb5 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 2 Sep 2024 11:44:37 +0200 Subject: [PATCH 27/33] [SCCP] Infer return attributes in SCCP as well (#106732) We can infer the range/nonnull attributes in non-interprocedural SCCP as well. The results may be better after the function has been simplified. --- clang/test/CodeGen/attr-counted-by.c | 8 ++--- .../llvm/Transforms/Utils/SCCPSolver.h | 4 ++- llvm/lib/Transforms/IPO/SCCP.cpp | 32 +++---------------- llvm/lib/Transforms/Scalar/SCCP.cpp | 8 +++++ llvm/lib/Transforms/Utils/SCCPSolver.cpp | 30 ++++++++++++++++- .../icmp-ashr-breaking-select-idiom.ll | 4 +-- llvm/test/Transforms/SCCP/exact-flags.ll | 4 +-- llvm/test/Transforms/SCCP/phis.ll | 4 +-- llvm/test/Transforms/SCCP/pointer-nonnull.ll | 10 ++---- 9 files changed, 57 insertions(+), 47 deletions(-) diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c index 3ed8b6f0c71861..ab36b6e7720ba3 100644 --- a/clang/test/CodeGen/attr-counted-by.c +++ b/clang/test/CodeGen/attr-counted-by.c @@ -639,7 +639,7 @@ void test6(struct anon_struct *p, int index) { p->array[index] = __builtin_dynamic_object_size(p->array, 1); } -// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test6_bdos( +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -3) i64 @test6_bdos( // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 @@ -649,7 +649,7 @@ void test6(struct anon_struct *p, int index) { // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = select i1 [[DOTINV]], i64 0, i64 [[TMP0]] // SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] // -// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test6_bdos( +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, -3) i64 @test6_bdos( // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 @@ -955,7 +955,7 @@ void test10(struct union_of_fams *p, int index) { p->bytes[index] = (unsigned char)__builtin_dynamic_object_size(p->bytes, 1); } -// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @test10_bdos( +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos( // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { // SANITIZE-WITH-ATTR-NEXT: entry: // SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 @@ -964,7 +964,7 @@ void test10(struct union_of_fams *p, int index) { // SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64 // SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP0]] // -// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @test10_bdos( +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10_bdos( // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { // NO-SANITIZE-WITH-ATTR-NEXT: entry: // NO-SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h index 1f959311295258..61a500b82875fb 100644 --- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h +++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h @@ -137,7 +137,7 @@ class SCCPSolver { const ValueLatticeElement &getLatticeValueFor(Value *V) const; /// getTrackedRetVals - Get the inferred return value map. - const MapVector &getTrackedRetVals(); + const MapVector &getTrackedRetVals() const; /// getTrackedGlobals - Get and return the set of inferred initializers for /// global variables. @@ -190,6 +190,8 @@ class SCCPSolver { bool removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU, BasicBlock *&NewUnreachableBB) const; + void inferReturnAttributes() const; + bool tryToReplaceWithConstant(Value *V); // Helper to check if \p LV is either a constant or a constant diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp index 5ef08c4a2d725d..f0d75a2016363a 100644 --- a/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/llvm/lib/Transforms/IPO/SCCP.cpp @@ -277,34 +277,10 @@ static bool runIPSCCP( // whether other functions are optimizable. SmallVector ReturnsToZap; - for (const auto &I : Solver.getTrackedRetVals()) { - Function *F = I.first; - const ValueLatticeElement &ReturnValue = I.second; - - // If there is a known constant range for the return value, add range - // attribute to the return value. - if (ReturnValue.isConstantRange() && - !ReturnValue.getConstantRange().isSingleElement()) { - // Do not add range metadata if the return value may include undef. - if (ReturnValue.isConstantRangeIncludingUndef()) - continue; - - // Take the intersection of the existing attribute and the inferred range. - ConstantRange CR = ReturnValue.getConstantRange(); - if (F->hasRetAttribute(Attribute::Range)) - CR = CR.intersectWith(F->getRetAttribute(Attribute::Range).getRange()); - F->addRangeRetAttr(CR); - continue; - } - // Infer nonnull return attribute. - if (F->getReturnType()->isPointerTy() && ReturnValue.isNotConstant() && - ReturnValue.getNotConstant()->isNullValue() && - !F->hasRetAttribute(Attribute::NonNull)) { - F->addRetAttr(Attribute::NonNull); - continue; - } - if (F->getReturnType()->isVoidTy()) - continue; + Solver.inferReturnAttributes(); + for (const auto &[F, ReturnValue] : Solver.getTrackedRetVals()) { + assert(!F->getReturnType()->isVoidTy() && + "should not track void functions"); if (SCCPSolver::isConstant(ReturnValue) || ReturnValue.isUnknownOrUndef()) findReturnsToZap(*F, ReturnsToZap, Solver); } diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index caf9f890418e29..0330460e7df8ab 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -66,6 +67,11 @@ static bool runSCCP(Function &F, const DataLayout &DL, DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; }, F.getContext()); + // While we don't do any actual inter-procedural analysis, still track + // return values so we can infer attributes. + if (canTrackReturnsInterprocedurally(&F)) + Solver.addTrackedFunction(&F); + // Mark the first block of the function as being executable. Solver.markBlockExecutable(&F.front()); @@ -115,6 +121,8 @@ static bool runSCCP(Function &F, const DataLayout &DL, if (!DeadBB->hasAddressTaken()) DTU.deleteBB(DeadBB); + Solver.inferReturnAttributes(); + return MadeChanges; } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 59775d2199ca61..56e1f90f46cfd1 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -354,6 +354,34 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU, return true; } +void SCCPSolver::inferReturnAttributes() const { + for (const auto &[F, ReturnValue] : getTrackedRetVals()) { + + // If there is a known constant range for the return value, add range + // attribute to the return value. + if (ReturnValue.isConstantRange() && + !ReturnValue.getConstantRange().isSingleElement()) { + // Do not add range metadata if the return value may include undef. + if (ReturnValue.isConstantRangeIncludingUndef()) + continue; + + // Take the intersection of the existing attribute and the inferred range. + ConstantRange CR = ReturnValue.getConstantRange(); + if (F->hasRetAttribute(Attribute::Range)) + CR = CR.intersectWith(F->getRetAttribute(Attribute::Range).getRange()); + F->addRangeRetAttr(CR); + continue; + } + // Infer nonnull return attribute. + if (F->getReturnType()->isPointerTy() && ReturnValue.isNotConstant() && + ReturnValue.getNotConstant()->isNullValue() && + !F->hasRetAttribute(Attribute::NonNull)) { + F->addRetAttr(Attribute::NonNull); + continue; + } + } +} + /// Helper class for SCCPSolver. This implements the instruction visitor and /// holds all the state. class SCCPInstVisitor : public InstVisitor { @@ -2168,7 +2196,7 @@ const ValueLatticeElement &SCCPSolver::getLatticeValueFor(Value *V) const { } const MapVector & -SCCPSolver::getTrackedRetVals() { +SCCPSolver::getTrackedRetVals() const { return Visitor->getTrackedRetVals(); } diff --git a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll index 35d5ceeb91950f..871615dbd62852 100644 --- a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll +++ b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll @@ -2,7 +2,7 @@ ; RUN: opt -O1 -S < %s | FileCheck %s define i32 @testa(i32 %mul) { -; CHECK-LABEL: define range(i32 -65536, 65536) i32 @testa( +; CHECK-LABEL: define range(i32 -65536, 32768) i32 @testa( ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[MUL]], 15 ; CHECK-NEXT: [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767) @@ -16,7 +16,7 @@ define i32 @testa(i32 %mul) { } define i32 @testb(i32 %mul) { -; CHECK-LABEL: define range(i32 -16777216, 16777216) i32 @testb( +; CHECK-LABEL: define range(i32 -128, 128) i32 @testb( ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0]] { ; CHECK-NEXT: [[SHR102:%.*]] = ashr i32 [[MUL]], 7 ; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[SHR102]], i32 -128) diff --git a/llvm/test/Transforms/SCCP/exact-flags.ll b/llvm/test/Transforms/SCCP/exact-flags.ll index a5e3bf111bbd9d..f860ddb6fe9cfb 100644 --- a/llvm/test/Transforms/SCCP/exact-flags.ll +++ b/llvm/test/Transforms/SCCP/exact-flags.ll @@ -2,7 +2,7 @@ ; RUN: opt -passes=sccp < %s -S | FileCheck %s define i8 @ashr_to_lshr(i8 %x, i8 %y) { -; CHECK-LABEL: define i8 @ashr_to_lshr( +; CHECK-LABEL: define range(i8 0, -128) i8 @ashr_to_lshr( ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { ; CHECK-NEXT: [[P:%.*]] = and i8 [[X]], 127 ; CHECK-NEXT: [[R:%.*]] = lshr exact i8 [[P]], [[Y]] @@ -14,7 +14,7 @@ define i8 @ashr_to_lshr(i8 %x, i8 %y) { } define i8 @sdiv_to_udiv(i8 %x, i8 %y) { -; CHECK-LABEL: define i8 @sdiv_to_udiv( +; CHECK-LABEL: define range(i8 0, -128) i8 @sdiv_to_udiv( ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) { ; CHECK-NEXT: [[X1:%.*]] = and i8 [[X]], 127 ; CHECK-NEXT: [[Y1:%.*]] = and i8 [[Y]], 127 diff --git a/llvm/test/Transforms/SCCP/phis.ll b/llvm/test/Transforms/SCCP/phis.ll index 9264a6eaefb85d..dae843ca955955 100644 --- a/llvm/test/Transforms/SCCP/phis.ll +++ b/llvm/test/Transforms/SCCP/phis.ll @@ -100,7 +100,7 @@ end: } define <2 x i16> @phi_vector_merge1(i1 %c, <2 x i8> %a) { -; CHECK-LABEL: define <2 x i16> @phi_vector_merge1( +; CHECK-LABEL: define range(i16 2, 259) <2 x i16> @phi_vector_merge1( ; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16> @@ -126,7 +126,7 @@ join: } define <2 x i16> @phi_vector_merge2(i1 %c, <2 x i8> %a) { -; CHECK-LABEL: define <2 x i16> @phi_vector_merge2( +; CHECK-LABEL: define range(i16 2, 259) <2 x i16> @phi_vector_merge2( ; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16> diff --git a/llvm/test/Transforms/SCCP/pointer-nonnull.ll b/llvm/test/Transforms/SCCP/pointer-nonnull.ll index 08d4a76345bb63..c3a6a762e31744 100644 --- a/llvm/test/Transforms/SCCP/pointer-nonnull.ll +++ b/llvm/test/Transforms/SCCP/pointer-nonnull.ll @@ -232,13 +232,9 @@ define i1 @ip_test_nonnull_caller(ptr %p) { } define ptr @ret_nonnull_pointer(ptr nonnull %p) { -; SCCP-LABEL: define ptr @ret_nonnull_pointer( -; SCCP-SAME: ptr nonnull [[P:%.*]]) { -; SCCP-NEXT: ret ptr [[P]] -; -; IPSCCP-LABEL: define nonnull ptr @ret_nonnull_pointer( -; IPSCCP-SAME: ptr nonnull [[P:%.*]]) { -; IPSCCP-NEXT: ret ptr [[P]] +; CHECK-LABEL: define nonnull ptr @ret_nonnull_pointer( +; CHECK-SAME: ptr nonnull [[P:%.*]]) { +; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p } From d7100111f41ca314c094987d880d1648b78256af Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 2 Sep 2024 06:02:24 -0400 Subject: [PATCH 28/33] [llvm][Support] Adjust maximum thread name length to the right value for OpenBSD (#106956) The thread name length is derived from _MAXCOMLEN which is 24. --- llvm/lib/Support/Unix/Threading.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc index 1812d990f21ac1..acfd4ad51902bb 100644 --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -146,7 +146,7 @@ static constexpr uint32_t get_max_thread_name_length_impl() { #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) return 16; #elif defined(__OpenBSD__) - return 32; + return 24; #else return 0; #endif From b9bba6ca9fc62c5ae3ee402196b11a523a500fdc Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 2 Sep 2024 12:11:03 +0200 Subject: [PATCH 29/33] [BasicAA] Track nuw through decomposed expressions (#106512) When we decompose the GEP offset expression, and the arithmetic is not performed using nuw operations, we cannot retain the nuw flag on the decomposed GEP. For example, if we have `gep nuw p, (a-1)`, this is not at all the same as `gep nuw (gep nuw p, a), -1`. Fix this by tracking NUW through linear expression decomposition, similarly to what we already do for the NSW flag. This fixes the miscompilation reported in https://github.com/llvm/llvm-project/pull/105496#issuecomment-2315322220. --- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 31 ++++-- llvm/test/Analysis/BasicAA/gep-nuw-alias.ll | 103 ++++++++++++++++++++ 2 files changed, 124 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 72db28929c0c37..a00ed7530ebc4c 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -375,24 +375,28 @@ struct LinearExpression { APInt Scale; APInt Offset; + /// True if all operations in this expression are NUW. + bool IsNUW; /// True if all operations in this expression are NSW. bool IsNSW; LinearExpression(const CastedValue &Val, const APInt &Scale, - const APInt &Offset, bool IsNSW) - : Val(Val), Scale(Scale), Offset(Offset), IsNSW(IsNSW) {} + const APInt &Offset, bool IsNUW, bool IsNSW) + : Val(Val), Scale(Scale), Offset(Offset), IsNUW(IsNUW), IsNSW(IsNSW) {} - LinearExpression(const CastedValue &Val) : Val(Val), IsNSW(true) { + LinearExpression(const CastedValue &Val) + : Val(Val), IsNUW(true), IsNSW(true) { unsigned BitWidth = Val.getBitWidth(); Scale = APInt(BitWidth, 1); Offset = APInt(BitWidth, 0); } - LinearExpression mul(const APInt &Other, bool MulIsNSW) const { + LinearExpression mul(const APInt &Other, bool MulIsNUW, bool MulIsNSW) const { // The check for zero offset is necessary, because generally // (X +nsw Y) *nsw Z does not imply (X *nsw Z) +nsw (Y *nsw Z). bool NSW = IsNSW && (Other.isOne() || (MulIsNSW && Offset.isZero())); - return LinearExpression(Val, Scale * Other, Offset * Other, NSW); + bool NUW = IsNUW && (Other.isOne() || MulIsNUW); + return LinearExpression(Val, Scale * Other, Offset * Other, NUW, NSW); } }; } @@ -408,7 +412,7 @@ static LinearExpression GetLinearExpression( if (const ConstantInt *Const = dyn_cast(Val.V)) return LinearExpression(Val, APInt(Val.getBitWidth(), 0), - Val.evaluateWith(Const->getValue()), true); + Val.evaluateWith(Const->getValue()), true, true); if (const BinaryOperator *BOp = dyn_cast(Val.V)) { if (ConstantInt *RHSC = dyn_cast(BOp->getOperand(1))) { @@ -444,6 +448,7 @@ static LinearExpression GetLinearExpression( E = GetLinearExpression(Val.withValue(BOp->getOperand(0), false), DL, Depth + 1, AC, DT); E.Offset += RHS; + E.IsNUW &= NUW; E.IsNSW &= NSW; break; } @@ -451,13 +456,14 @@ static LinearExpression GetLinearExpression( E = GetLinearExpression(Val.withValue(BOp->getOperand(0), false), DL, Depth + 1, AC, DT); E.Offset -= RHS; + E.IsNUW = false; // sub nuw x, y is not add nuw x, -y. E.IsNSW &= NSW; break; } case Instruction::Mul: E = GetLinearExpression(Val.withValue(BOp->getOperand(0), false), DL, Depth + 1, AC, DT) - .mul(RHS, NSW); + .mul(RHS, NUW, NSW); break; case Instruction::Shl: // We're trying to linearize an expression of the kind: @@ -472,6 +478,7 @@ static LinearExpression GetLinearExpression( Depth + 1, AC, DT); E.Offset <<= RHS.getLimitedValue(); E.Scale <<= RHS.getLimitedValue(); + E.IsNUW &= NUW; E.IsNSW &= NSW; break; } @@ -697,7 +704,8 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL, // If the integer type is smaller than the index size, it is implicitly // sign extended or truncated to index size. bool NUSW = GEPOp->hasNoUnsignedSignedWrap(); - bool NonNeg = NUSW && GEPOp->hasNoUnsignedWrap(); + bool NUW = GEPOp->hasNoUnsignedWrap(); + bool NonNeg = NUSW && NUW; unsigned Width = Index->getType()->getIntegerBitWidth(); unsigned SExtBits = IndexSize > Width ? IndexSize - Width : 0; unsigned TruncBits = IndexSize < Width ? Width - IndexSize : 0; @@ -706,9 +714,11 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL, // Scale by the type size. unsigned TypeSize = AllocTypeSize.getFixedValue(); - LE = LE.mul(APInt(IndexSize, TypeSize), NUSW); + LE = LE.mul(APInt(IndexSize, TypeSize), NUW, NUSW); Decomposed.Offset += LE.Offset.sext(MaxIndexSize); APInt Scale = LE.Scale.sext(MaxIndexSize); + if (!LE.IsNUW) + Decomposed.NWFlags = Decomposed.NWFlags.withoutNoUnsignedWrap(); // If we already had an occurrence of this index variable, merge this // scale into it. For example, we want to handle: @@ -719,7 +729,8 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL, areBothVScale(Decomposed.VarIndices[i].Val.V, LE.Val.V)) && Decomposed.VarIndices[i].Val.hasSameCastsAs(LE.Val)) { Scale += Decomposed.VarIndices[i].Scale; - LE.IsNSW = false; // We cannot guarantee nsw for the merge. + // We cannot guarantee no-wrap for the merge. + LE.IsNSW = LE.IsNUW = false; Decomposed.VarIndices.erase(Decomposed.VarIndices.begin() + i); break; } diff --git a/llvm/test/Analysis/BasicAA/gep-nuw-alias.ll b/llvm/test/Analysis/BasicAA/gep-nuw-alias.ll index b80a457f85176c..a5f1c1c747cc3f 100644 --- a/llvm/test/Analysis/BasicAA/gep-nuw-alias.ll +++ b/llvm/test/Analysis/BasicAA/gep-nuw-alias.ll @@ -212,3 +212,106 @@ define void @both_var_idx(ptr %p, i64 %i, i64 %j) { ret void } + +; CHECK-LABEL: add_no_nuw +; CHECK: MayAlias: i8* %gep, i8* %p +define i8 @add_no_nuw(ptr %p, i64 %n) { + store i8 3, ptr %p + + %add = add i64 %n, 1 + %gep = getelementptr nuw i8, ptr %p, i64 %add + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: add_nuw +; CHECK: NoAlias: i8* %gep, i8* %p +define i8 @add_nuw(ptr %p, i64 %n) { + store i8 3, ptr %p + + %add = add nuw i64 %n, 1 + %gep = getelementptr nuw i8, ptr %p, i64 %add + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: add_no_nuw +; CHECK: MayAlias: i8* %gep, i16* %p +define i8 @add_no_nuw_scale(ptr %p, i64 %n) { + store i16 3, ptr %p + + %add = add i64 %n, 1 + %gep = getelementptr nuw i16, ptr %p, i64 %add + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: add_nuw +; CHECK: NoAlias: i8* %gep, i16* %p +define i8 @add_nuw_scale(ptr %p, i64 %n) { + store i16 3, ptr %p + + %add = add nuw i64 %n, 1 + %gep = getelementptr nuw i16, ptr %p, i64 %add + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: sub_nuw +; CHECK: MayAlias: i8* %gep, i8* %p +define i8 @sub_nuw(ptr %p, i64 %n) { + store i8 3, ptr %p + + %add = sub nuw i64 %n, 1 + %gep = getelementptr nuw i8, ptr %p, i64 %add + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: mul_no_nuw +; CHECK: MayAlias: i8* %gep, i16* %p +define i8 @mul_no_nuw(ptr %p, i64 %n) { + store i16 3, ptr %p + + %add = add nuw i64 %n, 1 + %mul = mul i64 %add, 2 + %gep = getelementptr nuw i8, ptr %p, i64 %mul + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: mul_nuw +; CHECK: NoAlias: i8* %gep, i16* %p +define i8 @mul_nuw(ptr %p, i64 %n) { + store i16 3, ptr %p + + %add = add nuw i64 %n, 1 + %mul = mul nuw i64 %add, 2 + %gep = getelementptr nuw i8, ptr %p, i64 %mul + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: shl_no_nuw +; CHECK: MayAlias: i8* %gep, i16* %p +define i8 @shl_no_nuw(ptr %p, i64 %n) { + store i16 3, ptr %p + + %add = add nuw i64 %n, 1 + %shl = shl i64 %add, 1 + %gep = getelementptr nuw i8, ptr %p, i64 %shl + %val = load i8, ptr %gep + ret i8 %val +} + +; CHECK-LABEL: shl_nuw +; CHECK: NoAlias: i8* %gep, i16* %p +define i8 @shl_nuw(ptr %p, i64 %n) { + store i16 3, ptr %p + + %add = add nuw i64 %n, 1 + %shl = shl nuw i64 %add, 1 + %gep = getelementptr nuw i8, ptr %p, i64 %shl + %val = load i8, ptr %gep + ret i8 %val +} From c42512436b23ab50e7637f239abe8371407104a1 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 2 Sep 2024 11:12:40 +0100 Subject: [PATCH 30/33] [mlir][ArmSME] Rename slice move operations to insert/extract_tile_slice (#106755) This renames: - `arm_sme.move_tile_slice_to_vector` to `arm_sme.extract_tile_slice` - `arm_sme.move_vector_to_tile_slice` to `arm_sme.insert_tile_slice` The new names are more consistent with the rest of MLIR and should be easier to understand. The current names (to me personally) are hard to parse and easy to mix up when skimming through code. Additionally, the syntax for `insert_tile_slice` has changed from: ```mlir %4 = arm_sme.insert_tile_slice %0, %1, %2 : vector<[16]xi8> into vector<[16]x[16]xi8> ``` To: ```mlir %4 = arm_sme.insert_tile_slice %0, %1[%2] : vector<[16]xi8> into vector<[16]x[16]xi8> ``` This is for consistency with `extract_tile_slice`, but also helps with readability as it makes it clear which operand is the index. --- .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td | 38 +++--- .../ArithToArmSME/ArithToArmSME.cpp | 6 +- .../Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp | 60 ++++----- .../Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp | 10 +- .../VectorToArmSME/VectorToArmSME.cpp | 65 +++++---- .../ArithToArmSME/arith-to-arm-sme.mlir | 4 +- .../ArmSMEToLLVM/arm-sme-to-llvm.mlir | 76 +++++------ .../ArmSMEToSCF/arm-sme-to-scf.mlir | 2 +- .../VectorToArmSME/vector-to-arm-sme.mlir | 106 +++++++-------- mlir/test/Dialect/ArmSME/invalid.mlir | 16 +-- mlir/test/Dialect/ArmSME/roundtrip.mlir | 124 +++++++++--------- .../ArmSME/tile-allocation-copies.mlir | 4 +- .../ArmSME/tile-allocation-liveness.mlir | 56 ++++---- 13 files changed, 282 insertions(+), 285 deletions(-) diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td index 3f1776f57e4c71..d847dda5ae9f9b 100644 --- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td +++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td @@ -592,7 +592,7 @@ def StoreTileSliceOp : ArmSME_Op<"store_tile_slice", [ }]; } -def MoveVectorToTileSliceOp : ArmSME_Op<"move_vector_to_tile_slice", [ +def InsertTileSliceOp : ArmSME_Op<"insert_tile_slice", [ ArmSMETileOpInterface, Pure, AllTypesMatch<["tile", "result"]>, TypesMatchWith< @@ -603,25 +603,25 @@ def MoveVectorToTileSliceOp : ArmSME_Op<"move_vector_to_tile_slice", [ "::llvm::cast($_self).getElementType()," "/*scalableDims=*/{true})">, ]> { - let summary = "Move 1-D scalable vector to slice of 2-D tile"; + let summary = "Insert 1-D scalable vector into slice of 2-D tile"; let description = [{ - The vector to tile slice operation moves a 1-D scalable vector to a slice - of a 2-D scalable vector tile at the given index. The type of the 1-D - scalable vector to be moved must match the type of the tile slice. A tile - slice is a 1-D vector of horizontally or vertically contiguous elements - within a ZA tile. The updated tile is returned as the result. + Inserts a 1-D scalable vector to a slice of a 2-D scalable vector tile at + the given index. The type of the 1-D scalable vector to be inserted must + match the type of the tile slice. A tile slice is a 1-D vector of + horizontally or vertically contiguous elements within a ZA tile. The updated + tile is returned as the result. An optional tile slice layout attribute specifies whether the tile slice is horizontal (default) or vertical. - Example 1: Move a vector<[16]xi8> into tile horizontally (default) at given index. + Example 1: Insert `vector<[16]xi8>` into tile horizontally at the given index. ```mlir - %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[16]xi8> into vector<[16]x[16]xi8> + %tile_update = arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[16]xi8> into vector<[16]x[16]xi8> ``` - Example 2: Move a vector<[2]xf64> into tile vertically at given index. + Example 2: Insert `vector<[2]xf64>` into tile vertically at the given index. ```mlir - %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index layout : vector<[2]xf64> into vector<[2]x[2]xf64> + %tile_update = arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] layout : vector<[2]xf64> into vector<[2]x[2]xf64> ``` }]; let arguments = (ins @@ -636,35 +636,35 @@ def MoveVectorToTileSliceOp : ArmSME_Op<"move_vector_to_tile_slice", [ }]; let assemblyFormat = [{ - $vector `,` $tile `,` $tile_slice_index (`layout` `` $layout^)? + $vector `,` $tile `[` $tile_slice_index `]` (`layout` `` $layout^)? attr-dict `:` type($vector) `into` type($result) }]; } -def MoveTileSliceToVectorOp : ArmSME_Op<"move_tile_slice_to_vector", [ +def ExtractTileSliceOp : ArmSME_Op<"extract_tile_slice", [ ArmSMETileOpInterface, Pure, TypesMatchWith< "type of 'result' matches type of 'tile' slice", "tile", "result", "VectorType(VectorType::Builder(::llvm::cast($_self)).dropDim(0))">, ]> { - let summary = "Move slice of a 2-D tile to a 1-D scalable vector"; + let summary = "Extract 1-D scalable vector from slice of 2-D tile"; let description = [{ - The tile slice to vector operation extracts a 1-D scalable slice from a 2-D - scalable tile at the given index. A tile slice is a 1-D vector of - horizontally or vertically contiguous elements within a ZA tile. + Extracts a 1-D scalable slice from a 2-D scalable tile at the given index. + A tile slice is a 1-D vector of horizontally or vertically contiguous + elements within a ZA tile. An optional tile slice layout attribute specifies whether the tile slice is horizontal (default) or vertical. Example 1: Extract `vector<[16]xi8>` from tile horizontally at the given index. ```mlir - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8> ``` Example 2: Extract `vector<[2]xf64>` from tile vertically at the given index. ```mlir - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] layout : vector<[2]xf64> from vector<[2]x[2]xf64> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] layout : vector<[2]xf64> from vector<[2]x[2]xf64> ``` }]; diff --git a/mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp b/mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp index b12aa92001ff29..5aa2a098b17621 100644 --- a/mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp +++ b/mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp @@ -64,7 +64,7 @@ struct ConstantOpToArmSMELowering : public OpRewritePattern { return success(); } - // Lower non-zero constants to a loop of 'arm_sme.move_vector_to_tile_slice' + // Lower non-zero constants to a loop of 'arm_sme.insert_tile_slice' // ops that broadcast the constant to each tile slice. auto loc = constantOp.getLoc(); @@ -79,9 +79,9 @@ struct ConstantOpToArmSMELowering : public OpRewritePattern { auto initTile = rewriter.create(loc, tileType); auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex, Value currentTile) { - // Create 'arm_sme.move_vector_to_tile_slice' to write vector to tile + // Create 'arm_sme.insert_tile_slice' to write vector to tile // slice. - auto nextTile = b.create( + auto nextTile = b.create( loc, tileType, constantOp1D, currentTile, tileSliceIndex); return nextTile.getResult(); }; diff --git a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp index 1ad2ec6cee7f8c..f1fa411b82914a 100644 --- a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp +++ b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp @@ -575,23 +575,23 @@ struct StoreTileSliceConversion } }; -/// Lower `arm_sme.move_vector_to_tile_slice` to SME intrinsics. -struct MoveVectorToTileSliceConversion - : public ConvertArmSMEOpToLLVMPattern { +/// Lower `arm_sme.insert_tile_slice` to SME intrinsics. +struct InsertTileSliceConversion + : public ConvertArmSMEOpToLLVMPattern { using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern; LogicalResult - matchAndRewrite(arm_sme::MoveVectorToTileSliceOp moveVectorToTileSliceOp, - arm_sme::MoveVectorToTileSliceOp::Adaptor adaptor, + matchAndRewrite(arm_sme::InsertTileSliceOp insertTileSliceOp, + arm_sme::InsertTileSliceOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto loc = moveVectorToTileSliceOp.getLoc(); - auto tileType = moveVectorToTileSliceOp.getTileType(); + auto loc = insertTileSliceOp.getLoc(); + auto tileType = insertTileSliceOp.getTileType(); - auto tileId = getTileIdOrError(moveVectorToTileSliceOp); + auto tileId = getTileIdOrError(insertTileSliceOp); if (!tileId) return failure(); - auto tileSlice = moveVectorToTileSliceOp.getTileSliceIndex(); + auto tileSlice = insertTileSliceOp.getTileSliceIndex(); // Cast tile slice from index to i32 for intrinsic. auto tileSliceI32 = rewriter.create( @@ -606,42 +606,40 @@ struct MoveVectorToTileSliceConversion auto allActiveMask = rewriter.create(loc, predTy, one); // Create 'arm_sme.intr.write.(horiz|vert)' to write vector to tile slice. - switch (moveVectorToTileSliceOp.getLayout()) { + switch (insertTileSliceOp.getLayout()) { case arm_sme::TileSliceLayout::Horizontal: rewriter.create( loc, tileId, tileSliceI32, allActiveMask, - moveVectorToTileSliceOp.getVector()); + insertTileSliceOp.getVector()); break; case arm_sme::TileSliceLayout::Vertical: rewriter.create( loc, tileId, tileSliceI32, allActiveMask, - moveVectorToTileSliceOp.getVector()); + insertTileSliceOp.getVector()); break; } - // Intrinsic has no result, replace 'arm_sme.move_vector_to_tile_slice' with + // Intrinsic has no result, replace 'arm_sme.insert_tile_slice' with // the input tile to preserve dataflow. - rewriter.replaceOp(moveVectorToTileSliceOp, - moveVectorToTileSliceOp.getTile()); + rewriter.replaceOp(insertTileSliceOp, insertTileSliceOp.getTile()); return success(); } }; -/// Lower `arm_sme.move_tile_slice_to_vector` to SME intrinsics. -struct MoveTileSliceToVectorConversion - : public ConvertArmSMEOpToLLVMPattern { +/// Lower `arm_sme.extract_tile_slice` to SME intrinsics. +struct ExtractTileSliceConversion + : public ConvertArmSMEOpToLLVMPattern { using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern; LogicalResult - matchAndRewrite(arm_sme::MoveTileSliceToVectorOp moveTileSliceToVector, - OpAdaptor, + matchAndRewrite(arm_sme::ExtractTileSliceOp extractTileSlice, OpAdaptor, ConversionPatternRewriter &rewriter) const override { - auto loc = moveTileSliceToVector.getLoc(); - auto sliceType = moveTileSliceToVector.getSliceType(); - auto sliceIndex = moveTileSliceToVector.getTileSliceIndex(); + auto loc = extractTileSlice.getLoc(); + auto sliceType = extractTileSlice.getSliceType(); + auto sliceIndex = extractTileSlice.getTileSliceIndex(); - auto tileId = getTileIdOrError(moveTileSliceToVector); + auto tileId = getTileIdOrError(extractTileSlice); if (!tileId) return failure(); @@ -659,16 +657,16 @@ struct MoveTileSliceToVectorConversion loc, rewriter.getI32Type(), sliceIndex); // Create 'arm_sme.intr.read.(horiz|vert)' to extract the tile slice. - switch (moveTileSliceToVector.getLayout()) { + switch (extractTileSlice.getLayout()) { case arm_sme::TileSliceLayout::Horizontal: rewriter.replaceOpWithNewOp( - moveTileSliceToVector, sliceType, zeroVector, allTruePredicate, - tileId, sliceIndexI32); + extractTileSlice, sliceType, zeroVector, allTruePredicate, tileId, + sliceIndexI32); break; case arm_sme::TileSliceLayout::Vertical: rewriter.replaceOpWithNewOp( - moveTileSliceToVector, sliceType, zeroVector, allTruePredicate, - tileId, sliceIndexI32); + extractTileSlice, sliceType, zeroVector, allTruePredicate, tileId, + sliceIndexI32); break; } @@ -985,8 +983,8 @@ void mlir::populateArmSMEToLLVMConversionPatterns(LLVMTypeConverter &converter, }); addArmSMEConversionPatterns< - LoadTileSliceConversion, MoveTileSliceToVectorConversion, - MoveVectorToTileSliceConversion, StoreTileSliceConversion, + LoadTileSliceConversion, ExtractTileSliceConversion, + InsertTileSliceConversion, StoreTileSliceConversion, StreamingVLOpConversion, OuterProductOpConversion, OuterProductWideningOpConversion, diff --git a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp index 9f55932c33af66..411c9d2ebd8386 100644 --- a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp +++ b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp @@ -245,8 +245,8 @@ struct TileLoadOpConversion : public OpRewritePattern { /// : memref, vector<[4]xi1>, /// vector<[4]xi32> into vector<[4]xi32> /// // Insert slice into tile -/// %tile_update = arm_sme.move_vector_to_tile_slice -/// %slice, %iter_tile, %tile_slice_idx : +/// %tile_update = arm_sme.insert_tile_slice +/// %slice, %iter_tile[%tile_slice_idx] : /// vector<[4]xi32> into vector<[4]x[4]xi32> /// scf.yield %tile_update : vector<[4]x[4]xi32> /// } @@ -332,11 +332,11 @@ struct TileLoadOpWithMaskAndPadNonZeroConversion loc, tileSliceType, tileLoadOp.getBase(), memrefIndices, maskOp1D, /*passthru=*/pad1DOp); - // Create 'arm_sme.move_vector_to_tile_slice' to move slice into tile. - auto moveSlice = rewriter.create( + // Create 'arm_sme.insert_tile_slice' to insert slice into tile. + auto insertSlice = rewriter.create( loc, tileType, loadSlice->getResult(0), currentTile, tileSliceIndex, tileLoadOp.getLayout()); - rewriter.create(loc, moveSlice.getResult()); + rewriter.create(loc, insertSlice.getResult()); rewriter.setInsertionPointAfter(forOp); diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp index ee52b9ef6a6f6b..55965d9c2a531d 100644 --- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp +++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp @@ -199,8 +199,8 @@ struct VectorStoreToArmSMELowering : public OpRewritePattern { /// %broadcast_to_tile = scf.for %tile_slice_index = %c0 to %num_tile_slices /// step %c1 iter_args(%iter_tile = %init_tile) -> (vector<[4]x[4]xi32>) /// { -/// %tile_update = arm_sme.move_vector_to_tile_slice -/// %broadcast_to_1d, %iter_tile, %tile_slice_index : +/// %tile_update = arm_sme.insert_tile_slice +/// %broadcast_to_1d, %iter_tile[%tile_slice_index] : /// vector<[4]xi32> into vector<[4]x[4]xi32> /// scf.yield %tile_update : vector<[4]x[4]xi32> /// } @@ -238,9 +238,9 @@ struct BroadcastOpToArmSMELowering auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex, Value currentTile) { - // Create 'arm_sme.move_vector_to_tile_slice' to broadcast the value + // Create 'arm_sme.insert_tile_slice' to broadcast the value // to each tile slice. - auto nextTile = b.create( + auto nextTile = b.create( loc, tileType, broadcastOp1D, currentTile, tileSliceIndex); return nextTile.getResult(); }; @@ -267,8 +267,8 @@ struct BroadcastOpToArmSMELowering /// %broadcast_to_tile = scf.for %tile_slice_index = %c0 to %num_tile_slices /// step %c1 iter_args(%iter_tile = %init_tile) -> (vector<[4]x[4]xi32>) /// { -/// %tile_update = arm_sme.move_vector_to_tile_slice -/// %broadcast_to_1d, %iter_tile, %tile_slice_index : +/// %tile_update = arm_sme.insert_tile_slice +/// %broadcast_to_1d, %iter_tile[%tile_slice_index] : /// vector<[4]xi32> into vector<[4]x[4]xi32> /// scf.yield %tile_update : vector<[4]x[4]xi32> /// } @@ -299,7 +299,7 @@ struct SplatOpToArmSMELowering : public OpRewritePattern { auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex, Value currentTile) { - auto nextTile = b.create( + auto nextTile = b.create( loc, tileType, broadcastOp1D, currentTile, tileSliceIndex); return nextTile.getResult(); }; @@ -497,7 +497,7 @@ struct VectorOuterProductToArmSMELowering } }; -/// Lower `vector.extract` using `arm_sme.move_tile_slice_to_vector`. +/// Lower `vector.extract` using `arm_sme.extract_tile_slice`. /// /// Example: /// ``` @@ -505,7 +505,7 @@ struct VectorOuterProductToArmSMELowering /// ``` /// Becomes: /// ``` -/// %slice = arm_sme.move_tile_slice_to_vector %tile[%row] +/// %slice = arm_sme.extract_tile_slice %tile[%row] /// : vector<[4]xi32> from vector<[4]x[4]xi32> /// %el = vector.extract %slice[%col] : i32 from vector<[4]xi32> /// ``` @@ -531,27 +531,26 @@ struct VectorExtractToArmSMELowering } Value sliceIndex = vector::getAsValues(rewriter, loc, position[0]).front(); - auto moveTileSliceToVector = - rewriter.create(loc, sourceVector, - sliceIndex); + auto extractTileSlice = rewriter.create( + loc, sourceVector, sliceIndex); if (position.size() == 1) { // Single index case: Extracts a 1D slice. - rewriter.replaceOp(extractOp, moveTileSliceToVector); + rewriter.replaceOp(extractOp, extractTileSlice); return success(); } // Two indices case: Extracts a single element. assert(position.size() == 2); - rewriter.replaceOpWithNewOp( - extractOp, moveTileSliceToVector, position[1]); + rewriter.replaceOpWithNewOp(extractOp, extractTileSlice, + position[1]); return success(); } }; -/// Lower `vector.insert` using `arm_sme.move_vector_to_tile_slice` and -/// `arm_sme.move_tile_slice_to_vector`. +/// Lower `vector.insert` using `arm_sme.insert_tile_slice` and +/// `arm_sme.extract_tile_slice`. /// /// Example: /// ``` @@ -560,10 +559,10 @@ struct VectorExtractToArmSMELowering /// ``` /// Becomes: /// ``` -/// %slice = arm_sme.move_tile_slice_to_vector %tile[%row] +/// %slice = arm_sme.extract_tile_slice %tile[%row] /// : vector<[4]xi32> from vector<[4]x[4]xi32> /// %new_slice = vector.insert %el, %slice[%col] : i32 into vector<[4]xi32> -/// %new_tile = arm_sme.move_vector_to_tile_slice %new_slice, %tile, %row +/// %new_tile = arm_sme.insert_tile_slice %new_slice, %tile[%row] /// : vector<[4]xi32> into vector<[4]x[4]xi32> /// ``` struct VectorInsertToArmSMELowering @@ -594,21 +593,21 @@ struct VectorInsertToArmSMELowering if (position.size() == 2) { // Two indices case: Insert single element into tile. // We need to first extract the existing slice and update the element. - tileSlice = rewriter.create( + tileSlice = rewriter.create( loc, insertOp.getDest(), sliceIndex); tileSlice = rewriter.create(loc, source, tileSlice, position[1]); } // Insert the slice into the destination tile. - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( insertOp, tileSlice, insertOp.getDest(), sliceIndex); return success(); } }; /// Lowers `vector.print` of a tile into a loop over the rows of the tile, -/// extracting them via `arm_sme.move_tile_slice_to_vector`, then printing with +/// extracting them via `arm_sme.extract_tile_slice`, then printing with /// a 1D `vector.print`. /// /// BEFORE: @@ -623,7 +622,7 @@ struct VectorInsertToArmSMELowering /// %vscale = vector.vscale /// %svl_s = arith.muli %c4, %vscale : index /// scf.for %i = %c0 to %svl_s step %c1 { -/// %tile_slice = arm_sme.move_tile_slice_to_vector %tile[%i] +/// %tile_slice = arm_sme.extract_tile_slice %tile[%i] /// : vector<[4]xf32> from vector<[4]x[4]xf32> /// vector.print %tile_slice : vector<[4]xf32> /// } @@ -655,7 +654,7 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern { rewriter.setInsertionPointToStart(forOp.getBody()); // Extract the current row from the tile. Value rowIndex = forOp.getInductionVar(); - auto tileSlice = rewriter.create( + auto tileSlice = rewriter.create( loc, printOp.getSource(), rowIndex); // Print the row with a 1D vector.print. rewriter.create(loc, tileSlice, @@ -667,11 +666,11 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern { } }; -/// Folds a MoveTileSliceToVectorOp + TransferWriteOp to a StoreTileSliceOp. +/// Folds a ExtractTileSliceOp + TransferWriteOp to a StoreTileSliceOp. /// /// BEFORE: /// ```mlir -/// %slice = arm_sme.move_tile_slice_to_vector %tile[%index] +/// %slice = arm_sme.extract_tile_slice %tile[%index] /// : vector<[4]xf32> from vector<[4]x[4]xf32> /// vector.transfer_write %slice, %memref[%i, %j], %mask {in_bounds = [true]} /// : vector<[4]xf32>, memref @@ -694,11 +693,11 @@ struct FoldTransferWriteOfExtractTileSlice return rewriter.notifyMatchFailure(writeOp, "not inbounds transfer write"); - auto moveTileSlice = - writeOp.getVector().getDefiningOp(); - if (!moveTileSlice) + auto extractTileSlice = + writeOp.getVector().getDefiningOp(); + if (!extractTileSlice) return rewriter.notifyMatchFailure( - writeOp, "vector to store not from MoveTileSliceToVectorOp"); + writeOp, "vector to store not from ExtractTileSliceOp"); AffineMap map = writeOp.getPermutationMap(); if (!map.isMinorIdentity()) @@ -713,9 +712,9 @@ struct FoldTransferWriteOfExtractTileSlice } rewriter.replaceOpWithNewOp( - writeOp, moveTileSlice.getTile(), moveTileSlice.getTileSliceIndex(), - mask, writeOp.getSource(), writeOp.getIndices(), - moveTileSlice.getLayout()); + writeOp, extractTileSlice.getTile(), + extractTileSlice.getTileSliceIndex(), mask, writeOp.getSource(), + writeOp.getIndices(), extractTileSlice.getLayout()); return success(); } }; diff --git a/mlir/test/Conversion/ArithToArmSME/arith-to-arm-sme.mlir b/mlir/test/Conversion/ArithToArmSME/arith-to-arm-sme.mlir index 49d2e2f3c182b9..0faac9c847f5ff 100644 --- a/mlir/test/Conversion/ArithToArmSME/arith-to-arm-sme.mlir +++ b/mlir/test/Conversion/ArithToArmSME/arith-to-arm-sme.mlir @@ -99,7 +99,7 @@ func.func @arith_constant_dense_2d_zero_f64() { // CHECK: %[[VSCALE:.*]] = vector.vscale // CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %[[C16]] : index // CHECK: %[[TILE:.*]] = scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] iter_args(%[[CURRENT_TILE:.*]] = %[[INIT_TILE]]) -> (vector<[16]x[16]xi8>) { -// CHECK: %[[TILE_UPDATE:.*]] = arm_sme.move_vector_to_tile_slice %[[C2_SPLAT]], %[[CURRENT_TILE]], %[[TILE_SLICE_INDEX]] : vector<[16]xi8> into vector<[16]x[16]xi8> +// CHECK: %[[TILE_UPDATE:.*]] = arm_sme.insert_tile_slice %[[C2_SPLAT]], %[[CURRENT_TILE]][%[[TILE_SLICE_INDEX]]] : vector<[16]xi8> into vector<[16]x[16]xi8> // CHECK: scf.yield %[[TILE_UPDATE]] : vector<[16]x[16]xi8> // CHECK: "prevent.dce"(%[[TILE]]) : (vector<[16]x[16]xi8>) -> () func.func @arith_constant_dense_2d_nonzero_i8() { @@ -119,7 +119,7 @@ func.func @arith_constant_dense_2d_nonzero_i8() { // CHECK: %[[VSCALE:.*]] = vector.vscale // CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %[[C2]] : index // CHECK: %[[TILE:.*]] = scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] iter_args(%[[CURRENT_TILE:.*]] = %[[INIT_TILE]]) -> (vector<[2]x[2]xf64>) { -// CHECK: %[[TILE_UPDATE:.*]] = arm_sme.move_vector_to_tile_slice %[[C2_SPLAT]], %[[CURRENT_TILE]], %[[TILE_SLICE_INDEX]] : vector<[2]xf64> into vector<[2]x[2]xf64> +// CHECK: %[[TILE_UPDATE:.*]] = arm_sme.insert_tile_slice %[[C2_SPLAT]], %[[CURRENT_TILE]][%[[TILE_SLICE_INDEX]]] : vector<[2]xf64> into vector<[2]x[2]xf64> // CHECK: scf.yield %[[TILE_UPDATE]] : vector<[2]x[2]xf64> // CHECK: "prevent.dce"(%[[TILE]]) : (vector<[2]x[2]xf64>) -> () func.func @arith_constant_dense_2d_nonzero_f64() { diff --git a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir index ef85f3d069d743..6a4d77e86ab583 100644 --- a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir +++ b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir @@ -448,134 +448,134 @@ func.func @arm_sme_store_tile_slice_ver_f64(%tile_slice_index : index, %mask : v } //===----------------------------------------------------------------------===// -// arm_sme.move_vector_to_tile_slice +// arm_sme.insert_tile_slice //===----------------------------------------------------------------------===// // ----- -// CHECK-LABEL: @arm_sme_move_vector_to_tile_slice_hor_i32 +// CHECK-LABEL: @arm_sme_insert_tile_slice_hor_i32 // CHECK: "arm_sme.intr.write.horiz"({{.*}}) <{tile_id = 0 : i32}> : (i32, vector<[4]xi1>, vector<[4]xi32>) -> () -func.func @arm_sme_move_vector_to_tile_slice_hor_i32(%vector : vector<[4]xi32>, %tile_slice_index : index) -> () { +func.func @arm_sme_insert_tile_slice_hor_i32(%vector : vector<[4]xi32>, %tile_slice_index : index) -> () { %c0 = arith.constant 0 : index %tile = arm_sme.get_tile : vector<[4]x[4]xi32> - %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[4]xi32> into vector<[4]x[4]xi32> + %tile_update = arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[4]xi32> into vector<[4]x[4]xi32> "test.some_use" (%tile_update) : (vector<[4]x[4]xi32>) -> () return } // ----- -// CHECK-LABEL: @arm_sme_move_vector_to_tile_slice_ver_bf16 +// CHECK-LABEL: @arm_sme_insert_tile_slice_ver_bf16 // CHECK: "arm_sme.intr.write.vert"({{.*}}) <{tile_id = 0 : i32}> : (i32, vector<[8]xi1>, vector<[8]xbf16>) -> () -func.func @arm_sme_move_vector_to_tile_slice_ver_bf16(%vector : vector<[8]xbf16>, %tile_slice_index : index) -> () { +func.func @arm_sme_insert_tile_slice_ver_bf16(%vector : vector<[8]xbf16>, %tile_slice_index : index) -> () { %c0 = arith.constant 0 : index %tile = arm_sme.get_tile : vector<[8]x[8]xbf16> - %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index layout : vector<[8]xbf16> into vector<[8]x[8]xbf16> + %tile_update = arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] layout : vector<[8]xbf16> into vector<[8]x[8]xbf16> "test.some_use" (%tile_update) : (vector<[8]x[8]xbf16>) -> () return } //===----------------------------------------------------------------------===// -// arm_sme.move_tile_slice_to_vector +// arm_sme.extract_tile_slice //===----------------------------------------------------------------------===// // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_i8 +// CHECK-LABEL: @arm_sme_extract_tile_slice_i8 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[16]xi8>, vector<[16]xi1>, i32) -> vector<[16]xi8> -func.func @arm_sme_move_tile_slice_to_vector_i8(%tile_slice_index : index) -> vector<[16]xi8> { +func.func @arm_sme_extract_tile_slice_i8(%tile_slice_index : index) -> vector<[16]xi8> { %tile = arm_sme.get_tile : vector<[16]x[16]xi8> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8> return %slice : vector<[16]xi8> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_i16 +// CHECK-LABEL: @arm_sme_extract_tile_slice_i16 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[8]xi16>, vector<[8]xi1>, i32) -> vector<[8]xi16> -func.func @arm_sme_move_tile_slice_to_vector_i16(%tile_slice_index : index) -> vector<[8]xi16> { +func.func @arm_sme_extract_tile_slice_i16(%tile_slice_index : index) -> vector<[8]xi16> { %tile = arm_sme.get_tile : vector<[8]x[8]xi16> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[8]xi16> from vector<[8]x[8]xi16> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[8]xi16> from vector<[8]x[8]xi16> return %slice : vector<[8]xi16> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_i32 +// CHECK-LABEL: @arm_sme_extract_tile_slice_i32 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[4]xi32>, vector<[4]xi1>, i32) -> vector<[4]xi32> -func.func @arm_sme_move_tile_slice_to_vector_i32(%tile_slice_index : index) -> vector<[4]xi32> { +func.func @arm_sme_extract_tile_slice_i32(%tile_slice_index : index) -> vector<[4]xi32> { %tile = arm_sme.get_tile : vector<[4]x[4]xi32> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[4]xi32> from vector<[4]x[4]xi32> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[4]xi32> from vector<[4]x[4]xi32> return %slice : vector<[4]xi32> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_i64 +// CHECK-LABEL: @arm_sme_extract_tile_slice_i64 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[2]xi64>, vector<[2]xi1>, i32) -> vector<[2]xi64> -func.func @arm_sme_move_tile_slice_to_vector_i64(%tile_slice_index : index) -> vector<[2]xi64> { +func.func @arm_sme_extract_tile_slice_i64(%tile_slice_index : index) -> vector<[2]xi64> { %tile = arm_sme.get_tile : vector<[2]x[2]xi64> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xi64> from vector<[2]x[2]xi64> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[2]xi64> from vector<[2]x[2]xi64> return %slice : vector<[2]xi64> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_i128 +// CHECK-LABEL: @arm_sme_extract_tile_slice_i128 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[1]xi128>, vector<[1]xi1>, i32) -> vector<[1]xi128> -func.func @arm_sme_move_tile_slice_to_vector_i128(%tile_slice_index : index) -> vector<[1]xi128> { +func.func @arm_sme_extract_tile_slice_i128(%tile_slice_index : index) -> vector<[1]xi128> { %tile = arm_sme.get_tile : vector<[1]x[1]xi128> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[1]xi128> from vector<[1]x[1]xi128> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[1]xi128> from vector<[1]x[1]xi128> return %slice : vector<[1]xi128> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_f16 +// CHECK-LABEL: @arm_sme_extract_tile_slice_f16 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[8]xf16>, vector<[8]xi1>, i32) -> vector<[8]xf16> -func.func @arm_sme_move_tile_slice_to_vector_f16(%tile_slice_index : index) -> vector<[8]xf16> { +func.func @arm_sme_extract_tile_slice_f16(%tile_slice_index : index) -> vector<[8]xf16> { %tile = arm_sme.get_tile : vector<[8]x[8]xf16> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[8]xf16> from vector<[8]x[8]xf16> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[8]xf16> from vector<[8]x[8]xf16> return %slice : vector<[8]xf16> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_bf16 +// CHECK-LABEL: @arm_sme_extract_tile_slice_bf16 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[8]xbf16>, vector<[8]xi1>, i32) -> vector<[8]xbf16> -func.func @arm_sme_move_tile_slice_to_vector_bf16(%tile_slice_index : index) -> vector<[8]xbf16> { +func.func @arm_sme_extract_tile_slice_bf16(%tile_slice_index : index) -> vector<[8]xbf16> { %tile = arm_sme.get_tile : vector<[8]x[8]xbf16> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[8]xbf16> from vector<[8]x[8]xbf16> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[8]xbf16> from vector<[8]x[8]xbf16> return %slice : vector<[8]xbf16> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_f32 +// CHECK-LABEL: @arm_sme_extract_tile_slice_f32 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[4]xf32>, vector<[4]xi1>, i32) -> vector<[4]xf32> -func.func @arm_sme_move_tile_slice_to_vector_f32(%tile_slice_index : index) -> vector<[4]xf32> { +func.func @arm_sme_extract_tile_slice_f32(%tile_slice_index : index) -> vector<[4]xf32> { %tile = arm_sme.get_tile : vector<[4]x[4]xf32> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[4]xf32> from vector<[4]x[4]xf32> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[4]xf32> from vector<[4]x[4]xf32> return %slice : vector<[4]xf32> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_f64 +// CHECK-LABEL: @arm_sme_extract_tile_slice_f64 // CHECK: "arm_sme.intr.read.horiz"({{.*}}) <{tile_id = 0 : i32}> : (vector<[2]xf64>, vector<[2]xi1>, i32) -> vector<[2]xf64> -func.func @arm_sme_move_tile_slice_to_vector_f64(%tile_slice_index : index) -> vector<[2]xf64> { +func.func @arm_sme_extract_tile_slice_f64(%tile_slice_index : index) -> vector<[2]xf64> { %tile = arm_sme.get_tile : vector<[2]x[2]xf64> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xf64> from vector<[2]x[2]xf64> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[2]xf64> from vector<[2]x[2]xf64> return %slice : vector<[2]xf64> } // ----- -// CHECK-LABEL: @arm_sme_move_tile_slice_to_vector_ver_i128 +// CHECK-LABEL: @arm_sme_extract_tile_slice_ver_i128 // CHECK: "arm_sme.intr.read.vert"({{.*}}) <{tile_id = 0 : i32}> : (vector<[1]xi128>, vector<[1]xi1>, i32) -> vector<[1]xi128> -func.func @arm_sme_move_tile_slice_to_vector_ver_i128(%tile_slice_index : index) -> vector<[1]xi128> { +func.func @arm_sme_extract_tile_slice_ver_i128(%tile_slice_index : index) -> vector<[1]xi128> { %tile = arm_sme.get_tile : vector<[1]x[1]xi128> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] layout : vector<[1]xi128> from vector<[1]x[1]xi128> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] layout : vector<[1]xi128> from vector<[1]x[1]xi128> return %slice : vector<[1]xi128> } diff --git a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir index a2f2beff78c409..4ae710aa291137 100644 --- a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir +++ b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir @@ -89,7 +89,7 @@ func.func @arm_sme_tile_load_hor_with_mask_and_pad_zero(%src : memref) // CHECK-NEXT: %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index // CHECK: %[[PAD_1D:.*]] = vector.splat %[[PAD]] : vector<[4]xi32> // CHECK: %[[LOAD_SLICE:.*]] = vector.maskedload %[[SRC]]{{\[}}%[[OFFSET]], %[[C0]]], %[[MASK_1D]], %[[PAD_1D]] : memref, vector<[4]xi1>, vector<[4]xi32> into vector<[4]xi32> -// CHECK: %[[TILE_UPDATE:.*]] = arm_sme.move_vector_to_tile_slice %[[LOAD_SLICE]], %[[CURRENT_TILE]], %[[TILE_SLICE_INDEX]] : vector<[4]xi32> into vector<[4]x[4]xi32> +// CHECK: %[[TILE_UPDATE:.*]] = arm_sme.insert_tile_slice %[[LOAD_SLICE]], %[[CURRENT_TILE]][%[[TILE_SLICE_INDEX]]] : vector<[4]xi32> into vector<[4]x[4]xi32> // CHECK-NEXT: scf.yield %[[TILE_UPDATE]] : vector<[4]x[4]xi32> func.func @arm_sme_tile_load_hor_with_mask_and_nonzero_pad(%src : memref, %pad : i32) { %c0 = arith.constant 0 : index diff --git a/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir b/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir index 068fd0d04f1bc1..0f973af799634c 100644 --- a/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir +++ b/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir @@ -372,7 +372,7 @@ func.func @transfer_write_slice_with_mask(%vector: vector<[4]x[4]xf32>, %dest : // CHECK: arm_sme.store_tile_slice {{.*}} layout func.func @transfer_write_vertical_slice(%vector: vector<[4]x[4]xf32>, %dest : memref, %slice_index: index) { %c0 = arith.constant 0 : index - %slice = arm_sme.move_tile_slice_to_vector %vector[%slice_index] layout + %slice = arm_sme.extract_tile_slice %vector[%slice_index] layout : vector<[4]xf32> from vector<[4]x[4]xf32> vector.transfer_write %slice, %dest[%slice_index, %c0] { in_bounds = [true] }: vector<[4]xf32>, memref return @@ -394,7 +394,7 @@ func.func @transfer_write_vertical_slice(%vector: vector<[4]x[4]xf32>, %dest : m // CHECK: %[[VSCALE:.*]] = vector.vscale // CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index // CHECK: %[[TILE:.*]] = scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] iter_args(%[[CURRENT_TILE:.*]] = %[[INIT_TILE]]) -> (vector<[4]x[4]xi32>) { -// CHECK: %[[NEW_TILE:.*]] = arm_sme.move_vector_to_tile_slice %[[SRC_1D]], %[[CURRENT_TILE]], %[[TILE_SLICE_INDEX]] : vector<[4]xi32> into vector<[4]x[4]xi32> +// CHECK: %[[NEW_TILE:.*]] = arm_sme.insert_tile_slice %[[SRC_1D]], %[[CURRENT_TILE]][%[[TILE_SLICE_INDEX]]] : vector<[4]xi32> into vector<[4]x[4]xi32> // CHECK: scf.yield %[[NEW_TILE]] : vector<[4]x[4]xi32> // CHECK: "prevent.dce"(%[[TILE]]) : (vector<[4]x[4]xi32>) -> () func.func @broadcast_vec2d_from_i32(%arg0: i32) { @@ -409,7 +409,7 @@ func.func @broadcast_vec2d_from_i32(%arg0: i32) { // CHECK-SAME: %[[SRC:.*]]: vector) { // CHECK: %[[SRC_1D:.*]] = vector.broadcast %[[SRC]] : vector to vector<[4]xf32> // CHECK: scf.for -// CHECK: arm_sme.move_vector_to_tile_slice %[[SRC_1D]], {{.*}} +// CHECK: arm_sme.insert_tile_slice %[[SRC_1D]], {{.*}} func.func @broadcast_vec2d_from_vec0d(%arg0: vector) { %0 = vector.broadcast %arg0 : vector to vector<[4]x[4]xf32> "prevent.dce"(%0) : (vector<[4]x[4]xf32>) -> () @@ -422,7 +422,7 @@ func.func @broadcast_vec2d_from_vec0d(%arg0: vector) { // CHECK-SAME: %[[SRC:.*]]: vector<[8]xi16>) { // CHECK-NOT: vector.broadcast // CHECK: scf.for -// CHECK: arm_sme.move_vector_to_tile_slice %[[SRC]], {{.*}} +// CHECK: arm_sme.insert_tile_slice %[[SRC]], {{.*}} func.func @broadcast_vec2d_from_vec1d(%arg0: vector<[8]xi16>) { %0 = vector.broadcast %arg0 : vector<[8]xi16> to vector<[8]x[8]xi16> "prevent.dce"(%0) : (vector<[8]x[8]xi16>) -> () @@ -442,7 +442,7 @@ func.func @broadcast_vec2d_from_vec1d(%arg0: vector<[8]xi16>) { // CHECK: %[[VSCALE:.*]] = vector.vscale // CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %{{.*}} : index // CHECK: scf.for {{.*}} to %[[NUM_TILE_SLICES]] {{.*}} { -// CHECK: arm_sme.move_vector_to_tile_slice %[[BCST]], {{.*}} : vector<[4]xi32> into vector<[4]x[4]xi32> +// CHECK: arm_sme.insert_tile_slice %[[BCST]], {{.*}} : vector<[4]xi32> into vector<[4]x[4]xi32> func.func @splat_vec2d_from_i32(%arg0: i32) { %0 = vector.splat %arg0 : vector<[4]x[4]xi32> "prevent.dce"(%0) : (vector<[4]x[4]xi32>) -> () @@ -455,7 +455,7 @@ func.func @splat_vec2d_from_i32(%arg0: i32) { // CHECK-SAME: %[[SRC:.*]]: f16) { // CHECK: %[[BCST:.*]] = vector.broadcast %[[SRC]] : f16 to vector<[8]xf16> // CHECK: scf.for -// CHECK: arm_sme.move_vector_to_tile_slice %[[BCST]], {{.*}} : vector<[8]xf16> into vector<[8]x[8]xf16> +// CHECK: arm_sme.insert_tile_slice %[[BCST]], {{.*}} : vector<[8]xf16> into vector<[8]x[8]xf16> func.func @splat_vec2d_from_f16(%arg0: f16) { %0 = vector.splat %arg0 : vector<[8]x[8]xf16> "prevent.dce"(%0) : (vector<[8]x[8]xf16>) -> () @@ -695,7 +695,7 @@ func.func @vector_print_tile(%tile: vector<[4]x[4]xf32>) // CHECK-DAG: %[[VSCALE:.*]] = vector.vscale // CHECK-DAG: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index // CHECK-NEXT: scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] { -// CHECK-NEXT: %[[TILE_SLICE:.*]] = arm_sme.move_tile_slice_to_vector %[[TILE]][%[[TILE_SLICE_INDEX]]] : vector<[4]xf32> from vector<[4]x[4]xf32> +// CHECK-NEXT: %[[TILE_SLICE:.*]] = arm_sme.extract_tile_slice %[[TILE]][%[[TILE_SLICE_INDEX]]] : vector<[4]xf32> from vector<[4]x[4]xf32> // CHECK-NEXT: vector.print %[[TILE_SLICE]] : vector<[4]xf32> //===----------------------------------------------------------------------===// @@ -925,7 +925,7 @@ func.func @vector_store_i128(%arg0 : memref) { // CHECK-SAME: %[[INDEX:.*]]: index) func.func @vector_insert_slice_i32(%slice: vector<[4]xi32>, %row: index) -> vector<[4]x[4]xi32>{ // CHECK-NEXT: %[[TILE:.*]] = arm_sme.get_tile : vector<[4]x[4]xi32> - // CHECK-NEXT: arm_sme.move_vector_to_tile_slice %[[SLICE]], %[[TILE]], %[[INDEX]] : vector<[4]xi32> into vector<[4]x[4]xi32> + // CHECK-NEXT: arm_sme.insert_tile_slice %[[SLICE]], %[[TILE]][%[[INDEX]]] : vector<[4]xi32> into vector<[4]x[4]xi32> %tile = arm_sme.get_tile : vector<[4]x[4]xi32> %new_tile = vector.insert %slice, %tile[%row] : vector<[4]xi32> into vector<[4]x[4]xi32> return %new_tile : vector<[4]x[4]xi32> @@ -935,7 +935,7 @@ func.func @vector_insert_slice_i32(%slice: vector<[4]xi32>, %row: index) -> vect // CHECK-LABEL: @vector_insert_slice_i8 func.func @vector_insert_slice_i8(%slice: vector<[16]xi8>, %row: index) -> vector<[16]x[16]xi8> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[16]xi8> into vector<[16]x[16]xi8> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[16]xi8> into vector<[16]x[16]xi8> %tile = arm_sme.get_tile : vector<[16]x[16]xi8> %new_tile = vector.insert %slice, %tile[%row] : vector<[16]xi8> into vector<[16]x[16]xi8> return %new_tile : vector<[16]x[16]xi8> @@ -945,7 +945,7 @@ func.func @vector_insert_slice_i8(%slice: vector<[16]xi8>, %row: index) -> vecto // CHECK-LABEL: @vector_insert_slice_i16 func.func @vector_insert_slice_i16(%slice: vector<[8]xi16>, %row: index) -> vector<[8]x[8]xi16> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[8]xi16> into vector<[8]x[8]xi16> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[8]xi16> into vector<[8]x[8]xi16> %tile = arm_sme.get_tile : vector<[8]x[8]xi16> %new_tile = vector.insert %slice, %tile[%row] : vector<[8]xi16> into vector<[8]x[8]xi16> return %new_tile : vector<[8]x[8]xi16> @@ -955,7 +955,7 @@ func.func @vector_insert_slice_i16(%slice: vector<[8]xi16>, %row: index) -> vect // CHECK-LABEL: @vector_insert_slice_i64 func.func @vector_insert_slice_i64(%slice: vector<[2]xi64>, %row: index) -> vector<[2]x[2]xi64> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[2]xi64> into vector<[2]x[2]xi64> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[2]xi64> into vector<[2]x[2]xi64> %tile = arm_sme.get_tile : vector<[2]x[2]xi64> %new_tile = vector.insert %slice, %tile[%row] : vector<[2]xi64> into vector<[2]x[2]xi64> return %new_tile : vector<[2]x[2]xi64> @@ -965,7 +965,7 @@ func.func @vector_insert_slice_i64(%slice: vector<[2]xi64>, %row: index) -> vect // CHECK-LABEL: @vector_insert_slice_i128 func.func @vector_insert_slice_i128(%slice: vector<[1]xi128>, %row: index) -> vector<[1]x[1]xi128> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[1]xi128> into vector<[1]x[1]xi128> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[1]xi128> into vector<[1]x[1]xi128> %tile = arm_sme.get_tile : vector<[1]x[1]xi128> %new_tile = vector.insert %slice, %tile[%row] : vector<[1]xi128> into vector<[1]x[1]xi128> return %new_tile : vector<[1]x[1]xi128> @@ -975,7 +975,7 @@ func.func @vector_insert_slice_i128(%slice: vector<[1]xi128>, %row: index) -> ve // CHECK-LABEL: @vector_insert_slice_f16 func.func @vector_insert_slice_f16(%slice: vector<[8]xf16>, %row: index) -> vector<[8]x[8]xf16> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[8]xf16> into vector<[8]x[8]xf16> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[8]xf16> into vector<[8]x[8]xf16> %tile = arm_sme.get_tile : vector<[8]x[8]xf16> %new_tile = vector.insert %slice, %tile[%row] : vector<[8]xf16> into vector<[8]x[8]xf16> return %new_tile : vector<[8]x[8]xf16> @@ -985,7 +985,7 @@ func.func @vector_insert_slice_f16(%slice: vector<[8]xf16>, %row: index) -> vect // CHECK-LABEL: @vector_insert_slice_bf16 func.func @vector_insert_slice_bf16(%slice: vector<[8]xbf16>, %row: index) -> vector<[8]x[8]xbf16> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[8]xbf16> into vector<[8]x[8]xbf16> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[8]xbf16> into vector<[8]x[8]xbf16> %tile = arm_sme.get_tile : vector<[8]x[8]xbf16> %new_tile = vector.insert %slice, %tile[%row] : vector<[8]xbf16> into vector<[8]x[8]xbf16> return %new_tile : vector<[8]x[8]xbf16> @@ -995,7 +995,7 @@ func.func @vector_insert_slice_bf16(%slice: vector<[8]xbf16>, %row: index) -> ve // CHECK-LABEL: @vector_insert_slice_f32 func.func @vector_insert_slice_f32(%slice: vector<[4]xf32>, %row: index) -> vector<[4]x[4]xf32> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[4]xf32> into vector<[4]x[4]xf32> %tile = arm_sme.get_tile : vector<[4]x[4]xf32> %new_tile = vector.insert %slice, %tile[%row] : vector<[4]xf32> into vector<[4]x[4]xf32> return %new_tile : vector<[4]x[4]xf32> @@ -1005,7 +1005,7 @@ func.func @vector_insert_slice_f32(%slice: vector<[4]xf32>, %row: index) -> vect // CHECK-LABEL: @vector_insert_slice_f64 func.func @vector_insert_slice_f64(%slice: vector<[2]xf64>, %row: index) -> vector<[2]x[2]xf64> { - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}} : vector<[2]xf64> into vector<[2]x[2]xf64> + // CHECK: arm_sme.insert_tile_slice %{{.*}} : vector<[2]xf64> into vector<[2]x[2]xf64> %tile = arm_sme.get_tile : vector<[2]x[2]xf64> %new_tile = vector.insert %slice, %tile[%row] : vector<[2]xf64> into vector<[2]x[2]xf64> return %new_tile : vector<[2]x[2]xf64> @@ -1019,9 +1019,9 @@ func.func @vector_insert_slice_f64(%slice: vector<[2]xf64>, %row: index) -> vect // CHECK-SAME: %[[COL:.*]]: index) func.func @vector_insert_element_i32(%el: i32, %row: index, %col: index) -> vector<[4]x[4]xi32> { // CHECK-NEXT: %[[TILE:.*]] = arm_sme.get_tile : vector<[4]x[4]xi32> - // CHECK-NEXT: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %[[TILE]][%[[ROW]]] : vector<[4]xi32> from vector<[4]x[4]xi32> + // CHECK-NEXT: %[[SLICE:.*]] = arm_sme.extract_tile_slice %[[TILE]][%[[ROW]]] : vector<[4]xi32> from vector<[4]x[4]xi32> // CHECK-NEXT: %[[NEW_SLICE:.*]] = vector.insert %[[EL]], %[[SLICE]] [%[[COL]]] : i32 into vector<[4]xi32> - // CHECK-NEXT: arm_sme.move_vector_to_tile_slice %[[NEW_SLICE]], %[[TILE]], %[[ROW]] : vector<[4]xi32> into vector<[4]x[4]xi32> + // CHECK-NEXT: arm_sme.insert_tile_slice %[[NEW_SLICE]], %[[TILE]][%[[ROW]]] : vector<[4]xi32> into vector<[4]x[4]xi32> %tile = arm_sme.get_tile : vector<[4]x[4]xi32> %new_tile = vector.insert %el, %tile[%row, %col] : i32 into vector<[4]x[4]xi32> return %new_tile : vector<[4]x[4]xi32> @@ -1032,8 +1032,8 @@ func.func @vector_insert_element_i32(%el: i32, %row: index, %col: index) -> vect // CHECK-LABEL: @vector_insert_element_i8 func.func @vector_insert_element_i8(%el: i8, %row: index, %col: index) -> vector<[16]x[16]xi8> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[16]x[16]xi8> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[16]xi8> into vector<[16]x[16]xi8> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[16]xi8> into vector<[16]x[16]xi8> %tile = arm_sme.get_tile : vector<[16]x[16]xi8> %new_tile = vector.insert %el, %tile[%row, %col] : i8 into vector<[16]x[16]xi8> return %new_tile : vector<[16]x[16]xi8> @@ -1044,8 +1044,8 @@ func.func @vector_insert_element_i8(%el: i8, %row: index, %col: index) -> vector // CHECK-LABEL: @vector_insert_element_i16 func.func @vector_insert_element_i16(%el: i16, %row: index, %col: index) -> vector<[8]x[8]xi16> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[8]x[8]xi16> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[8]xi16> into vector<[8]x[8]xi16> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[8]xi16> into vector<[8]x[8]xi16> %tile = arm_sme.get_tile : vector<[8]x[8]xi16> %new_tile = vector.insert %el, %tile[%row, %col] : i16 into vector<[8]x[8]xi16> return %new_tile : vector<[8]x[8]xi16> @@ -1056,8 +1056,8 @@ func.func @vector_insert_element_i16(%el: i16, %row: index, %col: index) -> vect // CHECK-LABEL: @vector_insert_element_i64 func.func @vector_insert_element_i64(%el: i64, %row: index, %col: index) -> vector<[2]x[2]xi64> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[2]x[2]xi64> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[2]xi64> into vector<[2]x[2]xi64> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[2]xi64> into vector<[2]x[2]xi64> %tile = arm_sme.get_tile : vector<[2]x[2]xi64> %new_tile = vector.insert %el, %tile[%row, %col] : i64 into vector<[2]x[2]xi64> return %new_tile : vector<[2]x[2]xi64> @@ -1068,8 +1068,8 @@ func.func @vector_insert_element_i64(%el: i64, %row: index, %col: index) -> vect // CHECK-LABEL: @vector_insert_element_i128 func.func @vector_insert_element_i128(%el: i128, %row: index, %col: index) -> vector<[1]x[1]xi128> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[1]x[1]xi128> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[1]xi128> into vector<[1]x[1]xi128> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[1]xi128> into vector<[1]x[1]xi128> %tile = arm_sme.get_tile : vector<[1]x[1]xi128> %new_tile = vector.insert %el, %tile[%row, %col] : i128 into vector<[1]x[1]xi128> return %new_tile : vector<[1]x[1]xi128> @@ -1080,8 +1080,8 @@ func.func @vector_insert_element_i128(%el: i128, %row: index, %col: index) -> ve // CHECK-LABEL: @vector_insert_element_f16 func.func @vector_insert_element_f16(%el: f16, %row: index, %col: index) -> vector<[8]x[8]xf16> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[8]x[8]xf16> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[8]xf16> into vector<[8]x[8]xf16> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[8]xf16> into vector<[8]x[8]xf16> %tile = arm_sme.get_tile : vector<[8]x[8]xf16> %new_tile = vector.insert %el, %tile[%row, %col] : f16 into vector<[8]x[8]xf16> return %new_tile : vector<[8]x[8]xf16> @@ -1092,8 +1092,8 @@ func.func @vector_insert_element_f16(%el: f16, %row: index, %col: index) -> vect // CHECK-LABEL: @vector_insert_element_bf16 func.func @vector_insert_element_bf16(%el: bf16, %row: index, %col: index) -> vector<[8]x[8]xbf16> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[8]x[8]xbf16> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[8]xbf16> into vector<[8]x[8]xbf16> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[8]xbf16> into vector<[8]x[8]xbf16> %tile = arm_sme.get_tile : vector<[8]x[8]xbf16> %new_tile = vector.insert %el, %tile[%row, %col] : bf16 into vector<[8]x[8]xbf16> return %new_tile : vector<[8]x[8]xbf16> @@ -1104,8 +1104,8 @@ func.func @vector_insert_element_bf16(%el: bf16, %row: index, %col: index) -> ve // CHECK-LABEL: @vector_insert_element_f32 func.func @vector_insert_element_f32(%el: f32, %row: index, %col: index) -> vector<[4]x[4]xf32> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[4]x[4]xf32> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[4]xf32> into vector<[4]x[4]xf32> %tile = arm_sme.get_tile : vector<[4]x[4]xf32> %new_tile = vector.insert %el, %tile[%row, %col] : f32 into vector<[4]x[4]xf32> return %new_tile : vector<[4]x[4]xf32> @@ -1116,15 +1116,15 @@ func.func @vector_insert_element_f32(%el: f32, %row: index, %col: index) -> vect // CHECK-LABEL: @vector_insert_element_f64 func.func @vector_insert_element_f64(%el: f64, %row: index, %col: index) -> vector<[2]x[2]xf64> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[2]x[2]xf64> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]]{{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> - // CHECK: arm_sme.move_vector_to_tile_slice %{{.*}}, %[[TILE]], %{{.*}} : vector<[2]xf64> into vector<[2]x[2]xf64> + // CHECK: arm_sme.extract_tile_slice %[[TILE]]{{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> + // CHECK: arm_sme.insert_tile_slice %{{.*}}, %[[TILE]][%{{.*}}] : vector<[2]xf64> into vector<[2]x[2]xf64> %tile = arm_sme.get_tile : vector<[2]x[2]xf64> %new_tile = vector.insert %el, %tile[%row, %col] : f64 into vector<[2]x[2]xf64> return %new_tile : vector<[2]x[2]xf64> } //===----------------------------------------------------------------------===// -// vector.extract --> arm_sme.move_tile_slice_to_vector +// vector.extract --> arm_sme.extract_tile_slice //===----------------------------------------------------------------------===// // ----- @@ -1133,7 +1133,7 @@ func.func @vector_insert_element_f64(%el: f64, %row: index, %col: index) -> vect // CHECK-SAME: %[[INDEX:.*]]: index) func.func @vector_extract_slice_i32(%row: index) -> vector<[4]xi32> { // CHECK: %[[TILE:.*]] = arm_sme.get_tile : vector<[4]x[4]xi32> - // CHECK: arm_sme.move_tile_slice_to_vector %[[TILE]][%[[INDEX]]] : vector<[4]xi32> from vector<[4]x[4]xi32> + // CHECK: arm_sme.extract_tile_slice %[[TILE]][%[[INDEX]]] : vector<[4]xi32> from vector<[4]x[4]xi32> %tile = arm_sme.get_tile : vector<[4]x[4]xi32> %slice = vector.extract %tile[%row] : vector<[4]xi32> from vector<[4]x[4]xi32> return %slice : vector<[4]xi32> @@ -1143,7 +1143,7 @@ func.func @vector_extract_slice_i32(%row: index) -> vector<[4]xi32> { // CHECK-LABEL: @vector_extract_slice_i8 func.func @vector_extract_slice_i8(%row: index) -> vector<[16]xi8> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> %tile = arm_sme.get_tile : vector<[16]x[16]xi8> %slice = vector.extract %tile[%row] : vector<[16]xi8> from vector<[16]x[16]xi8> return %slice : vector<[16]xi8> @@ -1153,7 +1153,7 @@ func.func @vector_extract_slice_i8(%row: index) -> vector<[16]xi8> { // CHECK-LABEL: @vector_extract_slice_i16 func.func @vector_extract_slice_i16(%row: index) -> vector<[8]xi16> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> %tile = arm_sme.get_tile : vector<[8]x[8]xi16> %slice = vector.extract %tile[%row] : vector<[8]xi16> from vector<[8]x[8]xi16> return %slice : vector<[8]xi16> @@ -1163,7 +1163,7 @@ func.func @vector_extract_slice_i16(%row: index) -> vector<[8]xi16> { // CHECK-LABEL: @vector_extract_slice_i64 func.func @vector_extract_slice_i64(%row: index) -> vector<[2]xi64> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> %tile = arm_sme.get_tile : vector<[2]x[2]xi64> %slice = vector.extract %tile[%row] : vector<[2]xi64> from vector<[2]x[2]xi64> return %slice : vector<[2]xi64> @@ -1173,7 +1173,7 @@ func.func @vector_extract_slice_i64(%row: index) -> vector<[2]xi64> { // CHECK-LABEL: @vector_extract_slice_i128 func.func @vector_extract_slice_i128(%row: index) -> vector<[1]xi128> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> %tile = arm_sme.get_tile : vector<[1]x[1]xi128> %slice = vector.extract %tile[%row] : vector<[1]xi128> from vector<[1]x[1]xi128> return %slice : vector<[1]xi128> @@ -1183,7 +1183,7 @@ func.func @vector_extract_slice_i128(%row: index) -> vector<[1]xi128> { // CHECK-LABEL: @vector_extract_slice_f16 func.func @vector_extract_slice_f16(%row: index) -> vector<[8]xf16> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> %tile = arm_sme.get_tile : vector<[8]x[8]xf16> %slice = vector.extract %tile[%row] : vector<[8]xf16> from vector<[8]x[8]xf16> return %slice : vector<[8]xf16> @@ -1193,7 +1193,7 @@ func.func @vector_extract_slice_f16(%row: index) -> vector<[8]xf16> { // CHECK-LABEL: @vector_extract_slice_bf16 func.func @vector_extract_slice_bf16(%row: index) -> vector<[8]xbf16> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> %tile = arm_sme.get_tile : vector<[8]x[8]xbf16> %slice = vector.extract %tile[%row] : vector<[8]xbf16> from vector<[8]x[8]xbf16> return %slice : vector<[8]xbf16> @@ -1203,7 +1203,7 @@ func.func @vector_extract_slice_bf16(%row: index) -> vector<[8]xbf16> { // CHECK-LABEL: @vector_extract_slice_f32 func.func @vector_extract_slice_f32(%row: index) -> vector<[4]xf32> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> %tile = arm_sme.get_tile : vector<[4]x[4]xf32> %slice = vector.extract %tile[%row] : vector<[4]xf32> from vector<[4]x[4]xf32> return %slice : vector<[4]xf32> @@ -1213,7 +1213,7 @@ func.func @vector_extract_slice_f32(%row: index) -> vector<[4]xf32> { // CHECK-LABEL: @vector_extract_slice_f64 func.func @vector_extract_slice_f64(%row: index) -> vector<[2]xf64> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> %tile = arm_sme.get_tile : vector<[2]x[2]xf64> %slice = vector.extract %tile[%row] : vector<[2]xf64> from vector<[2]x[2]xf64> return %slice : vector<[2]xf64> @@ -1226,7 +1226,7 @@ func.func @vector_extract_slice_f64(%row: index) -> vector<[2]xf64> { // CHECK-SAME: %[[COL:.*]]: index) func.func @vector_extract_element(%row: index, %col: index) -> i32 { // CHECK-NEXT: %[[TILE:.*]] = arm_sme.get_tile : vector<[4]x[4]xi32> - // CHECK-NEXT: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %[[TILE]][%[[ROW]]] : vector<[4]xi32> from vector<[4]x[4]xi32> + // CHECK-NEXT: %[[SLICE:.*]] = arm_sme.extract_tile_slice %[[TILE]][%[[ROW]]] : vector<[4]xi32> from vector<[4]x[4]xi32> // CHECK-NEXT: %[[EL:.*]] = vector.extract %[[SLICE]]{{\[}}%[[COL]]] : i32 from vector<[4]xi32> %tile = arm_sme.get_tile : vector<[4]x[4]xi32> %el = vector.extract %tile[%row, %col] : i32 from vector<[4]x[4]xi32> @@ -1237,7 +1237,7 @@ func.func @vector_extract_element(%row: index, %col: index) -> i32 { // CHECK-LABEL: @vector_extract_element_i8 func.func @vector_extract_element_i8(%row: index, %col: index) -> i8 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : i8 from vector<[16]xi8> %tile = arm_sme.get_tile : vector<[16]x[16]xi8> %el = vector.extract %tile[%row, %col] : i8 from vector<[16]x[16]xi8> @@ -1248,7 +1248,7 @@ func.func @vector_extract_element_i8(%row: index, %col: index) -> i8 { // CHECK-LABEL: @vector_extract_element_i16 func.func @vector_extract_element_i16(%row: index, %col: index) -> i16 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : i16 from vector<[8]xi16> %tile = arm_sme.get_tile : vector<[8]x[8]xi16> %el = vector.extract %tile[%row, %col] : i16 from vector<[8]x[8]xi16> @@ -1259,7 +1259,7 @@ func.func @vector_extract_element_i16(%row: index, %col: index) -> i16 { // CHECK-LABEL: @vector_extract_element_i64 func.func @vector_extract_element_i64(%row: index, %col: index) -> i64 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : i64 from vector<[2]xi64> %tile = arm_sme.get_tile : vector<[2]x[2]xi64> %el = vector.extract %tile[%row, %col] : i64 from vector<[2]x[2]xi64> @@ -1270,7 +1270,7 @@ func.func @vector_extract_element_i64(%row: index, %col: index) -> i64 { // CHECK-LABEL: @vector_extract_element_i128 func.func @vector_extract_element_i128(%row: index, %col: index) -> i128 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : i128 from vector<[1]xi128> %tile = arm_sme.get_tile : vector<[1]x[1]xi128> %el = vector.extract %tile[%row, %col] : i128 from vector<[1]x[1]xi128> @@ -1281,7 +1281,7 @@ func.func @vector_extract_element_i128(%row: index, %col: index) -> i128 { // CHECK-LABEL: @vector_extract_element_f16 func.func @vector_extract_element_f16(%row: index, %col: index) -> f16 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : f16 from vector<[8]xf16> %tile = arm_sme.get_tile : vector<[8]x[8]xf16> %el = vector.extract %tile[%row, %col] : f16 from vector<[8]x[8]xf16> @@ -1292,7 +1292,7 @@ func.func @vector_extract_element_f16(%row: index, %col: index) -> f16 { // CHECK-LABEL: @vector_extract_element_bf16 func.func @vector_extract_element_bf16(%row: index, %col: index) -> bf16 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : bf16 from vector<[8]xbf16> %tile = arm_sme.get_tile : vector<[8]x[8]xbf16> %el = vector.extract %tile[%row, %col] : bf16 from vector<[8]x[8]xbf16> @@ -1303,7 +1303,7 @@ func.func @vector_extract_element_bf16(%row: index, %col: index) -> bf16 { // CHECK-LABEL: @vector_extract_element_f32 func.func @vector_extract_element_f32(%row: index, %col: index) -> f32 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : f32 from vector<[4]xf32> %tile = arm_sme.get_tile : vector<[4]x[4]xf32> %el = vector.extract %tile[%row, %col] : f32 from vector<[4]x[4]xf32> @@ -1314,7 +1314,7 @@ func.func @vector_extract_element_f32(%row: index, %col: index) -> f32 { // CHECK-LABEL: @vector_extract_element_f64 func.func @vector_extract_element_f64(%row: index, %col: index) -> f64 { - // CHECK: %[[SLICE:.*]] = arm_sme.move_tile_slice_to_vector %{{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> + // CHECK: %[[SLICE:.*]] = arm_sme.extract_tile_slice %{{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> // CHECK-NEXT: %{{.*}} = vector.extract %[[SLICE]]{{\[}}%{{.*}}] : f64 from vector<[2]xf64> %tile = arm_sme.get_tile : vector<[2]x[2]xf64> %el = vector.extract %tile[%row, %col] : f64 from vector<[2]x[2]xf64> diff --git a/mlir/test/Dialect/ArmSME/invalid.mlir b/mlir/test/Dialect/ArmSME/invalid.mlir index cc052fac0d9dc9..700b2412ff7a7c 100644 --- a/mlir/test/Dialect/ArmSME/invalid.mlir +++ b/mlir/test/Dialect/ArmSME/invalid.mlir @@ -45,36 +45,36 @@ func.func @arm_sme_get_tile__bad_shape(%tile_id : i8) -> vector<[4]x[16]xi8> { } //===----------------------------------------------------------------------===// -// arm_sme.move_vector_to_tile_slice +// arm_sme.insert_tile_slice //===----------------------------------------------------------------------===// // ----- -func.func @arm_sme_move_vector_to_tile_slice_i8__bad_vector_type(%vector : vector<[8]xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> vector<[16]x[16]xi8> { +func.func @arm_sme_insert_tile_slice_i8__bad_vector_type(%vector : vector<[8]xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> vector<[16]x[16]xi8> { %c0 = arith.constant 0 : index // expected-error@+1 {{op failed to verify that type of 'vector' matches type of 'tile' slice}} - %0 = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[8]xi8> into vector<[16]x[16]xi8> + %0 = arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[8]xi8> into vector<[16]x[16]xi8> return %0 : vector<[16]x[16]xi8> } // ----- -func.func @arm_sme_move_vector_to_tile_slice_f32__bad_vector_type(%vector : vector<[8]xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> vector<[4]x[4]xf32> { +func.func @arm_sme_insert_tile_slice_f32__bad_vector_type(%vector : vector<[8]xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> vector<[4]x[4]xf32> { %c0 = arith.constant 0 : index // expected-error@+1 {{op failed to verify that type of 'vector' matches type of 'tile' slice}} - %0 = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[8]xf32> into vector<[4]x[4]xf32> + %0 = arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[8]xf32> into vector<[4]x[4]xf32> return %0 : vector<[4]x[4]xf32> } //===----------------------------------------------------------------------===// -// arm_sme.move_tile_slice_to_vector +// arm_sme.extract_tile_slice //===----------------------------------------------------------------------===// // ----- -func.func @arm_sme_move_tile_slice_to_vector__bad_result_type(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> vector<[2]xf64> { +func.func @arm_sme_extract_tile_slice__bad_result_type(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> vector<[2]xf64> { // expected-error@+1 {{op failed to verify that type of 'result' matches type of 'tile' slice}} - %0 = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xf64> from vector<[4]x[4]xf32> + %0 = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[2]xf64> from vector<[4]x[4]xf32> return %0 : vector<[2]xf64> } diff --git a/mlir/test/Dialect/ArmSME/roundtrip.mlir b/mlir/test/Dialect/ArmSME/roundtrip.mlir index 6095fdc11ead8f..c326895aad698e 100644 --- a/mlir/test/Dialect/ArmSME/roundtrip.mlir +++ b/mlir/test/Dialect/ArmSME/roundtrip.mlir @@ -875,180 +875,180 @@ func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_s } //===----------------------------------------------------------------------===// -// arm_sme.move_vector_to_tile_slice +// arm_sme.insert_tile_slice //===----------------------------------------------------------------------===// // ----- -func.func @arm_sme_move_vector_to_tile_slice_i8(%vector : vector<[16]xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[16]xi8> into vector<[16]x[16]xi8> +func.func @arm_sme_insert_tile_slice_i8(%vector : vector<[16]xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[16]xi8> into vector<[16]x[16]xi8> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[16]xi8> into vector<[16]x[16]xi8> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[16]xi8> into vector<[16]x[16]xi8> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_i16(%vector : vector<[8]xi16>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[8]xi16> into vector<[8]x[8]xi16> +func.func @arm_sme_insert_tile_slice_i16(%vector : vector<[8]xi16>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[8]xi16> into vector<[8]x[8]xi16> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[8]xi16> into vector<[8]x[8]xi16> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[8]xi16> into vector<[8]x[8]xi16> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_i32(%vector : vector<[4]xi32>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[4]xi32> into vector<[4]x[4]xi32> +func.func @arm_sme_insert_tile_slice_i32(%vector : vector<[4]xi32>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[4]xi32> into vector<[4]x[4]xi32> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[4]xi32> into vector<[4]x[4]xi32> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[4]xi32> into vector<[4]x[4]xi32> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_i64(%vector : vector<[2]xi64>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[2]xi64> into vector<[2]x[2]xi64> +func.func @arm_sme_insert_tile_slice_i64(%vector : vector<[2]xi64>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[2]xi64> into vector<[2]x[2]xi64> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[2]xi64> into vector<[2]x[2]xi64> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[2]xi64> into vector<[2]x[2]xi64> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_i128(%vector : vector<[1]xi128>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[1]xi128> into vector<[1]x[1]xi128> +func.func @arm_sme_insert_tile_slice_i128(%vector : vector<[1]xi128>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[1]xi128> into vector<[1]x[1]xi128> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[1]xi128> into vector<[1]x[1]xi128> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[1]xi128> into vector<[1]x[1]xi128> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_f16(%vector : vector<[8]xf16>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[8]xf16> into vector<[8]x[8]xf16> +func.func @arm_sme_insert_tile_slice_f16(%vector : vector<[8]xf16>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[8]xf16> into vector<[8]x[8]xf16> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[8]xf16> into vector<[8]x[8]xf16> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[8]xf16> into vector<[8]x[8]xf16> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_bf16(%vector : vector<[8]xbf16>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[8]xbf16> into vector<[8]x[8]xbf16> +func.func @arm_sme_insert_tile_slice_bf16(%vector : vector<[8]xbf16>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[8]xbf16> into vector<[8]x[8]xbf16> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[8]xbf16> into vector<[8]x[8]xbf16> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[8]xbf16> into vector<[8]x[8]xbf16> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_f32(%vector : vector<[4]xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[4]xf32> into vector<[4]x[4]xf32> +func.func @arm_sme_insert_tile_slice_f32(%vector : vector<[4]xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[4]xf32> into vector<[4]x[4]xf32> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[4]xf32> into vector<[4]x[4]xf32> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[4]xf32> into vector<[4]x[4]xf32> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_f64(%vector : vector<[2]xf64>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} : vector<[2]xf64> into vector<[2]x[2]xf64> +func.func @arm_sme_insert_tile_slice_f64(%vector : vector<[2]xf64>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} : vector<[2]xf64> into vector<[2]x[2]xf64> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[2]xf64> into vector<[2]x[2]xf64> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] : vector<[2]xf64> into vector<[2]x[2]xf64> return } // ----- -func.func @arm_sme_move_vector_to_tile_slice_ver_i8(%vector : vector<[16]xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> () { - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} layout : vector<[16]xi8> into vector<[16]x[16]xi8> +func.func @arm_sme_insert_tile_slice_ver_i8(%vector : vector<[16]xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> () { + // CHECK: arm_sme.insert_tile_slice {{.*}} layout : vector<[16]xi8> into vector<[16]x[16]xi8> %c0 = arith.constant 0 : index - arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index layout : vector<[16]xi8> into vector<[16]x[16]xi8> + arm_sme.insert_tile_slice %vector, %tile[%tile_slice_index] layout : vector<[16]xi8> into vector<[16]x[16]xi8> return } //===----------------------------------------------------------------------===// -// arm_sme.move_tile_slice_to_vector +// arm_sme.extract_tile_slice //===----------------------------------------------------------------------===// // ----- -func.func @arm_sme_move_tile_slice_to_vector_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> vector<[16]xi8> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8> +func.func @arm_sme_extract_tile_slice_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> vector<[16]xi8> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[16]xi8> from vector<[16]x[16]xi8> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[16]xi8> from vector<[16]x[16]xi8> return %slice : vector<[16]xi8> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index) -> vector<[8]xi16> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[8]xi16> from vector<[8]x[8]xi16> +func.func @arm_sme_extract_tile_slice_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index) -> vector<[8]xi16> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[8]xi16> from vector<[8]x[8]xi16> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[8]xi16> from vector<[8]x[8]xi16> return %slice : vector<[8]xi16> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index) -> vector<[4]xi32> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[4]xi32> from vector<[4]x[4]xi32> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[4]xi32> from vector<[4]x[4]xi32> +func.func @arm_sme_extract_tile_slice_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index) -> vector<[4]xi32> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[4]xi32> from vector<[4]x[4]xi32> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[4]xi32> from vector<[4]x[4]xi32> return %slice : vector<[4]xi32> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index) -> vector<[2]xi64> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xi64> from vector<[2]x[2]xi64> +func.func @arm_sme_extract_tile_slice_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index) -> vector<[2]xi64> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[2]xi64> from vector<[2]x[2]xi64> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[2]xi64> from vector<[2]x[2]xi64> return %slice : vector<[2]xi64> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index) -> vector<[1]xi128> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[1]xi128> from vector<[1]x[1]xi128> +func.func @arm_sme_extract_tile_slice_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index) -> vector<[1]xi128> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[1]xi128> from vector<[1]x[1]xi128> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[1]xi128> from vector<[1]x[1]xi128> return %slice : vector<[1]xi128> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index) -> vector<[8]xf16> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[8]xf16> from vector<[8]x[8]xf16> +func.func @arm_sme_extract_tile_slice_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index) -> vector<[8]xf16> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[8]xf16> from vector<[8]x[8]xf16> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[8]xf16> from vector<[8]x[8]xf16> return %slice : vector<[8]xf16> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) -> vector<[8]xbf16> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[8]xbf16> from vector<[8]x[8]xbf16> +func.func @arm_sme_extract_tile_slice_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) -> vector<[8]xbf16> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[8]xbf16> from vector<[8]x[8]xbf16> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[8]xbf16> from vector<[8]x[8]xbf16> return %slice : vector<[8]xbf16> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> vector<[4]xf32> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[4]xf32> from vector<[4]x[4]xf32> +func.func @arm_sme_extract_tile_slice_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> vector<[4]xf32> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[4]xf32> from vector<[4]x[4]xf32> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[4]xf32> from vector<[4]x[4]xf32> return %slice : vector<[4]xf32> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index) -> vector<[2]xf64> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xf64> from vector<[2]x[2]xf64> +func.func @arm_sme_extract_tile_slice_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index) -> vector<[2]xf64> { + // CHECK: arm_sme.extract_tile_slice {{.*}} : vector<[2]xf64> from vector<[2]x[2]xf64> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] : vector<[2]xf64> from vector<[2]x[2]xf64> return %slice : vector<[2]xf64> } // ----- -func.func @arm_sme_move_tile_slice_to_vector_ver_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index) -> vector<[2]xf64> { - // CHECK: arm_sme.move_tile_slice_to_vector {{.*}} layout : vector<[2]xf64> from vector<[2]x[2]xf64> - %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] layout : vector<[2]xf64> from vector<[2]x[2]xf64> +func.func @arm_sme_extract_tile_slice_ver_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index) -> vector<[2]xf64> { + // CHECK: arm_sme.extract_tile_slice {{.*}} layout : vector<[2]xf64> from vector<[2]x[2]xf64> + %slice = arm_sme.extract_tile_slice %tile[%tile_slice_index] layout : vector<[2]xf64> from vector<[2]x[2]xf64> return %slice : vector<[2]xf64> } diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-copies.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-copies.mlir index 6d9cbf36a162ff..2327f74e7a5f93 100644 --- a/mlir/test/Dialect/ArmSME/tile-allocation-copies.mlir +++ b/mlir/test/Dialect/ArmSME/tile-allocation-copies.mlir @@ -59,7 +59,7 @@ func.func @cond_branch(%cond: i1, %tile: vector<[4]x[4]xf32>) { // CHECK: ^[[BB2_COPIES]]: // CHECK-NEXT: cf.br ^[[BB2:[[:alnum:]]+]] // CHECK: ^[[BB2]]: -// CHECK-NEXT: %[[NEXT_TILE:.*]] = arm_sme.move_vector_to_tile_slice %{{.*}}, %[[ITER_TILE]] +// CHECK-NEXT: %[[NEXT_TILE:.*]] = arm_sme.insert_tile_slice %{{.*}}, %[[ITER_TILE]] // CHECK: %[[BB1_COPY_1:.*]] = arm_sme.copy_tile %[[NEXT_TILE]] : vector<[4]x[4]xf32> // CHECK: cf.br ^bb1(%{{[[:alnum:]]+}}, %[[BB1_COPY_1]] // CHECK: ^[[BB3]](%{{.*}}: vector<[4]x[4]xf32>): @@ -80,7 +80,7 @@ func.func @cond_branch_with_backedge(%tileA: vector<[4]x[4]xf32>, %tileB: vector cf.cond_br %continueLoop, ^bb2, ^bb3(%iterTile, %tileB, %tileC, %tileD : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>) ^bb2: // Live here: %iterTile, %tileB, %tileC, %tileD - %nextTile = arm_sme.move_vector_to_tile_slice %slice, %iterTile, %currentIndex : vector<[4]xf32> into vector<[4]x[4]xf32> + %nextTile = arm_sme.insert_tile_slice %slice, %iterTile[%currentIndex] : vector<[4]xf32> into vector<[4]x[4]xf32> %nextIndex = arith.addi %currentIndex, %c1 : index cf.br ^bb1(%nextIndex, %nextTile : index, vector<[4]x[4]xf32>) ^bb3(%finalTileA: vector<[4]x[4]xf32>, %finalTileB: vector<[4]x[4]xf32>, %finalTileC: vector<[4]x[4]xf32>, %finalTileD: vector<[4]x[4]xf32>): diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir index 2e1f3d1ee10a9b..0b739feaf019d7 100644 --- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir +++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir @@ -9,8 +9,8 @@ // CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: // CHECK-LIVE-RANGE: ^bb0: // CHECK-LIVE-RANGE: S arm_sme.zero -// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.insert_tile_slice +// CHECK-LIVE-RANGE-NEXT: || arm_sme.insert_tile_slice // CHECK-LIVE-RANGE-NEXT: |E test.some_use // CHECK-LIVE-RANGE-NEXT: E test.some_use @@ -19,11 +19,11 @@ func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { // CHECK-NEXT: %[[ZERO_TILE_0:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32> // CHECK-NEXT: %[[ZERO_TILE_1:.*]] = arm_sme.zero {tile_id = 1 : i32} : vector<[4]x[4]xf32> - // CHECK-NEXT: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_A]], %[[ZERO_TILE_1]], %{{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK-NEXT: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %[[VECTOR_B]], %[[ZERO_TILE_0]], %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK-NEXT: %[[INSERT_TILE_1:.*]] = arm_sme.insert_tile_slice %[[VECTOR_A]], %[[ZERO_TILE_1]][%{{.*}}] {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK-NEXT: %[[INSERT_TILE_0:.*]] = arm_sme.insert_tile_slice %[[VECTOR_B]], %[[ZERO_TILE_0]][%{{.*}}] {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> %zero = arm_sme.zero : vector<[4]x[4]xf32> - %tile_a = arm_sme.move_vector_to_tile_slice %a, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32> - %tile_b = arm_sme.move_vector_to_tile_slice %b, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_a = arm_sme.insert_tile_slice %a, %zero[%index] : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_b = arm_sme.insert_tile_slice %b, %zero[%index] : vector<[4]xf32> into vector<[4]x[4]xf32> "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () return @@ -34,16 +34,16 @@ func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32> // CHECK-LIVE-RANGE-LABEL: @value_with_multiple_users // CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: // CHECK-LIVE-RANGE: ^bb0: -// CHECK-LIVE-RANGE-NEXT: |S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: || arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.insert_tile_slice +// CHECK-LIVE-RANGE-NEXT: || arm_sme.insert_tile_slice // CHECK-LIVE-RANGE-NEXT: |E test.some_use // CHECK-LIVE-RANGE-NEXT: E test.some_use // expected-note@below {{tile operand is: of type 'vector<[4]x[4]xf32>'}} func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) { // expected-error@below {{op tile operand allocated to different SME virtial tile (move required)}} - %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> - %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_a = arm_sme.insert_tile_slice %a, %tile[%index] : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_b = arm_sme.insert_tile_slice %b, %tile[%index] : vector<[4]xf32> into vector<[4]x[4]xf32> "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () return @@ -286,14 +286,14 @@ func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]x iter_args(%iter_a = %init, %iter_b = %init, %iter_c = %init, %iter_d = %init) -> (vector<[4]x[4]xf32>, vector<[4]x[4]xf32> , vector<[4]x[4]xf32> , vector<[4]x[4]xf32>) { // ^bb2: - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 2 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 3 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - // CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> - %new_a = arm_sme.move_vector_to_tile_slice %a, %iter_a, %i : vector<[4]xf32> into vector<[4]x[4]xf32> - %new_b = arm_sme.move_vector_to_tile_slice %b, %iter_b, %i : vector<[4]xf32> into vector<[4]x[4]xf32> - %new_c = arm_sme.move_vector_to_tile_slice %c, %iter_c, %i : vector<[4]xf32> into vector<[4]x[4]xf32> - %new_d = arm_sme.move_vector_to_tile_slice %d, %iter_d, %i : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 1 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 2 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 3 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + // CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_a = arm_sme.insert_tile_slice %a, %iter_a[%i] : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_b = arm_sme.insert_tile_slice %b, %iter_b[%i] : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_c = arm_sme.insert_tile_slice %c, %iter_c[%i] : vector<[4]xf32> into vector<[4]x[4]xf32> + %new_d = arm_sme.insert_tile_slice %d, %iter_d[%i] : vector<[4]xf32> into vector<[4]x[4]xf32> scf.yield %new_a, %new_b, %new_c, %new_d : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32> } // Live = %init, %tile_a, %tile_b, %tile_c, %tile_d (out of tiles!) @@ -316,10 +316,10 @@ func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]x // CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: // CHECK-LIVE-RANGE: ^bb2: // CHECK-LIVE-RANGE-NEXT: || test.some_use -// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.move_vector_to_tile_slice -// CHECK-LIVE-RANGE-NEXT: |||||S arm_sme.move_vector_to_tile_slice +// CHECK-LIVE-RANGE-NEXT: ||S arm_sme.insert_tile_slice +// CHECK-LIVE-RANGE-NEXT: |||S arm_sme.insert_tile_slice +// CHECK-LIVE-RANGE-NEXT: ||||S arm_sme.insert_tile_slice +// CHECK-LIVE-RANGE-NEXT: |||||S arm_sme.insert_tile_slice // CHECK-LIVE-RANGE-NEXT: ||E||| test.some_use // CHECK-LIVE-RANGE-NEXT: || E|| test.some_use // CHECK-LIVE-RANGE-NEXT: || E| test.some_use @@ -346,10 +346,10 @@ func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector< // So spilled here (unnecessarily). // The arm_sme.zero op could be moved into the loop to avoid this. "test.some_use"(%zero) : (vector<[4]x[4]xf32>) -> () - %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> - %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> - %tile_c = arm_sme.move_vector_to_tile_slice %c, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> - %tile_d = arm_sme.move_vector_to_tile_slice %d, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_a = arm_sme.insert_tile_slice %a, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_b = arm_sme.insert_tile_slice %b, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_c = arm_sme.insert_tile_slice %c, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32> + %tile_d = arm_sme.insert_tile_slice %d, %tile[%c0] : vector<[4]xf32> into vector<[4]x[4]xf32> // %zero is still live here (due the the backedge) "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> () "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> () @@ -405,7 +405,7 @@ func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector< // CHECK: arm_sme.get_tile {tile_id = 1 : i32} : vector<[4]x[4]xf32> // CHECK: arm_sme.get_tile {tile_id = 2 : i32} : vector<[4]x[4]xf32> // CHECK: arm_sme.get_tile {tile_id = 3 : i32} : vector<[4]x[4]xf32> -// CHECK: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> +// CHECK: arm_sme.insert_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32> // CHECK-NOT: tile_id = 16 func.func @cond_branch_with_backedge(%slice: vector<[4]xf32>) { %tileA = arm_sme.get_tile : vector<[4]x[4]xf32> @@ -423,7 +423,7 @@ func.func @cond_branch_with_backedge(%slice: vector<[4]xf32>) { cf.cond_br %continueLoop, ^bb2, ^bb3(%iterTile, %tileB, %tileC, %tileD : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>) ^bb2: // Live here: %iterTile, %tileB, %tileC, %tileD - %nextTile = arm_sme.move_vector_to_tile_slice %slice, %iterTile, %currentIndex : vector<[4]xf32> into vector<[4]x[4]xf32> + %nextTile = arm_sme.insert_tile_slice %slice, %iterTile[%currentIndex] : vector<[4]xf32> into vector<[4]x[4]xf32> %nextIndex = arith.addi %currentIndex, %c1 : index cf.br ^bb1(%nextIndex, %nextTile : index, vector<[4]x[4]xf32>) ^bb3(%finalTileA: vector<[4]x[4]xf32>, %finalTileB: vector<[4]x[4]xf32>, %finalTileC: vector<[4]x[4]xf32>, %finalTileD: vector<[4]x[4]xf32>): From 1e65b765879fb39214b28d96e3305fa3599581db Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 2 Sep 2024 06:17:40 -0400 Subject: [PATCH 31/33] [llvm][Support] Add support for thread naming under DragonFly BSD and Solaris/illumos (#106944) --- llvm/cmake/config-ix.cmake | 2 ++ llvm/include/llvm/Config/config.h.cmake | 6 +++++ llvm/lib/Support/Unix/Threading.inc | 36 ++++++++++++++----------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index f76eacb9d51366..3707ca824f6e9c 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -356,6 +356,8 @@ if (NOT PURE_WINDOWS) endif() check_symbol_exists(pthread_getname_np pthread.h HAVE_PTHREAD_GETNAME_NP) check_symbol_exists(pthread_setname_np pthread.h HAVE_PTHREAD_SETNAME_NP) + check_symbol_exists(pthread_get_name_np "pthread.h;pthread_np.h" HAVE_PTHREAD_GET_NAME_NP) + check_symbol_exists(pthread_set_name_np "pthread.h;pthread_np.h" HAVE_PTHREAD_SET_NAME_NP) if (LLVM_PTHREAD_LIB) list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${LLVM_PTHREAD_LIB}) endif() diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index f39d2d56d61e89..d71ff40144c097 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -125,6 +125,12 @@ /* Define to 1 if you have the `pthread_setname_np' function. */ #cmakedefine HAVE_PTHREAD_SETNAME_NP ${HAVE_PTHREAD_SETNAME_NP} +/* Define to 1 if you have the `pthread_get_name_np' function. */ +#cmakedefine HAVE_PTHREAD_GET_NAME_NP ${HAVE_PTHREAD_GET_NAME_NP} + +/* Define to 1 if you have the `pthread_set_name_np' function. */ +#cmakedefine HAVE_PTHREAD_SET_NAME_NP ${HAVE_PTHREAD_SET_NAME_NP} + /* Define to 1 if you have the header file. */ #cmakedefine HAVE_MACH_MACH_H ${HAVE_MACH_MACH_H} diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc index acfd4ad51902bb..43e18c3a963abf 100644 --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -137,13 +137,16 @@ uint64_t llvm::get_threadid() { } static constexpr uint32_t get_max_thread_name_length_impl() { -#if defined(__NetBSD__) +#if defined(PTHREAD_MAX_NAMELEN_NP) return PTHREAD_MAX_NAMELEN_NP; #elif defined(__APPLE__) return 64; +#elif defined(__sun__) && defined(__svr4__) + return 31; #elif defined(__linux__) && HAVE_PTHREAD_SETNAME_NP return 16; -#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || \ + defined(__DragonFly__) return 16; #elif defined(__OpenBSD__) return 24; @@ -170,15 +173,17 @@ void llvm::set_thread_name(const Twine &Name) { if (get_max_thread_name_length() > 0) NameStr = NameStr.take_back(get_max_thread_name_length() - 1); (void)NameStr; -#if defined(__linux__) && HAVE_PTHREAD_SETNAME_NP - ::pthread_setname_np(::pthread_self(), NameStr.data()); -#elif defined(__FreeBSD__) || defined(__OpenBSD__) +#if defined(HAVE_PTHREAD_SET_NAME_NP) ::pthread_set_name_np(::pthread_self(), NameStr.data()); -#elif defined(__NetBSD__) +#elif defined(HAVE_PTHREAD_SETNAME_NP) +#if defined(__NetBSD__) ::pthread_setname_np(::pthread_self(), "%s", const_cast(NameStr.data())); #elif defined(__APPLE__) ::pthread_setname_np(NameStr.data()); +#else + ::pthread_setname_np(::pthread_self(), NameStr.data()); +#endif #endif } @@ -221,23 +226,24 @@ void llvm::get_thread_name(SmallVectorImpl &Name) { } free(kp); return; -#elif defined(__NetBSD__) +#elif defined(__linux__) && HAVE_PTHREAD_GETNAME_NP + constexpr uint32_t len = get_max_thread_name_length_impl(); + char Buffer[len] = {'\0'}; // FIXME: working around MSan false positive. + if (0 == ::pthread_getname_np(::pthread_self(), Buffer, len)) + Name.append(Buffer, Buffer + strlen(Buffer)); +#elif defined(HAVE_PTHREAD_GET_NAME_NP) constexpr uint32_t len = get_max_thread_name_length_impl(); char buf[len]; - ::pthread_getname_np(::pthread_self(), buf, len); + ::pthread_get_name_np(::pthread_self(), buf, len); Name.append(buf, buf + strlen(buf)); -#elif defined(__OpenBSD__) + +#elif defined(HAVE_PTHREAD_GETNAME_NP) constexpr uint32_t len = get_max_thread_name_length_impl(); char buf[len]; - ::pthread_get_name_np(::pthread_self(), buf, len); + ::pthread_getname_np(::pthread_self(), buf, len); Name.append(buf, buf + strlen(buf)); -#elif defined(__linux__) && HAVE_PTHREAD_GETNAME_NP - constexpr uint32_t len = get_max_thread_name_length_impl(); - char Buffer[len] = {'\0'}; // FIXME: working around MSan false positive. - if (0 == ::pthread_getname_np(::pthread_self(), Buffer, len)) - Name.append(Buffer, Buffer + strlen(Buffer)); #endif } From 751975530e1041e5a8fb12cf57d5378c058d6d93 Mon Sep 17 00:00:00 2001 From: Tobias Gysi Date: Mon, 2 Sep 2024 12:26:15 +0200 Subject: [PATCH 32/33] Reapply "[MLIR][LLVM] Make DISubprogramAttr cyclic" (#106571) with fixes (#106947) This reverts commit fa93be4, restoring commit d884b77, with fixes that ensure the CAPI declarations are exported properly. This commit implements LLVM_DIRecursiveTypeAttrInterface for the DISubprogramAttr to ensure cyclic subprograms can be imported properly. In the process multiple shortcuts around the recently introduced DIImportedEntityAttr can be removed. --- .../Transforms/DebugTypeGenerator.cpp | 10 +- mlir/include/mlir-c/Dialect/LLVM.h | 28 ++++-- .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td | 68 +++++++++---- .../mlir/Dialect/LLVMIR/LLVMInterfaces.td | 2 +- mlir/lib/CAPI/Dialect/LLVM.cpp | 39 +++++--- mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp | 29 ++++-- mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 6 +- .../Transforms/DIScopeForLLVMFuncOp.cpp | 8 +- mlir/lib/Target/LLVMIR/DebugImporter.cpp | 18 ++-- mlir/lib/Target/LLVMIR/DebugTranslation.cpp | 98 ++++++++++--------- mlir/lib/Target/LLVMIR/DebugTranslation.h | 18 ++-- mlir/test/CAPI/llvm.c | 23 +++-- mlir/test/Target/LLVMIR/Import/debug-info.ll | 53 +++++----- mlir/test/Target/LLVMIR/llvmir-debug.mlir | 41 ++++---- 14 files changed, 265 insertions(+), 176 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index 54f2a12d800085..029d3776bcc0b8 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -146,8 +146,8 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( elements.push_back(subrangeTy); } return mlir::LLVM::DICompositeTypeAttr::get( - context, llvm::dwarf::DW_TAG_array_type, /*recursive_id=*/{}, - /*name=*/nullptr, /*file=*/nullptr, /*line=*/0, /*scope=*/nullptr, elemTy, + context, llvm::dwarf::DW_TAG_array_type, /*name=*/nullptr, + /*file=*/nullptr, /*line=*/0, /*scope=*/nullptr, elemTy, mlir::LLVM::DIFlags::Zero, /*sizeInBits=*/0, /*alignInBits=*/0, elements, dataLocation, /*rank=*/nullptr, allocated, associated); } @@ -188,7 +188,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType( } return mlir::LLVM::DICompositeTypeAttr::get( - context, llvm::dwarf::DW_TAG_structure_type, /*recursive_id=*/{}, + context, llvm::dwarf::DW_TAG_structure_type, mlir::StringAttr::get(context, result.second.name), fileAttr, line, scope, /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, offset * 8, /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr, @@ -236,8 +236,8 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType( // have been set to some valid default values. return mlir::LLVM::DICompositeTypeAttr::get( - context, llvm::dwarf::DW_TAG_array_type, /*recursive_id=*/{}, - /*name=*/nullptr, /*file=*/nullptr, /*line=*/0, /*scope=*/nullptr, elemTy, + context, llvm::dwarf::DW_TAG_array_type, /*name=*/nullptr, + /*file=*/nullptr, /*line=*/0, /*scope=*/nullptr, elemTy, mlir::LLVM::DIFlags::Zero, /*sizeInBits=*/0, /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr, /*allocated=*/nullptr, /*associated=*/nullptr); diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h index 5eb96a86e472d6..d6062bed5c0c0f 100644 --- a/mlir/include/mlir-c/Dialect/LLVM.h +++ b/mlir/include/mlir-c/Dialect/LLVM.h @@ -234,10 +234,14 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIBasicTypeAttrGet( MlirContext ctx, unsigned int tag, MlirAttribute name, uint64_t sizeInBits, MlirLLVMTypeEncoding encoding); +/// Creates a self-referencing LLVM DICompositeType attribute. +MLIR_CAPI_EXPORTED MlirAttribute +mlirLLVMDICompositeTypeAttrGetRecSelf(MlirAttribute recId); + /// Creates a LLVM DICompositeType attribute. MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDICompositeTypeAttrGet( - MlirContext ctx, unsigned int tag, MlirAttribute recId, MlirAttribute name, - MlirAttribute file, uint32_t line, MlirAttribute scope, + MlirContext ctx, MlirAttribute recId, bool isRecSelf, unsigned int tag, + MlirAttribute name, MlirAttribute file, uint32_t line, MlirAttribute scope, MlirAttribute baseType, int64_t flags, uint64_t sizeInBits, uint64_t alignInBits, intptr_t nElements, MlirAttribute const *elements, MlirAttribute dataLocation, MlirAttribute rank, MlirAttribute allocated, @@ -311,13 +315,17 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDILocalVariableAttrGet( MlirAttribute diFile, unsigned int line, unsigned int arg, unsigned int alignInBits, MlirAttribute diType, int64_t flags); +/// Creates a self-referencing LLVM DISubprogramAttr attribute. +MLIR_CAPI_EXPORTED MlirAttribute +mlirLLVMDISubprogramAttrGetRecSelf(MlirAttribute recId); + /// Creates a LLVM DISubprogramAttr attribute. MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDISubprogramAttrGet( - MlirContext ctx, MlirAttribute id, MlirAttribute compileUnit, - MlirAttribute scope, MlirAttribute name, MlirAttribute linkageName, - MlirAttribute file, unsigned int line, unsigned int scopeLine, - uint64_t subprogramFlags, MlirAttribute type, intptr_t nRetainedNodes, - MlirAttribute const *retainedNodes); + MlirContext ctx, MlirAttribute recId, bool isRecSelf, MlirAttribute id, + MlirAttribute compileUnit, MlirAttribute scope, MlirAttribute name, + MlirAttribute linkageName, MlirAttribute file, unsigned int line, + unsigned int scopeLine, uint64_t subprogramFlags, MlirAttribute type, + intptr_t nRetainedNodes, MlirAttribute const *retainedNodes); /// Gets the scope from this DISubprogramAttr. MLIR_CAPI_EXPORTED MlirAttribute @@ -356,9 +364,9 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIModuleAttrGet( /// Creates a LLVM DIImportedEntityAttr attribute. MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIImportedEntityAttrGet( - MlirContext ctx, unsigned int tag, MlirAttribute entity, MlirAttribute file, - unsigned int line, MlirAttribute name, intptr_t nElements, - MlirAttribute const *elements); + MlirContext ctx, unsigned int tag, MlirAttribute scope, + MlirAttribute entity, MlirAttribute file, unsigned int line, + MlirAttribute name, intptr_t nElements, MlirAttribute const *elements); /// Gets the scope of this DIModuleAttr. MLIR_CAPI_EXPORTED MlirAttribute diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index e57be7f760d380..49e54df3436ff3 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -271,7 +271,7 @@ def LLVM_DILanguageParameter : LLVM_DIParameter< >; def LLVM_DITagParameter : LLVM_DIParameter< - "tag", /*default=*/"", "Tag", /*errorCase=*/"llvm::dwarf::DW_TAG_invalid" + "tag", /*default=*/"0", "Tag", /*errorCase=*/"llvm::dwarf::DW_TAG_invalid" >; def LLVM_DIOperationEncodingParameter : LLVM_DIParameter< @@ -375,14 +375,17 @@ def LLVM_DICompositeTypeAttr : LLVM_Attr<"DICompositeType", "di_composite_type", [LLVM_DIRecursiveTypeAttrInterface], "DITypeAttr"> { let parameters = (ins - LLVM_DITagParameter:$tag, + // DIRecursiveTypeAttrInterface specific parameters. OptionalParameter<"DistinctAttr">:$recId, + OptionalParameter<"bool">:$isRecSelf, + // DICompositeType specific parameters. + LLVM_DITagParameter:$tag, OptionalParameter<"StringAttr">:$name, OptionalParameter<"DIFileAttr">:$file, OptionalParameter<"uint32_t">:$line, OptionalParameter<"DIScopeAttr">:$scope, OptionalParameter<"DITypeAttr">:$baseType, - OptionalParameter<"DIFlags", "DIFlags::Zero">:$flags, + OptionalParameter<"DIFlags">:$flags, OptionalParameter<"uint64_t">:$sizeInBits, OptionalParameter<"uint64_t">:$alignInBits, OptionalArrayRefParameter<"DINodeAttr">:$elements, @@ -391,14 +394,26 @@ def LLVM_DICompositeTypeAttr : LLVM_Attr<"DICompositeType", "di_composite_type", OptionalParameter<"DIExpressionAttr">:$allocated, OptionalParameter<"DIExpressionAttr">:$associated ); + let builders = [ + AttrBuilder<(ins + "unsigned":$tag, "StringAttr":$name, "DIFileAttr":$file, + "uint32_t":$line, "DIScopeAttr":$scope, "DITypeAttr":$baseType, + "DIFlags":$flags, "uint64_t":$sizeInBits, "uint64_t":$alignInBits, + "ArrayRef":$elements, "DIExpressionAttr":$dataLocation, + "DIExpressionAttr":$rank, "DIExpressionAttr":$allocated, + "DIExpressionAttr":$associated + ), [{ + return $_get($_ctxt, /*recId=*/nullptr, /*isRecSelf=*/nullptr, + tag, name, file, line, scope, baseType, flags, sizeInBits, + alignInBits, elements, dataLocation, rank, allocated, + associated); + }]> + ]; let assemblyFormat = "`<` struct(params) `>`"; let extraClassDeclaration = [{ /// Requirements of DIRecursiveTypeAttrInterface. /// @{ - /// Get whether this attr describes a recursive self reference. - bool isRecSelf() { return getTag() == 0; } - /// Get a copy of this type attr but with the recursive ID set to `recId`. DIRecursiveTypeAttrInterface withRecId(DistinctAttr recId); @@ -554,14 +569,19 @@ def LLVM_DILocalVariableAttr : LLVM_Attr<"DILocalVariable", "di_local_variable", //===----------------------------------------------------------------------===// def LLVM_DISubprogramAttr : LLVM_Attr<"DISubprogram", "di_subprogram", - /*traits=*/[], "DIScopeAttr"> { + [LLVM_DIRecursiveTypeAttrInterface], + "DIScopeAttr"> { let parameters = (ins + // DIRecursiveTypeAttrInterface specific parameters. + OptionalParameter<"DistinctAttr">:$recId, + OptionalParameter<"bool">:$isRecSelf, + // DISubprogramAttr specific parameters. OptionalParameter<"DistinctAttr">:$id, OptionalParameter<"DICompileUnitAttr">:$compileUnit, - "DIScopeAttr":$scope, + OptionalParameter<"DIScopeAttr">:$scope, OptionalParameter<"StringAttr">:$name, OptionalParameter<"StringAttr">:$linkageName, - "DIFileAttr":$file, + OptionalParameter<"DIFileAttr">:$file, OptionalParameter<"unsigned">:$line, OptionalParameter<"unsigned">:$scopeLine, OptionalParameter<"DISubprogramFlags">:$subprogramFlags, @@ -569,21 +589,31 @@ def LLVM_DISubprogramAttr : LLVM_Attr<"DISubprogram", "di_subprogram", OptionalArrayRefParameter<"DINodeAttr">:$retainedNodes ); let builders = [ - AttrBuilderWithInferredContext<(ins + AttrBuilder<(ins "DistinctAttr":$id, "DICompileUnitAttr":$compileUnit, - "DIScopeAttr":$scope, "StringRef":$name, "StringRef":$linkageName, + "DIScopeAttr":$scope, "StringAttr":$name, "StringAttr":$linkageName, "DIFileAttr":$file, "unsigned":$line, "unsigned":$scopeLine, "DISubprogramFlags":$subprogramFlags, "DISubroutineTypeAttr":$type, "ArrayRef":$retainedNodes ), [{ - MLIRContext *ctx = file.getContext(); - return $_get(ctx, id, compileUnit, scope, StringAttr::get(ctx, name), - StringAttr::get(ctx, linkageName), file, line, - scopeLine, subprogramFlags, type, retainedNodes); + return $_get($_ctxt, /*recId=*/nullptr, /*isRecSelf=*/false, id, compileUnit, + scope, name, linkageName, file, line, scopeLine, + subprogramFlags, type, retainedNodes); }]> ]; - let assemblyFormat = "`<` struct(params) `>`"; + let extraClassDeclaration = [{ + /// Requirements of DIRecursiveTypeAttrInterface. + /// @{ + + /// Get a copy of this type attr but with the recursive ID set to `recId`. + DIRecursiveTypeAttrInterface withRecId(DistinctAttr recId); + + /// Build a rec-self instance using the provided `recId`. + static DIRecursiveTypeAttrInterface getRecSelf(DistinctAttr recId); + + /// @} + }]; } //===----------------------------------------------------------------------===// @@ -627,13 +657,9 @@ def LLVM_DINamespaceAttr : LLVM_Attr<"DINamespace", "di_namespace", def LLVM_DIImportedEntityAttr : LLVM_Attr<"DIImportedEntity", "di_imported_entity", /*traits=*/[], "DINodeAttr"> { - /// TODO: DIImportedEntity has a 'scope' field which represents the scope where - /// this entity is imported. Currently, we are not adding a 'scope' field in - /// DIImportedEntityAttr to avoid cyclic dependency. As DIImportedEntityAttr - /// entries will be contained inside a scope entity (e.g. DISubprogramAttr), - /// the scope can easily be inferred. let parameters = (ins LLVM_DITagParameter:$tag, + "DIScopeAttr":$scope, "DINodeAttr":$entity, OptionalParameter<"DIFileAttr">:$file, OptionalParameter<"unsigned">:$line, diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td index 7085f81e203a1e..e2180410a8f04e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td @@ -406,7 +406,7 @@ def LLVM_DIRecursiveTypeAttrInterface let methods = [ InterfaceMethod<[{ Get whether this attr describes a recursive self reference. - }], "bool", "isRecSelf", (ins)>, + }], "bool", "getIsRecSelf", (ins)>, InterfaceMethod<[{ Get the recursive ID used for matching "rec-decl" with "rec-self". If this attr instance is not recursive, return a null attribute. diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp index 13341f0c4de881..03b536d7aad98f 100644 --- a/mlir/lib/CAPI/Dialect/LLVM.cpp +++ b/mlir/lib/CAPI/Dialect/LLVM.cpp @@ -159,9 +159,14 @@ MlirAttribute mlirLLVMDIBasicTypeAttrGet(MlirContext ctx, unsigned int tag, unwrap(ctx), tag, cast(unwrap(name)), sizeInBits, encoding)); } +MlirAttribute mlirLLVMDICompositeTypeAttrGetRecSelf(MlirAttribute recId) { + return wrap( + DICompositeTypeAttr::getRecSelf(cast(unwrap(recId)))); +} + MlirAttribute mlirLLVMDICompositeTypeAttrGet( - MlirContext ctx, unsigned int tag, MlirAttribute recId, MlirAttribute name, - MlirAttribute file, uint32_t line, MlirAttribute scope, + MlirContext ctx, MlirAttribute recId, bool isRecSelf, unsigned int tag, + MlirAttribute name, MlirAttribute file, uint32_t line, MlirAttribute scope, MlirAttribute baseType, int64_t flags, uint64_t sizeInBits, uint64_t alignInBits, intptr_t nElements, MlirAttribute const *elements, MlirAttribute dataLocation, MlirAttribute rank, MlirAttribute allocated, @@ -170,7 +175,7 @@ MlirAttribute mlirLLVMDICompositeTypeAttrGet( elementsStorage.reserve(nElements); return wrap(DICompositeTypeAttr::get( - unwrap(ctx), tag, cast(unwrap(recId)), + unwrap(ctx), cast(unwrap(recId)), isRecSelf, tag, cast(unwrap(name)), cast(unwrap(file)), line, cast(unwrap(scope)), cast(unwrap(baseType)), DIFlags(flags), sizeInBits, alignInBits, @@ -289,16 +294,21 @@ MlirAttribute mlirLLVMDISubroutineTypeAttrGet(MlirContext ctx, [](Attribute a) { return cast(a); }))); } +MlirAttribute mlirLLVMDISubprogramAttrGetRecSelf(MlirAttribute recId) { + return wrap(DISubprogramAttr::getRecSelf(cast(unwrap(recId)))); +} + MlirAttribute mlirLLVMDISubprogramAttrGet( - MlirContext ctx, MlirAttribute id, MlirAttribute compileUnit, - MlirAttribute scope, MlirAttribute name, MlirAttribute linkageName, - MlirAttribute file, unsigned int line, unsigned int scopeLine, - uint64_t subprogramFlags, MlirAttribute type, intptr_t nRetainedNodes, - MlirAttribute const *retainedNodes) { + MlirContext ctx, MlirAttribute recId, bool isRecSelf, MlirAttribute id, + MlirAttribute compileUnit, MlirAttribute scope, MlirAttribute name, + MlirAttribute linkageName, MlirAttribute file, unsigned int line, + unsigned int scopeLine, uint64_t subprogramFlags, MlirAttribute type, + intptr_t nRetainedNodes, MlirAttribute const *retainedNodes) { SmallVector nodesStorage; nodesStorage.reserve(nRetainedNodes); return wrap(DISubprogramAttr::get( - unwrap(ctx), cast(unwrap(id)), + unwrap(ctx), cast(unwrap(recId)), isRecSelf, + cast(unwrap(id)), cast(unwrap(compileUnit)), cast(unwrap(scope)), cast(unwrap(name)), cast(unwrap(linkageName)), cast(unwrap(file)), @@ -353,14 +363,15 @@ MlirAttribute mlirLLVMDIModuleAttrGetScope(MlirAttribute diModule) { } MlirAttribute mlirLLVMDIImportedEntityAttrGet( - MlirContext ctx, unsigned int tag, MlirAttribute entity, MlirAttribute file, - unsigned int line, MlirAttribute name, intptr_t nElements, - MlirAttribute const *elements) { + MlirContext ctx, unsigned int tag, MlirAttribute scope, + MlirAttribute entity, MlirAttribute file, unsigned int line, + MlirAttribute name, intptr_t nElements, MlirAttribute const *elements) { SmallVector elementsStorage; elementsStorage.reserve(nElements); return wrap(DIImportedEntityAttr::get( - unwrap(ctx), tag, cast(unwrap(entity)), - cast(unwrap(file)), line, cast(unwrap(name)), + unwrap(ctx), tag, cast(unwrap(scope)), + cast(unwrap(entity)), cast(unwrap(file)), line, + cast(unwrap(name)), llvm::map_to_vector(unwrapList(nElements, elements, elementsStorage), [](Attribute a) { return cast(a); }))); } diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp index 98a9659735e7e6..491dcc7f01e73d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp @@ -203,16 +203,33 @@ void printExpressionArg(AsmPrinter &printer, uint64_t opcode, DIRecursiveTypeAttrInterface DICompositeTypeAttr::withRecId(DistinctAttr recId) { return DICompositeTypeAttr::get( - getContext(), getTag(), recId, getName(), getFile(), getLine(), - getScope(), getBaseType(), getFlags(), getSizeInBits(), getAlignInBits(), - getElements(), getDataLocation(), getRank(), getAllocated(), - getAssociated()); + getContext(), recId, getIsRecSelf(), getTag(), getName(), getFile(), + getLine(), getScope(), getBaseType(), getFlags(), getSizeInBits(), + getAlignInBits(), getElements(), getDataLocation(), getRank(), + getAllocated(), getAssociated()); } DIRecursiveTypeAttrInterface DICompositeTypeAttr::getRecSelf(DistinctAttr recId) { - return DICompositeTypeAttr::get(recId.getContext(), 0, recId, {}, {}, 0, {}, - {}, DIFlags(), 0, 0, {}, {}, {}, {}, {}); + return DICompositeTypeAttr::get(recId.getContext(), recId, /*isRecSelf=*/true, + 0, {}, {}, 0, {}, {}, DIFlags(), 0, 0, {}, {}, + {}, {}, {}); +} + +//===----------------------------------------------------------------------===// +// DISubprogramAttr +//===----------------------------------------------------------------------===// + +DIRecursiveTypeAttrInterface DISubprogramAttr::withRecId(DistinctAttr recId) { + return DISubprogramAttr::get( + getContext(), recId, getIsRecSelf(), getId(), getCompileUnit(), + getScope(), getName(), getLinkageName(), getFile(), getLine(), + getScopeLine(), getSubprogramFlags(), getType(), getRetainedNodes()); +} + +DIRecursiveTypeAttrInterface DISubprogramAttr::getRecSelf(DistinctAttr recId) { + return DISubprogramAttr::get(recId.getContext(), recId, /*isRecSelf=*/true, + {}, {}, {}, {}, {}, 0, 0, {}, {}, {}, {}); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 3870aab52f199d..6e4a964f1fc93c 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -3155,9 +3155,9 @@ struct LLVMOpAsmDialectInterface : public OpAsmDialectInterface { .CasesetLoc(FusedLoc::get(context, {loc}, subprogramAttr)); } diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp index ce3643f513d34a..8c6f32f6bb0cd0 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp +++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp @@ -89,10 +89,9 @@ DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) { if (node->getTag() == llvm::dwarf::DW_TAG_array_type && !baseType) return nullptr; return DICompositeTypeAttr::get( - context, node->getTag(), /*recId=*/{}, - getStringAttrOrNull(node->getRawName()), translate(node->getFile()), - node->getLine(), translate(node->getScope()), baseType, - flags.value_or(DIFlags::Zero), node->getSizeInBits(), + context, node->getTag(), getStringAttrOrNull(node->getRawName()), + translate(node->getFile()), node->getLine(), translate(node->getScope()), + baseType, flags.value_or(DIFlags::Zero), node->getSizeInBits(), node->getAlignInBits(), elements, translateExpression(node->getDataLocationExp()), translateExpression(node->getRankExp()), @@ -217,8 +216,8 @@ DebugImporter::translateImpl(llvm::DIImportedEntity *node) { } return DIImportedEntityAttr::get( - context, node->getTag(), translate(node->getEntity()), - translate(node->getFile()), node->getLine(), + context, node->getTag(), translate(node->getScope()), + translate(node->getEntity()), translate(node->getFile()), node->getLine(), getStringAttrOrNull(node->getRawName()), elements); } @@ -227,6 +226,7 @@ DISubprogramAttr DebugImporter::translateImpl(llvm::DISubprogram *node) { mlir::DistinctAttr id; if (node->isDistinct()) id = getOrCreateDistinctID(node); + // Return nullptr if the scope or type is invalid. DIScopeAttr scope = translate(node->getScope()); if (node->getScope() && !scope) @@ -238,9 +238,12 @@ DISubprogramAttr DebugImporter::translateImpl(llvm::DISubprogram *node) { if (node->getType() && !type) return nullptr; + // Convert the retained nodes but drop all of them if one of them is invalid. SmallVector retainedNodes; for (llvm::DINode *retainedNode : node->getRetainedNodes()) retainedNodes.push_back(translate(retainedNode)); + if (llvm::is_contained(retainedNodes, nullptr)) + retainedNodes.clear(); return DISubprogramAttr::get(context, id, translate(node->getUnit()), scope, getStringAttrOrNull(node->getRawName()), @@ -374,6 +377,9 @@ getRecSelfConstructor(llvm::DINode *node) { .Case([&](llvm::DICompositeType *) { return CtorType(DICompositeTypeAttr::getRecSelf); }) + .Case([&](llvm::DISubprogram *) { + return CtorType(DISubprogramAttr::getRecSelf); + }) .Default(CtorType()); } diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp index 042e015f107fea..8ca3beca6b66f7 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp @@ -96,6 +96,17 @@ llvm::MDString *DebugTranslation::getMDStringOrNull(StringAttr stringAttr) { return llvm::MDString::get(llvmCtx, stringAttr); } +llvm::MDTuple * +DebugTranslation::getMDTupleOrNull(ArrayRef elements) { + if (elements.empty()) + return nullptr; + SmallVector llvmElements = llvm::to_vector( + llvm::map_range(elements, [&](DINodeAttr attr) -> llvm::Metadata * { + return translate(attr); + })); + return llvm::MDNode::get(llvmCtx, llvmElements); +} + llvm::DIBasicType *DebugTranslation::translateImpl(DIBasicTypeAttr attr) { return llvm::DIBasicType::get( llvmCtx, attr.getTag(), getMDStringOrNull(attr.getName()), @@ -138,6 +149,17 @@ DebugTranslation::translateTemporaryImpl(DICompositeTypeAttr attr) { /*VTableHolder=*/nullptr); } +llvm::TempDISubprogram +DebugTranslation::translateTemporaryImpl(DISubprogramAttr attr) { + return llvm::DISubprogram::getTemporary( + llvmCtx, /*Scope=*/nullptr, /*Name=*/{}, /*LinkageName=*/{}, + /*File=*/nullptr, attr.getLine(), /*Type=*/nullptr, + /*ScopeLine=*/0, /*ContainingType=*/nullptr, /*VirtualIndex=*/0, + /*ThisAdjustment=*/0, llvm::DINode::FlagZero, + static_cast(attr.getSubprogramFlags()), + /*Unit=*/nullptr); +} + llvm::DICompositeType * DebugTranslation::translateImpl(DICompositeTypeAttr attr) { // TODO: Use distinct attributes to model this, once they have landed. @@ -151,10 +173,6 @@ DebugTranslation::translateImpl(DICompositeTypeAttr attr) { isDistinct = true; } - SmallVector elements; - for (DINodeAttr member : attr.getElements()) - elements.push_back(translate(member)); - return getDistinctOrUnique( isDistinct, llvmCtx, attr.getTag(), getMDStringOrNull(attr.getName()), translate(attr.getFile()), attr.getLine(), translate(attr.getScope()), @@ -162,7 +180,7 @@ DebugTranslation::translateImpl(DICompositeTypeAttr attr) { attr.getAlignInBits(), /*OffsetInBits=*/0, /*Flags=*/static_cast(attr.getFlags()), - llvm::MDNode::get(llvmCtx, elements), + getMDTupleOrNull(attr.getElements()), /*RuntimeLang=*/0, /*VTableHolder=*/nullptr, /*TemplateParams=*/nullptr, /*Identifier=*/nullptr, /*Discriminator=*/nullptr, @@ -242,22 +260,21 @@ DebugTranslation::translateImpl(DIGlobalVariableAttr attr) { attr.getIsDefined(), nullptr, nullptr, attr.getAlignInBits(), nullptr); } -llvm::DIType * +llvm::DINode * DebugTranslation::translateRecursive(DIRecursiveTypeAttrInterface attr) { DistinctAttr recursiveId = attr.getRecId(); - if (auto *iter = recursiveTypeMap.find(recursiveId); - iter != recursiveTypeMap.end()) { + if (auto *iter = recursiveNodeMap.find(recursiveId); + iter != recursiveNodeMap.end()) { return iter->second; - } else { - assert(!attr.isRecSelf() && "unbound DI recursive self type"); } + assert(!attr.getIsRecSelf() && "unbound DI recursive self reference"); - auto setRecursivePlaceholder = [&](llvm::DIType *placeholder) { - recursiveTypeMap.try_emplace(recursiveId, placeholder); + auto setRecursivePlaceholder = [&](llvm::DINode *placeholder) { + recursiveNodeMap.try_emplace(recursiveId, placeholder); }; - llvm::DIType *result = - TypeSwitch(attr) + llvm::DINode *result = + TypeSwitch(attr) .Case([&](auto attr) { auto temporary = translateTemporaryImpl(attr); setRecursivePlaceholder(temporary.get()); @@ -266,11 +283,20 @@ DebugTranslation::translateRecursive(DIRecursiveTypeAttrInterface attr) { auto *concrete = translateImpl(attr); temporary->replaceAllUsesWith(concrete); return concrete; + }) + .Case([&](auto attr) { + auto temporary = translateTemporaryImpl(attr); + setRecursivePlaceholder(temporary.get()); + // Must call `translateImpl` directly instead of `translate` to + // avoid handling the recursive interface again. + auto *concrete = translateImpl(attr); + temporary->replaceAllUsesWith(concrete); + return concrete; }); - assert(recursiveTypeMap.back().first == recursiveId && + assert(recursiveNodeMap.back().first == recursiveId && "internal inconsistency: unexpected recursive translation stack"); - recursiveTypeMap.pop_back(); + recursiveNodeMap.pop_back(); return result; } @@ -297,6 +323,7 @@ llvm::DISubprogram *DebugTranslation::translateImpl(DISubprogramAttr attr) { bool isDefinition = static_cast(attr.getSubprogramFlags() & LLVM::DISubprogramFlags::Definition); + llvm::DISubprogram *node = getDistinctOrUnique( isDefinition, llvmCtx, scope, getMDStringOrNull(attr.getName()), getMDStringOrNull(attr.getLinkageName()), file, attr.getLine(), type, @@ -304,21 +331,8 @@ llvm::DISubprogram *DebugTranslation::translateImpl(DISubprogramAttr attr) { /*ContainingType=*/nullptr, /*VirtualIndex=*/0, /*ThisAdjustment=*/0, llvm::DINode::FlagZero, static_cast(attr.getSubprogramFlags()), - compileUnit); - - // DIImportedEntity requires scope information which DIImportedEntityAttr does - // not have. This is why we translate DIImportedEntityAttr after we have - // created DISubprogram as we can use it as the scope. - SmallVector retainedNodes; - for (DINodeAttr nodeAttr : attr.getRetainedNodes()) { - if (auto importedAttr = dyn_cast(nodeAttr)) { - llvm::DINode *dn = translate(importedAttr, node); - retainedNodes.push_back(dn); - } - } - if (!retainedNodes.empty()) - node->replaceRetainedNodes(llvm::MDTuple::get(llvmCtx, retainedNodes)); - + compileUnit, /*TemplateParams=*/nullptr, /*Declaration=*/nullptr, + getMDTupleOrNull(attr.getRetainedNodes())); if (attr.getId()) distinctAttrToNode.try_emplace(attr.getId(), node); return node; @@ -339,16 +353,12 @@ llvm::DINamespace *DebugTranslation::translateImpl(DINamespaceAttr attr) { attr.getExportSymbols()); } -llvm::DIImportedEntity *DebugTranslation::translate(DIImportedEntityAttr attr, - llvm::DIScope *scope) { - SmallVector elements; - for (DINodeAttr member : attr.getElements()) - elements.push_back(translate(member)); - +llvm::DIImportedEntity * +DebugTranslation::translateImpl(DIImportedEntityAttr attr) { return llvm::DIImportedEntity::get( - llvmCtx, attr.getTag(), scope, translate(attr.getEntity()), - translate(attr.getFile()), attr.getLine(), - getMDStringOrNull(attr.getName()), llvm::MDNode::get(llvmCtx, elements)); + llvmCtx, attr.getTag(), translate(attr.getScope()), + translate(attr.getEntity()), translate(attr.getFile()), attr.getLine(), + getMDStringOrNull(attr.getName()), getMDTupleOrNull(attr.getElements())); } llvm::DISubrange *DebugTranslation::translateImpl(DISubrangeAttr attr) { @@ -413,10 +423,10 @@ llvm::DINode *DebugTranslation::translate(DINodeAttr attr) { node = TypeSwitch(attr) .Case( + DIImportedEntityAttr, DILabelAttr, DILexicalBlockAttr, + DILexicalBlockFileAttr, DILocalVariableAttr, DIModuleAttr, + DINamespaceAttr, DINullTypeAttr, DIStringTypeAttr, + DISubprogramAttr, DISubrangeAttr, DISubroutineTypeAttr>( [&](auto attr) { return translateImpl(attr); }); if (node && !node->isTemporary()) diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.h b/mlir/lib/Target/LLVMIR/DebugTranslation.h index 37b985acf8541e..422aa34e28f3c9 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.h +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.h @@ -75,6 +75,7 @@ class DebugTranslation { llvm::DIDerivedType *translateImpl(DIDerivedTypeAttr attr); llvm::DIStringType *translateImpl(DIStringTypeAttr attr); llvm::DIFile *translateImpl(DIFileAttr attr); + llvm::DIImportedEntity *translateImpl(DIImportedEntityAttr attr); llvm::DILabel *translateImpl(DILabelAttr attr); llvm::DILexicalBlock *translateImpl(DILexicalBlockAttr attr); llvm::DILexicalBlockFile *translateImpl(DILexicalBlockFileAttr attr); @@ -90,27 +91,26 @@ class DebugTranslation { llvm::DISubroutineType *translateImpl(DISubroutineTypeAttr attr); llvm::DIType *translateImpl(DITypeAttr attr); - /// Currently, DIImportedEntityAttr does not have a scope field to avoid a - /// cyclic dependency. The scope information is obtained from the entity - /// which holds the list of DIImportedEntityAttr. This requires that scope - /// information be passed to translate function. - llvm::DIImportedEntity *translate(DIImportedEntityAttr attr, llvm::DIScope *); - /// Attributes that support self recursion need to implement an additional /// method to hook into `translateRecursive`. /// - ` translateTemporaryImpl()`: /// Create a temporary translation of the DI attr without recursively /// translating any nested DI attrs. - llvm::DIType *translateRecursive(DIRecursiveTypeAttrInterface attr); + llvm::DINode *translateRecursive(DIRecursiveTypeAttrInterface attr); /// Translate the given attribute to a temporary llvm debug metadata of the /// corresponding type. llvm::TempDICompositeType translateTemporaryImpl(DICompositeTypeAttr attr); + llvm::TempDISubprogram translateTemporaryImpl(DISubprogramAttr attr); /// Constructs a string metadata node from the string attribute. Returns /// nullptr if `stringAttr` is null or contains and empty string. llvm::MDString *getMDStringOrNull(StringAttr stringAttr); + /// Constructs a tuple metadata node from the `elements`. Returns nullptr if + /// `elements` is empty. + llvm::MDTuple *getMDTupleOrNull(ArrayRef elements); + /// Constructs a DIExpression metadata node from the DIExpressionAttr. Returns /// nullptr if `DIExpressionAttr` is null. llvm::DIExpression *getExpressionAttrOrNull(DIExpressionAttr attr); @@ -125,8 +125,8 @@ class DebugTranslation { /// metadata. DenseMap attrToNode; - /// A mapping between recursive ID and the translated DIType. - llvm::MapVector recursiveTypeMap; + /// A mapping between recursive ID and the translated DINode. + llvm::MapVector recursiveNodeMap; /// A mapping between a distinct ID and the translated LLVM metadata node. /// This helps identify attrs that should translate into the same LLVM debug diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c index da28a96f89691d..36277122801de4 100644 --- a/mlir/test/CAPI/llvm.c +++ b/mlir/test/CAPI/llvm.c @@ -248,12 +248,16 @@ static void testDebugInfoAttributes(MlirContext ctx) { mlirStringAttrGet(ctx, mlirStringRefCreateFromCString("foo")); MlirAttribute bar = mlirStringAttrGet(ctx, mlirStringRefCreateFromCString("bar")); - MlirAttribute id = mlirDisctinctAttrCreate(foo); + + MlirAttribute none = mlirUnitAttrGet(ctx); + MlirAttribute id = mlirDisctinctAttrCreate(none); + MlirAttribute recId0 = mlirDisctinctAttrCreate(none); + MlirAttribute recId1 = mlirDisctinctAttrCreate(none); // CHECK: #llvm.di_null_type mlirAttributeDump(mlirLLVMDINullTypeAttrGet(ctx)); - // CHECK: #llvm.di_basic_type MlirAttribute di_type = mlirLLVMDIBasicTypeAttrGet(ctx, 0, foo, 64, MlirLLVMTypeEncodingSigned); @@ -312,15 +316,17 @@ static void testDebugInfoAttributes(MlirContext ctx) { // CHECK: #llvm.di_subroutine_type<{{.*}}> mlirAttributeDump(subroutine_type); + MlirAttribute di_subprogram_self_rec = + mlirLLVMDISubprogramAttrGetRecSelf(recId0); MlirAttribute di_imported_entity = mlirLLVMDIImportedEntityAttrGet( - ctx, 0, di_module, file, 1, foo, 1, &local_var); + ctx, 0, di_subprogram_self_rec, di_module, file, 1, foo, 1, &local_var); mlirAttributeDump(di_imported_entity); // CHECK: #llvm.di_imported_entity<{{.*}}> MlirAttribute di_subprogram = mlirLLVMDISubprogramAttrGet( - ctx, id, compile_unit, compile_unit, foo, bar, file, 1, 2, 0, - subroutine_type, 1, &di_imported_entity); + ctx, recId0, false, id, compile_unit, compile_unit, foo, bar, file, 1, 2, + 0, subroutine_type, 1, &di_imported_entity); // CHECK: #llvm.di_subprogram<{{.*}}> mlirAttributeDump(di_subprogram); @@ -350,10 +356,13 @@ static void testDebugInfoAttributes(MlirContext ctx) { // CHECK: #llvm.di_string_type<{{.*}}> mlirAttributeDump(string_type); + // CHECK: #llvm.di_composite_type + mlirAttributeDump(mlirLLVMDICompositeTypeAttrGetRecSelf(recId1)); + // CHECK: #llvm.di_composite_type<{{.*}}> mlirAttributeDump(mlirLLVMDICompositeTypeAttrGet( - ctx, 0, id, foo, file, 1, compile_unit, di_type, 0, 64, 8, 1, &di_type, - expression, expression, expression, expression)); + ctx, recId1, false, 0, foo, file, 1, compile_unit, di_type, 0, 64, 8, 1, + &di_type, expression, expression, expression, expression)); } int main(void) { diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll index bb03da37c0d097..02e35ae7f0ee9d 100644 --- a/mlir/test/Target/LLVMIR/Import/debug-info.ll +++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll @@ -307,17 +307,13 @@ define void @class_method() { ret void, !dbg !9 } -; Verify the cyclic composite type is identified, even though conversion begins from the subprogram type. -; CHECK-DAG: #[[COMP_SELF:.+]] = #llvm.di_composite_type -; CHECK-DAG: #[[COMP_PTR:.+]] = #llvm.di_derived_type +; Verify the cyclic subprogram is handled correctly. +; CHECK-DAG: #[[SP_SELF:.+]] = #llvm.di_subprogram +; CHECK-DAG: #[[COMP:.+]] = #llvm.di_composite_type +; CHECK-DAG: #[[COMP_PTR:.+]] = #llvm.di_derived_type ; CHECK-DAG: #[[SP_TYPE:.+]] = #llvm.di_subroutine_type -; CHECK-DAG: #[[SP_INNER:.+]] = #llvm.di_subprogram -; CHECK-DAG: #[[COMP:.+]] = #llvm.di_composite_type - -; CHECK-DAG: #[[COMP_PTR_OUTER:.+]] = #llvm.di_derived_type -; CHECK-DAG: #[[SP_TYPE_OUTER:.+]] = #llvm.di_subroutine_type -; CHECK-DAG: #[[SP_OUTER:.+]] = #llvm.di_subprogram -; CHECK-DAG: #[[LOC]] = loc(fused<#[[SP_OUTER]]> +; CHECK-DAG: #[[SP:.+]] = #llvm.di_subprogram +; CHECK-DAG: #[[LOC]] = loc(fused<#[[SP]]> !llvm.dbg.cu = !{!1} !llvm.module.flags = !{!0} @@ -335,10 +331,10 @@ define void @class_method() { ; // ----- ; Verify the cyclic composite type is handled correctly. -; CHECK-DAG: #[[COMP_SELF:.+]] = #llvm.di_composite_type +; CHECK-DAG: #[[COMP_SELF:.+]] = #llvm.di_composite_type ; CHECK-DAG: #[[COMP_PTR_INNER:.+]] = #llvm.di_derived_type ; CHECK-DAG: #[[FIELD:.+]] = #llvm.di_derived_type -; CHECK-DAG: #[[COMP:.+]] = #llvm.di_composite_type +; CHECK-DAG: #[[COMP:.+]] = #llvm.di_composite_type ; CHECK-DAG: #[[COMP_PTR_OUTER:.+]] = #llvm.di_derived_type ; CHECK-DAG: #[[VAR0:.+]] = #llvm.di_local_variable @@ -610,9 +606,10 @@ define void @distinct_cu_func1() !dbg !5 { ; CHECK-LABEL: @declaration declare !dbg !1 void @declaration() -; CHECK: #di_subprogram = #llvm.di_subprogram< +; CHECK: #[[SP:.+]] = #llvm.di_subprogram< ; CHECK-NOT: id = distinct ; CHECK-NOT: subprogramFlags = +; CHECK: loc(fused<#[[SP]]> !llvm.module.flags = !{!0} !0 = !{i32 2, !"Debug Info Version", i32 3} @@ -633,14 +630,14 @@ declare !dbg !1 void @declaration() ; CHECK-DAG: #[[B1_INNER:.+]] = #llvm.di_derived_type<{{.*}}name = "B:B1", baseType = #[[B_SELF:.+]]> ; CHECK-DAG: #[[B2_INNER:.+]] = #llvm.di_derived_type<{{.*}}name = "B:B2", baseType = #[[B_SELF]]> -; CHECK-DAG: #[[B_INNER:.+]] = #llvm.di_composite_type<{{.*}}recId = [[B_RECID:.+]], {{.*}}name = "B", {{.*}}elements = #[[B1_INNER]], #[[B2_INNER]] +; CHECK-DAG: #[[B_INNER:.+]] = #llvm.di_composite_type ; CHECK-DAG: #[[B2_OUTER:.+]] = #llvm.di_derived_type<{{.*}}name = "B:B2", baseType = #[[B_INNER]]> -; CHECK-DAG: #[[A_OUTER:.+]] = #llvm.di_composite_type<{{.*}}recId = [[A_RECID:.+]], {{.*}}name = "A", {{.*}}elements = #[[B1_OUTER]], #[[B2_OUTER]] +; CHECK-DAG: #[[A_OUTER:.+]] = #llvm.di_composite_typeB", {{.*}}baseType = #[[B_OUTER:.+]]> -; CHECK-DAG: #[[B_OUTER]] = #llvm.di_composite_type<{{.*}}recId = [[B_RECID:.+]], {{.*}}name = "B", {{.*}}elements = #[[TO_C_INNER:.+]]> +; CHECK-DAG: #[[B_OUTER]] = #llvm.di_composite_type ; CHECK-DAG: #[[TO_C_INNER]] = #llvm.di_derived_type<{{.*}}name = "->C", {{.*}}baseType = #[[C_INNER:.+]]> ; CHECK-DAG: #[[C_INNER]] = #llvm.di_composite_type<{{.*}}name = "C", {{.*}}elements = #[[TO_B_SELF:.+]]> ; CHECK-DAG: #[[TO_B_SELF]] = #llvm.di_derived_type<{{.*}}name = "->B", {{.*}}baseType = #[[B_SELF:.+]]> -; CHECK-DAG: #[[B_SELF]] = #llvm.di_composite_type<{{.*}}recId = [[B_RECID]]> +; CHECK-DAG: #[[B_SELF]] = #llvm.di_composite_type ; CHECK-DAG: #[[TO_C_OUTER]] = #llvm.di_derived_type<{{.*}}name = "->C", {{.*}}baseType = #[[C_OUTER:.+]]> ; CHECK-DAG: #[[C_OUTER]] = #llvm.di_composite_type<{{.*}}name = "C", {{.*}}elements = #[[TO_B_OUTER]]> @@ -718,23 +715,23 @@ define void @class_field(ptr %arg1) !dbg !18 { ; ^ ^ ; +-------------+ -; CHECK-DAG: #[[A:.+]] = #llvm.di_composite_type<{{.*}}recId = [[A_RECID:.+]], {{.*}}name = "A", {{.*}}elements = #[[A_TO_B:.+]], #[[A_TO_C:.+]]> +; CHECK-DAG: #[[A:.+]] = #llvm.di_composite_type ; CHECK-DAG: #llvm.di_subprogram<{{.*}}scope = #[[A]], ; CHECK-DAG: #[[A_TO_B]] = #llvm.di_derived_type<{{.*}}name = "->B", {{.*}}baseType = #[[B_FROM_A:.+]]> ; CHECK-DAG: #[[A_TO_C]] = #llvm.di_derived_type<{{.*}}name = "->C", {{.*}}baseType = #[[C_FROM_A:.+]]> -; CHECK-DAG: #[[B_FROM_A]] = #llvm.di_composite_type<{{.*}}recId = [[B_RECID:.+]], {{.*}}name = "B", {{.*}}elements = #[[B_TO_C:.+]]> +; CHECK-DAG: #[[B_FROM_A]] = #llvm.di_composite_type ; CHECK-DAG: #[[B_TO_C]] = #llvm.di_derived_type<{{.*}}name = "->C", {{.*}}baseType = #[[C_FROM_B:.+]]> -; CHECK-DAG: #[[C_FROM_B]] = #llvm.di_composite_type<{{.*}}recId = [[C_RECID:.+]], {{.*}}name = "C", {{.*}}elements = #[[TO_A_SELF:.+]], #[[TO_B_SELF:.+]], #[[TO_C_SELF:.+]]> +; CHECK-DAG: #[[C_FROM_B]] = #llvm.di_composite_type -; CHECK-DAG: #[[C_FROM_A]] = #llvm.di_composite_type<{{.*}}recId = [[C_RECID]], {{.*}}name = "C", {{.*}}elements = #[[TO_A_SELF]], #[[A_TO_B:.+]], #[[TO_C_SELF]] +; CHECK-DAG: #[[C_FROM_A]] = #llvm.di_composite_typeA", {{.*}}baseType = #[[A_SELF:.+]]> ; CHECK-DAG: #[[TO_B_SELF]] = #llvm.di_derived_type<{{.*}}name = "->B", {{.*}}baseType = #[[B_SELF:.+]]> ; CHECK-DAG: #[[TO_C_SELF]] = #llvm.di_derived_type<{{.*}}name = "->C", {{.*}}baseType = #[[C_SELF:.+]]> -; CHECK-DAG: #[[A_SELF]] = #llvm.di_composite_type<{{.*}}recId = [[A_RECID]]> -; CHECK-DAG: #[[B_SELF]] = #llvm.di_composite_type<{{.*}}recId = [[B_RECID]]> -; CHECK-DAG: #[[C_SELF]] = #llvm.di_composite_type<{{.*}}recId = [[C_RECID]]> +; CHECK-DAG: #[[A_SELF]] = #llvm.di_composite_type +; CHECK-DAG: #[[B_SELF]] = #llvm.di_composite_type +; CHECK-DAG: #[[C_SELF]] = #llvm.di_composite_type define void @class_field(ptr %arg1) !dbg !18 { ret void @@ -816,4 +813,6 @@ define void @imp_fn() !dbg !12 { !17 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !8, file: !3, line: 1, elements: !15) ; CHECK-DAG: #[[M:.+]] = #llvm.di_module<{{.*}}name = "mod1"{{.*}}> -; CHECK-DAG: #[[SP:.+]] = #llvm.di_subprogram<{{.*}}name = "imp_fn"{{.*}}retainedNodes = #llvm.di_imported_entity> +; CHECK-DAG: #[[SP_REC:.+]] = #llvm.di_subprogram, isRecSelf = true> +; CHECK-DAG: #[[IE:.+]] = #llvm.di_imported_entity +; CHECK-DAG: #[[SP:.+]] = #llvm.di_subprogram<{{.*}}name = "imp_fn"{{.*}}retainedNodes = #[[IE]]> diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir index 30b2ba5e9bad1f..01194df5047742 100644 --- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir @@ -372,23 +372,28 @@ llvm.func @fn_with_gl() { llvm.func @imp_fn() { llvm.return } loc(#loc2) -#file = #llvm.di_file<"test.f90" in ""> -#SP_TY = #llvm.di_subroutine_type -#CU = #llvm.di_compile_unit, - sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false, + +#di_file = #llvm.di_file<"test.f90" in ""> +#di_subroutine_type = #llvm.di_subroutine_type +#di_compile_unit = #llvm.di_compile_unit, + sourceLanguage = DW_LANG_Fortran95, file = #di_file, isOptimized = false, emissionKind = Full> -#MOD = #llvm.di_module -#MOD1 = #llvm.di_module -#SP = #llvm.di_subprogram, compileUnit = #CU, scope = #file, - name = "imp_fn", file = #file, subprogramFlags = Definition, type = #SP_TY, - retainedNodes = #llvm.di_imported_entity, #llvm.di_imported_entity> +#di_module_1 = #llvm.di_module +#di_module_2 = #llvm.di_module +#di_subprogram_self_rec = #llvm.di_subprogram> +#di_imported_entity_1 = #llvm.di_imported_entity +#di_imported_entity_2 = #llvm.di_imported_entity +#di_subprogram = #llvm.di_subprogram, recId = distinct[1]<>, + compileUnit = #di_compile_unit, scope = #di_file, name = "imp_fn", + file = #di_file, subprogramFlags = Definition, type = #di_subroutine_type, + retainedNodes = #di_imported_entity_1, #di_imported_entity_2> #loc1 = loc("test.f90":12:14) -#loc2 = loc(fused<#SP>[#loc1]) +#loc2 = loc(fused<#di_subprogram>[#loc1]) // CHECK-DAG: ![[SP:[0-9]+]] = {{.*}}!DISubprogram(name: "imp_fn"{{.*}}retainedNodes: ![[NODES:[0-9]+]]) -// CHECK-DAG: ![[NODES]] = !{![[NODE2:[0-9]+]], ![[NODE1:[0-9]+]]} +// CHECK-DAG: ![[NODES]] = !{![[NODE1:[0-9]+]], ![[NODE2:[0-9]+]]} // CHECK-DAG: ![[NODE1]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: ![[SP]], entity: ![[MOD1:[0-9]+]]{{.*}}) // CHECK-DAG: ![[NODE2]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: ![[SP]], entity: ![[MOD2:[0-9]+]]{{.*}}) // CHECK-DAG: ![[MOD1]] = !DIModule({{.*}}name: "mod1"{{.*}}) @@ -443,7 +448,7 @@ llvm.func @func_debug_directives() { #di_compile_unit = #llvm.di_compile_unit, sourceLanguage = DW_LANG_C, file = #di_file, isOptimized = false, emissionKind = None> // Recursive type itself. -#di_struct_self = #llvm.di_composite_type> +#di_struct_self = #llvm.di_composite_type, isRecSelf = true> #di_ptr_inner = #llvm.di_derived_type #di_subroutine_inner = #llvm.di_subroutine_type #di_subprogram_inner = #llvm.di_subprogram< @@ -497,7 +502,7 @@ llvm.func @class_method() { // Ensures composite types with a recursive scope work. -#di_composite_type_self = #llvm.di_composite_type> +#di_composite_type_self = #llvm.di_composite_type, isRecSelf = true> #di_file = #llvm.di_file<"test.mlir" in "/"> #di_subroutine_type = #llvm.di_subroutine_type #di_subprogram = #llvm.di_subprogram @@ -508,7 +513,7 @@ llvm.func @class_method() { llvm.mlir.global @global_variable() {dbg_expr = #di_global_variable_expression} : !llvm.struct<()> // CHECK: distinct !DIGlobalVariable({{.*}}type: ![[COMP:[0-9]+]], -// CHECK: ![[COMP]] = distinct !DICompositeType({{.*}}scope: ![[SCOPE:[0-9]+]], +// CHECK: ![[COMP]] = distinct !DICompositeType({{.*}}scope: ![[SCOPE:[0-9]+]] // CHECK: ![[SCOPE]] = !DISubprogram({{.*}}type: ![[SUBROUTINE:[0-9]+]], // CHECK: ![[SUBROUTINE]] = !DISubroutineType(types: ![[SR_TYPES:[0-9]+]]) // CHECK: ![[SR_TYPES]] = !{![[COMP]]} @@ -520,7 +525,7 @@ llvm.mlir.global @global_variable() {dbg_expr = #di_global_variable_expression} // replaced with the recursive self reference. #di_file = #llvm.di_file<"test.mlir" in "/"> -#di_composite_type_self = #llvm.di_composite_type> +#di_composite_type_self = #llvm.di_composite_type, isRecSelf = true> #di_subroutine_type_inner = #llvm.di_subroutine_type #di_subprogram_inner = #llvm.di_subprogram @@ -540,7 +545,7 @@ llvm.mlir.global @global_variable() {dbg_expr = #di_global_variable_expression} // CHECK: distinct !DIGlobalVariable({{.*}}type: ![[VAR:[0-9]+]], // CHECK: ![[VAR]] = !DISubroutineType(types: ![[COMPS:[0-9]+]]) // CHECK: ![[COMPS]] = !{![[COMP:[0-9]+]], -// CHECK: ![[COMP]] = distinct !DICompositeType({{.*}}scope: ![[SCOPE:[0-9]+]], +// CHECK: ![[COMP]] = distinct !DICompositeType({{.*}}scope: ![[SCOPE:[0-9]+]] // CHECK: ![[SCOPE]] = !DISubprogram({{.*}}type: ![[SUBROUTINE:[0-9]+]], // CHECK: ![[SUBROUTINE]] = !DISubroutineType(types: ![[SR_TYPES:[0-9]+]]) // CHECK: ![[SR_TYPES]] = !{![[COMP]]} From 5dcea4628d7206d4351101850655356d4a8fc24a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 2 Sep 2024 12:36:51 +0200 Subject: [PATCH 33/33] [AutoUpgrade] Preserve attributes when upgrading named struct return For example, if the argument has an alignment attribute, preserve it. --- llvm/lib/IR/AutoUpgrade.cpp | 3 ++- .../intrinsics-struct-upgrade-attributes.ll | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Bitcode/intrinsics-struct-upgrade-attributes.ll diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 50fc2e728fcc01..69dae5e32dbbe8 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -4330,7 +4330,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { "Must have same number of elements"); SmallVector Args(CI->args()); - Value *NewCI = Builder.CreateCall(NewFn, Args); + CallInst *NewCI = Builder.CreateCall(NewFn, Args); + NewCI->setAttributes(CI->getAttributes()); Value *Res = PoisonValue::get(OldST); for (unsigned Idx = 0; Idx < OldST->getNumElements(); ++Idx) { Value *Elem = Builder.CreateExtractValue(NewCI, Idx); diff --git a/llvm/test/Bitcode/intrinsics-struct-upgrade-attributes.ll b/llvm/test/Bitcode/intrinsics-struct-upgrade-attributes.ll new file mode 100644 index 00000000000000..4962144899ae4b --- /dev/null +++ b/llvm/test/Bitcode/intrinsics-struct-upgrade-attributes.ll @@ -0,0 +1,18 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } + +declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*) + +; CHECK-LABEL: define %struct.__neon_int8x8x2_t @test_named_struct_return(ptr %A) { +; CHECK: %1 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr align 16 %A) +; CHECK: %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0 +; CHECK: %3 = insertvalue %struct.__neon_int8x8x2_t poison, <8 x i8> %2, 0 +; CHECK: %4 = extractvalue { <8 x i8>, <8 x i8> } %1, 1 +; CHECK: %5 = insertvalue %struct.__neon_int8x8x2_t %3, <8 x i8> %4, 1 +; CHECK: ret %struct.__neon_int8x8x2_t %5 + +define %struct.__neon_int8x8x2_t @test_named_struct_return(ptr %A) { + %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(ptr align 16 %A) + ret %struct.__neon_int8x8x2_t %val +}