From ec4c75515ad6b5478a1d3579d5cd58e9f45ae51b Mon Sep 17 00:00:00 2001 From: Gergely Meszaros Date: Wed, 21 Aug 2024 09:12:40 +0000 Subject: [PATCH] Allow toggling of selective scalarization for igc_opt Before this change when directly calling the ScalarizeFunction pass, it would always have selective scalarization enabled. The registry flag EnableSelectiveScalarization was only used by callers of createScalarizerPass. Change this so that the pass itself looks at the registry flag. Callers can still override this behavior by explicitly enabling or disabling selective scalarization. All internal users of createScalarizerPass are updated to match their existing behavior. --- IGC/AdaptorOCL/UnifyIROCL.cpp | 3 +- IGC/Compiler/Optimizer/Scalarizer.cpp | 56 +++- IGC/Compiler/Optimizer/Scalarizer.h | 15 +- .../tests/ScalarizeFunction/selective.ll | 256 ++++++++++++------ 4 files changed, 232 insertions(+), 98 deletions(-) diff --git a/IGC/AdaptorOCL/UnifyIROCL.cpp b/IGC/AdaptorOCL/UnifyIROCL.cpp index cabf2d4bffcb..d3abea6a722b 100644 --- a/IGC/AdaptorOCL/UnifyIROCL.cpp +++ b/IGC/AdaptorOCL/UnifyIROCL.cpp @@ -641,8 +641,7 @@ static void CommonOCLBasedPasses( mpm.add(new ScalarArgAsPointerAnalysis()); - // true means selective scalarization - mpm.add(createScalarizerPass(IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer))); + mpm.add(createScalarizerPass(SelectiveScalarizer::Auto)); // Create a dummy kernel to attach the symbol table if necessary // Only needed if function pointers, externally linked functions, or relocatable global variables are present diff --git a/IGC/Compiler/Optimizer/Scalarizer.cpp b/IGC/Compiler/Optimizer/Scalarizer.cpp index 1f54d64271b2..b54e02b5d1fb 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.cpp +++ b/IGC/Compiler/Optimizer/Scalarizer.cpp @@ -57,24 +57,34 @@ IGC_INITIALIZE_PASS_END(ScalarizeFunction, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG char ScalarizeFunction::ID = 0; -ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass(ID) +ScalarizeFunction::ScalarizeFunction(IGC::SelectiveScalarizer selectiveMode) : FunctionPass(ID) { initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry()); for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0; - // Needs IGC_EnableSelectiveScalarizer = 1 - m_SelectiveScalarization = selectiveScalarization; + V_PRINT(scalarizer, "ScalarizeFunction constructor\n"); + switch(selectiveMode) { + case IGC::SelectiveScalarizer::Off: + V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer forced off"); + m_SelectiveScalarization = false; + break; + case IGC::SelectiveScalarizer::On: + V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer forced on"); + m_SelectiveScalarization = true; + break; + case IGC::SelectiveScalarizer::Auto: + V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = "); + V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer)); + m_SelectiveScalarization = IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer); + break; + } + V_PRINT(scalarizer, "\n"); // Initialize SCM buffers and allocation m_SCMAllocationArray = new SCMEntry[ESTIMATED_INST_NUM]; m_SCMArrays.push_back(m_SCMAllocationArray); m_SCMArrayLocation = 0; - - V_PRINT(scalarizer, "ScalarizeFunction constructor\n"); - V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = "); - V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer)); - V_PRINT(scalarizer, "\n"); } bool ScalarizeFunction::doFinalization(llvm::Module& M) { @@ -244,20 +254,40 @@ void ScalarizeFunction::buildExclusiveSet() } else if (BitCastInst* BCI = dyn_cast(currInst)) { + auto isBitcastSink = [](BitCastInst *BCI) -> bool { + auto *SrcVTy = dyn_cast( + BCI->getOperand(0)->getType()); + + // If source is not a vector, we don't care about this bitcast + if (!SrcVTy) + return false; + + // If destination is a vector then we scalarize if the number of + // elements are the same (elementwise bitcast) + if (auto *DestVTy = + dyn_cast(BCI->getType())) + return DestVTy->getNumElements() != SrcVTy->getNumElements(); + + // If destination is not a vector, we don't want to scalarize + return true; + }; + + if (isBitcastSink(BCI)) { workset.push_back(BCI->getOperand(0)); + } } // try to find a web from the seed std::set defweb; while (!workset.empty()) { - auto Def = workset.back(); + auto* Def = workset.back(); workset.pop_back(); if (m_Excludes.count(Def) || defweb.count(Def)) { continue; } - // The web grows "up" through BitCasts and PHI nodes + // The web grows "up" (towards producers) through BitCasts and PHI nodes // but insert/extract elements and vector shuffles should be scalarized if (!isAddToWeb(Def)) continue; @@ -285,7 +315,7 @@ void ScalarizeFunction::buildExclusiveSet() continue; } - // The web grows "down" through BitCasts and PHI nodes as well + // The web grows "down" (towards users) through BitCasts and PHI nodes as well for (auto U : Def->users()) { if (!defweb.count(U) && isAddToWeb(U)) @@ -1458,8 +1488,8 @@ void ScalarizeFunction::resolveDeferredInstructions() m_DRL.clear(); } -extern "C" FunctionPass * createScalarizerPass(bool selectiveScalarization) +extern "C" FunctionPass * createScalarizerPass(IGC::SelectiveScalarizer selectiveMode) { - return new ScalarizeFunction(selectiveScalarization); + return new ScalarizeFunction(selectiveMode); } diff --git a/IGC/Compiler/Optimizer/Scalarizer.h b/IGC/Compiler/Optimizer/Scalarizer.h index 67887eab8332..d06ac0fc8f51 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.h +++ b/IGC/Compiler/Optimizer/Scalarizer.h @@ -39,6 +39,12 @@ namespace IGC // Define estimated amount of instructions in function #define ESTIMATED_INST_NUM 128 + enum class SelectiveScalarizer { + Off, + On, + Auto ///< Based on IGC_EnableSelectiveScalarizer (0 = off, 1 = on) + }; + /// @brief Scalarization pass used for converting code in functions /// which operate on vector types, to work on scalar types (by breaking /// data elements to scalars, and breaking each vector operation @@ -51,7 +57,10 @@ namespace IGC public: static char ID; // Pass identification, replacement for typeid - ScalarizeFunction(bool selectiveScalarization = true); + // Default value differs from createScalarizerPass to allow control over selective + // scalarization when pass is directly called from the command line (via igc_opt). + ScalarizeFunction( + SelectiveScalarizer selectiveMode = IGC::SelectiveScalarizer::Auto); ScalarizeFunction(const ScalarizeFunction&) = delete; ScalarizeFunction& operator=(const ScalarizeFunction&) = delete; @@ -271,5 +280,5 @@ namespace IGC /// The ending legs of the web consist of vectorial instructions such as insert and extract elements, /// vector shuffles, GenISA intrinsics and function calls. /// The vectorial instructions inside the web consist of bitcasts and PHI nodes. -extern "C" llvm::FunctionPass * createScalarizerPass(bool selectiveScalarization = false); - +extern "C" llvm::FunctionPass *createScalarizerPass( + IGC::SelectiveScalarizer selectiveMode = IGC::SelectiveScalarizer::Off); diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll index e20d3511b2f2..7c39694a61ce 100644 --- a/IGC/Compiler/tests/ScalarizeFunction/selective.ll +++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ;=========================== begin_copyright_notice ============================ ; ; Copyright (C) 2022 Intel Corporation @@ -16,67 +17,67 @@ define spir_kernel void @test_selective_1(i64 %addr) #0 { ; CHECK-LABEL: @test_selective_1( -; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer -; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> -; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> -; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) -; CHECK: ret void +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) +; CHECK-NEXT: ret void ; ; define a vector and do some bitcasts ; nothing should get scalarized here - %vectint = add <8 x i32> , zeroinitializer - %vectfloat = bitcast <8 x i32> %vectint to <8 x float> - %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> - call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) - ret void + ret void } define spir_kernel void @test_selective_2(i64 %addr) #0 { ; CHECK-LABEL: @test_selective_2( -; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer -; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> -; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> -; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) -; CHECK: [[CAST:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> -; CHECK: [[SCALAR_0:%.*]] = extractelement <8 x i32> [[CAST]], i32 0 -; CHECK: [[SCALAR_1:%.*]] = extractelement <8 x i32> [[CAST]], i32 1 -; CHECK: [[SCALAR_2:%.*]] = extractelement <8 x i32> [[CAST]], i32 2 -; CHECK: [[SCALAR_3:%.*]] = extractelement <8 x i32> [[CAST]], i32 3 -; CHECK: [[SCALAR_4:%.*]] = extractelement <8 x i32> [[CAST]], i32 4 -; CHECK: [[SCALAR_5:%.*]] = extractelement <8 x i32> [[CAST]], i32 5 -; CHECK: [[SCALAR_6:%.*]] = extractelement <8 x i32> [[CAST]], i32 6 -; CHECK: [[SCALAR_7:%.*]] = extractelement <8 x i32> [[CAST]], i32 7 -; CHECK: [[ADD:%.*]] = add i32 [[SCALAR_3]], [[SCALAR_5]] -; CHECK: ret void +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) +; CHECK-NEXT: [[ANOTHERCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> +; CHECK-NEXT: [[ANOTHERCAST_SCALAR:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 0 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR1:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 1 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR2:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 2 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR3:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 3 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR4:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 4 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR5:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 5 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR6:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 6 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR7:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 7 +; CHECK-NEXT: [[V3:%.*]] = add i32 [[ANOTHERCAST_SCALAR3]], [[ANOTHERCAST_SCALAR5]] +; CHECK-NEXT: ret void ; ; same as before, but %vectfloat is used in another branch of the code - %vectint = add <8 x i32> , zeroinitializer - %vectfloat = bitcast <8 x i32> %vectint to <8 x float> - %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> - call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) ; so scalarization should happen here - %anothercast = bitcast <8 x float> %vectfloat to <8 x i32> - %v1 = extractelement <8 x i32> %anothercast, i32 3 - %v2 = extractelement <8 x i32> %anothercast, i32 5 - %v3 = add i32 %v1, %v2 - ret void + %anothercast = bitcast <8 x float> %vectfloat to <8 x i32> + %v1 = extractelement <8 x i32> %anothercast, i32 3 + %v2 = extractelement <8 x i32> %anothercast, i32 5 + %v3 = add i32 %v1, %v2 + ret void } define spir_kernel void @test_selective_3() { ; CHECK-LABEL: @test_selective_3( -; CHECK: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] -; CHECK: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[INIT0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ] -; CHECK: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) -; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 -; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] -; CHECK: [[END]]: -; CHECK: ret void +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void ; ; no scalarization happens here because the vectors %data and %newdata are used as whole br label %loop @@ -97,17 +98,17 @@ end: define spir_kernel void @test_selective_4(i64 %addr) #0 { ; CHECK-LABEL: @test_selective_4( -; CHECK: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] -; CHECK: [[FLOAT_VECT:%.*]] = phi <8 x float> [ zeroinitializer, [[INIT0]] ], [ [[NEW_FLOAT_VECT:%.*]], %[[LOOP]] ] -; CHECK: [[INT_VECT:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) -; CHECK: [[NEW_FLOAT_VECT]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECT]], <8 x i16> , <8 x i32> [[INT_VECT]], i32 11, i32 11, i32 8, i32 8, i1 false) -; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 -; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 -; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] -; CHECK: [[END]]: -; CHECK: ret void +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) +; CHECK-NEXT: [[NEW_FLOAT_VECTOR]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECTOR]], <8 x i16> , <8 x i32> [[INT_VECTOR]], i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void ; ; same here: no scalarization br label %loop @@ -128,31 +129,31 @@ end: define spir_kernel void @test_selective_5() { ; CHECK-LABEL: @test_selective_5( -; CHECK: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] -; CHECK: [[DATA1:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ] -; CHECK: [[DATA3:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ] -; CHECK: [[DATA4:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ] -; CHECK: [[DATA5:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ] -; CHECK: [[DATA6:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] -; CHECK: [[DATA7:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] -; CHECK: [[DATA8:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] -; CHECK: [[DATA9:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] -; CHECK: [[VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 -; CHECK: [[VECT13:%.*]] = insertelement <4 x i32> [[VECT]], i32 [[DATA3]], i32 1 -; CHECK: [[VECT14:%.*]] = insertelement <4 x i32> [[VECT13]], i32 [[DATA4]], i32 2 -; CHECK: [[VECT15:%.*]] = insertelement <4 x i32> [[VECT14]], i32 [[DATA5]], i32 3 -; CHECK: [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[VECT15]]) -; CHECK: [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0 -; CHECK: [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1 -; CHECK: [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2 -; CHECK: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 -; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 -; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] -; CHECK: [[END]]: -; CHECK: ret void +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT13:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT]], i32 [[DATA3]], i32 1 +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT14:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT13]], i32 [[DATA4]], i32 2 +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT15:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT14]], i32 [[DATA5]], i32 3 +; CHECK-NEXT: [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[DATA2_ASSEMBLED_VECT15]]) +; CHECK-NEXT: [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0 +; CHECK-NEXT: [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1 +; CHECK-NEXT: [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2 +; CHECK-NEXT: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void ; ; here shufflevectors break vectorial nature of the arguments ; scalarization should be done @@ -174,6 +175,101 @@ end: ret void } +define spir_kernel void @test_selective_6() { +; CHECK-LABEL: @test_selective_6( +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTINT9:%.*]] = bitcast float [[VECTFLOAT1]] to i32 +; CHECK-NEXT: [[VECTINT10:%.*]] = bitcast float [[VECTFLOAT2]] to i32 +; CHECK-NEXT: [[VECTINT11:%.*]] = bitcast float [[VECTFLOAT3]] to i32 +; CHECK-NEXT: [[VECTINT12:%.*]] = bitcast float [[VECTFLOAT4]] to i32 +; CHECK-NEXT: [[VECTINT13:%.*]] = bitcast float [[VECTFLOAT5]] to i32 +; CHECK-NEXT: [[VECTINT14:%.*]] = bitcast float [[VECTFLOAT6]] to i32 +; CHECK-NEXT: [[VECTINT15:%.*]] = bitcast float [[VECTFLOAT7]] to i32 +; CHECK-NEXT: [[VECTINT16:%.*]] = bitcast float [[VECTFLOAT8]] to i32 +; CHECK-NEXT: [[VECTADD17:%.*]] = add i32 [[VECTINT9]], 1 +; CHECK-NEXT: [[VECTADD18:%.*]] = add i32 [[VECTINT10]], 2 +; CHECK-NEXT: [[VECTADD19:%.*]] = add i32 [[VECTINT11]], 3 +; CHECK-NEXT: [[VECTADD20:%.*]] = add i32 [[VECTINT12]], 4 +; CHECK-NEXT: [[VECTADD21:%.*]] = add i32 [[VECTINT13]], 5 +; CHECK-NEXT: [[VECTADD22:%.*]] = add i32 [[VECTINT14]], 6 +; CHECK-NEXT: [[VECTADD23:%.*]] = add i32 [[VECTINT15]], 7 +; CHECK-NEXT: [[VECTADD24:%.*]] = add i32 [[VECTINT16]], 8 +; CHECK-NEXT: [[VECTFLOAT_NEXT25]] = bitcast i32 [[VECTADD17]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT26]] = bitcast i32 [[VECTADD18]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT27]] = bitcast i32 [[VECTADD19]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT28]] = bitcast i32 [[VECTADD20]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT29]] = bitcast i32 [[VECTADD21]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT30]] = bitcast i32 [[VECTADD22]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT31]] = bitcast i32 [[VECTADD23]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT32]] = bitcast i32 [[VECTADD24]] to float +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void +; +; scalarization should not be prevented due to elementwise bitcasts +; such bitcasts can be part of a chain of vector instructions, but +; should not be at the end of it + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + %vectfloat = phi <8 x float> [ zeroinitializer, %0 ], [ %vectfloat.next, %loop ] + + %vectint = bitcast <8 x float> %vectfloat to <8 x i32> + %vectadd = add <8 x i32> %vectint, + %vectfloat.next = bitcast <8 x i32> %vectadd to <8 x float> + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop +end: + ret void +} + +define spir_kernel void @test_selective_7() { +; CHECK-LABEL: @test_selective_7( +; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to i64 +; CHECK-NEXT: ret void +; +; non-elementwise bitcasts (result type is scalar) should prevent scalarization, +; thus no scalarization should happen here + %vectint = add <4 x i16> , zeroinitializer + %vectfloat = bitcast <4 x i16> %vectint to <4 x half> + %vectcast = bitcast <4 x half> %vectfloat to i64 + + ret void +} + +define spir_kernel void @test_selective_8() { +; CHECK-LABEL: @test_selective_8( +; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to <2 x i32> +; CHECK-NEXT: ret void +; +; non-elementwise bitcasts (result is different sized vector) should prevent scalarization, +; thus no scalarization should happen here + %vectint = add <4 x i16> , zeroinitializer + %vectfloat = bitcast <4 x i16> %vectint to <4 x half> + %vectcast = bitcast <4 x half> %vectfloat to <2 x i32> + + ret void +} + declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1 declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1 declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1