diff --git a/IGC/AdaptorOCL/UnifyIROCL.cpp b/IGC/AdaptorOCL/UnifyIROCL.cpp index 5b5697f09669..63e4524659da 100644 --- a/IGC/AdaptorOCL/UnifyIROCL.cpp +++ b/IGC/AdaptorOCL/UnifyIROCL.cpp @@ -642,7 +642,8 @@ static void CommonOCLBasedPasses( mpm.add(new ScalarArgAsPointerAnalysis()); - mpm.add(createScalarizerPass(SelectiveScalarizer::Auto)); + // true means selective scalarization + mpm.add(createScalarizerPass(IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer))); // Create a dummy kernel to attach the symbol table if necessary // Only needed if function pointers, externally linked functions, or relocatable global variables are present diff --git a/IGC/Compiler/Optimizer/Scalarizer.cpp b/IGC/Compiler/Optimizer/Scalarizer.cpp index b54e02b5d1fb..1f54d64271b2 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.cpp +++ b/IGC/Compiler/Optimizer/Scalarizer.cpp @@ -57,34 +57,24 @@ IGC_INITIALIZE_PASS_END(ScalarizeFunction, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG char ScalarizeFunction::ID = 0; -ScalarizeFunction::ScalarizeFunction(IGC::SelectiveScalarizer selectiveMode) : FunctionPass(ID) +ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass(ID) { initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry()); for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0; - V_PRINT(scalarizer, "ScalarizeFunction constructor\n"); - switch(selectiveMode) { - case IGC::SelectiveScalarizer::Off: - V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer forced off"); - m_SelectiveScalarization = false; - break; - case IGC::SelectiveScalarizer::On: - V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer forced on"); - m_SelectiveScalarization = true; - break; - case IGC::SelectiveScalarizer::Auto: - V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = "); - V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer)); - m_SelectiveScalarization = IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer); - break; - } - V_PRINT(scalarizer, "\n"); + // Needs IGC_EnableSelectiveScalarizer = 1 + m_SelectiveScalarization = selectiveScalarization; // Initialize SCM buffers and allocation m_SCMAllocationArray = new SCMEntry[ESTIMATED_INST_NUM]; m_SCMArrays.push_back(m_SCMAllocationArray); m_SCMArrayLocation = 0; + + V_PRINT(scalarizer, "ScalarizeFunction constructor\n"); + V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = "); + V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer)); + V_PRINT(scalarizer, "\n"); } bool ScalarizeFunction::doFinalization(llvm::Module& M) { @@ -254,40 +244,20 @@ void ScalarizeFunction::buildExclusiveSet() } else if (BitCastInst* BCI = dyn_cast(currInst)) { - auto isBitcastSink = [](BitCastInst *BCI) -> bool { - auto *SrcVTy = dyn_cast( - BCI->getOperand(0)->getType()); - - // If source is not a vector, we don't care about this bitcast - if (!SrcVTy) - return false; - - // If destination is a vector then we scalarize if the number of - // elements are the same (elementwise bitcast) - if (auto *DestVTy = - dyn_cast(BCI->getType())) - return DestVTy->getNumElements() != SrcVTy->getNumElements(); - - // If destination is not a vector, we don't want to scalarize - return true; - }; - - if (isBitcastSink(BCI)) { workset.push_back(BCI->getOperand(0)); - } } // try to find a web from the seed std::set defweb; while (!workset.empty()) { - auto* Def = workset.back(); + auto Def = workset.back(); workset.pop_back(); if (m_Excludes.count(Def) || defweb.count(Def)) { continue; } - // The web grows "up" (towards producers) through BitCasts and PHI nodes + // The web grows "up" through BitCasts and PHI nodes // but insert/extract elements and vector shuffles should be scalarized if (!isAddToWeb(Def)) continue; @@ -315,7 +285,7 @@ void ScalarizeFunction::buildExclusiveSet() continue; } - // The web grows "down" (towards users) through BitCasts and PHI nodes as well + // The web grows "down" through BitCasts and PHI nodes as well for (auto U : Def->users()) { if (!defweb.count(U) && isAddToWeb(U)) @@ -1488,8 +1458,8 @@ void ScalarizeFunction::resolveDeferredInstructions() m_DRL.clear(); } -extern "C" FunctionPass * createScalarizerPass(IGC::SelectiveScalarizer selectiveMode) +extern "C" FunctionPass * createScalarizerPass(bool selectiveScalarization) { - return new ScalarizeFunction(selectiveMode); + return new ScalarizeFunction(selectiveScalarization); } diff --git a/IGC/Compiler/Optimizer/Scalarizer.h b/IGC/Compiler/Optimizer/Scalarizer.h index d06ac0fc8f51..67887eab8332 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.h +++ b/IGC/Compiler/Optimizer/Scalarizer.h @@ -39,12 +39,6 @@ namespace IGC // Define estimated amount of instructions in function #define ESTIMATED_INST_NUM 128 - enum class SelectiveScalarizer { - Off, - On, - Auto ///< Based on IGC_EnableSelectiveScalarizer (0 = off, 1 = on) - }; - /// @brief Scalarization pass used for converting code in functions /// which operate on vector types, to work on scalar types (by breaking /// data elements to scalars, and breaking each vector operation @@ -57,10 +51,7 @@ namespace IGC public: static char ID; // Pass identification, replacement for typeid - // Default value differs from createScalarizerPass to allow control over selective - // scalarization when pass is directly called from the command line (via igc_opt). - ScalarizeFunction( - SelectiveScalarizer selectiveMode = IGC::SelectiveScalarizer::Auto); + ScalarizeFunction(bool selectiveScalarization = true); ScalarizeFunction(const ScalarizeFunction&) = delete; ScalarizeFunction& operator=(const ScalarizeFunction&) = delete; @@ -280,5 +271,5 @@ namespace IGC /// The ending legs of the web consist of vectorial instructions such as insert and extract elements, /// vector shuffles, GenISA intrinsics and function calls. /// The vectorial instructions inside the web consist of bitcasts and PHI nodes. -extern "C" llvm::FunctionPass *createScalarizerPass( - IGC::SelectiveScalarizer selectiveMode = IGC::SelectiveScalarizer::Off); +extern "C" llvm::FunctionPass * createScalarizerPass(bool selectiveScalarization = false); + diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll index 7c39694a61ce..e20d3511b2f2 100644 --- a/IGC/Compiler/tests/ScalarizeFunction/selective.ll +++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ;=========================== begin_copyright_notice ============================ ; ; Copyright (C) 2022 Intel Corporation @@ -17,67 +16,67 @@ define spir_kernel void @test_selective_1(i64 %addr) #0 { ; CHECK-LABEL: @test_selective_1( -; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer -; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> -; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> -; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) -; CHECK-NEXT: ret void +; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> +; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) +; CHECK: ret void ; ; define a vector and do some bitcasts ; nothing should get scalarized here - %vectint = add <8 x i32> , zeroinitializer - %vectfloat = bitcast <8 x i32> %vectint to <8 x float> - %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> - call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) - ret void + ret void } define spir_kernel void @test_selective_2(i64 %addr) #0 { ; CHECK-LABEL: @test_selective_2( -; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer -; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> -; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> -; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) -; CHECK-NEXT: [[ANOTHERCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> -; CHECK-NEXT: [[ANOTHERCAST_SCALAR:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 0 -; CHECK-NEXT: [[ANOTHERCAST_SCALAR1:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 1 -; CHECK-NEXT: [[ANOTHERCAST_SCALAR2:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 2 -; CHECK-NEXT: [[ANOTHERCAST_SCALAR3:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 3 -; CHECK-NEXT: [[ANOTHERCAST_SCALAR4:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 4 -; CHECK-NEXT: [[ANOTHERCAST_SCALAR5:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 5 -; CHECK-NEXT: [[ANOTHERCAST_SCALAR6:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 6 -; CHECK-NEXT: [[ANOTHERCAST_SCALAR7:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 7 -; CHECK-NEXT: [[V3:%.*]] = add i32 [[ANOTHERCAST_SCALAR3]], [[ANOTHERCAST_SCALAR5]] -; CHECK-NEXT: ret void +; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> +; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) +; CHECK: [[CAST:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: [[SCALAR_0:%.*]] = extractelement <8 x i32> [[CAST]], i32 0 +; CHECK: [[SCALAR_1:%.*]] = extractelement <8 x i32> [[CAST]], i32 1 +; CHECK: [[SCALAR_2:%.*]] = extractelement <8 x i32> [[CAST]], i32 2 +; CHECK: [[SCALAR_3:%.*]] = extractelement <8 x i32> [[CAST]], i32 3 +; CHECK: [[SCALAR_4:%.*]] = extractelement <8 x i32> [[CAST]], i32 4 +; CHECK: [[SCALAR_5:%.*]] = extractelement <8 x i32> [[CAST]], i32 5 +; CHECK: [[SCALAR_6:%.*]] = extractelement <8 x i32> [[CAST]], i32 6 +; CHECK: [[SCALAR_7:%.*]] = extractelement <8 x i32> [[CAST]], i32 7 +; CHECK: [[ADD:%.*]] = add i32 [[SCALAR_3]], [[SCALAR_5]] +; CHECK: ret void ; ; same as before, but %vectfloat is used in another branch of the code - %vectint = add <8 x i32> , zeroinitializer - %vectfloat = bitcast <8 x i32> %vectint to <8 x float> - %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> - call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) ; so scalarization should happen here - %anothercast = bitcast <8 x float> %vectfloat to <8 x i32> - %v1 = extractelement <8 x i32> %anothercast, i32 3 - %v2 = extractelement <8 x i32> %anothercast, i32 5 - %v3 = add i32 %v1, %v2 - ret void + %anothercast = bitcast <8 x float> %vectfloat to <8 x i32> + %v1 = extractelement <8 x i32> %anothercast, i32 3 + %v2 = extractelement <8 x i32> %anothercast, i32 5 + %v3 = add i32 %v1, %v2 + ret void } define spir_kernel void @test_selective_3() { ; CHECK-LABEL: @test_selective_3( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) -; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: -; CHECK-NEXT: ret void +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[INIT0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ] +; CHECK: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void ; ; no scalarization happens here because the vectors %data and %newdata are used as whole br label %loop @@ -98,17 +97,17 @@ end: define spir_kernel void @test_selective_4(i64 %addr) #0 { ; CHECK-LABEL: @test_selective_4( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) -; CHECK-NEXT: [[NEW_FLOAT_VECTOR]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECTOR]], <8 x i16> , <8 x i32> [[INT_VECTOR]], i32 11, i32 11, i32 8, i32 8, i1 false) -; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: -; CHECK-NEXT: ret void +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[FLOAT_VECT:%.*]] = phi <8 x float> [ zeroinitializer, [[INIT0]] ], [ [[NEW_FLOAT_VECT:%.*]], %[[LOOP]] ] +; CHECK: [[INT_VECT:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) +; CHECK: [[NEW_FLOAT_VECT]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECT]], <8 x i16> , <8 x i32> [[INT_VECT]], i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void ; ; same here: no scalarization br label %loop @@ -129,31 +128,31 @@ end: define spir_kernel void @test_selective_5() { ; CHECK-LABEL: @test_selective_5( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] -; CHECK-NEXT: [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] -; CHECK-NEXT: [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] -; CHECK-NEXT: [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] -; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 -; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT13:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT]], i32 [[DATA3]], i32 1 -; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT14:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT13]], i32 [[DATA4]], i32 2 -; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT15:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT14]], i32 [[DATA5]], i32 3 -; CHECK-NEXT: [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[DATA2_ASSEMBLED_VECT15]]) -; CHECK-NEXT: [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0 -; CHECK-NEXT: [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1 -; CHECK-NEXT: [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2 -; CHECK-NEXT: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 -; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: -; CHECK-NEXT: ret void +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[DATA1:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ] +; CHECK: [[DATA3:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ] +; CHECK: [[DATA4:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ] +; CHECK: [[DATA5:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ] +; CHECK: [[DATA6:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA7:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA8:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA9:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 +; CHECK: [[VECT13:%.*]] = insertelement <4 x i32> [[VECT]], i32 [[DATA3]], i32 1 +; CHECK: [[VECT14:%.*]] = insertelement <4 x i32> [[VECT13]], i32 [[DATA4]], i32 2 +; CHECK: [[VECT15:%.*]] = insertelement <4 x i32> [[VECT14]], i32 [[DATA5]], i32 3 +; CHECK: [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[VECT15]]) +; CHECK: [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0 +; CHECK: [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1 +; CHECK: [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2 +; CHECK: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void ; ; here shufflevectors break vectorial nature of the arguments ; scalarization should be done @@ -175,101 +174,6 @@ end: ret void } -define spir_kernel void @test_selective_6() { -; CHECK-LABEL: @test_selective_6( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTINT9:%.*]] = bitcast float [[VECTFLOAT1]] to i32 -; CHECK-NEXT: [[VECTINT10:%.*]] = bitcast float [[VECTFLOAT2]] to i32 -; CHECK-NEXT: [[VECTINT11:%.*]] = bitcast float [[VECTFLOAT3]] to i32 -; CHECK-NEXT: [[VECTINT12:%.*]] = bitcast float [[VECTFLOAT4]] to i32 -; CHECK-NEXT: [[VECTINT13:%.*]] = bitcast float [[VECTFLOAT5]] to i32 -; CHECK-NEXT: [[VECTINT14:%.*]] = bitcast float [[VECTFLOAT6]] to i32 -; CHECK-NEXT: [[VECTINT15:%.*]] = bitcast float [[VECTFLOAT7]] to i32 -; CHECK-NEXT: [[VECTINT16:%.*]] = bitcast float [[VECTFLOAT8]] to i32 -; CHECK-NEXT: [[VECTADD17:%.*]] = add i32 [[VECTINT9]], 1 -; CHECK-NEXT: [[VECTADD18:%.*]] = add i32 [[VECTINT10]], 2 -; CHECK-NEXT: [[VECTADD19:%.*]] = add i32 [[VECTINT11]], 3 -; CHECK-NEXT: [[VECTADD20:%.*]] = add i32 [[VECTINT12]], 4 -; CHECK-NEXT: [[VECTADD21:%.*]] = add i32 [[VECTINT13]], 5 -; CHECK-NEXT: [[VECTADD22:%.*]] = add i32 [[VECTINT14]], 6 -; CHECK-NEXT: [[VECTADD23:%.*]] = add i32 [[VECTINT15]], 7 -; CHECK-NEXT: [[VECTADD24:%.*]] = add i32 [[VECTINT16]], 8 -; CHECK-NEXT: [[VECTFLOAT_NEXT25]] = bitcast i32 [[VECTADD17]] to float -; CHECK-NEXT: [[VECTFLOAT_NEXT26]] = bitcast i32 [[VECTADD18]] to float -; CHECK-NEXT: [[VECTFLOAT_NEXT27]] = bitcast i32 [[VECTADD19]] to float -; CHECK-NEXT: [[VECTFLOAT_NEXT28]] = bitcast i32 [[VECTADD20]] to float -; CHECK-NEXT: [[VECTFLOAT_NEXT29]] = bitcast i32 [[VECTADD21]] to float -; CHECK-NEXT: [[VECTFLOAT_NEXT30]] = bitcast i32 [[VECTADD22]] to float -; CHECK-NEXT: [[VECTFLOAT_NEXT31]] = bitcast i32 [[VECTADD23]] to float -; CHECK-NEXT: [[VECTFLOAT_NEXT32]] = bitcast i32 [[VECTADD24]] to float -; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: -; CHECK-NEXT: ret void -; -; scalarization should not be prevented due to elementwise bitcasts -; such bitcasts can be part of a chain of vector instructions, but -; should not be at the end of it - br label %loop - -loop: - %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] - %vectfloat = phi <8 x float> [ zeroinitializer, %0 ], [ %vectfloat.next, %loop ] - - %vectint = bitcast <8 x float> %vectfloat to <8 x i32> - %vectadd = add <8 x i32> %vectint, - %vectfloat.next = bitcast <8 x i32> %vectadd to <8 x float> - - %newoffset = add i32 %offset, 1 - %1 = icmp eq i32 %newoffset, 10 - br i1 %1, label %end, label %loop -end: - ret void -} - -define spir_kernel void @test_selective_7() { -; CHECK-LABEL: @test_selective_7( -; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer -; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> -; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to i64 -; CHECK-NEXT: ret void -; -; non-elementwise bitcasts (result type is scalar) should prevent scalarization, -; thus no scalarization should happen here - %vectint = add <4 x i16> , zeroinitializer - %vectfloat = bitcast <4 x i16> %vectint to <4 x half> - %vectcast = bitcast <4 x half> %vectfloat to i64 - - ret void -} - -define spir_kernel void @test_selective_8() { -; CHECK-LABEL: @test_selective_8( -; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer -; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> -; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to <2 x i32> -; CHECK-NEXT: ret void -; -; non-elementwise bitcasts (result is different sized vector) should prevent scalarization, -; thus no scalarization should happen here - %vectint = add <4 x i16> , zeroinitializer - %vectfloat = bitcast <4 x i16> %vectint to <4 x half> - %vectcast = bitcast <4 x half> %vectfloat to <2 x i32> - - ret void -} - declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1 declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1 declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1