diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c index b6183f79c102d..a59fb82fbd898 100644 --- a/clang/test/CodeGen/fp-accuracy.c +++ b/clang/test/CodeGen/fp-accuracy.c @@ -168,47 +168,47 @@ double rsqrt(double); // CHECK-F2: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]] // // CHECK-F3-LABEL: define dso_local void @f1 -// CHECK-F3: call double @llvm.fpbuiltin.acos.f64(double %conv) #[[ATTR_F3_HIGH:[0-9]+]] -// CHECK-F3: call double @llvm.fpbuiltin.acosh.f64(double %conv2) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.asin.f64(double %conv4) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.asinh.f64(double %conv6) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.atan.f64(double %conv8) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.atan2.f64(double %conv10, double %conv11) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double %conv13) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double %conv15) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double %conv17) #[[ATTR_F3_HIGH]] -// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double %conv19) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double %conv21) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double %conv23) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double %conv25) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.exp2.f64(double %conv27) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.expm1.f64(double %conv29) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fadd.f64(double %conv31, double %conv32) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fdiv.f64(double %conv34, double %conv35) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fmul.f64(double %conv37, double %conv38) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.frem.f64(double %conv40, double %conv41) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.fsub.f64(double %conv43, double %conv44) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.hypot.f64(double %conv46, double %conv47) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.ldexp.f64(double %conv49, i32 %conv50) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.log.f64(double %conv52) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double %conv54) #[[ATTR_F3_MEDIUM:[0-9]+]] -// CHECK-F3: call double @llvm.fpbuiltin.log1p.f64(double %conv56) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.log2.f64(double %conv58) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.pow.f64(double %conv60, double %conv61) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.rsqrt.f64(double %conv63) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.sin.f64(double %conv65) #[[ATTR_F3_HIGH]] -// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double %conv67, ptr %p1, ptr %p2) #[[ATTR_F3_MEDIUM]] -// CHECK-F3: call double @llvm.fpbuiltin.sinh.f64(double %conv68) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.sqrt.f64(double %conv70) #[[ATTR_F3_HIGH]] -// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double %conv72) #[[ATTR_F3_LOW:[0-9]+]] -// CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double %conv74) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F3_HIGH:[0-9]+]] +// CHECK-F3: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.asinh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.atan.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.atan2.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.exp2.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.expm1.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fadd.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fdiv.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fmul.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.frem.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.fsub.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.hypot.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.ldexp.f64(double {{.*}}, i32 {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F3_MEDIUM:[0-9]+]] +// CHECK-F3: call double @llvm.fpbuiltin.log1p.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.log2.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F3_MEDIUM]] +// CHECK-F3: call double @llvm.fpbuiltin.sinh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW:[0-9]+]] +// CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: attributes #[[ATTR_F3_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // CHECK-F3: attributes #[[ATTR_F3_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F3: attributes #[[ATTR_F3_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0" // // CHECK-LABEL-F4: define dso_local void @f1 -// CHECK-F4: call double @llvm.fpbuiltin.acos.f64(double %conv) #[[ATTR_F4_MEDIUM:[0-9]+]] +// CHECK-F4: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F4_MEDIUM:[0-9]+]] // CHECK-F4: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.asinh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] @@ -237,50 +237,50 @@ double rsqrt(double); // CHECK-F4: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] -// CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr %p1, ptr %p2) +// CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) // CHECK-F4: call double @llvm.fpbuiltin.sinh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] // -// CHECK-F5-LABEL: define dso_local void @f1( -// CHECK-F5: call double @acos(double noundef {{.*}}) -// CHECK-F5: call double @acosh(double noundef {{.*}}) -// CHECK-F5: call double @asin(double noundef {{.*}}) -// CHECK-F5: call double @asinh(double noundef {{.*}}) -// CHECK-F5: call double @atan(double noundef {{.*}}) -// CHECK-F5: call double @atan2(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call double @atanh(double noundef {{.*}}) +// CHECK-F5-LABEL: define dso_local void @f1 +// CHECK-F5: call double @acos(double {{.*}}) +// CHECK-F5: call double @acosh(double {{.*}}) +// CHECK-F5: call double @asin(double {{.*}}) +// CHECK-F5: call double @asinh(double {{.*}}) +// CHECK-F5: call double @atan(double {{.*}}) +// CHECK-F5: call double @atan2(double {{.*}}, double {{.*}}) +// CHECK-F5: call double @atanh(double {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F5_MEDIUM:[0-9]+]] -// CHECK-F5: call double @cosh(double noundef {{.*}}) -// CHECK-F5: call double @erf(double noundef {{.*}}) -// CHECK-F5: call double @erfc(double noundef {{.*}}) +// CHECK-F5: call double @cosh(double {{.*}}) +// CHECK-F5: call double @erf(double {{.*}}) +// CHECK-F5: call double @erfc(double {{.*}}) // CHECK-F5: call double @llvm.exp.f64(double {{.*}}) -// CHECK-F5: call i32 (double, ...) @exp10(double noundef {{.*}}) +// CHECK-F5: call i32 (double, ...) @exp10(double {{.*}}) // CHECK-F5: call double @llvm.exp2.f64(double {{.*}}) -// CHECK-F5: call double @expm1(double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fadd(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fdiv(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fmul(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @frem(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call i32 (double, double, ...) @fsub(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call double @hypot(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-F5: call double @ldexp(double noundef {{.*}}, i32 noundef {{.*}}) +// CHECK-F5: call double @expm1(double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fadd(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fdiv(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fmul(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @frem(double {{.*}}, double {{.*}}) +// CHECK-F5: call i32 (double, double, ...) @fsub(double {{.*}}, double {{.*}}) +// CHECK-F5: call double @hypot(double {{.*}}, double {{.*}}) +// CHECK-F5: call double @ldexp(double {{.*}}, i32 {{.*}}) // CHECK-F5: call double @llvm.log.f64(double {{.*}}) // CHECK-F5: call double @llvm.log10.f64(double {{.*}}) -// CHECK-F5: call double @log1p(double noundef {{.*}}) +// CHECK-F5: call double @log1p(double {{.*}}) // CHECK-F5: call double @llvm.log2.f64(double {{.*}}) // CHECK-F5: call double @llvm.pow.f64(double {{.*}}, double {{.*}}) -// CHECK-F5: call i32 (double, ...) @rsqrt(double noundef {{.*}}) +// CHECK-F5: call i32 (double, ...) @rsqrt(double {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F5_HIGH:[0-9]+]] -// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-F5: call double @sinh(double noundef {{.*}}) +// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-F5: call double @sinh(double {{.*}}) // CHECK-F5: call double @llvm.sqrt.f64(double {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F5_HIGH]] -// CHECK-F5: call double @tanh(double noundef {{.*}}) +// CHECK-F5: call double @tanh(double {{.*}}) // // -// CHECK-F6-LABEL: define dso_local void @f1( +// CHECK-F6-LABEL: define dso_local void @f1 // CHECK-F6: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F6_MEDIUM:[0-9]+]] // CHECK-F6: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] @@ -404,7 +404,7 @@ void f1(float a, float b) { // CHECK-F1: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F1_LOW]] // CHECK-F1: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F1_MEDIUM]] // CHECK-F1: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F1_MEDIUM]] -// CHECK-F1: call float @tanf(float noundef {{.*}}) +// CHECK-F1: call float @tanf(float {{.*}}) // // CHECK-F2-LABEL: define dso_local void @f2 // CHECK-F2: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F2_MEDIUM]] @@ -412,7 +412,7 @@ void f1(float a, float b) { // CHECK-F2: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F2_HIGH]] // CHECK-F2: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]] // CHECK-F2: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]] -// CHECK-F2: call float @tanf(float noundef {{.*}}) +// CHECK-F2: call float @tanf(float {{.*}}) // // CHECK-LABEL-F4: define dso_local void @f2 // CHECK-F4: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]] @@ -422,24 +422,24 @@ void f1(float a, float b) { // CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call float @tanf(float {{.*}}) // -// CHECK-F5-LABEL: define dso_local void @f2( +// CHECK-F5-LABEL: define dso_local void @f2 // CHECK-F5: call float @llvm.cos.f32(float {{.*}}) // CHECK-F5: call float @llvm.sin.f32(float {{.*}}) // CHECK-F5: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F5_HIGH]] // CHECK-F5: call double @llvm.log10.f64(double {{.*}}) -// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-F5: call float @tanf(float noundef {{.*}}) +// CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-F5: call float @tanf(float {{.*}}) // // CHECK-F5: attributes #[[ATTR_F5_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F5: attributes #[[ATTR_F5_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // -// CHECK-F6-LABEL: define dso_local void @f2( +// CHECK-F6-LABEL: define dso_local void @f2 // CHECK-F6: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F6_MEDIUM]] -// CHECK-F6: call float @tanf(float noundef {{.*}}) #[[ATTR8:[0-9]+]] +// CHECK-F6: call float @tanf(float {{.*}}) #[[ATTR8:[0-9]+]] // // CHECK-F6: attributes #[[ATTR_F6_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F6: attributes #[[ATTR_F6_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" @@ -450,7 +450,7 @@ void f1(float a, float b) { // CHECK-SPIR: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_SYCL2]] // CHECK-SPIR: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_SYCL5]] // CHECK-SPIR: call void @llvm.fpbuiltin.sincos.f32(float {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_SYCL1]] -// CHECK-SPIR: call spir_func float @tanf(float noundef {{.*}}) +// CHECK-SPIR: call spir_func float @tanf(float {{.*}}) // CHECK-LABEL: define dso_local void @f3 // CHECK: call float @fake_exp10(float {{.*}}) @@ -480,48 +480,48 @@ void f1(float a, float b) { // CHECK-SPIR: attributes #[[ATTR_SYCL8]] = {{.*}}"fpbuiltin-max-error"="2.0" // CHECK-DEFAULT-LABEL: define dso_local void @f1 -// CHECK-DEFAULT: call double @acos(double noundef {{.*}}) -// CHECK-DEFAULT: call double @acosh(double noundef {{.*}}) -// CHECK-DEFAULT: call double @asin(double noundef {{.*}}) -// CHECK-DEFAULT: call double @asinh(double noundef {{.*}}) -// CHECK-DEFAULT: call double @atan(double noundef {{.*}}) -// CHECK-DEFAULT: call double @atan2(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call double @atanh(double noundef {{.*}}) +// CHECK-DEFAULT: call double @acos(double {{.*}}) +// CHECK-DEFAULT: call double @acosh(double {{.*}}) +// CHECK-DEFAULT: call double @asin(double {{.*}}) +// CHECK-DEFAULT: call double @asinh(double {{.*}}) +// CHECK-DEFAULT: call double @atan(double {{.*}}) +// CHECK-DEFAULT: call double @atan2(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call double @atanh(double {{.*}}) // CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}}) -// CHECK-DEFAULT: call double @cosh(double noundef {{.*}}) -// CHECK-DEFAULT: call double @erf(double noundef {{.*}}) -// CHECK-DEFAULT: call double @erfc(double noundef {{.*}}) +// CHECK-DEFAULT: call double @cosh(double {{.*}}) +// CHECK-DEFAULT: call double @erf(double {{.*}}) +// CHECK-DEFAULT: call double @erfc(double {{.*}}) // CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ...) @exp10(double noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ...) @exp10(double {{.*}}) // CHECK-DEFAULT: call double @llvm.exp2.f64(double {{.*}}) -// CHECK-DEFAULT: call double @expm1(double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fadd(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fdiv(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fmul(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @frem(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call i32 (double, double, ...) @fsub(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call double @hypot(double noundef {{.*}}, double noundef {{.*}}) -// CHECK-DEFAULT: call double @ldexp(double noundef {{.*}}, i32 noundef {{.*}}) +// CHECK-DEFAULT: call double @expm1(double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fadd(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fdiv(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fmul(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @frem(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call i32 (double, double, ...) @fsub(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call double @hypot(double {{.*}}, double {{.*}}) +// CHECK-DEFAULT: call double @ldexp(double {{.*}}, i32 {{.*}}) // CHECK-DEFAULT: call double @llvm.log.f64(double {{.*}}) // CHECK-DEFAULT: call double @llvm.log10.f64(double {{.*}}) -// CHECK-DEFAULT: call double @log1p(double noundef {{.*}}) +// CHECK-DEFAULT: call double @log1p(double {{.*}}) // CHECK-DEFAULT: call double @llvm.log2.f64(double {{.*}}) // CHECK-DEFAULT: call double @llvm.pow.f64(double {{.*}}, double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ...) @rsqrt(double noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ...) @rsqrt(double {{.*}}) // CHECK-DEFAULT: call double @llvm.sin.f64(double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-DEFAULT: call double @sinh(double noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-DEFAULT: call double @sinh(double {{.*}}) // CHECK-DEFAULT: call double @llvm.sqrt.f64(double {{.*}}) -// CHECK-DEFAULT: call double @tan(double noundef {{.*}}) -// CHECK-DEFAULT: call double @tanh(double noundef {{.*}}) +// CHECK-DEFAULT: call double @tan(double {{.*}}) +// CHECK-DEFAULT: call double @tanh(double {{.*}}) // // CHECK-DEFAULT-LABEL: define dso_local void @f2 // CHECK-DEFAULT: call float @llvm.cos.f32(float {{.*}}) // CHECK-DEFAULT: call float @llvm.sin.f32(float {{.*}}) -// CHECK-DEFAULT: call double @tan(double noundef {{.*}}) +// CHECK-DEFAULT: call double @tan(double {{.*}}) // CHECK-DEFAULT: call double @llvm.log10.f64(double {{.*}}) -// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}}) -// CHECK-DEFAULT: call float @tanf(float noundef {{.*}}) +// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) +// CHECK-DEFAULT: call float @tanf(float {{.*}}) // CHECK-DEFAULT-LABEL: define dso_local void @f3 // CHECK-DEFAULT: call float @fake_exp10(float {{.*}}) diff --git a/libdevice/cmath_wrapper.cpp b/libdevice/cmath_wrapper.cpp index 79d8fbf12ce25..5d9c8f0a77d13 100644 --- a/libdevice/cmath_wrapper.cpp +++ b/libdevice/cmath_wrapper.cpp @@ -19,6 +19,9 @@ long int labs(long int x) { return __devicelib_labs(x); } DEVICE_EXTERN_C_INLINE long long int llabs(long long int x) { return __devicelib_llabs(x); } +DEVICE_EXTERN_C_INLINE +float fabsf(float x) { return __devicelib_fabsf(x); } + DEVICE_EXTERN_C_INLINE div_t div(int x, int y) { return __devicelib_div(x, y); } diff --git a/libdevice/cmath_wrapper_fp64.cpp b/libdevice/cmath_wrapper_fp64.cpp index bb5b1986a5a8c..e7b0815ae6526 100644 --- a/libdevice/cmath_wrapper_fp64.cpp +++ b/libdevice/cmath_wrapper_fp64.cpp @@ -15,6 +15,10 @@ // reference. If users provide their own math or complex functions(with // the prototype), functions in device libraries will be ignored and // overrided by users' version. + +DEVICE_EXTERN_C_INLINE +double fabs(double x) { return __devicelib_fabs(x); } + DEVICE_EXTERN_C_INLINE double log(double x) { return __devicelib_log(x); } diff --git a/libdevice/device_math.h b/libdevice/device_math.h index 930bcae7d7967..a402c748299d2 100644 --- a/libdevice/device_math.h +++ b/libdevice/device_math.h @@ -40,6 +40,12 @@ long int __devicelib_labs(long int x); DEVICE_EXTERN_C long long int __devicelib_llabs(long long int x); +DEVICE_EXTERN_C +float __devicelib_fabsf(float x); + +DEVICE_EXTERN_C +double __devicelib_fabs(double x); + DEVICE_EXTERN_C div_t __devicelib_div(int x, int y); diff --git a/libdevice/fallback-cmath-fp64.cpp b/libdevice/fallback-cmath-fp64.cpp index c42855699811a..9656f229c4fd1 100644 --- a/libdevice/fallback-cmath-fp64.cpp +++ b/libdevice/fallback-cmath-fp64.cpp @@ -14,6 +14,10 @@ // To support fallback device libraries on-demand loading, please update the // DeviceLibFuncMap in llvm/tools/sycl-post-link/sycl-post-link.cpp if you add // or remove any item in this file. + +DEVICE_EXTERN_C_INLINE +double __devicelib_fabs(double x) { return x < 0 ? -x : x; } + DEVICE_EXTERN_C_INLINE double __devicelib_log(double x) { return __spirv_ocl_log(x); } diff --git a/libdevice/fallback-cmath.cpp b/libdevice/fallback-cmath.cpp index 28a1463489b17..dc9e2806111f5 100644 --- a/libdevice/fallback-cmath.cpp +++ b/libdevice/fallback-cmath.cpp @@ -25,6 +25,9 @@ long int __devicelib_labs(long int x) { return x < 0 ? -x : x; } DEVICE_EXTERN_C_INLINE long long int __devicelib_llabs(long long int x) { return x < 0 ? -x : x; } +DEVICE_EXTERN_C_INLINE +float __devicelib_fabsf(float x) { return x < 0 ? -x : x; } + DEVICE_EXTERN_C_INLINE div_t __devicelib_div(int x, int y) { return {x / y, x % y}; } diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 2dc2abf704642..0f36f0d38eeb4 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -261,6 +261,17 @@ variable `SYCL_BUILD_PI_HIP_ROCM_DIR` which can be passed using the python $DPCPP_HOME/llvm/buildbot/configure.py --hip \ --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_DIR=/usr/local/rocm ``` +If further customization is required — for instance when the layout of +individual directories can not be inferred from `SYCL_BUILD_PI_HIP_ROCM_DIR` — +it is possible to specify the location of HIP include, HSA include and HIP +library directories, using the following CMake variables: +* `SYCL_BUILD_PI_HIP_INCLUDE_DIR`, +* `SYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR`, +* `SYCL_BUILD_PI_HIP_LIB_DIR`. +Please note that a similar customization would also be required for Unified +Runtime, see [the list of options provided by its +CMake](https://github.com/oneapi-src/unified-runtime#cmake-standard-options) +for details. [LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMDGPU compilation chain. The AMDGPU backend generates a standard ELF relocatable code diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 7c405d5ca791b..fed89532ff7ce 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -37,12 +37,14 @@ with the following entry-points: | `urCommandBufferFinalizeExp` | No more commands can be appended, makes command-buffer ready to enqueue on a command-queue. | | `urCommandBufferAppendKernelLaunchExp` | Append a kernel execution command to command-buffer. | | `urCommandBufferAppendUSMMemcpyExp` | Append a USM memcpy command to the command-buffer. | +| `urCommandBufferAppendUSMFillExp` | Append a USM fill command to the command-buffer. | | `urCommandBufferAppendMemBufferCopyExp` | Append a mem buffer copy command to the command-buffer. | | `urCommandBufferAppendMemBufferWriteExp` | Append a memory write command to a command-buffer object. | | `urCommandBufferAppendMemBufferReadExp` | Append a memory read command to a command-buffer object. | | `urCommandBufferAppendMemBufferCopyRectExp` | Append a rectangular memory copy command to a command-buffer object. | | `urCommandBufferAppendMemBufferWriteRectExp` | Append a rectangular memory write command to a command-buffer object. | | `urCommandBufferAppendMemBufferReadRectExp` | Append a rectangular memory read command to a command-buffer object. | +| `urCommandBufferAppendMemBufferFillExp` | Append a memory fill command to a command-buffer object. | | `urCommandBufferEnqueueExp` | Submit command-buffer to a command-queue for execution. | See the [UR EXP-COMMAND-BUFFER](https://oneapi-src.github.io/unified-runtime/core/EXP-COMMAND-BUFFER.html) @@ -347,6 +349,8 @@ The types of commands which are unsupported, and lead to this exception are: This corresponds to a memory buffer write command. * `handler::copy(src, dest)` or `handler::memcpy(dest, src)` - Where both `src` and `dest` are USM pointers. This corresponds to a USM copy command. +* `handler::memset(ptr, value, numBytes)` - This corresponds to a USM memory + fill command. Note that `handler::copy(src, dest)` where both `src` and `dest` are an accessor is supported, as a memory buffer copy command exists in the OpenCL extension. diff --git a/sycl/doc/developer/ContributeToDPCPP.md b/sycl/doc/developer/ContributeToDPCPP.md index c9e2f1ee42635..ee60eb5a59d70 100644 --- a/sycl/doc/developer/ContributeToDPCPP.md +++ b/sycl/doc/developer/ContributeToDPCPP.md @@ -10,6 +10,8 @@ All changes made to the DPC++ compiler and runtime library should generally preserve existing ABI/API and contributors should avoid making incompatible changes. One of the exceptions is experimental APIs, clearly marked so by namespace or related specification. +If you wish to propose a new experimental DPC++ extension then read +[README-process.md](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/README-process.md). Another exceptional case is the transition from SYCL 1.2.1 to SYCL 2020 standard. diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index d267bcbf23138..eb9221cb194b7 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -314,6 +314,19 @@ enum class graph_support_level { emulated }; +enum class node_type { + empty, + subgraph, + kernel, + memcpy, + memset, + memfill, + prefetch, + memadvise, + ext_oneapi_barrier, + host_task, +}; + namespace property { namespace graph { @@ -354,7 +367,18 @@ struct graphs_support; } // namespace device } // namespace info -class node {}; +class node { +public: + node() = delete; + + node_type get_type() const; + + std::vector get_predecessors() const; + + std::vector get_successors() const; + + static node get_node_from_event(event nodeEvent); +}; // State of a graph enum class graph_state { @@ -390,6 +414,9 @@ public: void make_edge(node& src, node& dest); void print_graph(std::string path, bool verbose = false) const; + + std::vector get_nodes() const; + std::vector get_root_nodes() const; }; template<> @@ -460,12 +487,56 @@ edges. The `node` class provides the {crs}[common reference semantics]. +==== Node Member Functions + +Table {counter: tableNumber}. Member functions of the `node` class. +[cols="2a,a"] +|=== +|Member Function|Description + +| [source,c++] ---- -namespace sycl::ext::oneapi::experimental { - class node {}; -} +node_type get_type() const; +---- +|Returns a value representing the type of command this node represents. + +| +[source,c++] +---- +std::vector get_predecessors() const; +---- +|Returns a list of the predecessor nodes which this node directly depends on. + +| +[source,c++] +---- +std::vector get_successors() const; ---- +|Returns a list of the successor nodes which directly depend on this node. + +| +[source,c++] +---- +static node get_node_from_event(event nodeEvent); +---- +|Finds the node associated with an event created from a submission to a queue + in the recording state. + +Parameters: + +* `nodeEvent` - Event returned from a submission to a queue in the recording + state. + +Returns: Graph node that was created when the command that returned +`nodeEvent` was submitted. + +Exceptions: + +* Throws with error code `invalid` if `nodeEvent` is not associated with a + graph node. + +|=== ==== Depends-On Property @@ -776,6 +847,21 @@ Exceptions: * Throws synchronously with error code `invalid` if the path is invalid or the file extension is not supported or if the write operation failed. +| +[source,c++] +---- +std::vector get_nodes() const; +---- +|Returns a list of all the nodes present in the graph in the order that they +were added. + +| +[source,c++] +---- +std::vector get_root_nodes() const; +---- +|Returns a list of all nodes in the graph which have no dependencies. + |=== Table {counter: tableNumber}. Member functions of the `command_graph` class for queue recording. diff --git a/sycl/doc/extensions/template.asciidoc b/sycl/doc/extensions/template.asciidoc index 166ccd2174b43..31d430ed56049 100644 --- a/sycl/doc/extensions/template.asciidoc +++ b/sycl/doc/extensions/template.asciidoc @@ -135,16 +135,16 @@ _It is also appropriate to give an indication of who the target audience is for the extension. For example, if the extension is intended only for ninja programmers, we might say something like:_ -> The properties described in this extension are advanced features that most -> applications should not need to use. In most cases, applications get the -> best performance without using these properties. +The properties described in this extension are advanced features that most +applications should not need to use. In most cases, applications get the best +performance without using these properties. _Occasionally, we might add an extension as a stopgap measure for a limited audience. When this happens, it's best to discourage general usage with a statement like:_ -> This extension exists to solve a specific problem, and a general solution is -> still being evaluated. It is not recommended for general usage. +This extension exists to solve a specific problem, and a general solution is +still being evaluated. It is not recommended for general usage. _Note that text should be wrapped at 80 columns as shown in this template. Extensions use AsciiDoc markup language (like this template). If you need help diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 69513335bf191..d963cfb860f4e 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -176,6 +176,8 @@ _PI_API(piextCommandBufferMemBufferWrite) _PI_API(piextCommandBufferMemBufferWriteRect) _PI_API(piextCommandBufferMemBufferRead) _PI_API(piextCommandBufferMemBufferReadRect) +_PI_API(piextCommandBufferMemBufferFill) +_PI_API(piextCommandBufferFillUSM) _PI_API(piextEnqueueCommandBuffer) _PI_API(piextUSMPitchedAlloc) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 010c59dd3c9d6..9860906e0f847 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -147,9 +147,10 @@ // 14.38 Change PI_MEM_ADVICE_* values to flags for use in bitwise operations. // 14.39 Added PI_EXT_INTEL_DEVICE_INFO_ESIMD_SUPPORT device info query. // 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones. +// 14.41 Added piextCommandBufferMemBufferFill & piextCommandBufferFillUSM #define _PI_H_VERSION_MAJOR 14 -#define _PI_H_VERSION_MINOR 40 +#define _PI_H_VERSION_MINOR 41 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -2441,7 +2442,7 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferReadRect( pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - void *ptr, pi_uint32 num_events_in_wait_list, + void *ptr, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); @@ -2458,7 +2459,7 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferReadRect( /// \param sync_point The sync_point associated with this memory operation. __SYCL_EXPORT pi_result piextCommandBufferMemBufferWrite( pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset, - size_t size, const void *ptr, pi_uint32 num_events_in_wait_list, + size_t size, const void *ptr, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); @@ -2483,7 +2484,43 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferWriteRect( pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, - const void *ptr, pi_uint32 num_events_in_wait_list, + const void *ptr, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a mem buffer fill command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param buffer is the location to fill the data. +/// \param pattern pointer to the pattern to fill the buffer with. +/// \param pattern_size size of the pattern in bytes. +/// \param offset Offset into the buffer to fill from. +/// \param size fill size in bytes. +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer command_buffer, pi_mem buffer, const void *pattern, + size_t pattern_size, size_t offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point); + +/// API to append a USM fill command to the command-buffer. +/// \param command_buffer The command-buffer to append onto. +/// \param ptr pointer to the USM allocation to fill. +/// \param pattern pointer to the pattern to fill ptr with. +/// \param pattern_size size of the pattern in bytes. +/// \param size fill size in bytes. +/// \param num_sync_points_in_wait_list The number of sync points in the +/// provided wait list. +/// \param sync_point_wait_list A list of sync points that this command must +/// wait on. +/// \param sync_point The sync_point associated with this memory operation. +__SYCL_EXPORT pi_result piextCommandBufferFillUSM( + pi_ext_command_buffer command_buffer, void *ptr, const void *pattern, + size_t pattern_size, size_t size, pi_uint32 num_sync_points_in_wait_list, const pi_ext_sync_point *sync_point_wait_list, pi_ext_sync_point *sync_point); diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 15645d9884499..ed13a3422c3a2 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -82,8 +82,37 @@ enum class graph_state { executable, ///< In executable state, the graph is ready to execute. }; +enum class node_type { + empty = 0, + subgraph, + kernel, + memcpy, + memset, + memfill, + prefetch, + memadvise, + ext_oneapi_barrier, + host_task +}; + /// Class representing a node in the graph, returned by command_graph::add(). class __SYCL_EXPORT node { +public: + node() = delete; + + /// Get the type of command associated with this node. + node_type get_type() const; + + /// Get a list of all the node dependencies of this node. + std::vector get_predecessors() const; + + /// Get a list of all nodes which depend on this node. + std::vector get_successors() const; + + /// Get the node associated with a SYCL event returned from a queue recording + /// submission. + static node get_node_from_event(event nodeEvent); + private: node(const std::shared_ptr &Impl) : impl(Impl) {} @@ -253,6 +282,12 @@ class __SYCL_EXPORT modifiable_command_graph { /// as kernel args or memory access where applicable. void print_graph(const std::string path, bool verbose = false) const; + /// Get a list of all nodes contained in this graph. + std::vector get_nodes() const; + + /// Get a list of all root nodes (nodes without dependencies) in this graph. + std::vector get_root_nodes() const; + protected: /// Constructor used internally by the runtime. /// @param Impl Detail implementation class to construct object with. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index af776662687d0..75ee911174faa 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1783,6 +1783,14 @@ class __SYCL_EXPORT handler { std::shared_ptr getCommandGraph() const; + /// Sets the user facing node type of this operation, used for operations + /// which are recorded to a graph. Since some operations may actually be a + /// different type than the user submitted, e.g. a fill() which is performed + /// as a kernel submission. + /// @param Type The actual type based on what handler functions the user + /// called. + void setUserFacingNodeType(ext::oneapi::experimental::node_type Type); + public: handler(const handler &) = delete; handler(handler &&) = delete; @@ -2726,6 +2734,7 @@ class __SYCL_EXPORT handler { checkIfPlaceholderIsBoundToHandler(Dst); throwIfActionIsCreated(); + setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); // TODO add check:T must be an integral scalar value or a SYCL vector type static_assert(isValidTargetForExplicitOp(AccessTarget), "Invalid accessor target for the fill method."); @@ -2764,6 +2773,7 @@ class __SYCL_EXPORT handler { /// \param Count is the number of times to fill Pattern into Ptr. template void fill(void *Ptr, const T &Pattern, size_t Count) { throwIfActionIsCreated(); + setUserFacingNodeType(ext::oneapi::experimental::node_type::memfill); static_assert(is_device_copyable::value, "Pattern must be device copyable"); parallel_for<__usmfill>(range<1>(Count), [=](id<1> Index) { diff --git a/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp b/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp index f687e811cfe09..53f59b1b18f80 100644 --- a/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp +++ b/sycl/pi_win_proxy_loader/pi_win_proxy_loader.cpp @@ -133,31 +133,23 @@ void preloadLibraries() { MapT &dllMap = getDllMap(); - auto ocl_path = LibSYCLDir / __SYCL_OPENCL_PLUGIN_NAME; - dllMap.emplace(ocl_path, - LoadLibraryEx(ocl_path.wstring().c_str(), NULL, NULL)); - - auto l0_path = LibSYCLDir / __SYCL_LEVEL_ZERO_PLUGIN_NAME; - dllMap.emplace(l0_path, LoadLibraryEx(l0_path.wstring().c_str(), NULL, NULL)); - - auto cuda_path = LibSYCLDir / __SYCL_CUDA_PLUGIN_NAME; - dllMap.emplace(cuda_path, - LoadLibraryEx(cuda_path.wstring().c_str(), NULL, NULL)); - - auto esimd_path = LibSYCLDir / __SYCL_ESIMD_EMULATOR_PLUGIN_NAME; - dllMap.emplace(esimd_path, - LoadLibraryEx(esimd_path.wstring().c_str(), NULL, NULL)); - - auto hip_path = LibSYCLDir / __SYCL_HIP_PLUGIN_NAME; - dllMap.emplace(hip_path, - LoadLibraryEx(hip_path.wstring().c_str(), NULL, NULL)); - - auto ur_path = LibSYCLDir / __SYCL_UNIFIED_RUNTIME_PLUGIN_NAME; - dllMap.emplace(ur_path, LoadLibraryEx(ur_path.wstring().c_str(), NULL, NULL)); - - auto nativecpu_path = LibSYCLDir / __SYCL_NATIVE_CPU_PLUGIN_NAME; - dllMap.emplace(nativecpu_path, - LoadLibraryEx(nativecpu_path.wstring().c_str(), NULL, NULL)); + // When searching for dependencies of the plugins limit the + // list of directories to %windows%\system32 and the directory that contains + // the loaded DLL (the plugin). This is necessary to avoid loading dlls from + // current directory and some other directories which are considered unsafe. + auto loadPlugin = [&](auto pluginName, + DWORD flags = LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR | + LOAD_LIBRARY_SEARCH_SYSTEM32) { + auto path = LibSYCLDir / pluginName; + dllMap.emplace(path, LoadLibraryEx(path.wstring().c_str(), NULL, flags)); + }; + loadPlugin(__SYCL_OPENCL_PLUGIN_NAME); + loadPlugin(__SYCL_LEVEL_ZERO_PLUGIN_NAME); + loadPlugin(__SYCL_CUDA_PLUGIN_NAME); + loadPlugin(__SYCL_ESIMD_EMULATOR_PLUGIN_NAME); + loadPlugin(__SYCL_HIP_PLUGIN_NAME); + loadPlugin(__SYCL_UNIFIED_RUNTIME_PLUGIN_NAME); + loadPlugin(__SYCL_NATIVE_CPU_PLUGIN_NAME); // Restore system error handling. (void)SetErrorMode(SavedMode); diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 94a39137ec4f7..5eb06f37b2237 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1137,6 +1137,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index 0f949af2c109c..5d5e8e4782066 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -44,6 +44,14 @@ else() set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_LIB_DIR}") endif() +# Set up defaults for UR +set(UR_HIP_INCLUDE_DIR "${PI_HIP_INCLUDE_DIR}" CACHE PATH + "Custom ROCm HIP include dir") +set(UR_HIP_HSA_INCLUDE_DIRS "${PI_HIP_HSA_INCLUDE_DIR}" CACHE PATH + "Custom ROCm HSA include dir") +set(UR_HIP_LIB_DIR "${PI_HIP_LIB_DIR}" CACHE PATH + "Custom ROCm HIP library dir") + # Mark override options for advanced usage mark_as_advanced(SYCL_BUILD_PI_HIP_INCLUDE_DIR SYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR SYCL_BUILD_PI_HIP_LIB_DIR) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 7095526dc1d34..775183d82d239 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1145,6 +1145,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index eb5ab8a42259f..6cb5322fa778f 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1303,6 +1303,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 6cc6a325af923..7512d411144ab 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1081,6 +1081,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index c3273c9affb9e..f81efbc880999 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit c2d78257ba7e7bbc230333f291282d16145aaac7 - # Merge: 8bb539c5 b3a1d52d + # commit c53953ae492587698d5adbab8ffee254d97b6a4e + # Merge: 9f88cf88 66d52ace # Author: Kenneth Benzie (Benie) - # Date: Wed Jan 10 11:24:12 2024 +0000 - # Merge pull request #1129 from sarnex/adapters - # [UR][L0] Make urPlatformGetBackendOption return -ze-opt-level=2 for -O1 and -O2 - set(UNIFIED_RUNTIME_TAG c2d78257ba7e7bbc230333f291282d16145aaac7) + # Date: Wed Jan 10 14:50:23 2024 +0000 + # Merge pull request #1170 from jchlanda/jakub/hip_custom_dirs + # [HIP] Allow custom location of ROCm components + set(UNIFIED_RUNTIME_TAG c53953ae492587698d5adbab8ffee254d97b6a4e) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index d3051c47bd93b..75d1bd598e80a 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4580,6 +4580,37 @@ inline pi_result piextCommandBufferMemBufferWrite( return PI_SUCCESS; } +inline pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + + HANDLE_ERRORS(urCommandBufferAppendMemBufferFillExp( + UrCommandBuffer, UrBuffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint)); + return PI_SUCCESS; +} + +inline pi_result piextCommandBufferFillUSM( + pi_ext_command_buffer CommandBuffer, void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + + ur_exp_command_buffer_handle_t UrCommandBuffer = + reinterpret_cast(CommandBuffer); + + HANDLE_ERRORS(urCommandBufferAppendUSMFillExp( + UrCommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint)); + return PI_SUCCESS; +} + inline pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index ab5b801c3fda3..7cb9fdbb9b554 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -1102,6 +1102,27 @@ pi_result piextCommandBufferMemBufferWriteRect( NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } +pi_result piextCommandBufferMemBufferFill( + pi_ext_command_buffer CommandBuffer, pi_mem Buffer, const void *Pattern, + size_t PatternSize, size_t Offset, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferMemBufferFill( + CommandBuffer, Buffer, Pattern, PatternSize, Offset, Size, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); +} + +pi_result piextCommandBufferFillUSM(pi_ext_command_buffer CommandBuffer, + void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size, + pi_uint32 NumSyncPointsInWaitList, + const pi_ext_sync_point *SyncPointWaitList, + pi_ext_sync_point *SyncPoint) { + return pi2ur::piextCommandBufferFillUSM( + CommandBuffer, Ptr, Pattern, PatternSize, Size, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); +} + pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, pi_queue Queue, pi_uint32 NumEventsInWaitList, diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index bf69241266c28..4d46c104a70b0 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -70,7 +70,7 @@ void duplicateNode(const std::shared_ptr Node, NodeCopy = std::make_shared(); NodeCopy->MCGType = sycl::detail::CG::None; } else { - NodeCopy = std::make_shared(Node->MCGType, Node->getCGCopy()); + NodeCopy = std::make_shared(Node->MNodeType, Node->getCGCopy()); } } @@ -156,6 +156,32 @@ bool isPartitionRoot(std::shared_ptr Node) { return true; } +/// Takes a vector of weak_ptrs to node_impls and returns a vector of node +/// objects created from those impls, in the same order. +std::vector createNodesFromImpls( + const std::vector> &Impls) { + std::vector Nodes{}; + + for (std::weak_ptr Impl : Impls) { + Nodes.push_back(sycl::detail::createSyclObjFromImpl(Impl.lock())); + } + + return Nodes; +} + +/// Takes a vector of shared_ptrs to node_impls and returns a vector of node +/// objects created from those impls, in the same order. +std::vector createNodesFromImpls( + const std::vector> &Impls) { + std::vector Nodes{}; + + for (std::shared_ptr Impl : Impls) { + Nodes.push_back(sycl::detail::createSyclObjFromImpl(Impl)); + } + + return Nodes; +} + } // anonymous namespace void partition::schedule() { @@ -277,6 +303,7 @@ graph_impl::~graph_impl() { } std::shared_ptr graph_impl::addNodesToExits( + const std::shared_ptr &Impl, const std::list> &NodeList) { // Find all input and output nodes from the node list std::vector> Inputs; @@ -303,12 +330,14 @@ std::shared_ptr graph_impl::addNodesToExits( // Add all the new nodes to the node storage for (auto &Node : NodeList) { MNodeStorage.push_back(Node); + addEventForNode(Impl, std::make_shared(), Node); } - return this->add(Outputs); + return this->add(Impl, Outputs); } std::shared_ptr graph_impl::addSubgraphNodes( + const std::shared_ptr &Impl, const std::shared_ptr &SubGraphExec) { std::map, std::shared_ptr> NodesMap; @@ -331,7 +360,7 @@ std::shared_ptr graph_impl::addSubgraphNodes( } } - return addNodesToExits(NewNodesList); + return addNodesToExits(Impl, NewNodesList); } void graph_impl::addRoot(const std::shared_ptr &Root) { @@ -343,7 +372,8 @@ void graph_impl::removeRoot(const std::shared_ptr &Root) { } std::shared_ptr -graph_impl::add(const std::vector> &Dep) { +graph_impl::add(const std::shared_ptr &Impl, + const std::vector> &Dep) { // Copy deps so we can modify them auto Deps = Dep; @@ -355,7 +385,8 @@ graph_impl::add(const std::vector> &Dep) { MNodeStorage.push_back(NodeImpl); addDepsToNode(NodeImpl, Deps); - + // Add an event associated with this explicit node for mixed usage + addEventForNode(Impl, std::make_shared(), NodeImpl); return NodeImpl; } @@ -382,11 +413,23 @@ graph_impl::add(const std::shared_ptr &Impl, if (Handler.MSubgraphNode) { return Handler.MSubgraphNode; } - return this->add(Handler.MCGType, std::move(Handler.MGraphNodeCG), Dep); + + node_type NodeType = + Handler.MImpl->MUserFacingNodeType != + ext::oneapi::experimental::node_type::empty + ? Handler.MImpl->MUserFacingNodeType + : ext::oneapi::experimental::detail::getNodeTypeFromCG( + Handler.MCGType); + + auto NodeImpl = this->add(NodeType, std::move(Handler.MGraphNodeCG), Dep); + // Add an event associated with this explicit node for mixed usage + addEventForNode(Impl, std::make_shared(), NodeImpl); + return NodeImpl; } std::shared_ptr -graph_impl::add(const std::vector Events) { +graph_impl::add(const std::shared_ptr &Impl, + const std::vector Events) { std::vector> Deps; @@ -401,11 +444,11 @@ graph_impl::add(const std::vector Events) { } } - return this->add(Deps); + return this->add(Impl, Deps); } std::shared_ptr -graph_impl::add(sycl::detail::CG::CGTYPE CGType, +graph_impl::add(node_type NodeType, std::unique_ptr CommandGroup, const std::vector> &Dep) { // Copy deps so we can modify them @@ -465,13 +508,13 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType, Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end()); const std::shared_ptr &NodeImpl = - std::make_shared(CGType, std::move(CommandGroup)); + std::make_shared(NodeType, std::move(CommandGroup)); MNodeStorage.push_back(NodeImpl); addDepsToNode(NodeImpl, Deps); // Set barrier nodes as prerequisites (new start points) for subsequent nodes - if (CGType == sycl::detail::CG::Barrier) { + if (NodeImpl->MCGType == sycl::detail::CG::Barrier) { MExtraDependencies.push_back(NodeImpl); } @@ -927,7 +970,7 @@ node modifiable_command_graph::addImpl(const std::vector &Deps) { } graph_impl::WriteLock Lock(impl->MMutex); - std::shared_ptr NodeImpl = impl->add(DepImpls); + std::shared_ptr NodeImpl = impl->add(impl, DepImpls); return sycl::detail::createSyclObjFromImpl(NodeImpl); } @@ -1076,6 +1119,17 @@ void modifiable_command_graph::print_graph(std::string path, } } +std::vector modifiable_command_graph::get_nodes() const { + return createNodesFromImpls(impl->MNodeStorage); +} +std::vector modifiable_command_graph::get_root_nodes() const { + auto &Roots = impl->MRoots; + std::vector> Impls{}; + + std::copy(Roots.begin(), Roots.end(), std::back_inserter(Impls)); + return createNodesFromImpls(Impls); +} + executable_command_graph::executable_command_graph( const std::shared_ptr &Graph, const sycl::context &Ctx) : impl(std::make_shared(Ctx, Graph)) { @@ -1111,8 +1165,26 @@ void executable_command_graph::update( throw sycl::exception(sycl::make_error_code(errc::invalid), "Method not yet implemented"); } - } // namespace detail + +node_type node::get_type() const { return impl->MNodeType; } + +std::vector node::get_predecessors() const { + return detail::createNodesFromImpls(impl->MPredecessors); +} + +std::vector node::get_successors() const { + return detail::createNodesFromImpls(impl->MSuccessors); +} + +node node::get_node_from_event(event nodeEvent) { + auto EventImpl = sycl::detail::getSyclObjImpl(nodeEvent); + auto GraphImpl = EventImpl->getCommandGraph(); + + return sycl::detail::createSyclObjFromImpl( + GraphImpl->getNodeForEvent(EventImpl)); +} + } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 46bc15f7b8022..019a9b2dc7018 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -37,6 +37,42 @@ namespace oneapi { namespace experimental { namespace detail { +inline node_type getNodeTypeFromCG(sycl::detail::CG::CGTYPE CGType) { + using sycl::detail::CG; + + // TODO: Handle subgraph case when internal representation has been changed to + // contain a single subgraph node. The current approach copies nodes into the + // parent graph which prevents this. + switch (CGType) { + case CG::None: + return node_type::empty; + case CG::Kernel: + return node_type::kernel; + case CG::CopyAccToPtr: + case CG::CopyPtrToAcc: + case CG::CopyAccToAcc: + case CG::CopyUSM: + return node_type::memcpy; + case CG::Memset2DUSM: + return node_type::memset; + case CG::Fill: + case CG::FillUSM: + return node_type::memfill; + case CG::PrefetchUSM: + return node_type::prefetch; + case CG::AdviseUSM: + return node_type::memadvise; + case CG::Barrier: + case CG::BarrierWaitlist: + return node_type::ext_oneapi_barrier; + case CG::CodeplayHostTask: + return node_type::host_task; + default: + assert(false && "Invalid Graph Node Type"); + return node_type::empty; + } +} + /// Implementation of node class from SYCL_EXT_ONEAPI_GRAPH. class node_impl { public: @@ -48,6 +84,8 @@ class node_impl { std::vector> MPredecessors; /// Type of the command-group for the node. sycl::detail::CG::CGTYPE MCGType = sycl::detail::CG::None; + /// User facing type of the node + node_type MNodeType = node_type::empty; /// Command group object which stores all args etc needed to enqueue the node std::unique_ptr MCommandGroup; @@ -93,12 +131,13 @@ class node_impl { node_impl() {} /// Construct a node representing a command-group. - /// @param CGType Type of the command-group. + /// @param NodeType Type of the command-group. /// @param CommandGroup The CG which stores the command information for this /// node. - node_impl(sycl::detail::CG::CGTYPE CGType, + node_impl(node_type NodeType, std::unique_ptr &&CommandGroup) - : MCGType(CGType), MCommandGroup(std::move(CommandGroup)) {} + : MCGType(CommandGroup->getType()), MNodeType(NodeType), + MCommandGroup(std::move(CommandGroup)) {} /// Checks if this node has a given requirement. /// @param Requirement Requirement to lookup. @@ -551,13 +590,12 @@ class graph_impl { void removeRoot(const std::shared_ptr &Root); /// Create a kernel node in the graph. - /// @param CGType Type of the command-group. + /// @param NodeType User facing type of the node. /// @param CommandGroup The CG which stores all information for this node. /// @param Dep Dependencies of the created node. /// @return Created node in the graph. std::shared_ptr - add(sycl::detail::CG::CGTYPE CGType, - std::unique_ptr CommandGroup, + add(node_type NodeType, std::unique_ptr CommandGroup, const std::vector> &Dep = {}); /// Create a CGF node in the graph. @@ -573,16 +611,20 @@ class graph_impl { const std::vector> &Dep = {}); /// Create an empty node in the graph. + /// @param Impl Graph implementation pointer /// @param Dep List of predecessor nodes. /// @return Created node in the graph. std::shared_ptr - add(const std::vector> &Dep = {}); + add(const std::shared_ptr &Impl, + const std::vector> &Dep = {}); /// Create an empty node in the graph. + /// @param Impl Graph implementation pointer /// @param Events List of events associated to this node. /// @return Created node in the graph. std::shared_ptr - add(const std::vector Events); + add(const std::shared_ptr &Impl, + const std::vector Events); /// Add a queue to the set of queues which are currently recording to this /// graph. @@ -607,10 +649,15 @@ class graph_impl { bool clearQueues(); /// Associate a sycl event with a node in the graph. + /// @param GraphImpl shared_ptr to Graph impl associated with this event, aka + /// this /// @param EventImpl Event to associate with a node in map. /// @param NodeImpl Node to associate with event in map. - void addEventForNode(std::shared_ptr EventImpl, + void addEventForNode(std::shared_ptr GraphImpl, + std::shared_ptr EventImpl, std::shared_ptr NodeImpl) { + if (!EventImpl->getCommandGraph()) + EventImpl->setCommandGraph(GraphImpl); MEventsMap[EventImpl] = NodeImpl; } @@ -632,12 +679,28 @@ class graph_impl { "No event has been recorded for the specified graph node"); } + std::shared_ptr + getNodeForEvent(std::shared_ptr EventImpl) { + ReadLock Lock(MMutex); + + if (auto NodeFound = MEventsMap.find(EventImpl); + NodeFound != std::end(MEventsMap)) { + return NodeFound->second; + } + + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "No node in this graph is associated with this event"); + } + /// Duplicates and Adds sub-graph nodes from an executable graph to this /// graph. + /// @param Impl Graph implementation pointer /// @param SubGraphExec sub-graph to add to the parent. /// @return An empty node is used to schedule dependencies on this sub-graph. std::shared_ptr - addSubgraphNodes(const std::shared_ptr &SubGraphExec); + addSubgraphNodes(const std::shared_ptr &Impl, + const std::shared_ptr &SubGraphExec); /// Query for the context tied to this graph. /// @return Context associated with graph. @@ -873,10 +936,12 @@ class graph_impl { void addRoot(const std::shared_ptr &Root); /// Adds nodes to the exit nodes of this graph. + /// @param Impl Graph implementation pointer /// @param NodeList List of nodes from sub-graph in schedule order. /// @return An empty node is used to schedule dependencies on this sub-graph. std::shared_ptr - addNodesToExits(const std::list> &NodeList); + addNodesToExits(const std::shared_ptr &Impl, + const std::list> &NodeList); /// Adds dependencies for a new node, if it has no deps it will be /// added as a root node. diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index d98602ab02e35..a6f4622587fcf 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -11,6 +11,7 @@ #include "sycl/handler.hpp" #include #include +#include namespace sycl { inline namespace _V1 { @@ -117,6 +118,14 @@ class handler_impl { // Extra information for semaphore interoperability sycl::detail::pi::PiInteropSemaphoreHandle MInteropSemaphoreHandle; + + // The user facing node type, used for operations which are recorded to a + // graph. Since some operations may actually be a different type than the user + // submitted, e.g. a fill() which is performed as a kernel submission. This is + // used to pass the type that the user expects to graph nodes when they are + // created for later query by users. + sycl::ext::oneapi::experimental::node_type MUserFacingNodeType = + sycl::ext::oneapi::experimental::node_type::empty; }; } // namespace detail diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index ae357a8f4fe5b..d0071dbabd15a 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -1666,6 +1666,50 @@ void MemoryManager::ext_oneapi_copy_usm_cmd_buffer( } } +void MemoryManager::ext_oneapi_fill_usm_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, + size_t Len, int Pattern, std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + + if (!DstMem) + throw runtime_error("NULL pointer argument in memory fill operation.", + PI_ERROR_INVALID_VALUE); + + const PluginPtr &Plugin = Context->getPlugin(); + // Pattern is interpreted as an unsigned char so pattern size is always 1. + size_t PatternSize = 1; + Plugin->call( + CommandBuffer, DstMem, &Pattern, PatternSize, Len, Deps.size(), + Deps.data(), OutSyncPoint); +} + +void MemoryManager::ext_oneapi_fill_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, SYCLMemObjI *SYCLMemObj, + void *Mem, size_t PatternSize, const char *Pattern, unsigned int Dim, + sycl::range<3> Size, sycl::range<3> AccessRange, sycl::id<3> AccessOffset, + unsigned int ElementSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { + assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + + const PluginPtr &Plugin = Context->getPlugin(); + if (SYCLMemObj->getType() != detail::SYCLMemObjI::MemObjType::Buffer) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Images are not supported in Graphs"); + } + if (Dim <= 1) { + Plugin->call( + CommandBuffer, pi::cast(Mem), Pattern, + PatternSize, AccessOffset[0] * ElementSize, + AccessRange[0] * ElementSize, Deps.size(), Deps.data(), OutSyncPoint); + return; + } + throw runtime_error("Not supported configuration of fill requested", + PI_ERROR_INVALID_OPERATION); +} + void MemoryManager::copy_image_bindless( void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index a1b68b1418c69..6169c99392f66 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -316,6 +316,24 @@ class __SYCL_EXPORT MemoryManager { void *DstMem, std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + static void ext_oneapi_fill_usm_cmd_buffer( + sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *DstMem, + size_t Len, int Pattern, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + + static void + ext_oneapi_fill_cmd_buffer(sycl::detail::ContextImplPtr Context, + sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + SYCLMemObjI *SYCLMemObj, void *Mem, + size_t PatternSize, const char *Pattern, + unsigned int Dim, sycl::range<3> Size, + sycl::range<3> AccessRange, + sycl::id<3> AccessOffset, unsigned int ElementSize, + std::vector Deps, + sycl::detail::pi::PiExtSyncPoint *OutSyncPoint); + static void copy_image_bindless(void *Src, QueueImplPtr Queue, void *Dst, const sycl::detail::pi::PiMemImageDesc &Desc, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index db14a10943ce3..2ffc0ebd54a38 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2764,6 +2764,28 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return PI_SUCCESS; } + case CG::CGTYPE::Fill: { + CGFill *Fill = (CGFill *)MCommandGroup.get(); + Requirement *Req = (Requirement *)(Fill->getReqToFill()); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + + MemoryManager::ext_oneapi_fill_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, AllocaCmd->getSYCLMemObj(), + AllocaCmd->getMemAllocation(), Fill->MPattern.size(), + Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange, + Req->MOffset, Req->MElemSize, std::move(MSyncPointDeps), &OutSyncPoint); + MEvent->setSyncPoint(OutSyncPoint); + return PI_SUCCESS; + } + case CG::CGTYPE::FillUSM: { + CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get(); + MemoryManager::ext_oneapi_fill_usm_cmd_buffer( + MQueue->getContextImplPtr(), MCommandBuffer, Fill->getDst(), + Fill->getLength(), Fill->getFill(), std::move(MSyncPointDeps), + &OutSyncPoint); + MEvent->setSyncPoint(OutSyncPoint); + return PI_SUCCESS; + } default: throw runtime_error("CG type not implemented for command buffers.", PI_ERROR_INVALID_OPERATION); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index e07f53d15141e..f7c3c613235f7 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -510,6 +510,12 @@ event handler::finalize() { ext::oneapi::experimental::detail::graph_impl::WriteLock Lock( GraphImpl->MMutex); + ext::oneapi::experimental::node_type NodeType = + MImpl->MUserFacingNodeType != + ext::oneapi::experimental::node_type::empty + ? MImpl->MUserFacingNodeType + : ext::oneapi::experimental::detail::getNodeTypeFromCG(MCGType); + // Create a new node in the graph representing this command-group if (MQueue->isInOrder()) { // In-order queues create implicit linear dependencies between nodes. @@ -518,22 +524,20 @@ event handler::finalize() { auto DependentNode = GraphImpl->getLastInorderNode(MQueue); NodeImpl = DependentNode - ? GraphImpl->add(MCGType, std::move(CommandGroup), + ? GraphImpl->add(NodeType, std::move(CommandGroup), {DependentNode}) - : GraphImpl->add(MCGType, std::move(CommandGroup)); + : GraphImpl->add(NodeType, std::move(CommandGroup)); // If we are recording an in-order queue remember the new node, so it // can be used as a dependency for any more nodes recorded from this // queue. GraphImpl->setLastInorderNode(MQueue, NodeImpl); } else { - NodeImpl = GraphImpl->add(MCGType, std::move(CommandGroup)); + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); } // Associate an event with this new node and return the event. - GraphImpl->addEventForNode(EventImpl, NodeImpl); - - EventImpl->setCommandGraph(GraphImpl); + GraphImpl->addEventForNode(GraphImpl, EventImpl, NodeImpl); return detail::createSyclObjFromImpl(EventImpl); } @@ -897,6 +901,7 @@ void handler::memset(void *Dest, int Value, size_t Count) { MDstPtr = Dest; MPattern.push_back(static_cast(Value)); MLength = Count; + setUserFacingNodeType(ext::oneapi::experimental::node_type::memset); setType(detail::CG::FillUSM); } @@ -1418,7 +1423,7 @@ void handler::ext_oneapi_graph( // return it to the user later. // The nodes of the subgraph are duplicated when added to its parents. // This avoids changing properties of the graph added as a subgraph. - MSubgraphNode = ParentGraph->addSubgraphNodes(GraphImpl); + MSubgraphNode = ParentGraph->addSubgraphNodes(ParentGraph, GraphImpl); // If we are recording an in-order queue remember the subgraph node, so it // can be used as a dependency for any more nodes recorded from this queue. @@ -1427,8 +1432,7 @@ void handler::ext_oneapi_graph( } // Associate an event with the subgraph node. auto SubgraphEvent = std::make_shared(); - SubgraphEvent->setCommandGraph(ParentGraph); - ParentGraph->addEventForNode(SubgraphEvent, MSubgraphNode); + ParentGraph->addEventForNode(ParentGraph, SubgraphEvent, MSubgraphNode); } else { // Set the exec graph for execution during finalize. MExecGraph = GraphImpl; @@ -1443,6 +1447,10 @@ handler::getCommandGraph() const { return MQueue->getCommandGraph(); } +void handler::setUserFacingNodeType(ext::oneapi::experimental::node_type Type) { + MImpl->MUserFacingNodeType = Type; +} + std::optional> handler::getMaxWorkGroups() { auto Dev = detail::getSyclObjImpl(detail::getDeviceFromHandler(*this)); std::array PiResult = {}; diff --git a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp index 994b61485801e..747ce4ed77465 100644 --- a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp +++ b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp @@ -19,12 +19,12 @@ namespace s = sycl; constexpr s::access::mode sycl_read = s::access::mode::read; constexpr s::access::mode sycl_write = s::access::mode::write; -#define TEST_NUM 63 +#define TEST_NUM 64 double ref[TEST_NUM] = { - 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 2, 0, 0, 1, 0, 2, 0, 0, - 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 5, 0, 0, 0, 0, 0.5, 0.5, NAN, NAN, 2, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.5, 0, 2, 0, 0, 1, 0, 2, 0, 0, + 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 5, 0, 0, 0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; double refIptr = 1; @@ -59,6 +59,7 @@ template void device_cmath_test(s::queue &deviceQueue) { T minus_infinity = -INFINITY; double subnormal; *((uint64_t *)&subnormal) = 0xFFFFFFFFFFFFFULL; + res_access[i++] = std::fabs(-1.0); res_access[i++] = std::cos(0.0); res_access[i++] = std::sin(0.0); res_access[i++] = std::round(1.0); diff --git a/sycl/test-e2e/DeviceLib/cmath_test.cpp b/sycl/test-e2e/DeviceLib/cmath_test.cpp index 3cc359f79fb94..77aeb312571d2 100644 --- a/sycl/test-e2e/DeviceLib/cmath_test.cpp +++ b/sycl/test-e2e/DeviceLib/cmath_test.cpp @@ -143,15 +143,15 @@ template void device_cmath_test_1(s::queue &deviceQueue) { assert(quo == 0); } -// MSVC implements std::ldexp and std::frexp by invoking the -// 'double' version of corresponding C math functions(ldexp and frexp). Those -// 2 functions can only work on Windows with fp64 extension support from -// underlying device. +// MSVC implements std::ldexp, std::fabs and std::frexp by +// invoking the 'double' version of corresponding C math functions(ldexp, fabs +// and frexp). Those functions can only work on Windows with fp64 extension +// support from underlying device. #ifndef _WIN32 template void device_cmath_test_2(s::queue &deviceQueue) { s::range<1> numOfItems{2}; T result[2] = {-1}; - T ref[2] = {0, 2}; + T ref[3] = {0, 2, 1}; // Variable exponent is an integer value to store the exponent in frexp // function int exponent = -1; @@ -166,6 +166,7 @@ template void device_cmath_test_2(s::queue &deviceQueue) { int i = 0; res_access[i++] = std::frexp(0.0f, &exp_access[0]); res_access[i++] = std::ldexp(1.0f, 1); + res_access[i++] = std::fabs(-1.0f); }); }); } diff --git a/sycl/test-e2e/Graph/Explicit/buffer_fill.cpp b/sycl/test-e2e/Graph/Explicit/buffer_fill.cpp new file mode 100644 index 0000000000000..73b961994a72b --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_fill.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} +// +// TODO enable cuda once buffer issue investigated and fixed +// UNSUPPORTED: cuda + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_fill.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp new file mode 100644 index 0000000000000..a8a42abc1acd0 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} + +// USM memset command not supported for OpenCL +// UNSUPPORTED: opencl + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/usm_memset.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/buffer_fill.cpp b/sycl/test-e2e/Graph/Inputs/buffer_fill.cpp new file mode 100644 index 0000000000000..351194dadda0f --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_fill.cpp @@ -0,0 +1,88 @@ +// Tests adding a Buffer fill operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}}}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + const size_t N = 10; + const float Pattern = 3.14f; + std::vector Data(N); + buffer Buffer{Data}; + + const uint64_t PatternI64 = 0x3333333355555555; + std::vector DataI64(N); + buffer BufferI64{DataI64}; + + const uint32_t PatternI32 = 888; + std::vector DataI32(N); + buffer BufferI32{DataI32}; + + const uint16_t PatternI16 = 777; + std::vector DataI16(N); + buffer BufferI16{DataI16}; + + const uint8_t PatternI8 = 33; + std::vector DataI8(N); + buffer BufferI8{DataI8}; + + Buffer.set_write_back(false); + BufferI64.set_write_back(false); + BufferI32.set_write_back(false); + BufferI16.set_write_back(false); + BufferI8.set_write_back(false); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Pattern); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI64.get_access(CGH); + CGH.fill(Acc, PatternI64); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI32.get_access(CGH); + CGH.fill(Acc, PatternI32); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI16.get_access(CGH); + CGH.fill(Acc, PatternI16); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = BufferI8.get_access(CGH); + CGH.fill(Acc, PatternI8); + }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + } + host_accessor HostData(Buffer); + host_accessor HostDataI64(BufferI64); + host_accessor HostDataI32(BufferI32); + host_accessor HostDataI16(BufferI16); + host_accessor HostDataI8(BufferI8); + for (int i = 0; i < N; i++) { + assert(HostData[i] == Pattern); + assert(HostDataI64[i] == PatternI64); + assert(HostDataI32[i] == PatternI32); + assert(HostDataI16[i] == PatternI16); + assert(HostDataI8[i] == PatternI8); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/usm_memset.cpp b/sycl/test-e2e/Graph/Inputs/usm_memset.cpp new file mode 100644 index 0000000000000..f357b9b3a5adf --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/usm_memset.cpp @@ -0,0 +1,34 @@ +// Tests adding a USM memset operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue{{sycl::ext::intel::property::queue::no_immediate_command_list{}}}; + + if (!are_graphs_supported(Queue)) { + return 0; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + unsigned char *Arr = malloc_device(N, Queue); + + int Value = 77; + auto NodeA = + add_node(Graph, Queue, [&](handler &CGH) { CGH.memset(Arr, Value, N); }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == Value); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_fill.cpp new file mode 100644 index 0000000000000..91729ace49742 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_fill.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} +// +// TODO enable cuda once buffer issue investigated and fixed +// UNSUPPORTED: cuda + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_fill.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp new file mode 100644 index 0000000000000..acbb0a502c67f --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp @@ -0,0 +1,11 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %} + +// USM memset command not supported for OpenCL +// UNSUPPORTED: opencl + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/usm_memset.cpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp index 619cac27a4d28..d90d46a90f801 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp @@ -6,18 +6,21 @@ template (A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 4349b32745751..0c458f67658d5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -76,14 +76,6 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { assert(rowsA % tM == 0); assert(colsA % tK == 0); assert(colsB % tN == 0); - - auto pA = address_space_cast(A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); - // submit main kernel std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); @@ -94,6 +86,15 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { // loop global // loop localrange [=](nd_item<2> it) [[intel::reqd_sub_group_size(sgSize)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); auto m2 = it.get_group(0); auto n2 = it.get_group(1); auto m1 = it.get_local_id(0); diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp index c2716c94a359c..f11ef0eadd7df 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp @@ -19,19 +19,22 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - auto pA = address_space_cast(A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); - q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp index 3607eab14fbc0..82116e6ea2b2a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp @@ -18,20 +18,21 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q, unsigned int vnniFactor) { // Add one iteration for the out of bounds dpas instruction size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0); size_t NDRangeN = N / TN; - - auto pA = address_space_cast(A); - auto pB = address_space_cast(B); - auto pC = address_space_cast(C); - q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index 3564472c5d958..9c435d1ee4337 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -13,20 +13,21 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - auto p_input = address_space_cast(input); - - auto p_out_col_major = - address_space_cast(out_col_major); - auto p_out_row_major = - address_space_cast(out_row_major); - q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + auto p_input = + address_space_cast(input); + + auto p_out_col_major = + address_space_cast(out_col_major); + auto p_out_row_major = + address_space_cast(out_row_major); + const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index abf73cce97bd0..fcdf008702292 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -84,9 +84,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 3940b6d80677a..c0dacf2632e9b 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -84,9 +84,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 38b3a420b2e71..a2bd23cbf26ce 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -83,9 +83,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 11ee74902849b..8bece2c54db32 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -83,9 +83,11 @@ piSamplerRetain piTearDown piextBindlessImageSamplerCreate piextCommandBufferCreate +piextCommandBufferFillUSM piextCommandBufferFinalize piextCommandBufferMemBufferCopy piextCommandBufferMemBufferCopyRect +piextCommandBufferMemBufferFill piextCommandBufferMemBufferRead piextCommandBufferMemBufferReadRect piextCommandBufferMemBufferWrite diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 62336dad2ed28..4ee98875d4444 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3715,6 +3715,7 @@ _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_ _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_semaphore_handleERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZN4sycl3_V13ext6oneapi12experimental33is_source_kernel_bundle_supportedENS0_7backendENS3_15source_languageE +_ZN4sycl3_V13ext6oneapi12experimental4node19get_node_from_eventENS0_5eventE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC1ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC2ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD1Ev @@ -3926,10 +3927,12 @@ _ZN4sycl3_V16detail13MemoryManager21copy_to_device_globalEPKvbSt10shared_ptrINS1 _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvRKSt6vectorIP9_pi_eventSaISB_EEPSB_ _ZN4sycl3_V16detail13MemoryManager23copy_from_device_globalEPKvbSt10shared_ptrINS1_10queue_implEEmmPvRKSt6vectorIP9_pi_eventSaISB_EEPSB_RKS5_INS1_10event_implEE _ZN4sycl3_V16detail13MemoryManager24allocateInteropMemObjectESt10shared_ptrINS1_12context_implEEPvRKS3_INS1_10event_implEERKS5_RKNS0_13property_listERP9_pi_event +_ZN4sycl3_V16detail13MemoryManager26ext_oneapi_fill_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvmPKcjNS0_5rangeILi3EEESE_NS0_2idILi3EEEjSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2D_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSA_jSC_SC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyD2H_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPvjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjPcjSC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager29ext_oneapi_copyH2D_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPNS1_11SYCLMemObjIEPcjNS0_5rangeILi3EEENS0_2idILi3EEEjPvjSC_SC_SE_jSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager30ext_oneapi_copy_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEPKvP22_pi_ext_command_buffermPvSt6vectorIjSaIjEEPj +_ZN4sycl3_V16detail13MemoryManager30ext_oneapi_fill_usm_cmd_bufferESt10shared_ptrINS1_12context_implEEP22_pi_ext_command_bufferPvmiSt6vectorIjSaIjEEPj _ZN4sycl3_V16detail13MemoryManager3mapEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEENS0_6access4modeEjNS0_5rangeILi3EEESC_NS0_2idILi3EEEjSt6vectorIP9_pi_eventSaISH_EERSH_ _ZN4sycl3_V16detail13MemoryManager4copyEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEEjNS0_5rangeILi3EEESA_NS0_2idILi3EEEjS5_S8_jSA_SA_SC_jSt6vectorIP9_pi_eventSaISF_EERSF_ _ZN4sycl3_V16detail13MemoryManager4copyEPNS1_11SYCLMemObjIEPvSt10shared_ptrINS1_10queue_implEEjNS0_5rangeILi3EEESA_NS0_2idILi3EEEjS5_S8_jSA_SA_SC_jSt6vectorIP9_pi_eventSaISF_EERSF_RKS6_INS1_10event_implEE @@ -4124,6 +4127,7 @@ _ZN4sycl3_V17handler20associateWithHandlerEPNS0_6detail30UnsampledImageAccessorB _ZN4sycl3_V17handler20memcpyToDeviceGlobalEPKvS3_bmm _ZN4sycl3_V17handler20setKernelCacheConfigE23_pi_kernel_cache_config _ZN4sycl3_V17handler20setStateSpecConstSetEv +_ZN4sycl3_V17handler21setUserFacingNodeTypeENS0_3ext6oneapi12experimental9node_typeE _ZN4sycl3_V17handler22ext_oneapi_fill2d_implEPvmPKvmmm _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE @@ -4210,8 +4214,13 @@ _ZNK4sycl3_V114interop_handle16getNativeContextEv _ZNK4sycl3_V115device_selector13select_deviceEv _ZNK4sycl3_V116default_selectorclERKNS0_6deviceE _ZNK4sycl3_V120accelerator_selectorclERKNS0_6deviceE +_ZNK4sycl3_V13ext6oneapi12experimental4node14get_successorsEv +_ZNK4sycl3_V13ext6oneapi12experimental4node16get_predecessorsEv +_ZNK4sycl3_V13ext6oneapi12experimental4node8get_typeEv _ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph11print_graphENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEb +_ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph14get_root_nodesEv _ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph8finalizeERKNS0_13property_listE +_ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph9get_nodesEv _ZNK4sycl3_V13ext6oneapi12experimental9image_mem16get_channel_typeEv _ZNK4sycl3_V13ext6oneapi12experimental9image_mem16get_num_channelsEv _ZNK4sycl3_V13ext6oneapi12experimental9image_mem17get_channel_orderEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index c3e000b4f9553..01881bc1d38d7 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -1041,6 +1041,8 @@ ?ext_oneapi_empty@queue@_V1@sycl@@QEBA_NXZ ?ext_oneapi_enable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z ?ext_oneapi_fill2d_impl@handler@_V1@sycl@@AEAAXPEAX_KPEBX111@Z +?ext_oneapi_fill_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEAX_KPEBDIV?$range@$02@34@6V?$id@$02@34@IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z +?ext_oneapi_fill_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KHV?$vector@IV?$allocator@I@std@@@6@PEAI@Z ?ext_oneapi_get_default_context@platform@_V1@sycl@@QEBA?AVcontext@23@XZ ?ext_oneapi_get_kernel@kernel_bundle_plain@detail@_V1@sycl@@QEAA?AVkernel@34@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z ?ext_oneapi_get_state@queue@_V1@sycl@@QEBA?AW4queue_state@experimental@oneapi@ext@23@XZ @@ -1263,32 +1265,38 @@ ?get_max_statement_size@stream@_V1@sycl@@QEBA_KXZ ?get_max_statement_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ ?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVdevice@45@AEBVcontext@45@@Z -?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVqueue@45@@Z -?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z -?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ -?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ -?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z -?get_pitch@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$01@34@XZ +?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVqueue@45@@Z +?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z +?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ +?get_node_from_event@node@experimental@oneapi@ext@_V1@sycl@@SA?AV123456@Vevent@56@@Z +?get_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ +?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z +?get_pitch@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$01@34@XZ ?get_pitch@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$01@34@XZ ?get_platform@context@_V1@sycl@@QEBA?AVplatform@23@XZ ?get_platform@device@_V1@sycl@@QEBA?AVplatform@23@XZ ?get_platforms@platform@_V1@sycl@@SA?AV?$vector@Vplatform@_V1@sycl@@V?$allocator@Vplatform@_V1@sycl@@@std@@@std@@XZ -?get_pointer_device@_V1@sycl@@YA?AVdevice@12@PEBXAEBVcontext@12@@Z -?get_pointer_type@_V1@sycl@@YA?AW4alloc@usm@12@PEBXAEBVcontext@12@@Z -?get_precision@stream@_V1@sycl@@QEBA_KXZ -?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ -?get_range@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$02@34@XZ -?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ -?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ -?get_size@image_plain@detail@_V1@sycl@@IEBA_KXZ -?get_size@stream@_V1@sycl@@QEBA_KXZ -?get_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ -?get_specialization_constant_impl@kernel_bundle_plain@detail@_V1@sycl@@IEBAXPEBDPEAX@Z -?get_stream_mode@stream@_V1@sycl@@QEBA?AW4stream_manipulator@23@XZ -?get_type@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4image_type@23456@XZ -?get_wait_list@event@_V1@sycl@@QEAA?AV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@XZ -?get_width@stream@_V1@sycl@@QEBA_KXZ -?get_work_item_buffer_size@stream@_V1@sycl@@QEBA_KXZ +?get_pointer_device@_V1@sycl@@YA?AVdevice@12@PEBXAEBVcontext@12@@Z +?get_pointer_type@_V1@sycl@@YA?AW4alloc@usm@12@PEBXAEBVcontext@12@@Z +?get_precision@stream@_V1@sycl@@QEBA_KXZ +?get_predecessors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ +?get_range@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$02@34@XZ +?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ +?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ +?get_root_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_size@image_plain@detail@_V1@sycl@@IEBA_KXZ +?get_size@stream@_V1@sycl@@QEBA_KXZ +?get_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ +?get_specialization_constant_impl@kernel_bundle_plain@detail@_V1@sycl@@IEBAXPEBDPEAX@Z +?get_stream_mode@stream@_V1@sycl@@QEBA?AW4stream_manipulator@23@XZ +?get_successors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +?get_type@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4image_type@23456@XZ +?get_type@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4node_type@23456@XZ +?get_wait_list@event@_V1@sycl@@QEAA?AV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@XZ +?get_width@stream@_V1@sycl@@QEBA_KXZ +?get_work_item_buffer_size@stream@_V1@sycl@@QEBA_KXZ ?get_work_item_buffer_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ ?gpu_selector_v@_V1@sycl@@YAHAEBVdevice@12@@Z ?handleHostData@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXPEAX@Z@std@@_K_N@Z @@ -1472,12 +1480,13 @@ ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXAEBV?$range@$01@34@@Z ?setPitches@image_impl@detail@_V1@sycl@@AEAAXXZ -?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ -?setStateSpecConstSet@handler@_V1@sycl@@AEAAXXZ -?setType@handler@_V1@sycl@@AEAAXW4CGTYPE@CG@detail@23@@Z -?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAX$$T@Z -?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z -?set_final_data_from_storage@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ +?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ +?setStateSpecConstSet@handler@_V1@sycl@@AEAAXXZ +?setType@handler@_V1@sycl@@AEAAXW4CGTYPE@CG@detail@23@@Z +?setUserFacingNodeType@handler@_V1@sycl@@AEAAXW4node_type@experimental@oneapi@ext@23@@Z +?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAX$$T@Z +?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z +?set_final_data_from_storage@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXXZ ?set_final_data_internal@image_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 8ad08257a1046..660f84c63c3b9 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -804,7 +804,9 @@ TEST_F(CommandGraphTest, RecordSubGraph) { // The first and fourth nodes should have events associated with MainGraph but // not graph. The second and third nodes were added as a sub-graph and - // duplicated. They should not have events associated with Graph or MainGraph. + // duplicated. They should only have events associated with MainGraph, however + // these events are created internally in the graph and not present in user + // code. ASSERT_ANY_THROW( sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); ASSERT_EQ( @@ -812,13 +814,13 @@ TEST_F(CommandGraphTest, RecordSubGraph) { sycl::detail::getSyclObjImpl(Node1MainGraph)); ScheduleIt++; - ASSERT_ANY_THROW( + ASSERT_NO_THROW( sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt)); ASSERT_ANY_THROW( sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); ScheduleIt++; - ASSERT_ANY_THROW( + ASSERT_NO_THROW( sycl::detail::getSyclObjImpl(MainGraph)->getEventForNode(*ScheduleIt)); ASSERT_ANY_THROW( sycl::detail::getSyclObjImpl(Graph)->getEventForNode(*ScheduleIt)); @@ -1941,6 +1943,149 @@ TEST_F(CommandGraphTest, GraphPartitionsMerging) { ASSERT_FALSE(PartitionsList[4]->isHostTask()); } +TEST_F(CommandGraphTest, GetNodeQueries) { + // Tests graph and node queries for correctness + + // Add some nodes to the graph for testing and test after each addition. + auto RootA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 1lu); + ASSERT_EQ(GraphNodes.size(), 1lu); + } + auto RootB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 2lu); + } + auto NodeA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(RootA, RootB)}); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 3lu); + } + auto NodeB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(RootB)}); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 2lu); + ASSERT_EQ(GraphNodes.size(), 4lu); + } + auto RootC = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + { + auto GraphRoots = Graph.get_root_nodes(); + auto GraphNodes = Graph.get_nodes(); + ASSERT_EQ(GraphRoots.size(), 3lu); + ASSERT_EQ(GraphNodes.size(), 5lu); + } + + ASSERT_EQ(RootA.get_predecessors().size(), 0lu); + ASSERT_EQ(RootA.get_successors().size(), 1lu); + ASSERT_EQ(RootB.get_predecessors().size(), 0lu); + ASSERT_EQ(RootB.get_successors().size(), 2lu); + ASSERT_EQ(RootC.get_predecessors().size(), 0lu); + ASSERT_EQ(RootC.get_successors().size(), 0lu); + ASSERT_EQ(NodeA.get_predecessors().size(), 2lu); + ASSERT_EQ(NodeA.get_successors().size(), 0lu); + ASSERT_EQ(NodeB.get_predecessors().size(), 1lu); + ASSERT_EQ(NodeB.get_successors().size(), 0lu); + + // List of nodes that we've added in the order they were added. + std::vector NodeList{RootA, RootB, NodeA, NodeB, RootC}; + auto GraphNodes = Graph.get_nodes(); + + // Check ordering of all nodes is correct + for (size_t i = 0; i < GraphNodes.size(); i++) { + ASSERT_EQ(sycl::detail::getSyclObjImpl(GraphNodes[i]), + sycl::detail::getSyclObjImpl(NodeList[i])); + } +} + +TEST_F(CommandGraphTest, NodeTypeQueries) { + + // Allocate some pointers for testing memory nodes + int *PtrA = malloc_device(16, Queue); + int *PtrB = malloc_device(16, Queue); + + auto NodeKernel = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + ASSERT_EQ(NodeKernel.get_type(), experimental::node_type::kernel); + + auto NodeMemcpy = Graph.add( + [&](sycl::handler &cgh) { cgh.memcpy(PtrA, PtrB, 16 * sizeof(int)); }); + ASSERT_EQ(NodeMemcpy.get_type(), experimental::node_type::memcpy); + + auto NodeMemset = Graph.add( + [&](sycl::handler &cgh) { cgh.memset(PtrB, 7, 16 * sizeof(int)); }); + ASSERT_EQ(NodeMemset.get_type(), experimental::node_type::memset); + + auto NodeMemfill = + Graph.add([&](sycl::handler &cgh) { cgh.fill(PtrB, 7, 16); }); + ASSERT_EQ(NodeMemfill.get_type(), experimental::node_type::memfill); + + auto NodePrefetch = Graph.add( + [&](sycl::handler &cgh) { cgh.prefetch(PtrA, 16 * sizeof(int)); }); + ASSERT_EQ(NodePrefetch.get_type(), experimental::node_type::prefetch); + + auto NodeMemadvise = Graph.add( + [&](sycl::handler &cgh) { cgh.mem_advise(PtrA, 16 * sizeof(int), 1); }); + ASSERT_EQ(NodeMemadvise.get_type(), experimental::node_type::memadvise); + + // Use queue recording for barrier since it is not supported in explicit API + Graph.begin_recording(Queue); + auto EventBarrier = + Queue.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + Graph.end_recording(); + + auto NodeBarrier = experimental::node::get_node_from_event(EventBarrier); + ASSERT_EQ(NodeBarrier.get_type(), + experimental::node_type::ext_oneapi_barrier); + + auto NodeHostTask = + Graph.add([&](sycl::handler &cgh) { cgh.host_task([]() {}); }); + ASSERT_EQ(NodeHostTask.get_type(), experimental::node_type::host_task); + + auto NodeEmpty = Graph.add(); + ASSERT_EQ(NodeEmpty.get_type(), experimental::node_type::empty); + + // TODO: Test subgraph case once changes have been implemented. +} + +TEST_F(CommandGraphTest, GetNodeFromEvent) { + // Test getting a node from a recorded event and using that as a dependency + // for an explicit node + Graph.begin_recording(Queue); + auto EventKernel = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(); + + experimental::node NodeKernelA = + experimental::node::get_node_from_event(EventKernel); + + // Add node as a dependency with the property + auto NodeKernelB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + experimental::property::node::depends_on(NodeKernelA)); + + // Test adding a dependency through make_edge + auto NodeKernelC = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + ASSERT_NO_THROW(Graph.make_edge(NodeKernelA, NodeKernelC)); + + auto GraphExec = Graph.finalize(); +} + TEST_F(CommandGraphTest, ProfilingException) { Graph.begin_recording(Queue); auto Event1 = Queue.submit( @@ -2178,3 +2323,65 @@ TEST_F(MultiThreadGraphTest, Finalize) { ASSERT_EQ(checkExecGraphSchedule(GraphExecImpl, GraphExecRefImpl), true); } } + +// Test adding fill and memset nodes to a graph +TEST_F(CommandGraphTest, FillMemsetNodes) { + const int Value = 7; + // Buffer fill + buffer Buffer{range<1>{1}}; + Buffer.set_write_back(false); + + { + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + auto NodeA = Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Value); + }); + auto NodeB = Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + CGH.fill(Acc, Value); + }); + + auto NodeAImpl = sycl::detail::getSyclObjImpl(NodeA); + auto NodeBImpl = sycl::detail::getSyclObjImpl(NodeB); + + // Check Operator== + EXPECT_EQ(NodeAImpl, NodeAImpl); + EXPECT_NE(NodeAImpl, NodeBImpl); + } + + // USM + { + int *USMPtr = malloc_device(1, Queue); + + // We need to create some differences between nodes because unlike buffer + // fills they are not differentiated on accessor ptr value. + auto FillNodeA = + Graph.add([&](handler &CGH) { CGH.fill(USMPtr, Value, 1); }); + auto FillNodeB = + Graph.add([&](handler &CGH) { CGH.fill(USMPtr, Value + 1, 1); }); + auto MemsetNodeA = + Graph.add([&](handler &CGH) { CGH.memset(USMPtr, Value, 1); }); + auto MemsetNodeB = + Graph.add([&](handler &CGH) { CGH.memset(USMPtr, Value, 2); }); + + auto FillNodeAImpl = sycl::detail::getSyclObjImpl(FillNodeA); + auto FillNodeBImpl = sycl::detail::getSyclObjImpl(FillNodeB); + auto MemsetNodeAImpl = sycl::detail::getSyclObjImpl(MemsetNodeA); + auto MemsetNodeBImpl = sycl::detail::getSyclObjImpl(MemsetNodeB); + + // Check Operator== + EXPECT_EQ(FillNodeAImpl, FillNodeAImpl); + EXPECT_EQ(FillNodeBImpl, FillNodeBImpl); + EXPECT_NE(FillNodeAImpl, FillNodeBImpl); + + EXPECT_EQ(MemsetNodeAImpl, MemsetNodeAImpl); + EXPECT_EQ(MemsetNodeBImpl, MemsetNodeBImpl); + EXPECT_NE(MemsetNodeAImpl, MemsetNodeBImpl); + sycl::free(USMPtr, Queue); + } +} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index f12917b1e70b9..decc1a7e309ee 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1376,6 +1376,23 @@ inline pi_result mock_piextCommandBufferMemBufferCopyRect( return PI_SUCCESS; } +inline pi_result mock_piextCommandBufferMemBufferFill( + pi_ext_command_buffer command_buffer, pi_mem buffer, const void *pattern, + size_t pattern_size, size_t offset, size_t size, + pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + +inline pi_result mock_piextCommandBufferFillUSM( + pi_ext_command_buffer command_buffer, void *ptr, const void *pattern, + size_t pattern_size, size_t size, pi_uint32 num_sync_points_in_wait_list, + const pi_ext_sync_point *sync_point_wait_list, + pi_ext_sync_point *sync_point) { + return PI_SUCCESS; +} + inline pi_result mock_piTearDown(void *PluginParameter) { return PI_SUCCESS; } inline pi_result mock_piPluginGetLastError(char **message) {