lukeiwanski · jeffmak · Jul 17, 2017 · Jul 19, 2017
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -45,6 +45,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 // Common code between the two backward pass kernels: verifies that the
 // dimensions all match and extract the padded rows and columns.
@@ -612,6 +615,239 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
 };
 
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+ int kKnownDepthMultiplier>
+class DepthwiseConv2dBackpropInputSYCLKernelNHWC{
+public:
+ using write_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+ cl::sycl::access::target::global_buffer>;
+ using read_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+ cl::sycl::access::target::global_buffer>;
+public:
+ DepthwiseConv2dBackpropInputSYCLKernelNHWC(const DepthwiseArgs args,
+ read_accessor out_backprop_accessor,
+ read_accessor filter_accessor,
+ write_accessor in_backprop_accessor)
+ : args_(args),
+ out_backprop_accessor_(out_backprop_accessor),
+ filter_accessor_(filter_accessor),
+ in_backprop_accessor_(in_backprop_accessor){}
+ void operator()(cl::sycl::item<1> item){
+ T* out_backprop_data = ConvertToActualTypeSycl(T, out_backprop_accessor_);
+ T* filter_data = ConvertToActualTypeSycl(T, filter_accessor_);
+ T* in_backprop_data = ConvertToActualTypeSycl(T, in_backprop_accessor_);
+
+ const int thread_id = item.get_linear_id();
+
+ const int filter_rows =
+ kKnownFilterHeight < 0 ? args_.filter_rows : kKnownFilterHeight;
+ const int filter_cols =
+ kKnownFilterWidth < 0 ? args_.filter_cols : kKnownFilterWidth;
+ const int depth_multiplier =
+ kKnownDepthMultiplier < 0 ? args_.depth_multiplier : kKnownDepthMultiplier;
+ // Compute the indexes of this thread in the output.
+ const int in_d = thread_id % args_.in_depth;
+ const int in_c = (thread_id / args_.in_depth) % args_.in_cols;
+ const int in_r = (thread_id / args_.in_depth / args_.in_cols) % args_.in_rows;
+ const int b = thread_id / args_.in_depth / args_.in_cols / args_.in_rows;
+
+ T sum = T(0);
+
+ const int out_r_start =
+ std::max<int>(0, (in_r - filter_rows + args_.pad_rows + args_.stride) / args_.stride);
+ const int out_r_end = std::min(args_.out_rows - 1, (in_r + args_.pad_rows) / args_.stride);
+ const int out_c_start =
+ std::max(0, (in_c - filter_cols + args_.pad_cols + args_.stride) / args_.stride);
+ const int out_c_end = std::min(args_.out_cols - 1, (in_c + args_.pad_cols) / args_.stride);
+
+ for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
+ const int f_r = in_r + args_.pad_rows - out_r * args_.stride;
+ const int temp_out_backprop_offset =
+ args_.out_depth * args_.out_cols * (out_r + args_.out_rows * b);
+ const int temp_filter_offset = filter_cols * f_r;
+ for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
+ const int f_c = in_c + args_.pad_cols - out_c * args_.stride;
+ int filter_offset =
+ depth_multiplier * (in_d + args_.in_depth * (f_c + temp_filter_offset));
+ const int out_backprop_offset =
+ args_.out_depth * out_c + temp_out_backprop_offset;
+#pragma unroll 6
+ for (int i = 0; i < depth_multiplier; ++i) {
+ sum +=
+ out_backprop_data[out_backprop_offset + in_d * depth_multiplier + i]
+ * filter_data[filter_offset + i];
+ }
+ }
+ }
+ const int in_backprop_offset =
+ in_d + args_.in_depth * (in_c + args_.in_cols * (in_r + args_.in_rows * b));
+ in_backprop_data[in_backprop_offset] = sum;
+
+ }
+private:
+ const DepthwiseArgs args_;
+ const read_accessor out_backprop_accessor_;
+ const read_accessor filter_accessor_;
+ write_accessor in_backprop_accessor_;
+};
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+ int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropInputSYCL(const SYCLDevice& d,
+ const DepthwiseArgs args,
+ const Tensor& out_backprop,
+ const Tensor& filter, Tensor* in_backprop,
+ TensorFormat data_format) {
+ const int num_threads = in_backprop->NumElements();
+
+ auto out_backprop_buffer =
+ d.get_sycl_buffer(out_backprop.template flat<T>().data());
+ auto filter_buffer =
+ d.get_sycl_buffer(filter.template flat<T>().data());
+ auto in_backprop_buffer =
+ d.get_sycl_buffer(in_backprop->template flat<T>().data());
+
+ d.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+ auto out_backprop_access =
+ out_backprop_buffer
+ .template get_access<cl::sycl::access::mode::read>(cgh);
+ auto filter_access =
+ filter_buffer
+ .template get_access<cl::sycl::access::mode::read>(cgh);
+ auto in_backprop_access =
+ in_backprop_buffer
+ .template get_access<cl::sycl::access::mode::write>(cgh);
+
+ if(data_format == FORMAT_NHWC){
+ DepthwiseConv2dBackpropInputSYCLKernelNHWC<T, kKnownFilterWidth,
+ kKnownFilterHeight, kKnownDepthMultiplier> functor(
+ args, out_backprop_access, filter_access, in_backprop_access);
+ cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
+ } else {
+ assert(false && "Incorrect data format");
+ return;
+ }
+ });
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dBackpropInputSYCL(const SYCLDevice& d,
+ const DepthwiseArgs args,
+ const Tensor& out_backprop,
+ const Tensor& filter, Tensor* in_backprop,
+ TensorFormat data_format) {
+ if (args.depth_multiplier == 1) {
+ LaunchDepthwiseConv2dBackpropInputSYCL<T, kKnownFilterWidth,
+ kKnownFilterHeight, 1>(
+ d, args, out_backprop, filter, in_backprop, data_format);
+ } else {
+ LaunchDepthwiseConv2dBackpropInputSYCL<T, kKnownFilterWidth,
+ kKnownFilterHeight, -1>(
+ d, args, out_backprop, filter, in_backprop, data_format);
+ }
+}
+
+template <typename T>
+struct LaunchDepthwiseConvBackpropInputOp<SYCLDevice, T> {
+ static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
+ const Tensor& out_backprop, const Tensor& filter,
+ Tensor* in_backprop, TensorFormat data_format) {
+ const SYCLDevice& d = ctx->eigen_device<SYCLDevice>();
+ if (args.filter_rows == 3 && args.filter_cols == 3) {
+ LaunchDepthwiseConv2dBackpropInputSYCL<T, 3, 3>(
+ d, args, out_backprop, filter, in_backprop, data_format);
+ } else {
+ LaunchDepthwiseConv2dBackpropInputSYCL<T, -1, -1>(
+ d, args, out_backprop, filter, in_backprop, data_format);
+ }
+ }
+};
+
+template <class T>
+class DepthwiseConv2dNativeBackpropInputOp <SYCLDevice, T> : public OpKernel {
+ public:
+ explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+ OP_REQUIRES(context, strides_.size() == 4,
+ errors::InvalidArgument("Sliding window strides field must "
+ "specify 4 dimensions"));
+
+ string data_format;
+ OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+ OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+ errors::InvalidArgument("Invalid data format"));
+
+ stride_ = GetTensorDim(strides_, data_format_, 'H');
+ const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+ const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+ const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+ OP_REQUIRES(context, stride_ == stride_w,
+ errors::InvalidArgument(
+ "Current implementation only supports equal length "
+ "strides in the row and column dimensions."));
+ OP_REQUIRES(
+ context, (stride_n == 1 && stride_c == 1),
+ errors::InvalidArgument("Current implementation does not yet support "
+ "strides in the batch and depth dimensions."));
+ OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input_sizes = context->input(0);
+ const Tensor& filter = context->input(1);
+ OP_REQUIRES(
+ context, TensorShapeUtils::IsVector(input_sizes.shape()),
+ errors::InvalidArgument(
+ "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+ input_sizes.dims()));
+ TensorShape input_shape;
+ const int32* in_sizes_data = input_sizes.template flat<int32>().data();
+ for (int i = 0; i < input_sizes.NumElements(); ++i) {
+ OP_REQUIRES(context, in_sizes_data[i] >= 0,
+ errors::InvalidArgument("Dimension ", i,
+ " of input_sizes must be >= 0"));
+ input_shape.AddDim(in_sizes_data[i]);
+ }
+ const TensorShape& filter_shape = filter.shape();
+ EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
+ Tensor* in_backprop = nullptr;
+ OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+ {0}, 0, input_shape, &in_backprop));
+ if (input_shape.num_elements() == 0) {
+ return;
+ }
+ LaunchDepthwiseConvBackpropInputOp<SYCLDevice, T>::launch(
+ context, args, out_backprop, filter, in_backprop,
+ data_format_);
+ }
+
+ private:
+ std::vector<int32> strides_;
+ Padding padding_;
+ TensorFormat data_format_;
+ int64 stride_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<float>("T")
+ .HostMemory("input_sizes"),
+ DepthwiseConv2dNativeBackpropInputOp<SYCLDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<double>("T")
+ .HostMemory("input_sizes"),
+ DepthwiseConv2dNativeBackpropInputOp<SYCLDevice, double>);
+#endif // TENSORFLOW_USE_SYCL
+
 #define REGISTER_CPU_KERNEL(T) \
  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
  .Device(DEVICE_CPU) \