From a4494ea365dcbd52fa3db5aa3ed590be84e78f40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Miguel=20Sousa?= <luis.sousa@amd.com>
Date: Fri, 13 Dec 2024 11:12:10 +0100
Subject: [PATCH 1/4] Fix bug where emitc constants wouldn't be directly
 emitted in subscripts. (#411)

* Fix bug where emitc constants wouldn't be directly emitted in subscripts.

* Use ConstantOp as a deferred op when constantsAsVariables=false
---
 mlir/lib/Target/Cpp/TranslateToCpp.cpp        | 74 +++++++++++--------
 .../Cpp/emitc-constants-as-variables.mlir     | 50 ++++++++++++-
 2 files changed, 90 insertions(+), 34 deletions(-)
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index a2e368e502737d..e09fe1d1126be5 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -254,6 +254,16 @@ struct CppEmitter {
     return operandExpression == emittedExpression;
   };
 
+  /// Determine whether expression \p expressionOp should be emitted inline,
+  /// i.e. as part of its user. This function recommends inlining of any
+  /// expressions that can be inlined unless it is used by another expression,
+  /// under the assumption that  any expression fusion/re-materialization was
+  /// taken care of by transformations run by the backend.
+  bool shouldBeInlined(ExpressionOp expressionOp);
+
+  /// This emitter will only emit translation units whos id matches this value.
+  StringRef willOnlyEmitTu() { return onlyTu; }
+
 private:
   using ValueMapper = llvm::ScopedHashTable<Value, std::string>;
   using BlockMapper = llvm::ScopedHashTable<Block *, std::string>;
@@ -297,21 +307,22 @@ struct CppEmitter {
       return lowestPrecedence();
     return emittedExpressionPrecedence.back();
   }
+
+  /// Determine whether expression \p op should be emitted in a deferred way.
+  bool hasDeferredEmission(Operation *op);
 };
 } // namespace
 
-/// Determine whether expression \p op should be emitted in a deferred way.
-static bool hasDeferredEmission(Operation *op) {
+bool CppEmitter::hasDeferredEmission(Operation *op) {
+  if (llvm::isa_and_nonnull<emitc::ConstantOp>(op)) {
+    return !shouldUseConstantsAsVariables();
+  }
+
   return isa_and_nonnull<emitc::GetGlobalOp, emitc::LiteralOp, emitc::MemberOp,
                          emitc::MemberOfPtrOp, emitc::SubscriptOp>(op);
 }
 
-/// Determine whether expression \p expressionOp should be emitted inline, i.e.
-/// as part of its user. This function recommends inlining of any expressions
-/// that can be inlined unless it is used by another expression, under the
-/// assumption that  any expression fusion/re-materialization was taken care of
-/// by transformations run by the backend.
-static bool shouldBeInlined(ExpressionOp expressionOp) {
+bool CppEmitter::shouldBeInlined(ExpressionOp expressionOp) {
   // Do not inline if expression is marked as such.
   if (expressionOp.getDoNotInline())
     return false;
@@ -373,6 +384,25 @@ static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation,
 static LogicalResult printOperation(CppEmitter &emitter,
                                     emitc::ConstantOp constantOp) {
   if (!emitter.shouldUseConstantsAsVariables()) {
+    std::string out;
+    llvm::raw_string_ostream ss(out);
+
+    /// Temporary emitter object that writes to our stream instead of the output
+    /// allowing for the capture and caching of the produced string.
+    CppEmitter sniffer = CppEmitter(ss, emitter.shouldDeclareVariablesAtTop(),
+                                    emitter.willOnlyEmitTu(),
+                                    emitter.shouldUseConstantsAsVariables());
+
+    ss << "(";
+    if (failed(sniffer.emitType(constantOp.getLoc(), constantOp.getType())))
+      return failure();
+    ss << ") ";
+
+    if (failed(
+            sniffer.emitAttribute(constantOp.getLoc(), constantOp.getValue())))
+      return failure();
+
+    emitter.cacheDeferredOpResult(constantOp.getResult(), out);
     return success();
   }
 
@@ -838,7 +868,7 @@ static LogicalResult printOperation(CppEmitter &emitter, emitc::CastOp castOp) {
 
 static LogicalResult printOperation(CppEmitter &emitter,
                                     emitc::ExpressionOp expressionOp) {
-  if (shouldBeInlined(expressionOp))
+  if (emitter.shouldBeInlined(expressionOp))
     return success();
 
   Operation &op = *expressionOp.getOperation();
@@ -892,7 +922,7 @@ static LogicalResult printOperation(CppEmitter &emitter, emitc::ForOp forOp) {
         dyn_cast_if_present<ExpressionOp>(value.getDefiningOp());
     if (!expressionOp)
       return false;
-    return shouldBeInlined(expressionOp);
+    return emitter.shouldBeInlined(expressionOp);
   };
 
   os << "for (";
@@ -1114,7 +1144,7 @@ static LogicalResult printFunctionBody(CppEmitter &emitter,
         functionOp->walk<WalkOrder::PreOrder>([&](Operation *op) -> WalkResult {
           if (isa<emitc::ExpressionOp>(op->getParentOp()) ||
               (isa<emitc::ExpressionOp>(op) &&
-               shouldBeInlined(cast<emitc::ExpressionOp>(op))))
+               emitter.shouldBeInlined(cast<emitc::ExpressionOp>(op))))
             return WalkResult::skip();
           for (OpResult result : op->getResults()) {
             if (failed(emitter.emitVariableDeclaration(
@@ -1494,22 +1524,6 @@ LogicalResult CppEmitter::emitExpression(ExpressionOp expressionOp) {
 
 LogicalResult CppEmitter::emitOperand(Value value) {
   Operation *def = value.getDefiningOp();
-  if (!shouldUseConstantsAsVariables()) {
-    if (auto constant = dyn_cast_if_present<ConstantOp>(def)) {
-      os << "((";
-
-      if (failed(emitType(constant.getLoc(), constant.getType()))) {
-        return failure();
-      }
-      os << ") ";
-
-      if (failed(emitAttribute(constant.getLoc(), constant.getValue()))) {
-        return failure();
-      }
-      os << ")";
-      return success();
-    }
-  }
 
   if (isPartOfCurrentExpression(value)) {
     assert(def && "Expected operand to be defined by an operation");
@@ -1721,11 +1735,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
             cacheDeferredOpResult(op.getResult(), op.getValue());
             return success();
           })
-          .Case<emitc::MemberOp>([&](auto op) {
-            cacheDeferredOpResult(op.getResult(), createMemberAccess(op));
-            return success();
-          })
-          .Case<emitc::MemberOfPtrOp>([&](auto op) {
+          .Case<emitc::MemberOp, emitc::MemberOfPtrOp>([&](auto op) {
             cacheDeferredOpResult(op.getResult(), createMemberAccess(op));
             return success();
           })
diff --git a/mlir/test/Target/Cpp/emitc-constants-as-variables.mlir b/mlir/test/Target/Cpp/emitc-constants-as-variables.mlir
index 5774bdc47308ff..c908ecc460edff 100644
--- a/mlir/test/Target/Cpp/emitc-constants-as-variables.mlir
+++ b/mlir/test/Target/Cpp/emitc-constants-as-variables.mlir
@@ -11,9 +11,55 @@ func.func @test() {
 
   return
 }
+// CPP-DEFAULT-LABEL: void test() {
+// CPP-DEFAULT-NEXT:   for (size_t v1 = (size_t) 0; v1 < (size_t) 10; v1 += (size_t) 1) {
+// CPP-DEFAULT-NEXT:   }
+// CPP-DEFAULT-NEXT:   return;
+// CPP-DEFAULT-NEXT: }
+
+// -----
+
+func.func @test_subscript(%arg0: !emitc.array<4xf32>) -> (!emitc.lvalue<f32>) {
+  %c0 = "emitc.constant"() <{value = 0 : index}> : () -> !emitc.size_t
+  %0 = emitc.subscript %arg0[%c0] : (!emitc.array<4xf32>, !emitc.size_t) -> !emitc.lvalue<f32>
+  return %0 : !emitc.lvalue<f32>
+}
+// CPP-DEFAULT-LABEL: float test_subscript(float v1[4]) {
+// CPP-DEFAULT-NEXT:  return v1[(size_t) 0];
+// CPP-DEFAULT-NEXT: }
+
+// -----
 
-// CPP-DEFAULT: void test() {
-// CPP-DEFAULT-NEXT:   for (size_t v1 = ((size_t) 0); v1 < ((size_t) 10); v1 += ((size_t) 1)) {
+func.func @emitc_switch_ui64() {
+  %0 = "emitc.constant"(){value = 1 : ui64} : () -> ui64
+
+  emitc.switch %0 : ui64
+  default {
+    emitc.call_opaque "func2" (%0) : (ui64) -> ()
+    emitc.yield
+  }
+  return
+}
+// CPP-DEFAULT-LABEL: void emitc_switch_ui64() {
+// CPP-DEFAULT:   switch ((uint64_t) 1) {
+// CPP-DEFAULT-NEXT:   default: {
+// CPP-DEFAULT-NEXT:     func2((uint64_t) 1);
+// CPP-DEFAULT-NEXT:     break;
 // CPP-DEFAULT-NEXT:   }
 // CPP-DEFAULT-NEXT:   return;
 // CPP-DEFAULT-NEXT: }
+
+// -----
+
+func.func @negative_values() {
+  %1 = "emitc.constant"() <{value = 10 : index}> : () -> !emitc.size_t
+  %2 = "emitc.constant"() <{value = -3000000000 : index}> : () -> !emitc.ssize_t
+
+  %3 = emitc.add %1, %2 : (!emitc.size_t, !emitc.ssize_t) -> !emitc.ssize_t
+
+  return
+}
+// CPP-DEFAULT-LABEL: void negative_values() {
+// CPP-DEFAULT-NEXT:   ssize_t v1 = (size_t) 10 + (ssize_t) -3000000000;
+// CPP-DEFAULT-NEXT:   return;
+// CPP-DEFAULT-NEXT: }
\ No newline at end of file

From ae055899259a9dfb7229c7559fc086aaa31952b7 Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre@amd.com>
Date: Mon, 16 Dec 2024 11:56:32 +0100
Subject: [PATCH 2/4] TOSA: concat: fix canonicalization that would result in
 concat with no operands

The fold() of the concat had two issues:
a) It would fold `concat (tensor<0x2>, tensor<0x4>), axis=1 -> tensor<0x6>` into `concat ()` (no operands)
based on the observation that both operands have zero elements. This is invalid as concat needs to have
at least one operand.

b) After that fix, it would fold `concat (tensor<0x2>, tensor<0x4>), axis=1 -> tensor<0x6>`
int `concat (tensor<0x4>), axis=1 -> tensor<0x6>`. This is also invalid; even though the concat
still produces the same number of elements (= none), shape relations are not corret (0x4 input, but 0x6 output).

I fixed that by only removing operands from the concat when
- the operand has zero dim on the concatenation axis (i.e. it doesn't contribute to the result shape)
- there is at least one operand left after removing
---
 .../Dialect/Tosa/IR/TosaCanonicalizations.cpp | 12 +++------
 mlir/test/Dialect/Tosa/canonicalize.mlir      | 26 +++++++++++++++++++
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 5d31b18ed7525c..1af7ba4bbd5134 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -1349,19 +1349,13 @@ OpFoldResult tosa::AbsOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
-static bool hasZeroSize(Type ty) {
-  auto ranked = dyn_cast<RankedTensorType>(ty);
-  if (!ranked)
-    return false;
-  return any_of(ranked.getShape(), [](auto d) { return d == 0; });
-}
-
 OpFoldResult ConcatOp::fold(FoldAdaptor adaptor) {
   /// Remove operands that have zero elements.
   bool changed = false;
   for (size_t i = 0; i < getInput1().size(); ) {
-    auto input = getInput1()[i];
-    if (hasZeroSize(input.getType())) {
+    auto input = cast<RankedTensorType>(getInput1()[i].getType());
+    // Ensure that we have at least one operand left.
+    if (input.getDimSize(getAxis()) == 0 && getInput1().size() > 1) {
       getInput1Mutable().erase(i);
       changed = true;
     } else {
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index f35df639cca523..4e10833a775c39 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -204,6 +204,32 @@ func.func @concat_fold_zero(%arg0: tensor<?x0xf32>, %arg1: tensor<?x1xf32>, %arg
   %0 = tosa.concat %arg0, %arg1, %arg2 {axis = 1 : i32}: (tensor<?x0xf32>, tensor<?x1xf32>, tensor<?x2xf32>) -> tensor<?x3xf32>
   return %0 : tensor<?x3xf32>
 }
+// -----
+
+// CHECK-LABEL: @concat_fold_zero
+func.func @concat_fold_zero_all(%arg0: tensor<?x0xf32>, %arg1: tensor<?x0xf32>) -> tensor<?x0xf32> {
+  // CHECK: return %arg1
+  %0 = tosa.concat %arg0, %arg1 {axis = 1 : i32}: (tensor<?x0xf32>, tensor<?x0xf32>) -> tensor<?x0xf32>
+  return %0 : tensor<?x0xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_fold_zero
+func.func @concat_fold_zero_different_axis(%arg0: tensor<0x2xf32>, %arg1: tensor<0x4xf32>) -> tensor<0x6xf32> {
+  // CHECK: tosa.concat %arg0, %arg1
+  %0 = tosa.concat %arg0, %arg1 {axis = 1 : i32}: (tensor<0x2xf32>, tensor<0x4xf32>) -> tensor<0x6xf32>
+  return %0 : tensor<0x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_fold_zero_size
+func.func @concat_fold_zero_size(%arg0: tensor<?x0xf32>, %arg1: tensor<?x1xf32>, %arg2: tensor<?x2xf32>) -> tensor<?x3xf32> {
+  // CHECK: tosa.concat %arg1, %arg2 {axis = 1 : i32}
+  %0 = tosa.concat %arg0, %arg1, %arg2 {axis = 1 : i32}: (tensor<?x0xf32>, tensor<?x1xf32>, tensor<?x2xf32>) -> tensor<?x3xf32>
+  return %0 : tensor<?x3xf32>
+}
 
 // -----
 

From 992dad34ac36e7f8c32e6ebbc9612264678c9aee Mon Sep 17 00:00:00 2001
From: Tina Jung <tinamaria.jung@amd.com>
Date: Tue, 17 Dec 2024 09:39:38 +0100
Subject: [PATCH 3/4] EmitC: Allow casts between opaque and float types (#428)

* EmitC: Allow casts between opaque and float types

* Use EmitC cast compatibility check

* Allow opaque types in casts (also for array types)
---
 mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp | 4 ++--
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp               | 4 ++++
 mlir/test/Dialect/EmitC/ops.mlir                  | 4 ++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index 938bc73d439969..6a84ead33d5c2b 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -757,7 +757,7 @@ class TruncFConversion : public OpConversionPattern<arith::TruncFOp> {
       return rewriter.notifyMatchFailure(castOp,
                                          "unsupported cast destination type");
 
-    if (!castOp.areCastCompatible(operandType, dstType))
+    if (!emitc::CastOp::areCastCompatible(operandType, dstType))
       return rewriter.notifyMatchFailure(castOp, "cast-incompatible types");
 
     rewriter.replaceOpWithNewOp<emitc::CastOp>(castOp, dstType,
@@ -787,7 +787,7 @@ class ExtFConversion : public OpConversionPattern<arith::ExtFOp> {
       return rewriter.notifyMatchFailure(castOp,
                                          "unsupported cast destination type");
 
-    if (!castOp.areCastCompatible(operandType, dstType))
+    if (!emitc::CastOp::areCastCompatible(operandType, dstType))
       return rewriter.notifyMatchFailure(castOp, "cast-incompatible types");
 
     rewriter.replaceOpWithNewOp<emitc::CastOp>(castOp, dstType,
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 8ed1d609b91818..66421c2f6fff66 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -313,6 +313,10 @@ LogicalResult emitc::AssignOp::verify() {
 bool CastOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
   Type input = inputs.front(), output = outputs.front();
 
+  // Opaque types are always allowed
+  if (isa<emitc::OpaqueType>(input) || isa<emitc::OpaqueType>(output))
+    return true;
+
   // Cast to array is only possible from an array
   if (isa<emitc::ArrayType>(input) != isa<emitc::ArrayType>(output))
     return false;
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index ad70ea61cb2958..80a33b2b9621fe 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -36,11 +36,15 @@ emitc.func private @extern(i32) attributes {specifiers = ["extern"]}
 
 func.func @cast(%arg0: i32) {
   %1 = emitc.cast %arg0: i32 to f32
+  %2 = emitc.cast %1: f32 to !emitc.opaque<"some type">
+  %3 = emitc.cast %2: !emitc.opaque<"some type"> to !emitc.size_t
   return
 }
 
 func.func @cast_array(%arg : !emitc.array<4xf32>) {
     %1 = emitc.cast %arg: !emitc.array<4xf32> to !emitc.array<4xf32> ref
+    %2 = emitc.cast %arg: !emitc.array<4xf32> to !emitc.opaque<"some type">
+    %3 = emitc.cast %2: !emitc.opaque<"some type"> to !emitc.array<4xf32> ref
     return
 }
 

From 55180428a10562867a5dafea01c596194fd028b9 Mon Sep 17 00:00:00 2001
From: josel-amd <166385423+josel-amd@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:17:07 +0100
Subject: [PATCH 4/4] ElementwiseOpFusion: option for disable empty (#430)

* ElementwiseOpFusion: option for disable empty

---------

Co-authored-by: Matthias Gehre <matthias.gehre@amd.com>
---
 mlir/include/mlir/Dialect/Linalg/Passes.td          |  5 +++++
 .../mlir/Dialect/Linalg/Transforms/Transforms.h     |  3 ++-
 .../Linalg/Transforms/ElementwiseOpFusion.cpp       | 11 +++++++----
 .../fusion-elementwise-ops-no-remove-outs-deps.mlir | 13 +++++++++++++
 4 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/fusion-elementwise-ops-no-remove-outs-deps.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index d96ad919b65f0a..0c315102f3fce5 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -75,6 +75,11 @@ def LinalgElementwiseOpFusionPass : Pass<"linalg-fuse-elementwise-ops"> {
   let dependentDialects = [
     "affine::AffineDialect", "linalg::LinalgDialect", "memref::MemRefDialect"
   ];
+  let options = [
+    Option<"removeOutsDependency", "remove-outs-dependency", "bool",
+           /*default=*/"true",
+           "Replace out by tensor.empty">,
+  ];
 }
 
 def LinalgNamedOpConversionPass: Pass<"linalg-named-op-conversion"> {
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 0208f854f799ec..d8920b34f7bcc8 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1701,7 +1701,8 @@ using ControlFusionFn = std::function<bool(OpOperand *fusedOperand)>;
 /// when both operations are fusable elementwise operations.
 void populateElementwiseOpsFusionPatterns(
     RewritePatternSet &patterns,
-    const ControlFusionFn &controlElementwiseOpFusion);
+    const ControlFusionFn &controlElementwiseOpFusion,
+    bool replaceOutsDependency = true);
 
 /// Function type which is used to control propagation of tensor.pack/unpack
 /// ops.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 6c806fb9828dc2..cd0de6dfaf9411 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -2134,11 +2134,13 @@ void mlir::linalg::populateFoldReshapeOpsByCollapsingPatterns(
 
 void mlir::linalg::populateElementwiseOpsFusionPatterns(
     RewritePatternSet &patterns,
-    const ControlFusionFn &controlElementwiseOpsFusion) {
+    const ControlFusionFn &controlElementwiseOpsFusion,
+    bool removeOutsDependency) {
   auto *context = patterns.getContext();
   patterns.add<FuseElementwiseOps>(context, controlElementwiseOpsFusion);
-  patterns.add<FoldFillWithGenericOp, FoldScalarOrSplatConstant,
-               RemoveOutsDependency>(context);
+  patterns.add<FoldFillWithGenericOp, FoldScalarOrSplatConstant>(context);
+  if (removeOutsDependency)
+    patterns.add<RemoveOutsDependency>(context);
   // Add the patterns that clean up dead operands and results.
   populateEraseUnusedOperandsAndResultsPatterns(patterns);
 }
@@ -2180,7 +2182,8 @@ struct LinalgElementwiseOpFusionPass
     };
 
     // Add elementwise op fusion patterns.
-    populateElementwiseOpsFusionPatterns(patterns, defaultControlFn);
+    populateElementwiseOpsFusionPatterns(patterns, defaultControlFn,
+                                         removeOutsDependency);
     populateFoldReshapeOpsByExpansionPatterns(patterns, defaultControlFn);
     tensor::populateBubbleUpExpandShapePatterns(patterns);
 
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops-no-remove-outs-deps.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops-no-remove-outs-deps.mlir
new file mode 100644
index 00000000000000..cf40f3ff519014
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops-no-remove-outs-deps.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt %s -p 'builtin.module(func.func(linalg-fuse-elementwise-ops{remove-outs-dependency=0}))' -split-input-file | FileCheck %s
+
+#identity = affine_map<(d0) -> (d0)>
+
+func.func @keep_outs_dependency(%arg: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK-NOT: tensor.empty
+  %1 = linalg.generic {indexing_maps = [#identity, #identity], iterator_types = ["parallel"] } ins(%arg: tensor<4xf32>) outs(%arg: tensor<4xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %exp = arith.negf %in: f32
+      linalg.yield %exp : f32
+  } -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}