From e4d6564ed594fab79e9e42021a486b2896d241e1 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy@amd.com>
Date: Mon, 4 Mar 2024 22:22:41 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Add initial pass to map `do concurrent`
 to OMP

Adds a new pass to map `do concurrent` loops to OpenMP. So far the pass
only maps to the host, i.e. `parallel do`. Later, it will be extended to
map to the device/target as well.

This diff also adds the pass the code-gen pipeline flang. To invoke the
new pass from flang, use `-fdo-concurrent-parallel=<none|host|device>`.
The option is only active when OpenMP is enabled throught `-fopenmp`.
---
 clang/include/clang/Driver/Options.td         |   4 +
 clang/lib/Driver/ToolChains/Flang.cpp         |   3 +-
 .../include/flang/Frontend/CodeGenOptions.def |   2 +
 flang/include/flang/Frontend/CodeGenOptions.h |   8 +
 .../flang/Optimizer/Transforms/Passes.h       |   2 +
 .../flang/Optimizer/Transforms/Passes.td      |  20 ++
 flang/lib/Frontend/CompilerInvocation.cpp     |  28 +++
 flang/lib/Frontend/FrontendActions.cpp        |  29 ++-
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 +
 .../Transforms/DoConcurrentConversion.cpp     | 206 ++++++++++++++++++
 flang/test/Driver/driver-help-hidden.f90      |   2 +
 flang/test/Driver/driver-help.f90             |   4 +
 flang/test/Transforms/DoConcurrent/basic.f90  |  44 ++++
 flang/test/Transforms/DoConcurrent/basic.mlir |  60 +++++
 14 files changed, 410 insertions(+), 3 deletions(-)
 create mode 100644 flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
 create mode 100644 flang/test/Transforms/DoConcurrent/basic.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/basic.mlir

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 3a028fadb25b18..682c0d2c671b94 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6473,6 +6473,10 @@ defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays",
 defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stride",
   PosFlag<SetTrue, [], [ClangOption], "Create unit-strided versions of loops">,
    NegFlag<SetFalse, [], [ClangOption], "Do not create unit-strided loops (default)">>;
+
+def do_concurrent_parallel_EQ : Joined<["-"], "fdo-concurrent-parallel=">,
+  HelpText<"Try to map `do concurrent` loops to OpenMP (on host or device)">,
+      Values<"none,host,device">;
 } // let Visibility = [FC1Option, FlangOption]
 
 def J : JoinedOrSeparate<["-"], "J">,
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 6168b42dc78292..bb66ae75be2bd6 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -150,7 +150,8 @@ void Flang::addCodegenOptions(const ArgList &Args,
                             options::OPT_flang_deprecated_no_hlfir,
                             options::OPT_flang_experimental_polymorphism,
                             options::OPT_fno_ppc_native_vec_elem_order,
-                            options::OPT_fppc_native_vec_elem_order});
+                            options::OPT_fppc_native_vec_elem_order,
+                            options::OPT_do_concurrent_parallel_EQ});
 }
 
 void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index 9d03ec88a56b8a..a6128130baadc3 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -40,5 +40,7 @@ ENUM_CODEGENOPT(DebugInfo,  llvm::codegenoptions::DebugInfoKind, 4,  llvm::codeg
 ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use
 ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers
 
+ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP
+
 #undef CODEGENOPT
 #undef ENUM_CODEGENOPT
diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h
index 0c318e4023af43..cd5c5e592d8926 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.h
+++ b/flang/include/flang/Frontend/CodeGenOptions.h
@@ -129,6 +129,14 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// transformation.
   OptRemark OptimizationRemarkAnalysis;
 
+  /// Optionally map `do concurrent` loops to OpenMP. This is only valid of
+  /// OpenMP is enabled.
+  enum class DoConcurrentMappingKind {
+    DCMK_None,  // Do not lower `do concurrent` to OpenMP.
+    DCMK_Host,  // Lower to run in parallel on the CPU.
+    DCMK_Device // Lower to run in parallel on the GPU.
+  };
+
   // Define accessors/mutators for code generation options of enumeration type.
 #define CODEGENOPT(Name, Bits, Default)
 #define ENUM_CODEGENOPT(Name, Type, Bits, Default)                             \
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index e1d22c8c986da7..25a526ab0cbfcb 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -96,6 +96,8 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath,
                        bool noNaNsFPMath, bool approxFuncFPMath,
                        bool noSignedZerosFPMath, bool unsafeFPMath);
 
+std::unique_ptr<mlir::Pass> createDoConcurrentConversionPass();
+
 // declarative passes
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/Transforms/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 5fb576fd876254..06de4a1d28a929 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -397,4 +397,24 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
   let constructor = "::fir::createFunctionAttrPass()";
 }
 
+def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> {
+  let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops.";
+
+  let description = [{ This is an experimental pass to map `DO CONCURRENR` loops
+     to their correspnding equivalent OpenMP worksharing constructs.
+
+     For now the following is supported:
+       - Mapping simple loops to `parallel do`.
+
+     Still to TODO:
+       - More extensive testing.
+       - Mapping to `target teams distribute parallel do`.
+       - Allowing the user to control mapping behavior: either to the host or
+         target.
+  }];
+
+  let constructor = "::fir::createDoConcurrentConversionPass()";
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 4707de0e976ca7..79663b89942c0f 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -154,6 +154,32 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
   return true;
 }
 
+static bool parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
+                                     llvm::opt::ArgList &args,
+                                     clang::DiagnosticsEngine &diags) {
+  llvm::opt::Arg *arg =
+      args.getLastArg(clang::driver::options::OPT_do_concurrent_parallel_EQ);
+  if (!arg)
+    return true;
+
+  using DoConcurrentMappingKind = Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind;
+  std::optional<DoConcurrentMappingKind> val =
+      llvm::StringSwitch<std::optional<DoConcurrentMappingKind>>(
+          arg->getValue())
+          .Case("none", DoConcurrentMappingKind::DCMK_None)
+          .Case("host", DoConcurrentMappingKind::DCMK_Host)
+          .Case("device", DoConcurrentMappingKind::DCMK_Device)
+          .Default(std::nullopt);
+
+  if (!val.has_value()) {
+    diags.Report(clang::diag::err_drv_invalid_value)
+        << arg->getAsString(args) << arg->getValue();
+    return false;
+  }
+  opts.setDoConcurrentMapping(val.value());
+  return true;
+}
+
 static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts,
                               llvm::opt::ArgList &args,
                               clang::DiagnosticsEngine &diags) {
@@ -385,6 +411,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
                    clang::driver::options::OPT_funderscoring, false)) {
     opts.Underscoring = 0;
   }
+
+  parseDoConcurrentMapping(opts, args, diags);
 }
 
 /// Parses all target input arguments and populates the target
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 849b3c8e4dc027..b2f7a7abf64ac9 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -320,8 +320,9 @@ bool CodeGenAction::beginSourceFileAction() {
   // Add OpenMP-related passes
   // WARNING: These passes must be run immediately after the lowering to ensure
   // that the FIR is correct with respect to OpenMP operations/attributes.
-  if (ci.getInvocation().getFrontendOpts().features.IsEnabled(
-          Fortran::common::LanguageFeature::OpenMP)) {
+  bool isOpenMPEnabled = ci.getInvocation().getFrontendOpts().features.IsEnabled(
+          Fortran::common::LanguageFeature::OpenMP);
+  if (isOpenMPEnabled) {
     bool isDevice = false;
     if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
             mlirModule->getOperation()))
@@ -332,6 +333,30 @@ bool CodeGenAction::beginSourceFileAction() {
     fir::createOpenMPFIRPassPipeline(pm, isDevice);
   }
 
+  using DoConcurrentMappingKind =
+      Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind;
+  DoConcurrentMappingKind selectedKind = ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping();
+  if (selectedKind != DoConcurrentMappingKind::DCMK_None) {
+    if (!isOpenMPEnabled) {
+      unsigned diagID = ci.getDiagnostics().getCustomDiagID(
+          clang::DiagnosticsEngine::Warning,
+          "lowering `do concurrent` loops to OpenMP is only supported if "
+          "OpenMP is enabled");
+      ci.getDiagnostics().Report(diagID);
+    } else {
+      bool mapToDevice = selectedKind == DoConcurrentMappingKind::DCMK_Device;
+
+      if (mapToDevice) {
+        unsigned diagID = ci.getDiagnostics().getCustomDiagID(
+            clang::DiagnosticsEngine::Warning,
+            "TODO: lowering `do concurrent` loops to OpenMP device is not "
+            "supported yet");
+        ci.getDiagnostics().Report(diagID);
+      } else
+        pm.addPass(fir::createDoConcurrentConversionPass());
+    }
+  }
+
   pm.enableVerifier(/*verifyPasses=*/true);
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
 
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index ba2e267996150e..cf83bb496bb5e8 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -22,6 +22,7 @@ add_flang_library(FIRTransforms
   OMPMarkDeclareTarget.cpp
   VScaleAttr.cpp
   FunctionAttr.cpp
+  DoConcurrentConversion.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
new file mode 100644
index 00000000000000..4534d514b86d76
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
@@ -0,0 +1,206 @@
+//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include <memory>
+
+namespace fir {
+#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"
+
+namespace {
+class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
+public:
+  using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::OpPrintingFlags flags;
+    flags.printGenericOpForm();
+
+    mlir::omp::ParallelOp parallelOp =
+        rewriter.create<mlir::omp::ParallelOp>(doLoop.getLoc());
+
+    mlir::Block *block = rewriter.createBlock(&parallelOp.getRegion());
+
+    rewriter.setInsertionPointToEnd(block);
+    rewriter.create<mlir::omp::TerminatorOp>(doLoop.getLoc());
+
+    rewriter.setInsertionPointToStart(block);
+
+    // ==== TODO (1) Start ====
+    //
+    // The goal of the few lines below is to collect and clone
+    // the list of operations that define the loop's lower and upper bounds as
+    // well as the step. Should we, instead of doing this here, split it into 2
+    // stages?
+    //
+    //   1. **Stage 1**: add an analysis that extracts all the relevant
+    //                   operations defining the lower-bound, upper-bound, and
+    //                   step.
+    //   2. **Stage 2**: clone the collected operations in the parallel region.
+    //
+    // So far, the pass has been tested with very simple loops (where the bounds
+    // and step are constants) so the goal of **Stage 1** is to have a
+    // well-defined component that has the sole responsibility of collecting all
+    // the relevant ops relevant to the loop header. This was we can test this
+    // in isolation for more complex loops and better organize the code. **Stage
+    // 2** would then be responsible for the actual cloning of the collected
+    // loop header preparation/allocation operations.
+
+    // Clone the LB, UB, step defining ops inside the parallel region.
+    llvm::SmallVector<mlir::Value> lowerBound, upperBound, step;
+    lowerBound.push_back(
+        rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0));
+    upperBound.push_back(
+        rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0));
+    step.push_back(
+        rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0));
+    // ==== TODO (1) End ====
+
+    auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>(
+        doLoop.getLoc(), lowerBound, upperBound, step);
+    wsLoopOp.setInclusive(true);
+
+    auto outlineableOp =
+        mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(*parallelOp);
+    rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock());
+
+    // ==== TODO (2) Start ====
+    //
+    // The goal of the following simple work-list algorithm and
+    // the following `for` loop is to collect all the operations related to the
+    // allocation of the induction variable for the `do concurrent` loop. The
+    // operations collected by this algorithm are very similar to what is
+    // usually emitted for privatized variables, e.g. for omp.parallel loops.
+    // Therefore, I think we can:
+    //
+    //   1. **Stage 1**: Add an analysis that colects all these operations. The
+    //                   goal is similar to **Stage 1** of TODO (1): isolate the
+    //                   algorithm is an individually-testable component so that
+    //                   we properly implement and test it for more complicated
+    //                   `do concurrent` loops.
+    //   1. **Stage 2**: Using the collected operations, create and populate an
+    //                   `omp.private {type=private}` op to server as the
+    //                   delayed privatizer for the new work-sharing loop.
+
+    // For the induction variable, we need to privative its allocation and
+    // binding inside the parallel region.
+    llvm::SmallSetVector<mlir::Operation *, 2> workList;
+    // Therefore, we first discover the induction variable by discovering
+    // `fir.store`s where the source is the loop's block argument.
+    workList.insert(doLoop.getInductionVar().getUsers().begin(),
+                    doLoop.getInductionVar().getUsers().end());
+    llvm::SmallSetVector<fir::StoreOp, 2> inductionVarTargetStores;
+
+    // Walk the def-chain of the loop's block argument until we hit `fir.store`.
+    while (!workList.empty()) {
+      mlir::Operation *item = workList.front();
+
+      if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(item)) {
+        inductionVarTargetStores.insert(storeOp);
+      } else {
+        workList.insert(item->getUsers().begin(), item->getUsers().end());
+      }
+
+      workList.remove(item);
+    }
+
+    // For each collected `fir.sotre`, find the target memref's alloca's and
+    // declare ops.
+    llvm::SmallSetVector<mlir::Operation *, 4> declareAndAllocasToClone;
+    for (auto storeOp : inductionVarTargetStores) {
+      mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp();
+
+      for (auto operand : storeTarget->getOperands()) {
+        declareAndAllocasToClone.insert(operand.getDefiningOp());
+      }
+      declareAndAllocasToClone.insert(storeTarget);
+    }
+    // ==== TODO (2) End ====
+    //
+    // TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can
+    // more easily generalize the pass to work for targets other than OpenMP,
+    // e.g. OpenACC, I think can, can reuse the results of the analyses and only
+    // change the code-gen/rewriting.
+
+    mlir::IRMapping mapper;
+
+    // Collect the memref defining ops in the parallel region.
+    for (mlir::Operation *opToClone : declareAndAllocasToClone) {
+      rewriter.clone(*opToClone, mapper);
+    }
+
+    // Clone the loop's body inside the worksharing construct using the mapped
+    // memref values.
+    rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(),
+                               wsLoopOp.getRegion().begin(), mapper);
+
+    mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator();
+    rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back());
+    rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
+    rewriter.eraseOp(terminator);
+
+    rewriter.eraseOp(doLoop);
+
+    return mlir::success();
+  }
+};
+
+class DoConcurrentConversionPass
+    : public fir::impl::DoConcurrentConversionPassBase<
+          DoConcurrentConversionPass> {
+public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+
+    if (func.isDeclaration()) {
+      return;
+    }
+
+    auto *context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<DoConcurrentConversion>(context);
+    mlir::ConversionTarget target(*context);
+    target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
+                           mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                           mlir::omp::OpenMPDialect>();
+
+    target.addDynamicallyLegalOp<fir::DoLoopOp>(
+        [](fir::DoLoopOp op) { return !op.getUnordered(); });
+
+    if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
+                                               std::move(patterns)))) {
+      mlir::emitError(mlir::UnknownLoc::get(context),
+                      "error in converting do-concurrent op");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::createDoConcurrentConversionPass() {
+  return std::make_unique<DoConcurrentConversionPass>();
+}
+
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 44dbac44772b29..b379138db13913 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -35,6 +35,8 @@
 ! CHECK-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
 ! CHECK-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
 ! CHECK-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
+! CHECK-NEXT: -fdo-concurrent-parallel=<value>
+! CHECK-NEXT:                         Try to map `do concurrent` loops to OpenMP (on host or device)
 ! CHECK-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
 ! CHECK-NEXT: -ffixed-form            Process source files in fixed form
 ! CHECK-NEXT: -ffixed-line-length=<value>
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index b4280a454e3128..cf645e8c3d3847 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -31,6 +31,8 @@
 ! HELP-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
 ! HELP-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
 ! HELP-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
+! HELP-NEXT: -fdo-concurrent-parallel=<value>
+! HELP-NEXT:                         Try to map `do concurrent` loops to OpenMP (on host or device)
 ! HELP-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
 ! HELP-NEXT: -ffixed-form            Process source files in fixed form
 ! HELP-NEXT: -ffixed-line-length=<value>
@@ -186,6 +188,8 @@
 ! HELP-FC1-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
 ! HELP-FC1-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
 ! HELP-FC1-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
+! HELP-FC1-NEXT: -fdo-concurrent-parallel=<value>
+! HELP-FC1-NEXT:                         Try to map `do concurrent` loops to OpenMP (on host or device)
 ! HELP-FC1-NEXT: -fembed-offload-object=<value>
 ! HELP-FC1-NEXT:                         Embed Offloading device-side binary into host object file as a section.
 ! HELP-FC1-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
diff --git a/flang/test/Transforms/DoConcurrent/basic.f90 b/flang/test/Transforms/DoConcurrent/basic.f90
new file mode 100644
index 00000000000000..a555a25c9bad5d
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic.f90
@@ -0,0 +1,44 @@
+! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \
+! RUN:   | FileCheck %s
+
+! CHECK-LABEL: do_concurrent_basic
+program do_concurrent_basic
+    ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+    ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
+    implicit none
+    integer :: a(10)
+    integer :: i
+
+    ! CHECK-NOT: fir.do_loop
+
+    ! CHECK: omp.parallel {
+
+    ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+    ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+    ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+    ! CHECK: %[[STEP:.*]] = arith.constant 1 : index
+
+    ! CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+    ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+    ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+    ! CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]])  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+    ! CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+    ! CHECK-NEXT: omp.yield
+    ! CHECK-NEXT: }
+
+    ! CHECK-NEXT: omp.terminator
+    ! CHECK-NEXT: }
+    do concurrent (integer :: i=1:10)
+        a(i) = i
+    end do
+
+    ! CHECK-NOT: fir.do_loop
+end program do_concurrent_basic
diff --git a/flang/test/Transforms/DoConcurrent/basic.mlir b/flang/test/Transforms/DoConcurrent/basic.mlir
new file mode 100644
index 00000000000000..7d62463f36d422
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic.mlir
@@ -0,0 +1,60 @@
+// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
+
+// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s
+
+// CHECK-LABEL: func.func @do_concurrent_basic
+func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
+    // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    // CHECK: %[[C1:.*]] = arith.constant 1 : i32
+    // CHECK: %[[C10:.*]] = arith.constant 10 : i32
+
+    %0 = fir.alloca i32 {bindc_name = "i"}
+    %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+    %c10 = arith.constant 10 : index
+    %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    %c1_i32 = arith.constant 1 : i32
+    %7 = fir.convert %c1_i32 : (i32) -> index
+    %c10_i32 = arith.constant 10 : i32
+    %8 = fir.convert %c10_i32 : (i32) -> index
+    %c1 = arith.constant 1 : index
+
+    // CHECK-NOT: fir.do_loop
+
+    // CHECK: omp.parallel {
+
+    // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+    // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+    // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+    // CHECK: %[[STEP:.*]] = arith.constant 1 : index
+
+    // CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+    // CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+    // CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
+    // CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    // CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    // CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+    // CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]])  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+    // CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+    // CHECK-NEXT: omp.yield
+    // CHECK-NEXT: }
+
+    // CHECK-NEXT: omp.terminator
+    // CHECK-NEXT: }
+    fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
+      %13 = fir.convert %arg0 : (index) -> i32
+      fir.store %13 to %1#1 : !fir.ref<i32>
+      %14 = fir.load %1#0 : !fir.ref<i32>
+      %15 = fir.load %1#0 : !fir.ref<i32>
+      %16 = fir.convert %15 : (i32) -> i64
+      %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+      hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+    }
+
+    // CHECK-NOT: fir.do_loop
+
+    return
+  }

From 385768aff2412c1b8e1e0e10f6911c5cd15b50e3 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy@amd.com>
Date: Mon, 11 Mar 2024 06:41:50 -0500
Subject: [PATCH 2/2] Handle review comments.

---
 .../flang/Optimizer/Transforms/Passes.td      |  2 +-
 .../Transforms/DoConcurrentConversion.cpp     | 40 +++++++++++++++----
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 06de4a1d28a929..95ffd199bc0fe4 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -400,7 +400,7 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
 def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> {
   let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops.";
 
-  let description = [{ This is an experimental pass to map `DO CONCURRENR` loops
+  let description = [{ This is an experimental pass to map `DO CONCURRENT` loops
      to their correspnding equivalent OpenMP worksharing constructs.
 
      For now the following is supported:
diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
index 4534d514b86d76..f5a3d925ab5d9a 100644
--- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
@@ -70,13 +70,39 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     // loop header preparation/allocation operations.
 
     // Clone the LB, UB, step defining ops inside the parallel region.
+    mlir::Operation* lbOp = doLoop.getLowerBound().getDefiningOp();
+    mlir::Operation* ubOp = doLoop.getUpperBound().getDefiningOp();
+    mlir::Operation* stepOp = doLoop.getStep().getDefiningOp();
+
+    if (lbOp == nullptr || ubOp == nullptr || stepOp == nullptr) {
+      return rewriter.notifyMatchFailure(
+          doLoop, "At least one of the loop's LB, UB, or step doesn't have a "
+                  "defining operation.");
+    }
+
+    std::function<bool(mlir::Operation *)> isOpUltimatelyConstant =
+        [&](mlir::Operation *operation) {
+          if (mlir::isa_and_present<mlir::arith::ConstantOp>(operation))
+            return true;
+
+          if (fir::ConvertOp convertOp =
+                  mlir::dyn_cast_if_present<fir::ConvertOp>(operation))
+            return isOpUltimatelyConstant(convertOp.getValue().getDefiningOp());
+
+          return false;
+        };
+
+    if (!isOpUltimatelyConstant(lbOp) || !isOpUltimatelyConstant(ubOp) ||
+        !isOpUltimatelyConstant(stepOp)) {
+      return rewriter.notifyMatchFailure(
+          doLoop, "`do concurrent` conversion is currently only supported for "
+                  "constant LB, UB, and step values.");
+    }
+
     llvm::SmallVector<mlir::Value> lowerBound, upperBound, step;
-    lowerBound.push_back(
-        rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0));
-    upperBound.push_back(
-        rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0));
-    step.push_back(
-        rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0));
+    lowerBound.push_back(rewriter.clone(*lbOp)->getResult(0));
+    upperBound.push_back(rewriter.clone(*ubOp)->getResult(0));
+    step.push_back(rewriter.clone(*stepOp)->getResult(0));
     // ==== TODO (1) End ====
 
     auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>(
@@ -127,7 +153,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       workList.remove(item);
     }
 
-    // For each collected `fir.sotre`, find the target memref's alloca's and
+    // For each collected `fir.store`, find the target memref's alloca's and
     // declare ops.
     llvm::SmallSetVector<mlir::Operation *, 4> declareAndAllocasToClone;
     for (auto storeOp : inductionVarTargetStores) {