diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 2c069860ffdca..0fdb302fe10ca 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/OpenMP/Utils.h" @@ -28,8 +29,10 @@ namespace looputils { /// Stores info needed about the induction/iteration variable for each `do /// concurrent` in a loop nest. struct InductionVariableInfo { - InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); } - + InductionVariableInfo(fir::DoConcurrentLoopOp loop, + mlir::Value inductionVar) { + populateInfo(loop, inductionVar); + } /// The operation allocating memory for iteration variable. mlir::Operation *iterVarMemDef; /// the operation(s) updating the iteration variable with the current @@ -45,7 +48,7 @@ struct InductionVariableInfo { /// ... /// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ... /// ... - /// fir.do_loop %ind_var = %lb to %ub step %s unordered { + /// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) { /// %ind_var_conv = fir.convert %ind_var : (index) -> i32 /// fir.store %ind_var_conv to %i#1 : !fir.ref /// ... @@ -62,14 +65,14 @@ struct InductionVariableInfo { /// Note: The current implementation is dependent on how flang emits loop /// bodies; which is sufficient for the current simple test/use cases. If this /// proves to be insufficient, this should be made more generic. - void populateInfo(fir::DoLoopOp doLoop) { + void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) { mlir::Value result = nullptr; // Checks if a StoreOp is updating the memref of the loop's iteration // variable. auto isStoringIV = [&](fir::StoreOp storeOp) { // Direct store into the IV memref. - if (storeOp.getValue() == doLoop.getInductionVar()) { + if (storeOp.getValue() == inductionVar) { indVarUpdateOps.push_back(storeOp); return true; } @@ -77,7 +80,7 @@ struct InductionVariableInfo { // Indirect store into the IV memref. if (auto convertOp = mlir::dyn_cast( storeOp.getValue().getDefiningOp())) { - if (convertOp.getOperand() == doLoop.getInductionVar()) { + if (convertOp.getOperand() == inductionVar) { indVarUpdateOps.push_back(convertOp); indVarUpdateOps.push_back(storeOp); return true; @@ -87,7 +90,7 @@ struct InductionVariableInfo { return false; }; - for (mlir::Operation &op : doLoop) { + for (mlir::Operation &op : loop) { if (auto storeOp = mlir::dyn_cast(op)) if (isStoringIV(storeOp)) { result = storeOp.getMemref(); @@ -100,219 +103,7 @@ struct InductionVariableInfo { } }; -using LoopNestToIndVarMap = - llvm::MapVector; - -/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff -/// there are no operations in \p outerloop's body other than: -/// -/// 1. the operations needed to assign/update \p outerLoop's induction variable. -/// 2. \p innerLoop itself. -/// -/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop -/// according to the above definition. -bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) { - mlir::ForwardSliceOptions forwardSliceOptions; - forwardSliceOptions.inclusive = true; - // The following will be used as an example to clarify the internals of this - // function: - // ``` - // 1. fir.do_loop %i_idx = %34 to %36 step %c1 unordered { - // 2. %i_idx_2 = fir.convert %i_idx : (index) -> i32 - // 3. fir.store %i_idx_2 to %i_iv#1 : !fir.ref - // - // 4. fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered { - // 5. %j_idx_2 = fir.convert %j_idx : (index) -> i32 - // 6. fir.store %j_idx_2 to %j_iv#1 : !fir.ref - // ... loop nest body, possible uses %i_idx ... - // } - // } - // ``` - // In this example, the `j` loop is perfectly nested inside the `i` loop and - // below is how we find that. - - // We don't care about the outer-loop's induction variable's uses within the - // inner-loop, so we filter out these uses. - // - // This filter tells `getForwardSlice` (below) to only collect operations - // which produce results defined above (i.e. outside) the inner-loop's body. - // - // Since `outerLoop.getInductionVar()` is a block argument (to the - // outer-loop's body), the filter effectively collects uses of - // `outerLoop.getInductionVar()` inside the outer-loop but outside the - // inner-loop. - forwardSliceOptions.filter = [&](mlir::Operation *op) { - return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion()); - }; - - llvm::SetVector indVarSlice; - // The forward slice of the `i` loop's IV will be the 2 ops in line 1 & 2 - // above. Uses of `%i_idx` inside the `j` loop are not collected because of - // the filter. - mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice, - forwardSliceOptions); - llvm::DenseSet indVarSet(indVarSlice.begin(), - indVarSlice.end()); - - llvm::DenseSet outerLoopBodySet; - // The following walk collects ops inside `outerLoop` that are **not**: - // * the outer-loop itself, - // * or the inner-loop, - // * or the `fir.result` op (the outer-loop's terminator). - // - // For the above example, this will also populate `outerLoopBodySet` with ops - // in line 1 & 2 since we skip the `i` loop, the `j` loop, and the terminator. - outerLoop.walk([&](mlir::Operation *op) { - if (op == outerLoop) - return mlir::WalkResult::advance(); - - if (op == innerLoop) - return mlir::WalkResult::skip(); - - if (mlir::isa(op)) - return mlir::WalkResult::advance(); - - outerLoopBodySet.insert(op); - return mlir::WalkResult::advance(); - }); - - // If `outerLoopBodySet` ends up having the same ops as `indVarSet`, then - // `outerLoop` only contains ops that setup its induction variable + - // `innerLoop` + the `fir.result` terminator. In other words, `innerLoop` is - // perfectly nested inside `outerLoop`. - bool result = (outerLoopBodySet == indVarSet); - LLVM_DEBUG(DBGS() << "Loop pair starting at location " << outerLoop.getLoc() - << " is" << (result ? "" : " not") - << " perfectly nested\n"); - - return result; -} - -/// Starting with `currentLoop` collect a perfectly nested loop nest, if any. -/// This function collects as much as possible loops in the nest; it case it -/// fails to recognize a certain nested loop as part of the nest it just returns -/// the parent loops it discovered before. -mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop, - LoopNestToIndVarMap &loopNest) { - assert(currentLoop.getUnordered()); - - while (true) { - loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)}); - llvm::SmallVector unorderedLoops; - - for (auto nestedLoop : currentLoop.getRegion().getOps()) - if (nestedLoop.getUnordered()) - unorderedLoops.push_back(nestedLoop); - - if (unorderedLoops.empty()) - break; - - // Having more than one unordered loop means that we are not dealing with a - // perfect loop nest (i.e. a mulit-range `do concurrent` loop); which is the - // case we are after here. - if (unorderedLoops.size() > 1) - return mlir::failure(); - - fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front(); - - if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop)) - return mlir::failure(); - - currentLoop = nestedUnorderedLoop; - } - - return mlir::success(); -} - -/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In -/// particular, this function would take this input IR: -/// ``` -/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered { -/// fir.store %i_iv to %i#1 : !fir.ref -/// %j_lb = arith.constant 1 : i32 -/// %j_ub = arith.constant 10 : i32 -/// %j_step = arith.constant 1 : index -/// -/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered { -/// fir.store %j_iv to %j#1 : !fir.ref -/// ... -/// } -/// } -/// ``` -/// -/// into the following form (using generic op form since the result is -/// technically an invalid `fir.do_loop` op: -/// -/// ``` -/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({ -/// ^bb0(%i_iv: index): -/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32 -/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32 -/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index -/// -/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({ -/// ^bb0(%new_i_iv: index, %new_j_iv: index): -/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref) -> () -/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref) -> () -/// ... -/// }) -/// ``` -/// -/// What happened to the loop nest is the following: -/// -/// * the innermost loop's entry block was updated from having one operand to -/// having `n` operands where `n` is the number of loops in the nest, -/// -/// * the outer loop(s)' ops that update the IVs were sank inside the innermost -/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above), -/// -/// * the innermost loop's entry block's arguments were mapped in order from the -/// outermost to the innermost IV. -/// -/// With this IR change, we can directly inline the innermost loop's region into -/// the newly generated `omp.loop_nest` op. -/// -/// Note that this function has a pre-condition that \p loopNest consists of -/// perfectly nested loops; i.e. there are no in-between ops between 2 nested -/// loops except for the ops to setup the inner loop's LB, UB, and step. These -/// ops are handled/cloned by `genLoopNestClauseOps(..)`. -void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter, - looputils::LoopNestToIndVarMap &loopNest) { - if (loopNest.size() <= 1) - return; - - fir::DoLoopOp innermostLoop = loopNest.back().first; - mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front(); - - llvm::SmallVector argTypes; - llvm::SmallVector argLocs; - - for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) { - // Sink the IV update ops to the innermost loop. We need to do for all loops - // except for the innermost one, hence the `drop_end` usage above. - for (mlir::Operation *op : indVarInfo.indVarUpdateOps) - op->moveBefore(&innermostFirstOp); - - argTypes.push_back(doLoop.getInductionVar().getType()); - argLocs.push_back(doLoop.getInductionVar().getLoc()); - } - - mlir::Region &innermmostRegion = innermostLoop.getRegion(); - // Extend the innermost entry block with arguments to represent the outer IVs. - innermmostRegion.addArguments(argTypes, argLocs); - - unsigned idx = 1; - // In reverse, remap the IVs of the loop nest from the old values to the new - // ones. We do that in reverse since the first argument before this loop is - // the old IV for the innermost loop. Therefore, we want to replace it first - // before the old value (1st argument in the block) is remapped to be the IV - // of the outermost loop in the nest. - for (auto &[doLoop, _] : llvm::reverse(loopNest)) { - doLoop.getInductionVar().replaceAllUsesWith( - innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx)); - ++idx; - } -} +using InductionVariableInfos = llvm::SmallVector; /// Collects values that are local to a loop: "loop-local values". A loop-local /// value is one that is used exclusively inside the loop but allocated outside @@ -326,9 +117,9 @@ void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter, /// used exclusively inside. /// /// \param [out] locals - the list of loop-local values detected for \p doLoop. -void collectLoopLocalValues(fir::DoLoopOp doLoop, +void collectLoopLocalValues(fir::DoConcurrentLoopOp loop, llvm::SetVector &locals) { - doLoop.walk([&](mlir::Operation *op) { + loop.walk([&](mlir::Operation *op) { for (mlir::Value operand : op->getOperands()) { if (locals.contains(operand)) continue; @@ -340,11 +131,11 @@ void collectLoopLocalValues(fir::DoLoopOp doLoop, // Values defined inside the loop are not interesting since they do not // need to be localized. - if (doLoop->isAncestor(operand.getDefiningOp())) + if (loop->isAncestor(operand.getDefiningOp())) continue; for (auto *user : operand.getUsers()) { - if (!doLoop->isAncestor(user)) { + if (!loop->isAncestor(user)) { isLocal = false; break; } @@ -373,39 +164,42 @@ static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion, } } // namespace looputils -class DoConcurrentConversion : public mlir::OpConversionPattern { +class DoConcurrentConversion + : public mlir::OpConversionPattern { public: - using mlir::OpConversionPattern::OpConversionPattern; + using mlir::OpConversionPattern::OpConversionPattern; - DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice, - llvm::DenseSet &concurrentLoopsToSkip) + DoConcurrentConversion( + mlir::MLIRContext *context, bool mapToDevice, + llvm::DenseSet &concurrentLoopsToSkip) : OpConversionPattern(context), mapToDevice(mapToDevice), concurrentLoopsToSkip(concurrentLoopsToSkip) {} mlir::LogicalResult - matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, + matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { if (mapToDevice) return doLoop.emitError( "not yet implemented: Mapping `do concurrent` loops to device"); - looputils::LoopNestToIndVarMap loopNest; - bool hasRemainingNestedLoops = - failed(looputils::collectLoopNest(doLoop, loopNest)); - if (hasRemainingNestedLoops) - mlir::emitWarning(doLoop.getLoc(), - "Some `do concurent` loops are not perfectly-nested. " - "These will be serialized."); + looputils::InductionVariableInfos ivInfos; + auto loop = mlir::cast( + doLoop.getRegion().back().getTerminator()); + + auto indVars = loop.getLoopInductionVars(); + assert(indVars.has_value()); + + for (mlir::Value indVar : *indVars) + ivInfos.emplace_back(loop, indVar); llvm::SetVector locals; - looputils::collectLoopLocalValues(loopNest.back().first, locals); - looputils::sinkLoopIVArgs(rewriter, loopNest); + looputils::collectLoopLocalValues(loop, locals); mlir::IRMapping mapper; mlir::omp::ParallelOp parallelOp = - genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper); + genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper); mlir::omp::LoopNestOperands loopNestClauseOps; - genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper, + genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper, loopNestClauseOps); for (mlir::Value local : locals) @@ -413,41 +207,56 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { rewriter); mlir::omp::LoopNestOp ompLoopNest = - genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps, + genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps, /*isComposite=*/mapToDevice); - rewriter.eraseOp(doLoop); + rewriter.setInsertionPoint(doLoop); + fir::FirOpBuilder builder( + rewriter, + fir::getKindMapping(doLoop->getParentOfType())); + + // Collect iteration variable(s) allocations so that we can move them + // outside the `fir.do_concurrent` wrapper (before erasing it). + llvm::SmallVector opsToMove; + for (mlir::Operation &op : llvm::drop_end(doLoop)) + opsToMove.push_back(&op); + + mlir::Block *allocBlock = builder.getAllocaBlock(); + + for (mlir::Operation *op : llvm::reverse(opsToMove)) { + rewriter.moveOpBefore(op, allocBlock, allocBlock->begin()); + } // Mark `unordered` loops that are not perfectly nested to be skipped from // the legality check of the `ConversionTarget` since we are not interested // in mapping them to OpenMP. - ompLoopNest->walk([&](fir::DoLoopOp doLoop) { - if (doLoop.getUnordered()) { - concurrentLoopsToSkip.insert(doLoop); - } + ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) { + concurrentLoopsToSkip.insert(doLoop); }); + rewriter.eraseOp(doLoop); + return mlir::success(); } private: - mlir::omp::ParallelOp genParallelOp(mlir::Location loc, - mlir::ConversionPatternRewriter &rewriter, - looputils::LoopNestToIndVarMap &loopNest, - mlir::IRMapping &mapper) const { + mlir::omp::ParallelOp + genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, + looputils::InductionVariableInfos &ivInfos, + mlir::IRMapping &mapper) const { auto parallelOp = rewriter.create(loc); rewriter.createBlock(¶llelOp.getRegion()); rewriter.setInsertionPoint(rewriter.create(loc)); - genLoopNestIndVarAllocs(rewriter, loopNest, mapper); + genLoopNestIndVarAllocs(rewriter, ivInfos, mapper); return parallelOp; } void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter, - looputils::LoopNestToIndVarMap &loopNest, + looputils::InductionVariableInfos &ivInfos, mlir::IRMapping &mapper) const { - for (auto &[_, indVarInfo] : loopNest) + for (auto &indVarInfo : ivInfos) genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper); } @@ -471,10 +280,11 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { return result; } - void genLoopNestClauseOps( - mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, - looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper, - mlir::omp::LoopNestOperands &loopNestClauseOps) const { + void + genLoopNestClauseOps(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, + mlir::omp::LoopNestOperands &loopNestClauseOps) const { assert(loopNestClauseOps.loopLowerBounds.empty() && "Loop nest bounds were already emitted!"); @@ -483,43 +293,42 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { bounds.push_back(var.getDefiningOp()->getResult(0)); }; - for (auto &[doLoop, _] : loopNest) { - populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds); - populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds); - populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps); + for (auto [lb, ub, st] : llvm::zip_equal( + loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) { + populateBounds(lb, loopNestClauseOps.loopLowerBounds); + populateBounds(ub, loopNestClauseOps.loopUpperBounds); + populateBounds(st, loopNestClauseOps.loopSteps); } loopNestClauseOps.loopInclusive = rewriter.getUnitAttr(); } mlir::omp::LoopNestOp - genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop, - mlir::IRMapping &mapper, + genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, + fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, const mlir::omp::LoopNestOperands &clauseOps, bool isComposite) const { - auto wsloopOp = rewriter.create(doLoop.getLoc()); + auto wsloopOp = rewriter.create(loop.getLoc()); wsloopOp.setComposite(isComposite); rewriter.createBlock(&wsloopOp.getRegion()); auto loopNestOp = - rewriter.create(doLoop.getLoc(), clauseOps); + rewriter.create(loop.getLoc(), clauseOps); // Clone the loop's body inside the loop nest construct using the // mapped values. - rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(), + rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(), loopNestOp.getRegion().begin(), mapper); - mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator(); rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back()); - rewriter.create(terminator->getLoc()); - rewriter.eraseOp(terminator); + rewriter.create(loop->getLoc()); return loopNestOp; } bool mapToDevice; - llvm::DenseSet &concurrentLoopsToSkip; + llvm::DenseSet &concurrentLoopsToSkip; }; class DoConcurrentConversionPass @@ -548,19 +357,16 @@ class DoConcurrentConversionPass return; } - llvm::DenseSet concurrentLoopsToSkip; + llvm::DenseSet concurrentLoopsToSkip; mlir::RewritePatternSet patterns(context); patterns.insert( context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device, concurrentLoopsToSkip); mlir::ConversionTarget target(*context); - target.addDynamicallyLegalOp([&](fir::DoLoopOp op) { - // The goal is to handle constructs that eventually get lowered to - // `fir.do_loop` with the `unordered` attribute (e.g. array expressions). - // Currently, this is only enabled for the `do concurrent` construct since - // the pass runs early in the pipeline. - return !op.getUnordered() || concurrentLoopsToSkip.contains(op); - }); + target.addDynamicallyLegalOp( + [&](fir::DoConcurrentOp op) { + return concurrentLoopsToSkip.contains(op); + }); target.markUnknownOpDynamicallyLegal( [](mlir::Operation *) { return true; }); diff --git a/flang/test/Transforms/DoConcurrent/basic_device.mlir b/flang/test/Transforms/DoConcurrent/basic_device.mlir index d7fcc40e4a7f9..0ca48943864c8 100644 --- a/flang/test/Transforms/DoConcurrent/basic_device.mlir +++ b/flang/test/Transforms/DoConcurrent/basic_device.mlir @@ -1,8 +1,6 @@ // RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} { - %0 = fir.alloca i32 {bindc_name = "i"} - %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) %2 = fir.address_of(@_QFEa) : !fir.ref> %c10 = arith.constant 10 : index %3 = fir.shape %c10 : (index) -> !fir.shape<1> @@ -14,15 +12,19 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas %c1 = arith.constant 1 : index // expected-error@+2 {{not yet implemented: Mapping `do concurrent` loops to device}} - // expected-error@below {{failed to legalize operation 'fir.do_loop'}} - fir.do_loop %arg0 = %7 to %8 step %c1 unordered { - %13 = fir.convert %arg0 : (index) -> i32 - fir.store %13 to %1#1 : !fir.ref - %14 = fir.load %1#0 : !fir.ref - %15 = fir.load %1#0 : !fir.ref - %16 = fir.convert %15 : (i32) -> i64 - %17 = hlfir.designate %4#0 (%16) : (!fir.ref>, i64) -> !fir.ref - hlfir.assign %14 to %17 : i32, !fir.ref + // expected-error@below {{failed to legalize operation 'fir.do_concurrent'}} + fir.do_concurrent { + %0 = fir.alloca i32 {bindc_name = "i"} + %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + fir.do_concurrent.loop (%arg0) = (%7) to (%8) step (%c1) { + %13 = fir.convert %arg0 : (index) -> i32 + fir.store %13 to %1#1 : !fir.ref + %14 = fir.load %1#0 : !fir.ref + %15 = fir.load %1#0 : !fir.ref + %16 = fir.convert %15 : (i32) -> i64 + %17 = hlfir.designate %4#0 (%16) : (!fir.ref>, i64) -> !fir.ref + hlfir.assign %14 to %17 : i32, !fir.ref + } } return diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 index b84d4481ac766..12f63031cbaee 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ diff --git a/flang/test/Transforms/DoConcurrent/basic_host.mlir b/flang/test/Transforms/DoConcurrent/basic_host.mlir index b53ecd687c039..5425829404d7b 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.mlir +++ b/flang/test/Transforms/DoConcurrent/basic_host.mlir @@ -6,8 +6,6 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} { // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) - %0 = fir.alloca i32 {bindc_name = "i"} - %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) %2 = fir.address_of(@_QFEa) : !fir.ref> %c10 = arith.constant 10 : index %3 = fir.shape %c10 : (index) -> !fir.shape<1> @@ -18,7 +16,7 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas %8 = fir.convert %c10_i32 : (i32) -> index %c1 = arith.constant 1 : index - // CHECK-NOT: fir.do_loop + // CHECK-NOT: fir.do_concurrent // CHECK: %[[C1:.*]] = arith.constant 1 : i32 // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index @@ -46,17 +44,21 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas // CHECK-NEXT: omp.terminator // CHECK-NEXT: } - fir.do_loop %arg0 = %7 to %8 step %c1 unordered { - %13 = fir.convert %arg0 : (index) -> i32 - fir.store %13 to %1#1 : !fir.ref - %14 = fir.load %1#0 : !fir.ref - %15 = fir.load %1#0 : !fir.ref - %16 = fir.convert %15 : (i32) -> i64 - %17 = hlfir.designate %4#0 (%16) : (!fir.ref>, i64) -> !fir.ref - hlfir.assign %14 to %17 : i32, !fir.ref + fir.do_concurrent { + %0 = fir.alloca i32 {bindc_name = "i"} + %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + fir.do_concurrent.loop (%arg0) = (%7) to (%8) step (%c1) { + %13 = fir.convert %arg0 : (index) -> i32 + fir.store %13 to %1#1 : !fir.ref + %14 = fir.load %1#0 : !fir.ref + %15 = fir.load %1#0 : !fir.ref + %16 = fir.convert %15 : (i32) -> i64 + %17 = hlfir.designate %4#0 (%16) : (!fir.ref>, i64) -> !fir.ref + hlfir.assign %14 to %17 : i32, !fir.ref + } } - // CHECK-NOT: fir.do_loop + // CHECK-NOT: fir.do_concurrent return } diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 index 4e13c0919589a..f82696669eca6 100644 --- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 +++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests that "loop-local values" are properly handled by localizing them to the ! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue` ! for a definition of "loop-local values" and how they are handled. diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 deleted file mode 100644 index adc4a488d1ec9..0000000000000 --- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 +++ /dev/null @@ -1,92 +0,0 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - -! Tests loop-nest detection algorithm for do-concurrent mapping. - -! REQUIRES: asserts - -! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host \ -! RUN: -mmlir -debug -mmlir -mlir-disable-threading %s -o - 2> %t.log || true - -! RUN: FileCheck %s < %t.log - -program main - implicit none - -contains - -subroutine foo(n) - implicit none - integer :: n, m - integer :: i, j, k - integer :: x - integer, dimension(n) :: a - integer, dimension(n, n, n) :: b - - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested - do concurrent(i=1:n, j=1:bar(n*m, n/m)) - a(i) = n - end do - - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested - do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m)) - a(i) = n - end do - - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested - do concurrent(i=bar(n, x):n) - do concurrent(j=1:bar(n*m, n/m)) - a(i) = n - end do - end do - - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested - do concurrent(i=1:n) - x = 10 - do concurrent(j=1:m) - b(i,j,k) = i * j + k - end do - end do - - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested - do concurrent(i=1:n) - do concurrent(j=1:m) - b(i,j,k) = i * j + k - end do - x = 10 - end do - - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested - do concurrent(i=1:n) - do concurrent(j=1:m) - b(i,j,k) = i * j + k - x = 10 - end do - end do - - ! Verify the (i,j) and (j,k) pairs of loops are detected as perfectly nested. - ! - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 3]]:{{.*}}) is perfectly nested - ! CHECK: Loop pair starting at location - ! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested - do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m), k=1:bar(n*m, bar(n*m, n/m))) - a(i) = n - end do -end subroutine - -pure function bar(n, m) - implicit none - integer, intent(in) :: n, m - integer :: bar - - bar = n + m -end function - -end program main diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 index 26800678d381c..d0210726de83e 100644 --- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 +++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests mapping of a `do concurrent` loop with multiple iteration ranges. ! RUN: split-file %s %t diff --git a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 index 23a3aae976c07..cd1bd4f98a3f5 100644 --- a/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 +++ b/flang/test/Transforms/DoConcurrent/non_const_bounds.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ ! RUN: | FileCheck %s diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 index d1c02101318ab..74799359e0476 100644 --- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 +++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 @@ -1,6 +1,3 @@ -! Fails until we update the pass to use the `fir.do_concurrent` op. -! XFAIL: * - ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that ! we skip converting the not-perfectly nested `do concurrent` loop. @@ -22,23 +19,24 @@ program main end do end -! CHECK: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"} -! CHECK: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]] - -! CHECK: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"} -! CHECK: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]] - ! CHECK: omp.parallel { ! CHECK: omp.wsloop { ! CHECK: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} { -! CHECK: fir.do_loop %[[J_IV:.*]] = {{.*}} { -! CHECK: %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32 +! CHECK: fir.do_concurrent { + +! CHECK: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"} +! CHECK: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]] + +! CHECK: %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"} +! CHECK: %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]] + +! CHECK: fir.do_concurrent.loop (%[[J_IV:.*]], %[[K_IV:.*]]) = {{.*}} { +! CHECK: %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32 ! CHECK: fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#0 -! CHECK: fir.do_loop %[[K_IV:.*]] = {{.*}} { ! CHECK: %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32 -! CHECK: fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0 +! CHECK: fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0 ! CHECK: } ! CHECK: } ! CHECK: omp.yield