From f2b287279f02e452e7600345bc4e3fd9aee096bd Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Sat, 11 May 2024 13:38:36 -0700 Subject: [PATCH 01/11] [mlir][SCF] Allow tiling by specifying maximum number of tiles. --- .../Linalg/TransformOps/LinalgTransformOps.h | 6 +- .../Dialect/Linalg/Transforms/Transforms.h | 27 +- .../SCF/Transforms/TileUsingInterface.h | 35 ++- .../TransformOps/LinalgTransformOps.cpp | 47 ++- mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp | 182 ------------ .../SCF/Transforms/TileUsingInterface.cpp | 271 +++++++++++++----- mlir/test/Dialect/Linalg/tile-to-forall.mlir | 1 - .../TestTilingInterfaceTransformOps.cpp | 6 +- 8 files changed, 272 insertions(+), 303 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h index 3af642752724c..db25c9b241734 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h @@ -30,6 +30,10 @@ class GenericOp; class LinalgOp; } // namespace linalg +namespace scf { +struct SCFTilingResult; +} // namespace scf + namespace tensor { class InsertSliceOp; class PackOp; @@ -60,7 +64,7 @@ tileToForallOpImpl(RewriterBase &rewriter, transform::TransformState &state, ArrayRef mixedNumThreads, ArrayRef mixedTileSizes, std::optional mapping, - linalg::ForallTilingResult &tilingResult); + scf::SCFTilingResult &tilingResult); } // namespace transform } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 0c7a8edff222f..248e626346b52 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -866,31 +866,8 @@ FailureOr computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions); -/// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying -/// tiling by `numThreads`. -/// If non-empty, the `mapping` is added as an attribute to the -/// resulting `scf.forall`. -/// Zero tile sizes indicate that the dimension is not tiled, and can be -/// thought of as tiling by the full size of data. It is the user's -/// responsibility to ensure that `numThreads` is a valid tiling specification -/// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case). -struct ForallTilingResult { - Operation *tileOp; - Operation *tiledOp; -}; -FailureOr tileToForallOp(RewriterBase &builder, - TilingInterface op, - ArrayRef numThreads, - std::optional mapping); - -/// Same as `tileToForallOp`, but calculate the number of threads -/// required using the given tileSizes. -FailureOr -tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op, - ArrayRef tileSizes, - std::optional mapping); - -/// Transformation information returned after reduction tiling. + + /// Transformation information returned after reduction tiling. struct ForallReductionTilingResult { /// The partial reduction tiled op generated. SmallVector parallelTiledOps; diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h index d68ca11207376..23f63ca0906ab 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h @@ -32,9 +32,13 @@ using SCFTileSizeComputationFunction = /// Options to use to control tiling. struct SCFTilingOptions { - /// Computation function that returns the tile sizes for each operation. - /// Delayed construction of constant tile sizes should occur to interoperate - /// with folding. + /// Computation function that returns the tile sizes to use for each loop. + /// Returning a tile size of zero implies no tiling for that loop. If the + /// size of the returned vector is smaller than the number of loops, the inner + /// loops are not tiled. If the size of the returned vector is larger, then + /// the vector is truncated to number of loops. Only one of + /// `tileSizeComputationFunction` or `maxNumTilesComputationFunction` should + /// be used. SCFTileSizeComputationFunction tileSizeComputationFunction = nullptr; SCFTilingOptions & @@ -45,7 +49,25 @@ struct SCFTilingOptions { /// Convenience function to set the `tileSizeComputationFunction` to a /// function that computes tile sizes at the point they are needed. Allows /// proper interaction with folding. - SCFTilingOptions &setTileSizes(ArrayRef ts); + SCFTilingOptions &setTileSizes(ArrayRef tileSizes); + + /// Computation function that returns the maximum number of tile to use for + /// each loop. Returning a tile size of zero implies no tiling for that loop. + /// If the size of the returned vector is smaller than the number of loops, + /// the inner loops are not tiled. If the size of the returned vector is + /// larger, then the vector is truncated to number of loops. Only one of + /// `tileSizeComputationFunction` or `maxNumTilesComputationFunction` should + /// be used. + SCFTileSizeComputationFunction maxNumTilesComputationFunction = nullptr; + + SCFTilingOptions & + setMaxNumTilesComputationFunction(SCFTileSizeComputationFunction fun) { + maxNumTilesComputationFunction = std::move(fun); + return *this; + } + /// Convenience function to set the `tileSizeComputationFunction` to a + /// function that computes tile sizes at the point they are needed. + SCFTilingOptions &setMaxNumTiles(ArrayRef numTiles); /// The interchange vector to reorder the tiled loops. SmallVector interchangeVector = {}; @@ -67,9 +89,8 @@ struct SCFTilingOptions { /// when using loop constructs that dont support such a mapping (like /// `scf.for`) SmallVector mappingVector = {}; - SCFTilingOptions &setMapping(ArrayRef mapping) { - mappingVector = llvm::map_to_vector( - mapping, [](auto attr) -> Attribute { return attr; }); + SCFTilingOptions &setMapping(ArrayRef mapping) { + mappingVector = llvm::to_vector(mapping); return *this; } }; diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index b611347b8de2e..3cbe3d5b8e78d 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3156,7 +3156,7 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl( TransformOpInterface transformOp, Operation *target, ArrayRef mixedNumThreads, ArrayRef mixedTileSizes, std::optional mapping, - linalg::ForallTilingResult &tilingResult) { + scf::SCFTilingResult &tilingResult) { // Transform all targets one by one. auto tileableOp = dyn_cast(target); if (!tileableOp) { @@ -3167,18 +3167,38 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl( return diag; } rewriter.setInsertionPoint(tileableOp); - FailureOr maybeTilingResult = failure(); + scf::SCFTilingOptions options; + options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); if (!mixedNumThreads.empty()) { - maybeTilingResult = - linalg::tileToForallOp(rewriter, tileableOp, mixedNumThreads, mapping); + options.setMaxNumTiles(mixedNumThreads); } else { - maybeTilingResult = linalg::tileToForallOpUsingTileSizes( - rewriter, tileableOp, mixedTileSizes, mapping); + SmallVector loopRanges = tileableOp.getIterationDomain(rewriter); + unsigned nLoops = loopRanges.size(); + SmallVector numThreads; + numThreads.reserve(nLoops); + AffineExpr s0, s1; + bindSymbols(rewriter.getContext(), s0, s1); + AffineExpr divExpr = s0.ceilDiv(s1); + for (int i = 0, e = std::min(mixedTileSizes.size(), loopRanges.size()); + i < e; ++i) { + OpFoldResult numTiles = mixedTileSizes[i]; + if (!isConstantIntValue(numTiles, 0)) + numTiles = affine::makeComposedFoldedAffineApply( + rewriter, tileableOp.getLoc(), divExpr, + {loopRanges[i].size, numTiles}); + numThreads.push_back(numTiles); + } + options.setMaxNumTiles(numThreads); + } + if (mapping) { + options.setMapping(mapping.value().getValue()); } + FailureOr maybeTilingResult = + scf::tileUsingSCF(rewriter, tileableOp, options); if (failed(maybeTilingResult)) return transformOp.emitDefaultSilenceableFailure(tileableOp); - rewriter.replaceOp(tileableOp, maybeTilingResult->tileOp->getResults()); + rewriter.replaceOp(tileableOp, maybeTilingResult->replacements); tilingResult = *maybeTilingResult; return DiagnosedSilenceableFailure::success(); @@ -3214,14 +3234,14 @@ DiagnosedSilenceableFailure transform::TileUsingForallOp::apply( return status; for (Operation *target : state.getPayloadOps(getTarget())) { - linalg::ForallTilingResult tilingResult; + scf::SCFTilingResult tilingResult; DiagnosedSilenceableFailure diag = tileToForallOpImpl( rewriter, state, transformOp, target, mixedNumThreads, mixedTileSizes, getMapping(), tilingResult); if (!diag.succeeded()) return diag; - tileOps.push_back(tilingResult.tileOp); - tiledOps.push_back(tilingResult.tiledOp); + tileOps.push_back(tilingResult.loops.front()); + tiledOps.append(tilingResult.tiledOps); } transformResults.set(cast(getForallOp()), tileOps); @@ -3699,7 +3719,7 @@ DiagnosedSilenceableFailure transform::MapCopyToThreadsOp::applyToOne( // OpBuilder only used to compute attributes. OpBuilder b(getContext()); - linalg::ForallTilingResult tilingResult; + scf::SCFTilingResult tilingResult; DiagnosedSilenceableFailure diag = tileToForallOpImpl( /*rewriter=*/rewriter, /*state=*/state, @@ -3712,8 +3732,9 @@ DiagnosedSilenceableFailure transform::MapCopyToThreadsOp::applyToOne( if (!diag.succeeded()) return diag; - results.push_back(tilingResult.tileOp); - results.push_back(tilingResult.tiledOp); + results.push_back(tilingResult.loops.front()); + for (auto op : tilingResult.tiledOps) + results.push_back(op); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp index 8ef8651646829..fb6ab2055e7dd 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -435,188 +435,6 @@ static void calculateTileOffsetsAndSizes( } } -/// Returns a vector of bools representing if, for each axis, `op` can be tiled -/// without incurring in a race condition and thus it is thread-safe to do the -/// tiling. This is checked by iterating over numThreads and ensuring that the -/// corresponding iterator type is "parallel". If it is not, then we know that -/// such dimension is unsafe to tile. -SmallVector safeToTileToForall(mlir::MLIRContext *ctx, LinalgOp linalgOp, - ArrayRef numThreads) { - auto iterators = linalgOp.getIteratorTypesArray(); - SmallVector safeToTile(numThreads.size(), true); - - for (unsigned i = 0, e = numThreads.size(); i != e; i++) { - if (auto attr = llvm::dyn_cast_if_present(numThreads[i])) { - if (cast(attr).getValue().getSExtValue() > 1) { - safeToTile[i] = iterators[i] == utils::IteratorType::parallel; - } - } else { - safeToTile[i] = iterators[i] == utils::IteratorType::parallel; - } - } - return safeToTile; -} - -/// Rewrite a TilingInterface `op` to a tiled `scf.forall`. The -/// tiling is specified by the number of tiles/threads `numThreads` and the -/// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is -/// not specified, then it is derived from `numThreads` as `ceilDiv(dimSize[i], -/// numThreads[i])`. If non-empty, the `mapping` is added as an -/// attribute to the resulting `scf.forall`. A zero tile sizes indicate -/// that the dimension is not tiled, and can be thought of as tiling by the full -/// size of data. -/// It is the user's responsibility to ensure that `numThreads` is a valid -/// tiling specification (i.e. that only tiles parallel dimensions, e.g. in the -/// Linalg case). If the dimension is not parallelizable, a warning is issued to -/// notify the user that the generated code is not safe to parallelize. If -/// `omitTileOffsetBoundsCheck` is true, then the function will assume that -/// `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds. -static FailureOr tileToForallOpImpl( - RewriterBase &b, TilingInterface op, ArrayRef numThreads, - std::optional> nominalTileSizes, - std::optional mapping, bool omitTileOffsetBoundsCheck) { - Location loc = op->getLoc(); - OpBuilder::InsertionGuard g(b); - - SmallVector loopRanges = op.getIterationDomain(b); - if (loopRanges.empty()) - return op->emitOpError("expected non-empty loop ranges"); - auto hasStrideOne = [](Range r) { return !isConstantIntValue(r.stride, 1); }; - if (llvm::any_of(loopRanges, hasStrideOne)) - return op->emitOpError("only stride-1 supported atm"); - - // Gather destination tensors. - SmallVector dest; - if (failed(tensor::getOrCreateDestinations(b, loc, op, dest))) - return op->emitOpError("failed to get destination tensors"); - - SmallVector nonZeroNumThreads = - llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) { - return !isConstantIntValue(ofr, 0); - })); - SmallVector materializedNonZeroNumThreads = - llvm::to_vector(llvm::map_range(nonZeroNumThreads, [&](OpFoldResult ofr) { - return getValueOrCreateConstantIndexOp(b, loc, ofr); - })); - - LinalgOp linalgOp = dyn_cast(op.getOperation()); - if (linalgOp) { - // Check if tiling is thread safe and print a warning if not. - SmallVector tilingSafety = - safeToTileToForall(b.getContext(), linalgOp, numThreads); - for (size_t i = 0; i < tilingSafety.size(); i++) - if (!tilingSafety[i]) - op.emitWarning() << "tiling is not thread safe at axis #" << i; - } - - // 1. Create the ForallOp. We don't use the lambda body-builder - // version because we require the use of RewriterBase in the body, so we - // manually move the insertion point to the body below. - scf::ForallOp forallOp = b.create( - loc, getAsOpFoldResult((materializedNonZeroNumThreads)), dest, mapping); - - // 2. Fill out the ForallOp body. - SmallVector tiledOffsets, tiledSizes; - calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, loopRanges, - omitTileOffsetBoundsCheck, nominalTileSizes, - tiledOffsets, tiledSizes); - - // 3. Clone the tileable op and update its destination operands to use the - // output bbArgs of the ForallOp. - ArrayRef destBbArgs = forallOp.getRegionIterArgs(); - Operation *tiledOp = nullptr; - SmallVector tiledValues; - { - // 3.a. RAII guard, inserting within forallOp, before terminator. - OpBuilder::InsertionGuard g(b); - b.setInsertionPoint(forallOp.getTerminator()); - Operation *clonedOp = b.clone(*op.getOperation()); - auto destinationStyleOp = dyn_cast(clonedOp); - if (destinationStyleOp) { - for (OpOperand &outOperand : destinationStyleOp.getDpsInitsMutable()) { - // Swap tensor inits with the corresponding block argument of the - // scf.forall op. Memref inits remain as is. - if (isa(outOperand.get().getType())) { - auto *it = llvm::find(dest, outOperand.get()); - assert(it != dest.end() && "could not find destination tensor"); - unsigned destNum = std::distance(dest.begin(), it); - outOperand.set(destBbArgs[destNum]); - } - } - } - - // 4. Tile the cloned op and delete the clone. - FailureOr tilingResult = - cast(clonedOp).getTiledImplementation(b, tiledOffsets, - tiledSizes); - if (failed(tilingResult)) - return clonedOp->emitError("Failed to tile op: "); - if (tilingResult->tiledOps.size() != 1) { - return clonedOp->emitError("expected a single produced tiled op, got ") - << tilingResult->tiledOps.size(); - } - - b.eraseOp(clonedOp); - tiledOp = tilingResult->tiledOps.front(); - tiledValues = tilingResult->tiledValues; - } - - // 5. Parallel insert back into the result tensor. - for (auto it : llvm::zip(llvm::seq(unsigned(0), unsigned(dest.size())), - tiledValues, destBbArgs)) { - // 5.a. Partial subset information is inserted just before the terminator. - OpBuilder::InsertionGuard g(b); - b.setInsertionPoint(forallOp.getTerminator()); - - SmallVector resultOffsets, resultSizes; - if (failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets, - tiledSizes, resultOffsets, - resultSizes))) - return op->emitOpError("output offsets couldn't be calculated"); - SmallVector strides(resultSizes.size(), b.getIndexAttr(1)); - - // 5.b. Parallel insertions are inserted at the end of the combining - // terminator. - b.setInsertionPointToEnd(forallOp.getTerminator().getBody()); - b.create(loc, std::get<1>(it), - std::get<2>(it), resultOffsets, - resultSizes, strides); - } - return ForallTilingResult{forallOp, tiledOp}; -} - -FailureOr -linalg::tileToForallOp(RewriterBase &b, TilingInterface op, - ArrayRef numThreads, - std::optional mapping) { - return tileToForallOpImpl(b, op, numThreads, - /*nominalTileSizes=*/std::nullopt, mapping, - /*omitTileOffsetBoundsCheck=*/false); -} - -FailureOr -linalg::tileToForallOpUsingTileSizes(RewriterBase &b, TilingInterface op, - ArrayRef tileSizes, - std::optional mapping) { - SmallVector loopRanges = op.getIterationDomain(b); - unsigned nLoops = loopRanges.size(); - SmallVector numThreads; - numThreads.reserve(nLoops); - AffineExpr s0, s1; - bindSymbols(b.getContext(), s0, s1); - AffineExpr divExpr = s0.ceilDiv(s1); - for (const auto &it : llvm::zip(tileSizes, loopRanges)) { - OpFoldResult numTiles = std::get<0>(it); - if (!isConstantIntValue(numTiles, 0)) - numTiles = makeComposedFoldedAffineApply( - b, op.getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)}); - numThreads.push_back(numTiles); - } - return tileToForallOpImpl(b, op, numThreads, - /*nominalTileSizes=*/tileSizes, mapping, - /*omitTileOffsetBoundsCheck=*/true); -} - template static FailureOr tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef tileSizes, diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index a1392813d6de3..3acae673449c8 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -42,6 +42,16 @@ scf::SCFTilingOptions::setTileSizes(ArrayRef ts) { return *this; } +scf::SCFTilingOptions & +scf::SCFTilingOptions::setMaxNumTiles(ArrayRef mnt) { + assert(!maxNumTilesComputationFunction && "max num tiles already set"); + auto maxNumTiles = llvm::to_vector(mnt); + maxNumTilesComputationFunction = [maxNumTiles](OpBuilder &b, Operation *op) { + return maxNumTiles; + }; + return *this; +} + /// Helper method to adjust the interchange vector to match the iteration /// domain. static SmallVector @@ -61,6 +71,85 @@ fillInterchangeVector(ArrayRef interchangeVector, // tileUsingSCF implementation. //===----------------------------------------------------------------------===// +/// Verify the tile size options are set in a consistent manner. +static LogicalResult +verifyTileSizeOptions(RewriterBase &rewriter, Location loc, + const scf::SCFTilingOptions &options) { + if (!options.tileSizeComputationFunction && + !options.maxNumTilesComputationFunction) { + return rewriter.notifyMatchFailure( + loc, "at least one of tile size computation function or max num tiles " + "computation must be specified."); + } + if (options.tileSizeComputationFunction && + options.maxNumTilesComputationFunction) { + return rewriter.notifyMatchFailure( + loc, "only one of tile size computation function or max num tiles " + "computation function can be specified"); + } + + // If specified, check that the interchange vector is a permutation. + if (!options.interchangeVector.empty()) { + if (!isPermutationVector(options.interchangeVector)) { + return rewriter.notifyMatchFailure( + loc, "invalid intechange vector, not a permutation of the entire " + "iteration space"); + } + } + return success(); +} + +/// Compute the tile sizes and num tiles values. The `numTiles` +/// is empty if the `maxNumTilesComputationFunction` is not specified. +static std::tuple, SmallVector> +getTileSizesAndNumTiles(RewriterBase &rewriter, TilingInterface op, + ArrayRef iterationDomain, + const scf::SCFTilingOptions &options) { + SmallVector tileSizes, numTiles; + + // Enforce the convention that "tiling by zero" + // skips tiling a particular dimension. This convention is significantly + // simpler to handle instead of adjusting affine maps to account for missing + // dimensions. + auto numLoops = iterationDomain.size(); + if (options.tileSizeComputationFunction) { + tileSizes = options.tileSizeComputationFunction(rewriter, op); + tileSizes.resize(numLoops, rewriter.getIndexAttr(0)); + return {tileSizes, numTiles}; + } + + assert(options.maxNumTilesComputationFunction && + "expected at least one of tile sizes cpomputation function or max num " + "tiles computation function"); + // Enforce the convention that "maxNumTiles to zero" + // skips tiling a particular dimension. This convention is significantly + // simpler to handle instead of adjusting affine maps to account for missing + // dimensions. + SmallVector maxNumTiles = + options.maxNumTilesComputationFunction(rewriter, op); + maxNumTiles.resize(numLoops, rewriter.getIndexAttr(0)); + + // Use the maxNumTiles to compute the tile sizes as + // - niters = ceilDiv(ub - lb, step) + // - tileSize = ceilDiv(niters, maxNumTiles) + AffineExpr s0, s1, s2, s3; + bindSymbols(rewriter.getContext(), s0, s1, s2, s3); + AffineExpr numIters = (s1 - s0).ceilDiv(s2); + AffineExpr tileSizeExpr = numIters.ceilDiv(s3); + tileSizes.resize(numLoops, rewriter.getIndexAttr(0)); + for (auto [index, maxNumTile] : llvm::enumerate(maxNumTiles)) { + if (isConstantIntValue(maxNumTile, 0)) + continue; + + tileSizes[index] = affine::makeComposedFoldedAffineApply( + rewriter, op.getLoc(), tileSizeExpr, + {iterationDomain[index].offset, iterationDomain[index].size, + iterationDomain[index].stride, maxNumTile}); + } + + return {tileSizes, maxNumTiles}; +} + // Check if `stride` evenly divides the trip count `size - offset`. static bool tileDividesIterationDomain(Range loopRange) { std::optional offsetAsInt = getConstantIntValue(loopRange.offset); @@ -100,6 +189,46 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, b, loc, minMap, SmallVector{iv, tileSize, size}); } +/// Compute the tile offsets and sizes. +static std::tuple, SmallVector> +getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, + ArrayRef iterationDomain, + ArrayRef tileSizes, bool isLoopNormalized) { + SmallVector offsets, sizes; + int materializedLoopNum = 0; + + AffineExpr d0, s0, s1, s2; + AffineExpr offsetExpr; + if (isLoopNormalized) { + bindDims(rewriter.getContext(), d0); + bindSymbols(rewriter.getContext(), s0, s1, s2); + offsetExpr = s0 + d0 * s1 * s2; + } + + for (auto [tileSize, loopRange] : + llvm::zip_equal(tileSizes, iterationDomain)) { + if (isConstantIntValue(tileSize, 0)) { + offsets.push_back(loopRange.offset); + sizes.push_back(loopRange.size); + continue; + } + // If loop is normalized, the offset is (lb + iv * step * tileSize) + Value iv = ivs[materializedLoopNum++]; + OpFoldResult offset; + if (isLoopNormalized) { + offset = affine::makeComposedFoldedAffineApply( + rewriter, loc, offsetExpr, + ArrayRef{iv, loopRange.offset, loopRange.stride, + tileSize}); + } else { + offset = getAsOpFoldResult(iv); + } + offsets.push_back(offset); + sizes.push_back(getBoundedTileSize(rewriter, loc, loopRange, iv, tileSize)); + } + return {offsets, sizes}; +} + /// A function that allows returning additional yielded values during /// `yieldTiledValuesAndReplace`. /// - `ivs` induction variable for the loop. @@ -145,8 +274,8 @@ static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter, /// populated. static LogicalResult generateLoopNestUsingForOp( RewriterBase &rewriter, Location loc, ArrayRef loopRanges, - ArrayRef tileSizes, ValueRange destinationTensors, - YieldTiledValuesFn yieldTiledValuesFn, + ArrayRef tileSizes, ArrayRef numTiles, + ValueRange destinationTensors, YieldTiledValuesFn yieldTiledValuesFn, SmallVector &loops) { assert(!loopRanges.empty() && "unexpected empty loop ranges"); assert(loopRanges.size() == tileSizes.size() && @@ -154,15 +283,30 @@ static LogicalResult generateLoopNestUsingForOp( OpBuilder::InsertionGuard guard(rewriter); SmallVector ivs; - for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) { + Value zero, one; + if (!numTiles.empty()) { + zero = rewriter.create(loc, 0); + ; + one = rewriter.create(loc, 1); + } + + for (auto [index, loopRange, tileSize] : + llvm::enumerate(loopRanges, tileSizes)) { // No loops if tile size is zero. Set offset and size to the loop // offset and size. if (isConstantIntValue(tileSize, 0)) continue; - Value lb = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.offset); - Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.size); - Value step = getValueOrCreateConstantIndexOp(rewriter, loc, tileSize); + Value lb, ub, step; + if (numTiles.empty()) { + lb = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.offset); + ub = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.size); + step = getValueOrCreateConstantIndexOp(rewriter, loc, tileSize); + } else { + lb = zero; + ub = getValueOrCreateConstantIndexOp(rewriter, loc, numTiles[index]); + step = one; + } auto loop = rewriter.create(loc, lb, ub, step, destinationTensors, [](OpBuilder &bodyBuilder, Location bodyLoc, @@ -224,32 +368,45 @@ static LogicalResult generateLoopNestUsingForOp( /// populated. static LogicalResult generateLoopNestUsingForallOp( RewriterBase &rewriter, Location loc, ArrayRef loopRanges, - ArrayRef tileSizes, ArrayRef mappingVector, - ValueRange destinationTensors, YieldTiledValuesFn tiledBodyFn, - SmallVector &loops) { - SmallVector lbs, ubs, steps; + ArrayRef tileSizes, ArrayRef numTiles, + ArrayRef mappingVector, ValueRange destinationTensors, + YieldTiledValuesFn tiledBodyFn, SmallVector &loops) { assert(!loopRanges.empty() && "unexpected empty loop ranges"); assert(loopRanges.size() == tileSizes.size() && "expected as many tile sizes as loop ranges"); + assert((numTiles.empty() || numTiles.size() == loopRanges.size()) && + "expected max number of tiles to be either empty or equal to number " + "of loops"); OpBuilder::InsertionGuard guard(rewriter); SmallVector offsets(loopRanges.size()), sizes(loopRanges.size()); - for (auto [tileSize, loopRange] : llvm::zip_equal(tileSizes, loopRanges)) { - if (isConstantIntValue(tileSize, 0)) - continue; - lbs.push_back(loopRange.offset); - ubs.push_back(loopRange.size); - steps.push_back(tileSize); - } - assert(!lbs.empty() && "Expected at least one loop range"); - std::optional mappingAttr; if (!mappingVector.empty()) mappingAttr = rewriter.getArrayAttr(mappingVector); - auto forallOp = rewriter.create( - loc, lbs, ubs, steps, destinationTensors, mappingAttr); + scf::ForallOp forallOp; + SmallVector lbs, ubs, steps; + if (numTiles.empty()) { + for (auto [tileSize, loopRange] : llvm::zip_equal(tileSizes, loopRanges)) { + if (isConstantIntValue(tileSize, 0)) + continue; + lbs.push_back(loopRange.offset); + ubs.push_back(loopRange.size); + steps.push_back(tileSize); + } + assert(!lbs.empty() && "Expected at least one loop range"); + forallOp = rewriter.create(loc, lbs, ubs, steps, + destinationTensors, mappingAttr); + } else { + SmallVector numThreads; + for (auto maxNumTile : numTiles) { + if (!isConstantIntValue(maxNumTile, 0)) + numThreads.push_back(maxNumTile); + } + forallOp = rewriter.create(loc, numThreads, + destinationTensors, mappingAttr); + } loops.push_back(forallOp); rewriter.setInsertionPoint(forallOp.getTerminator()); @@ -286,13 +443,11 @@ static LogicalResult generateLoopNestUsingForallOp( /// loop. /// - `loops` is an in-out parameter into which the generated loops are /// populated. -static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc, - const scf::SCFTilingOptions &options, - ArrayRef loopRanges, - ArrayRef tileSizes, - ValueRange destinationTensors, - YieldTiledValuesFn tiledBodyFn, - SmallVector &loops) { +static LogicalResult generateLoopNest( + RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options, + ArrayRef loopRanges, ArrayRef tileSizes, + ArrayRef numTiles, ValueRange destinationTensors, + YieldTiledValuesFn tiledBodyFn, SmallVector &loops) { // If the tile sizes are all zero, no loops are generated. Just call the // callback function to handle untiled case. if (llvm::all_of(tileSizes, isZeroIndex)) { @@ -303,11 +458,12 @@ static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc, } if (options.loopType == scf::SCFTilingOptions::LoopType::ForOp) { return generateLoopNestUsingForOp(rewriter, loc, loopRanges, tileSizes, - destinationTensors, tiledBodyFn, loops); + numTiles, destinationTensors, tiledBodyFn, + loops); } if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) { return generateLoopNestUsingForallOp( - rewriter, loc, loopRanges, tileSizes, options.mappingVector, + rewriter, loc, loopRanges, tileSizes, numTiles, options.mappingVector, destinationTensors, tiledBodyFn, loops); } return rewriter.notifyMatchFailure(loc, "unhandled loop type"); @@ -531,28 +687,20 @@ static LogicalResult addInitOperandsToLoopNest( FailureOr mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, const scf::SCFTilingOptions &options) { + if (failed(verifyTileSizeOptions(rewriter, op.getLoc(), options))) { + return failure(); + } + OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointAfter(op); - if (!options.tileSizeComputationFunction) { - return rewriter.notifyMatchFailure( - op, "missing tile size computation function"); - } - // 1. Get the range of the loops that are represented by the operation. SmallVector iterationDomain = op.getIterationDomain(rewriter); - size_t numLoops = iterationDomain.size(); - // 2. Materialize the tile sizes. Enforce the convention that "tiling by zero" - // skips tiling a particular dimension. This convention is significantly - // simpler to handle instead of adjusting affine maps to account for missing - // dimensions. - SmallVector tileSizes = - options.tileSizeComputationFunction(rewriter, op); - if (tileSizes.size() < iterationDomain.size()) { - auto zero = rewriter.getIndexAttr(0); - tileSizes.append(numLoops - tileSizes.size(), zero); - } + // 2. Materialize the tile sizes or max num tiles; + SmallVector tileSizes, numTiles; + std::tie(tileSizes, numTiles) = + getTileSizesAndNumTiles(rewriter, op, iterationDomain, options); // 3. If there is an interchange specified, permute the iteration domain and // the tile sizes. @@ -560,16 +708,13 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, if (!options.interchangeVector.empty()) { interchangeVector = fillInterchangeVector(options.interchangeVector, iterationDomain.size()); - } - if (!interchangeVector.empty()) { - if (!isPermutationVector(interchangeVector)) { - return rewriter.notifyMatchFailure( - op, "invalid intechange vector, not a permutation of the entire " - "iteration space"); - } + assert(isPermutationVector(interchangeVector) && + "expected interchange vector to be a permutation"); applyPermutationToVector(iterationDomain, interchangeVector); applyPermutationToVector(tileSizes, interchangeVector); + if (!numTiles.empty()) + applyPermutationToVector(numTiles, interchangeVector); } FailureOr tilingResult; @@ -583,21 +728,8 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, -> LogicalResult { // 4a. Compute the `offsets` and `sizes` to use for tiling. SmallVector offsets, sizes; - { - int materializedLoopNum = 0; - for (auto [tileSize, loopRange] : - llvm::zip_equal(tileSizes, iterationDomain)) { - if (isConstantIntValue(tileSize, 0)) { - offsets.push_back(loopRange.offset); - sizes.push_back(loopRange.size); - continue; - } - Value iv = ivs[materializedLoopNum++]; - offsets.push_back(iv); - sizes.push_back( - getBoundedTileSize(rewriter, loc, loopRange, iv, tileSize)); - } - } + std::tie(offsets, sizes) = getTileOffsetAndSizes( + rewriter, loc, ivs, iterationDomain, tileSizes, !numTiles.empty()); // 4b. If interchange was provided, apply inverse of the interchange // to get back the offsets/sizes in the order to be specified. @@ -665,7 +797,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 7. Generate the tiled loops nest using the callback defined above. SmallVector loops; if (failed(generateLoopNest(rewriter, op.getLoc(), options, iterationDomain, - tileSizes, destinationTensors, + tileSizes, numTiles, destinationTensors, innerYieldTiledValuesFn, loops))) return op.emitOpError("failed to generate tiling loops"); assert(succeeded(tilingResult) && @@ -781,6 +913,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b, scf::SCFTilingOptions options; options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp); if (failed(generateLoopNest(b, loc, options, iterationDomain, tileSizesVector, + /*numTiles=*/ArrayRef{}, initTensors, innerYieldTiledValuesFn, loops))) return b.notifyMatchFailure(op, "failed to tile for parallel reduction"); diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir index 8545dfd25eccf..f33739f119eaf 100644 --- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir @@ -177,7 +177,6 @@ module attributes {transform.with_named_sequence} { } } - // ----- // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp index 8f206d9077272..a99441cd7147b 100644 --- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp +++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp @@ -234,11 +234,7 @@ applyTileToAll(RewriterBase &rewriter, Operation *transformOp, scf::SCFTilingOptions tilingOptions; tilingOptions.setTileSizes(tileSizes).setInterchange(interchange); if (mapping) { - auto mappingAttrs = - llvm::map_to_vector(mapping.value(), [](Attribute attr) { - return cast(attr); - }); - tilingOptions.setMapping(mappingAttrs); + tilingOptions.setMapping(mapping.value().getValue()); } tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); From 2310b01b911ebc386c4f2254fd33fdef51bfde30 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Mon, 20 May 2024 23:17:56 -0700 Subject: [PATCH 02/11] Allow specifying both numThreads and tileSizes to keep the same existing semantics of distribution using number of threads. --- .../SCF/Transforms/TileUsingInterface.h | 28 +- .../TransformOps/LinalgTransformOps.cpp | 5 +- .../SCF/Transforms/TileUsingInterface.cpp | 266 ++++++++++-------- mlir/test/Dialect/Linalg/tile-to-forall.mlir | 52 ++-- .../Dialect/Linalg/transform-op-tile.mlir | 29 +- .../tile-pad-using-interface.mlir | 10 +- .../TilingInterface/tile-using-interface.mlir | 50 ++-- 7 files changed, 234 insertions(+), 206 deletions(-) diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h index 23f63ca0906ab..9291c91cfa1ed 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h @@ -36,9 +36,7 @@ struct SCFTilingOptions { /// Returning a tile size of zero implies no tiling for that loop. If the /// size of the returned vector is smaller than the number of loops, the inner /// loops are not tiled. If the size of the returned vector is larger, then - /// the vector is truncated to number of loops. Only one of - /// `tileSizeComputationFunction` or `maxNumTilesComputationFunction` should - /// be used. + /// the vector is truncated to number of loops. SCFTileSizeComputationFunction tileSizeComputationFunction = nullptr; SCFTilingOptions & @@ -51,23 +49,25 @@ struct SCFTilingOptions { /// proper interaction with folding. SCFTilingOptions &setTileSizes(ArrayRef tileSizes); - /// Computation function that returns the maximum number of tile to use for - /// each loop. Returning a tile size of zero implies no tiling for that loop. - /// If the size of the returned vector is smaller than the number of loops, - /// the inner loops are not tiled. If the size of the returned vector is - /// larger, then the vector is truncated to number of loops. Only one of - /// `tileSizeComputationFunction` or `maxNumTilesComputationFunction` should - /// be used. - SCFTileSizeComputationFunction maxNumTilesComputationFunction = nullptr; + /// Computation function that returns the number of threads to use for + /// each loop. Returning a num threads of zero implies no tiling for that + /// loop. If the size of the returned vector is smaller than the number of + /// loops, the inner loops are not tiled. If the size of the returned vector + /// is larger, then the vector is truncated to number of loops. Note: This + /// option is only supported with loopType set to `LoopType::ForallOp`. If the + /// tile size function is not specified while the num threads computation is, + /// then the tile size is determined automatically to map at most one tile per + /// thread. + SCFTileSizeComputationFunction numThreadsComputationFunction = nullptr; SCFTilingOptions & - setMaxNumTilesComputationFunction(SCFTileSizeComputationFunction fun) { - maxNumTilesComputationFunction = std::move(fun); + setNumThreadsComputationFunction(SCFTileSizeComputationFunction fun) { + numThreadsComputationFunction = std::move(fun); return *this; } /// Convenience function to set the `tileSizeComputationFunction` to a /// function that computes tile sizes at the point they are needed. - SCFTilingOptions &setMaxNumTiles(ArrayRef numTiles); + SCFTilingOptions &setNumThreads(ArrayRef numThreads); /// The interchange vector to reorder the tiled loops. SmallVector interchangeVector = {}; diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 3cbe3d5b8e78d..2fd4c9f48201e 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3170,7 +3170,7 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl( scf::SCFTilingOptions options; options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); if (!mixedNumThreads.empty()) { - options.setMaxNumTiles(mixedNumThreads); + options.setNumThreads(mixedNumThreads); } else { SmallVector loopRanges = tileableOp.getIterationDomain(rewriter); unsigned nLoops = loopRanges.size(); @@ -3188,7 +3188,8 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl( {loopRanges[i].size, numTiles}); numThreads.push_back(numTiles); } - options.setMaxNumTiles(numThreads); + options.setNumThreads(numThreads); + options.setTileSizes(mixedTileSizes); } if (mapping) { options.setMapping(mapping.value().getValue()); diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 3acae673449c8..cb26b5bf44f4e 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -43,11 +43,11 @@ scf::SCFTilingOptions::setTileSizes(ArrayRef ts) { } scf::SCFTilingOptions & -scf::SCFTilingOptions::setMaxNumTiles(ArrayRef mnt) { - assert(!maxNumTilesComputationFunction && "max num tiles already set"); - auto maxNumTiles = llvm::to_vector(mnt); - maxNumTilesComputationFunction = [maxNumTiles](OpBuilder &b, Operation *op) { - return maxNumTiles; +scf::SCFTilingOptions::setNumThreads(ArrayRef nt) { + assert(!numThreadsComputationFunction && "num tiles already set"); + auto numThreads = llvm::to_vector(nt); + numThreadsComputationFunction = [numThreads](OpBuilder &b, Operation *op) { + return numThreads; }; return *this; } @@ -75,17 +75,12 @@ fillInterchangeVector(ArrayRef interchangeVector, static LogicalResult verifyTileSizeOptions(RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options) { - if (!options.tileSizeComputationFunction && - !options.maxNumTilesComputationFunction) { + // Specifying number of tile is only supported on `scf.forall` op. + if (options.numThreadsComputationFunction && + options.loopType != scf::SCFTilingOptions::LoopType::ForallOp) { return rewriter.notifyMatchFailure( - loc, "at least one of tile size computation function or max num tiles " - "computation must be specified."); - } - if (options.tileSizeComputationFunction && - options.maxNumTilesComputationFunction) { - return rewriter.notifyMatchFailure( - loc, "only one of tile size computation function or max num tiles " - "computation function can be specified"); + loc, "number of tiles/threads can only by specified when loop type is " + "set to use `scf.forall`"); } // If specified, check that the interchange vector is a permutation. @@ -99,58 +94,94 @@ verifyTileSizeOptions(RewriterBase &rewriter, Location loc, return success(); } -/// Compute the tile sizes and num tiles values. The `numTiles` -/// is empty if the `maxNumTilesComputationFunction` is not specified. +/// Compute the tile sizes and num threads values passed in. static std::tuple, SmallVector> -getTileSizesAndNumTiles(RewriterBase &rewriter, TilingInterface op, - ArrayRef iterationDomain, - const scf::SCFTilingOptions &options) { - SmallVector tileSizes, numTiles; +getTileSizes(RewriterBase &rewriter, TilingInterface op, + ArrayRef iterationDomain, + const scf::SCFTilingOptions &options) { + OpFoldResult zero = rewriter.getIndexAttr(0); + SmallVector tileSizes, numThreads; + size_t numLoops = iterationDomain.size(); + + // Check whether the number of tiles to use is specified. + if (options.numThreadsComputationFunction) { + numThreads = options.numThreadsComputationFunction(rewriter, op); + numThreads.resize(numLoops, zero); + + // If the number of tiles is also specified, use that. + if (options.tileSizeComputationFunction) { + tileSizes = options.tileSizeComputationFunction(rewriter, op); + } else { + // Compute the tile sizes from the iteration domain and number + // of tiles as follows + // - niters = ceilDiv(ub - lb, step) + // - tileSize = ceilDiv(niters, numThreads) + AffineExpr s0, s1, s2, s3; + bindSymbols(rewriter.getContext(), s0, s1, s2, s3); + AffineExpr numItersExpr = (s1 - s0).ceilDiv(s2); + AffineExpr tileSizeExpr = numItersExpr.ceilDiv(s3); + tileSizes.resize(numLoops, zero); + for (auto [index, range, nt] : + llvm::enumerate(iterationDomain, numThreads)) { + if (isConstantIntValue(nt, 0)) + continue; + + tileSizes[index] = affine::makeComposedFoldedAffineApply( + rewriter, op.getLoc(), tileSizeExpr, + {range.offset, range.size, range.stride, nt}); + } + } + tileSizes.resize(numLoops, zero); + return {tileSizes, numThreads}; + } // Enforce the convention that "tiling by zero" // skips tiling a particular dimension. This convention is significantly // simpler to handle instead of adjusting affine maps to account for missing // dimensions. - auto numLoops = iterationDomain.size(); if (options.tileSizeComputationFunction) { tileSizes = options.tileSizeComputationFunction(rewriter, op); - tileSizes.resize(numLoops, rewriter.getIndexAttr(0)); - return {tileSizes, numTiles}; } + tileSizes.resize(numLoops, zero); - assert(options.maxNumTilesComputationFunction && - "expected at least one of tile sizes cpomputation function or max num " - "tiles computation function"); - // Enforce the convention that "maxNumTiles to zero" - // skips tiling a particular dimension. This convention is significantly - // simpler to handle instead of adjusting affine maps to account for missing - // dimensions. - SmallVector maxNumTiles = - options.maxNumTilesComputationFunction(rewriter, op); - maxNumTiles.resize(numLoops, rewriter.getIndexAttr(0)); - - // Use the maxNumTiles to compute the tile sizes as - // - niters = ceilDiv(ub - lb, step) - // - tileSize = ceilDiv(niters, maxNumTiles) - AffineExpr s0, s1, s2, s3; - bindSymbols(rewriter.getContext(), s0, s1, s2, s3); - AffineExpr numIters = (s1 - s0).ceilDiv(s2); - AffineExpr tileSizeExpr = numIters.ceilDiv(s3); - tileSizes.resize(numLoops, rewriter.getIndexAttr(0)); - for (auto [index, maxNumTile] : llvm::enumerate(maxNumTiles)) { - if (isConstantIntValue(maxNumTile, 0)) + return {tileSizes, numThreads}; +} + +/// Checks if any of the tiled loops are not parallel. +static void checkSafeToTileToForall(TilingInterface op, + ArrayRef tileSizes, + ArrayRef numThreads) { + auto iterators = op.getLoopIteratorTypes(); + assert(iterators.size() == tileSizes.size() && + "expected as many tile size values as number of loops"); + assert((numThreads.empty() || (numThreads.size() == iterators.size())) && + "when specified, expected number of threads to use for each loop"); + + for (auto [index, iterator, tileSize] : + llvm::enumerate(iterators, tileSizes)) { + // If num threads is specified, check that it is greater than one only for + // parallel dimensions. + if (!numThreads.empty()) { + if (std::optional constNumThreads = + getConstantIntValue(numThreads[index])) { + if (constNumThreads.value() > 1 && + iterator != utils::IteratorType::parallel) { + op.emitWarning() << "tiling is not thread safe at axis #" << index; + } + } continue; + } - tileSizes[index] = affine::makeComposedFoldedAffineApply( - rewriter, op.getLoc(), tileSizeExpr, - {iterationDomain[index].offset, iterationDomain[index].size, - iterationDomain[index].stride, maxNumTile}); + if (std::optional constTileSize = getConstantIntValue(tileSize)) { + if (constTileSize.value() > 0 && + iterator != utils::IteratorType::parallel) { + op.emitWarning() << "tiling is not thread safe at axis #" << index; + } + } } - - return {tileSizes, maxNumTiles}; } -// Check if `stride` evenly divides the trip count `size - offset`. +/// Check if `stride` evenly divides the trip count `size - offset`. static bool tileDividesIterationDomain(Range loopRange) { std::optional offsetAsInt = getConstantIntValue(loopRange.offset); if (!offsetAsInt) @@ -164,10 +195,10 @@ static bool tileDividesIterationDomain(Range loopRange) { return ((sizeAsInt.value() - offsetAsInt.value()) % strideAsInt.value() == 0); } -/// Returns the bounded tile size given the current `iv`, `loopRange` and -/// `tileSize`, i.e., `min(tileSize, range.end() - iv)`. +/// Returns the bounded tile size given the current `offset`, `loopRange` and +/// `tileSize`, i.e., `min(tileSize, range.end() - offset)`. static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, - Range loopRange, Value iv, + Range loopRange, OpFoldResult offset, OpFoldResult tileSize) { std::optional ts = getConstantIntValue(tileSize); if (ts && ts.value() == 1) @@ -186,7 +217,7 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, AffineMap minMap = AffineMap::get(1, 2, {s0, s1 - d0}, b.getContext()); Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size); return affine::makeComposedFoldedAffineMin( - b, loc, minMap, SmallVector{iv, tileSize, size}); + b, loc, minMap, SmallVector{offset, tileSize, size}); } /// Compute the tile offsets and sizes. @@ -224,11 +255,29 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, offset = getAsOpFoldResult(iv); } offsets.push_back(offset); - sizes.push_back(getBoundedTileSize(rewriter, loc, loopRange, iv, tileSize)); + sizes.push_back( + getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize)); } return {offsets, sizes}; } +/// Function to return the bounds of the loops to be generated. +static std::tuple, SmallVector, + SmallVector> +getLoopBounds(RewriterBase &rewriter, Location loc, ArrayRef loopRanges, + ArrayRef tileSizes) { + SmallVector lbs, ubs, steps; + for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) { + // No loop if the tile size is 0. + if (isConstantIntValue(tileSize, 0)) + continue; + lbs.push_back(loopRange.offset); + ubs.push_back(loopRange.size); + steps.push_back(tileSize); + } + return {lbs, ubs, steps}; +} + /// A function that allows returning additional yielded values during /// `yieldTiledValuesAndReplace`. /// - `ivs` induction variable for the loop. @@ -274,39 +323,26 @@ static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter, /// populated. static LogicalResult generateLoopNestUsingForOp( RewriterBase &rewriter, Location loc, ArrayRef loopRanges, - ArrayRef tileSizes, ArrayRef numTiles, - ValueRange destinationTensors, YieldTiledValuesFn yieldTiledValuesFn, + ArrayRef tileSizes, ValueRange destinationTensors, + YieldTiledValuesFn yieldTiledValuesFn, SmallVector &loops) { assert(!loopRanges.empty() && "unexpected empty loop ranges"); assert(loopRanges.size() == tileSizes.size() && "expected as many tile sizes as loop ranges"); OpBuilder::InsertionGuard guard(rewriter); - SmallVector ivs; - Value zero, one; - if (!numTiles.empty()) { - zero = rewriter.create(loc, 0); - ; - one = rewriter.create(loc, 1); - } - - for (auto [index, loopRange, tileSize] : - llvm::enumerate(loopRanges, tileSizes)) { - // No loops if tile size is zero. Set offset and size to the loop - // offset and size. - if (isConstantIntValue(tileSize, 0)) - continue; + SmallVector lbs, ubs, steps; + std::tie(lbs, ubs, steps) = + getLoopBounds(rewriter, loc, loopRanges, tileSizes); + SmallVector lbVals = + getValueOrCreateConstantIndexOp(rewriter, loc, lbs); + SmallVector ubVals = + getValueOrCreateConstantIndexOp(rewriter, loc, ubs); + SmallVector stepVals = + getValueOrCreateConstantIndexOp(rewriter, loc, steps); - Value lb, ub, step; - if (numTiles.empty()) { - lb = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.offset); - ub = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.size); - step = getValueOrCreateConstantIndexOp(rewriter, loc, tileSize); - } else { - lb = zero; - ub = getValueOrCreateConstantIndexOp(rewriter, loc, numTiles[index]); - step = one; - } + SmallVector ivs; + for (auto [lb, ub, step] : llvm::zip_equal(lbVals, ubVals, stepVals)) { auto loop = rewriter.create(loc, lb, ub, step, destinationTensors, [](OpBuilder &bodyBuilder, Location bodyLoc, @@ -368,15 +404,12 @@ static LogicalResult generateLoopNestUsingForOp( /// populated. static LogicalResult generateLoopNestUsingForallOp( RewriterBase &rewriter, Location loc, ArrayRef loopRanges, - ArrayRef tileSizes, ArrayRef numTiles, + ArrayRef tileSizes, ArrayRef numThreads, ArrayRef mappingVector, ValueRange destinationTensors, YieldTiledValuesFn tiledBodyFn, SmallVector &loops) { assert(!loopRanges.empty() && "unexpected empty loop ranges"); assert(loopRanges.size() == tileSizes.size() && "expected as many tile sizes as loop ranges"); - assert((numTiles.empty() || numTiles.size() == loopRanges.size()) && - "expected max number of tiles to be either empty or equal to number " - "of loops"); OpBuilder::InsertionGuard guard(rewriter); SmallVector offsets(loopRanges.size()), sizes(loopRanges.size()); @@ -386,25 +419,23 @@ static LogicalResult generateLoopNestUsingForallOp( mappingAttr = rewriter.getArrayAttr(mappingVector); scf::ForallOp forallOp; - SmallVector lbs, ubs, steps; - if (numTiles.empty()) { - for (auto [tileSize, loopRange] : llvm::zip_equal(tileSizes, loopRanges)) { - if (isConstantIntValue(tileSize, 0)) + bool useNumThreads = !numThreads.empty(); + + if (useNumThreads) { + // Prune the zero numthreads. + SmallVector nonZeroNumThreads; + for (auto nt : numThreads) { + if (isConstantIntValue(nt, 0)) continue; - lbs.push_back(loopRange.offset); - ubs.push_back(loopRange.size); - steps.push_back(tileSize); + nonZeroNumThreads.push_back(nt); } - assert(!lbs.empty() && "Expected at least one loop range"); - forallOp = rewriter.create(loc, lbs, ubs, steps, + forallOp = rewriter.create(loc, nonZeroNumThreads, destinationTensors, mappingAttr); } else { - SmallVector numThreads; - for (auto maxNumTile : numTiles) { - if (!isConstantIntValue(maxNumTile, 0)) - numThreads.push_back(maxNumTile); - } - forallOp = rewriter.create(loc, numThreads, + SmallVector lbs, ubs, steps; + std::tie(lbs, ubs, steps) = + getLoopBounds(rewriter, loc, loopRanges, tileSizes); + forallOp = rewriter.create(loc, lbs, ubs, steps, destinationTensors, mappingAttr); } loops.push_back(forallOp); @@ -446,7 +477,7 @@ static LogicalResult generateLoopNestUsingForallOp( static LogicalResult generateLoopNest( RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options, ArrayRef loopRanges, ArrayRef tileSizes, - ArrayRef numTiles, ValueRange destinationTensors, + ArrayRef numThreads, ValueRange destinationTensors, YieldTiledValuesFn tiledBodyFn, SmallVector &loops) { // If the tile sizes are all zero, no loops are generated. Just call the // callback function to handle untiled case. @@ -458,12 +489,11 @@ static LogicalResult generateLoopNest( } if (options.loopType == scf::SCFTilingOptions::LoopType::ForOp) { return generateLoopNestUsingForOp(rewriter, loc, loopRanges, tileSizes, - numTiles, destinationTensors, tiledBodyFn, - loops); + destinationTensors, tiledBodyFn, loops); } if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) { return generateLoopNestUsingForallOp( - rewriter, loc, loopRanges, tileSizes, numTiles, options.mappingVector, + rewriter, loc, loopRanges, tileSizes, numThreads, options.mappingVector, destinationTensors, tiledBodyFn, loops); } return rewriter.notifyMatchFailure(loc, "unhandled loop type"); @@ -697,10 +727,16 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 1. Get the range of the loops that are represented by the operation. SmallVector iterationDomain = op.getIterationDomain(rewriter); - // 2. Materialize the tile sizes or max num tiles; - SmallVector tileSizes, numTiles; - std::tie(tileSizes, numTiles) = - getTileSizesAndNumTiles(rewriter, op, iterationDomain, options); + // 2. Materialize the tile sizes and/or number of threads; + SmallVector tileSizes, numThreads; + std::tie(tileSizes, numThreads) = + getTileSizes(rewriter, op, iterationDomain, options); + + // Check if it is safe to tile. This is hold over from previous iterations + // of tile to for-all. Consider dropping it. + if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) { + checkSafeToTileToForall(op, tileSizes, numThreads); + } // 3. If there is an interchange specified, permute the iteration domain and // the tile sizes. @@ -713,8 +749,8 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, applyPermutationToVector(iterationDomain, interchangeVector); applyPermutationToVector(tileSizes, interchangeVector); - if (!numTiles.empty()) - applyPermutationToVector(numTiles, interchangeVector); + if (!numThreads.empty()) + applyPermutationToVector(numThreads, interchangeVector); } FailureOr tilingResult; @@ -729,7 +765,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 4a. Compute the `offsets` and `sizes` to use for tiling. SmallVector offsets, sizes; std::tie(offsets, sizes) = getTileOffsetAndSizes( - rewriter, loc, ivs, iterationDomain, tileSizes, !numTiles.empty()); + rewriter, loc, ivs, iterationDomain, tileSizes, !numThreads.empty()); // 4b. If interchange was provided, apply inverse of the interchange // to get back the offsets/sizes in the order to be specified. @@ -797,7 +833,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 7. Generate the tiled loops nest using the callback defined above. SmallVector loops; if (failed(generateLoopNest(rewriter, op.getLoc(), options, iterationDomain, - tileSizes, numTiles, destinationTensors, + tileSizes, numThreads, destinationTensors, innerYieldTiledValuesFn, loops))) return op.emitOpError("failed to generate tiling loops"); assert(succeeded(tilingResult) && @@ -913,7 +949,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b, scf::SCFTilingOptions options; options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp); if (failed(generateLoopNest(b, loc, options, iterationDomain, tileSizesVector, - /*numTiles=*/ArrayRef{}, + /*numThreads=*/ArrayRef{}, initTensors, innerYieldTiledValuesFn, loops))) return b.notifyMatchFailure(op, "failed to tile for parallel reduction"); diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir index f33739f119eaf..d1ed468fce323 100644 --- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir @@ -3,9 +3,9 @@ // Offset per thread: // CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 10))> // Per thread tile size. -// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 10)) + s0, s0 ceildiv 10)> +// CHECK-DAG: affine_map<(d0)[s0] -> (s0 ceildiv 10, -(d0 * (s0 ceildiv 10)) + s0)> // CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 20))> -// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 20)) + s0, s0 ceildiv 20)> +// CHECK-DAG: affine_map<(d0)[s0] -> (s0 ceildiv 20, -(d0 * (s0 ceildiv 20)) + s0)> module { // CHECK-LABEL: matmul( @@ -96,7 +96,7 @@ module { // In this test case, matmul dims and tile size are dynamic. // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> -// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> +// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -(d0 * s0) + s1)> // CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> // CHECK-LABEL: matmul_tile_size_dynamic_dynamic( @@ -140,7 +140,7 @@ module attributes {transform.with_named_sequence} { // Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot. -// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 300, 15)> +// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (15, d0 * -15 + 300)> // CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)> // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)> // CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 15)> @@ -176,30 +176,29 @@ module attributes {transform.with_named_sequence} { transform.yield } } - // ----- -// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> -// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> -// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> -// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> -// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)> +// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 * 20)> -// CHECK-LABEL: matmul_tile_size_dynamic( +// CHECK: matmul_tile_size_dynamic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor // CHECK-SAME: %[[B:[0-9a-z]+]]: tensor // CHECK-SAME: %[[C:[0-9a-z]+]]: tensor func.func @matmul_tile_size_dynamic(%A: tensor, %B: tensor, %C: tensor) -> tensor { // CHECK: %[[M:.+]] = tensor.dim %[[A]], %c0 : // CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 : - // CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]] - // CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] + // CHECK: %[[NT0:.+]] = affine.apply #[[MAP0]]()[%[[M]]] + // CHECK: %[[NT1:.+]] = affine.apply #[[MAP1]]()[%[[N]]] // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] - // CHECK: %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] - // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) + // CHECK: %[[TS0:.+]] = affine.min #[[MAP2]](%[[IV0]])[%[[M]]] + // CHECK: %[[TS1:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N]]] + // CHECK: %[[LB0:.+]] = affine.apply #[[MAP4]](%[[IV0]]) + // CHECK: %[[LB1:.+]] = affine.apply #[[MAP5]](%[[IV1]]) // CHECK: tensor.extract_slice %[[A]] // CHECK: tensor.extract_slice %[[B]] // CHECK: tensor.extract_slice %[[C_BLK]] @@ -219,26 +218,25 @@ module attributes {transform.with_named_sequence} { transform.yield } } - // ----- // Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot. -// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -21 + 300, 21)> -// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 21)> +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * -21 + 300, 21)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 * 21)> -// CHECK-LABEL: matmul_tile_size_static( +// CHECK: matmul_tile_size_static( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor // CHECK-SAME: %[[B:[0-9a-z]+]]: tensor // CHECK-SAME: %[[C:[0-9a-z]+]]: tensor func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> { // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]]) + // CHECK: %[[TS:.+]] = affine.min #[[MAP0]](%[[IV1]]) // CHECK-NOT: affine.max // CHECK-NOT: affine.min - // CHECK: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]]) + // CHECK: %[[LB0:.+]] = affine.apply #[[MAP1]](%[[IV0]]) + // CHECK: %[[LB1:.+]] = affine.apply #[[MAP2]](%[[IV1]]) // CHECK: %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] : // CHECK: %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] : // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] : @@ -298,7 +296,7 @@ module { // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> // CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> -// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> +// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -(d0 * s0) + s1)> // CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> // CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> // CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir index 955ea6b0ebbbd..727e3c361f054 100644 --- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --transform-interpreter --mlir-print-local-scope --split-input-file --verify-diagnostics %s | FileCheck %s +// RUN: mlir-opt --transform-interpreter --mlir-print-local-scope --split-input-file --verify-diagnostics --cse %s | FileCheck %s module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { @@ -178,12 +178,11 @@ module { // CHECK-LABEL: func.func @scalable_tile( // CHECK-SAME: %[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor, %[[ARG_2:.*]]: tensor, -// CHECK: %[[C4:.*]] = arith.constant 0 : index -// CHECK: %[[DIM:.*]] = tensor.dim %[[ARG_0]], %[[C4]] : tensor +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[DIM:.*]] = tensor.dim %[[ARG_0]], %[[C0]] : tensor // CHECK: %[[VEC_SIZE:.*]] = arith.constant 4 : index // CHECK: %[[VS:.*]] = vector.vscale // CHECK: %[[STEP:.*]] = arith.muli %[[VEC_SIZE]], %[[VS]] : index -// CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: scf.for %[[IV:.*]] = %[[C0]] to %[[DIM]] step %[[STEP]] iter_args(%[[VAL:.*]] = %[[ARG_2]]) -> (tensor) { // CHECK: %[[SIZE:.*]] = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%[[IV]])[%[[STEP]], %[[DIM]]] // CHECK: %[[SLICE_ARG0:.*]] = tensor.extract_slice %[[ARG_0]][%[[IV]]] [%[[SIZE]]] [1] : tensor to tensor @@ -202,20 +201,14 @@ module { // ----- // CHECK-LABEL: func.func @scalable_and_fixed_length_tile -// CHECK: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %[[VS:.*]] = vector.vscale -// CHECK: %[[STEP_2:.*]] = arith.muli %[[C4]], %[[VS]] : index -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C128:.*]] = arith.constant 128 : index -// CHECK: %[[STEP_0:.*]] = arith.constant 4 : index -// CHECK: scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C128]] step %[[STEP_0]] -// CHECK: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[C128_1:.*]] = arith.constant 128 : index -// CHECK: %[[STEP_1:.*]] = arith.constant 4 : index -// CHECK: scf.for %[[VAL_16:.*]] = %[[C0_1]] to %[[C128_1]] step %[[STEP_1]] -// CHECK: %[[C0_2:.*]] = arith.constant 0 : index -// CHECK: %[[C128_2:.*]] = arith.constant 128 : index -// CHECK: scf.for %{{.*}} = %[[C0_2]] to %[[C128_2]] step %[[STEP_2]] +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[VS:.*]] = vector.vscale +// CHECK-DAG: %[[STEP_2:.*]] = arith.muli %[[C4]], %[[VS]] : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index +// CHECK: scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C128]] step %[[C4]] +// CHECK: scf.for %[[VAL_16:.*]] = %[[C0]] to %[[C128]] step %[[C4]] +// CHECK: scf.for %{{.*}} = %[[C0]] to %[[C128]] step %[[STEP_2]] func.func @scalable_and_fixed_length_tile( %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) diff --git a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir index 7d247aefcf6b1..ccf8e37c094f4 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir @@ -31,8 +31,8 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[DIM_IN1:.+]] = tensor.dim %[[IN]], %[[C1]] // CHECK-DAG: %[[DIM1:.+]] = affine.apply #[[MAP1]]()[%[[DIM_IN1]]] // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK: %[[RESULT:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[DIM0]] step %[[C2]] -// CHECK: %[[C3:.+]] = arith.constant 3 : index // CHECK: scf.for {{.*}} = %[[C0]] to %[[DIM1]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] = // CHECK: %[[SWAP_RESULT:.*]] = scf.if // CHECK: tensor.generate @@ -62,8 +62,8 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 8)> -// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 7)> +// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 + 8)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 + 7)> // CHECK: func @dynamic_2d_pad_tensor_inner_tiling( // CHECK-SAME: %[[IN:.*]]: tensor // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index @@ -107,9 +107,9 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK: %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C15]] step %[[C2]] -// CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK: scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] = // CHECK: %[[SWAP_RESULT:.*]] = scf.if // CHECK: tensor.generate diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir index 488a52e8e3e91..08be9737f4302 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir @@ -24,13 +24,13 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index // CHECK-DAG: %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]] // CHECK-DAG: %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]] // CHECK-DAG: %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]] +// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index +// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index // CHECK: %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]] // CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[ARG2]]) -// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index // CHECK: %[[INNER:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]] // CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]]) // CHECK-DAG: %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]] @@ -77,14 +77,14 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index // CHECK-DAG: %[[M:.+]] = memref.dim %[[ARG0]], %[[C0]] // CHECK-DAG: %[[K:.+]] = memref.dim %[[ARG0]], %[[C1]] // CHECK-DAG: %[[N:.+]] = memref.dim %[[ARG1]], %[[C1]] +// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index +// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index +// CHECK-DAG: %[[C30:.+]] = arith.constant 30 : index // CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]] -// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index // CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]] -// CHECK-DAG: %[[C30:.+]] = arith.constant 30 : index // CHECK: scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]] // CHECK-DAG: %[[TS_M:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]] // CHECK-DAG: %[[TS_N:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[N]]] @@ -130,15 +130,15 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)> // CHECK-LABEL: func.func @multi_result( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>) -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK-DAG: %[[INIT0:.+]] = tensor.empty() // CHECK-DAG: %[[INIT1:.+]] = tensor.empty() +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index +// CHECK-DAG: %[[C300:.+]] = arith.constant 300 : index +// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index +// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index // CHECK: %[[OUTER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C128]] step %[[C10]] // CHECK-SAME: iter_args(%[[ARG1:[a-zA-Z0-9]+]] = %[[INIT0]], %[[ARG2:[a-zA-Z0-9]+]] = %[[INIT1]]) -// CHECK-DAG: %[[C300:.+]] = arith.constant 300 : index -// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index // CHECK: %[[INNER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[C300]] step %[[C20]] // CHECK-SAME: iter_args(%[[ARG3:[a-zA-Z0-9]+]] = %[[ARG1]], %[[ARG4:[a-zA-Z0-9]+]] = %[[ARG2]]) // CHECK-DAG: %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]]) @@ -193,7 +193,6 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index // CHECK-DAG: %[[N:.+]] = tensor.dim %[[INPUT]], %[[C0]] // CHECK-DAG: %[[C:.+]] = tensor.dim %[[INPUT]], %[[C3]] // CHECK-DAG: %[[P:.+]] = tensor.dim %[[FILTER]], %[[C0]] @@ -201,12 +200,13 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[F:.+]] = tensor.dim %[[FILTER]], %[[C3]] // CHECK-DAG: %[[R:.+]] = tensor.dim %[[INIT]], %[[C1]] // CHECK-DAG: %[[S:.+]] = tensor.dim %[[INIT]], %[[C2]] +// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index +// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index +// CHECK-DAG: %[[C30:.+]] = arith.constant 30 : index // CHECK: scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[P]] step %[[C10]] // CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[INIT]]) -// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index // CHECK: scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[Q]] step %[[C20]] // CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]]) -// CHECK-DAG: %[[C30:.+]] = arith.constant 30 : index // CHECK: scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[C]] step %[[C30]] // CHECK-SAME: iter_args(%[[INIT2:.+]] = %[[INIT1]]) // CHECK-DAG: %[[TS_P:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[P]]] @@ -259,15 +259,15 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK: #[[$MAP_ADD:.+]] = affine_map<(d0, d1) -> (d0 + d1)> -// CHECK-LABEL: @indexed_semantics -// CHECK: scf.for %[[I0:.+]] = %{{.*}} to %{{.*}} step %{{.*}} -// CHECK: scf.for %[[I1:.+]] = %{{.*}} to %{{.*}} step %{{.*}} -// CHECK: %[[INDEX0:.+]] = linalg.index 0 -// CHECK: %[[INDEX0_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX0]], %[[I0]]) -// CHECK: %[[INDEX1:.+]] = linalg.index 1 -// CHECK: %[[INDEX1_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX1]], %[[I1]]) -// CHECK: arith.addi %[[INDEX0_AMENDED]], %[[INDEX1_AMENDED]] +// CHECK: #[[MAP_ADD:.+]] = affine_map<(d0, d1) -> (d0 + d1)> +// CHECK: @indexed_semantics +// CHECK: scf.for %[[I0:.+]] = %{{.*}} to %{{.*}} step %{{.*}} +// CHECK: scf.for %[[I1:.+]] = %{{.*}} to %{{.*}} step %{{.*}} +// CHECK: %[[INDEX0:.+]] = linalg.index 0 +// CHECK: %[[INDEX0_AMENDED:.+]] = affine.apply #[[MAP_ADD]](%[[INDEX0]], %[[I0]]) +// CHECK: %[[INDEX1:.+]] = linalg.index 1 +// CHECK: %[[INDEX1_AMENDED:.+]] = affine.apply #[[MAP_ADD]](%[[INDEX1]], %[[I1]]) +// CHECK: arith.addi %[[INDEX0_AMENDED]], %[[INDEX1_AMENDED]] // ----- @@ -296,16 +296,16 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index // CHECK-DAG: %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]] // CHECK-DAG: %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]] // CHECK-DAG: %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]] +// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index +// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : index +// CHECK-DAG: %[[C30:.+]] = arith.constant 30 : index // CHECK: %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]] // CHECK-SAME: iter_args(%[[INIT0:.+]] = %[[ARG2]]) -// CHECK-DAG: %[[C30:.+]] = arith.constant 30 : index // CHECK: %[[INNER1:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]] // CHECK-SAME: iter_args(%[[INIT1:.+]] = %[[INIT0]]) -// CHECK-DAG: %[[C10:.+]] = arith.constant 10 : index // CHECK: %[[INNER2:[a-zA-Z0-9]+]] = scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]] // CHECK-SAME: iter_args(%[[INIT2:.+]] = %[[INIT1]]) // CHECK-DAG: %[[TS_N:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[N]]] From 3a823c2657fbdd7770376f238d564096c319fd4f Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Thu, 23 May 2024 12:00:55 -0700 Subject: [PATCH 03/11] Add logic to account for negative tile sizes. --- .../SCF/Transforms/TileUsingInterface.cpp | 102 +++++++++++++----- mlir/test/Dialect/Linalg/tile-to-forall.mlir | 49 +++++---- 2 files changed, 102 insertions(+), 49 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index cb26b5bf44f4e..033f2977d1cbd 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -220,45 +220,93 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, b, loc, minMap, SmallVector{offset, tileSize, size}); } +/// Returns true if the maximum tile offset `tileSize * numThreads-1` is less +/// than `iterationSize`. +static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, + OpFoldResult numThreads, + OpFoldResult iterationSize) { + std::optional tileSizeConst = getConstantIntValue(tileSize); + std::optional numThreadsConst = getConstantIntValue(numThreads); + std::optional iterSizeConst = getConstantIntValue(iterationSize); + if (!tileSizeConst || !numThreadsConst || !iterSizeConst) + return false; + return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst; +} + /// Compute the tile offsets and sizes. static std::tuple, SmallVector> getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, ArrayRef iterationDomain, - ArrayRef tileSizes, bool isLoopNormalized) { + ArrayRef tileSizes, + ArrayRef numThreads) { SmallVector offsets, sizes; int materializedLoopNum = 0; - AffineExpr d0, s0, s1, s2; - AffineExpr offsetExpr; - if (isLoopNormalized) { - bindDims(rewriter.getContext(), d0); + if (!numThreads.empty()) { + AffineExpr d0, d1, s0, s1, s2; + AffineExpr offsetExpr, residualTileSizeExpr; + bindDims(rewriter.getContext(), d0, d1); bindSymbols(rewriter.getContext(), s0, s1, s2); - offsetExpr = s0 + d0 * s1 * s2; - } + offsetExpr = d0 + d1 * s0 * s1; + residualTileSizeExpr = s2 - (d0 + d1 * s0 * s1); - for (auto [tileSize, loopRange] : - llvm::zip_equal(tileSizes, iterationDomain)) { - if (isConstantIntValue(tileSize, 0)) { - offsets.push_back(loopRange.offset); - sizes.push_back(loopRange.size); - continue; - } - // If loop is normalized, the offset is (lb + iv * step * tileSize) - Value iv = ivs[materializedLoopNum++]; - OpFoldResult offset; - if (isLoopNormalized) { - offset = affine::makeComposedFoldedAffineApply( + for (auto [nt, tileSize, loopRange] : + llvm::zip_equal(numThreads, tileSizes, iterationDomain)) { + + if (isConstantIntValue(nt, 0) || isConstantIntValue(nt, 1)) { + offsets.push_back(loopRange.offset); + sizes.push_back(loopRange.size); + continue; + } + + Value iv = ivs[materializedLoopNum++]; + OpFoldResult offset = affine::makeComposedFoldedAffineApply( rewriter, loc, offsetExpr, - ArrayRef{iv, loopRange.offset, loopRange.stride, + ArrayRef{loopRange.offset, iv, loopRange.stride, tileSize}); - } else { - offset = getAsOpFoldResult(iv); + OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply( + rewriter, loc, residualTileSizeExpr, + {loopRange.offset, nt, loopRange.stride, tileSize, loopRange.size}); + OpFoldResult size = tileSize; + if (!isConstantIntValue(residualTileSize, 0)) { + OpFoldResult sizeMinusOffsetPerThread = + affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0, + {offset, loopRange.size}); + size = affine::makeComposedFoldedAffineMin( + rewriter, loc, + AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()), + {sizeMinusOffsetPerThread, tileSize}); + } + if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) { + AffineMap maxMap = + AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()); + size = affine::makeComposedFoldedAffineMax( + rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size}); + } + + offsets.push_back(offset); + sizes.push_back(size); + } + return {offsets, sizes}; + } else { + for (auto [tileSize, loopRange] : + llvm::zip_equal(tileSizes, iterationDomain)) { + + if (isConstantIntValue(tileSize, 0)) { + offsets.push_back(loopRange.offset); + sizes.push_back(loopRange.size); + continue; + } + + Value iv = ivs[materializedLoopNum++]; + OpFoldResult offset = getAsOpFoldResult(iv); + offsets.push_back(offset); + OpFoldResult size = + getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize); + sizes.push_back(size); } - offsets.push_back(offset); - sizes.push_back( - getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize)); + return {offsets, sizes}; } - return {offsets, sizes}; } /// Function to return the bounds of the loops to be generated. @@ -765,7 +813,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 4a. Compute the `offsets` and `sizes` to use for tiling. SmallVector offsets, sizes; std::tie(offsets, sizes) = getTileOffsetAndSizes( - rewriter, loc, ivs, iterationDomain, tileSizes, !numThreads.empty()); + rewriter, loc, ivs, iterationDomain, tileSizes, numThreads); // 4b. If interchange was provided, apply inverse of the interchange // to get back the offsets/sizes in the order to be specified. diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir index d1ed468fce323..c0ba5a8402d5f 100644 --- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir @@ -3,9 +3,9 @@ // Offset per thread: // CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 10))> // Per thread tile size. -// CHECK-DAG: affine_map<(d0)[s0] -> (s0 ceildiv 10, -(d0 * (s0 ceildiv 10)) + s0)> +// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 10)) + s0, s0 ceildiv 10)> // CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 20))> -// CHECK-DAG: affine_map<(d0)[s0] -> (s0 ceildiv 20, -(d0 * (s0 ceildiv 20)) + s0)> +// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 20)) + s0, s0 ceildiv 20)> module { // CHECK-LABEL: matmul( @@ -96,7 +96,7 @@ module { // In this test case, matmul dims and tile size are dynamic. // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> -// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -(d0 * s0) + s1)> +// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> // CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> // CHECK-LABEL: matmul_tile_size_dynamic_dynamic( @@ -140,7 +140,7 @@ module attributes {transform.with_named_sequence} { // Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot. -// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (15, d0 * -15 + 300)> +// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 300, 15)> // CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)> // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)> // CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 15)> @@ -176,6 +176,7 @@ module attributes {transform.with_named_sequence} { transform.yield } } + // ----- // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> @@ -296,7 +297,7 @@ module { // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> // CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> -// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -(d0 * s0) + s1)> +// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> // CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> // CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> // CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> @@ -339,7 +340,6 @@ module attributes {transform.with_named_sequence} { // ----- // CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 100, 15)> -// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)> // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 15)> // CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0)> @@ -352,8 +352,7 @@ module attributes {transform.with_named_sequence} { %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>) -> (tensor<100xf32>, tensor<100xf32>) { // CHECK: scf.forall (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]]) -// CHECK: %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]]) -// CHECK: %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]]) +// CHECK: %[[TS:.+]] = affine.min #[[$map0]](%[[IV0]]) // CHECK-NOT: affine.min // CHECK-NOT: affine.max // CHECK: %[[LB:.+]] = affine.apply #[[$map2]](%[[IV0]]) @@ -453,9 +452,10 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> // CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> -// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> -// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> +// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (0, d0)> +// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> +// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)> // CHECK-LABEL: matmul_tile_size_dynamic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor @@ -470,10 +470,12 @@ func.func @matmul_tile_size_dynamic(%A: tensor, %B: tensor, %C // CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] // CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] - // CHECK: %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]] - // CHECK: %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]]) + // CHECK: %[[TSMIN0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] + // CHECK: %[[TS0:.+]] = affine.max #[[$map3]](%[[TSMIN0]]) + // CHECK: %[[TSMIN1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] + // CHECK: %[[TS1:.+]] = affine.max #[[$map3]](%[[TSMIN1]]) + // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) + // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) // CHECK: tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] : // CHECK: tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] : // CHECK: tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] : @@ -521,9 +523,10 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> // CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> -// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> -// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> +// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (0, d0)> +// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> +// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)> // CHECK-LABEL: matmul_tile_size_dynamic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor @@ -538,10 +541,12 @@ func.func @matmul_tile_size_dynamic(%A: tensor, %B: tensor, %C // CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] // CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] - // CHECK: %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]] - // CHECK: %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]]) + // CHECK: %[[TSMIN0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] + // CHECK: %[[TS0:.+]] = affine.max #[[$map3]](%[[TSMIN0]]) + // CHECK: %[[TSMIN1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] + // CHECK: %[[TS1:.+]] = affine.max #[[$map3]](%[[TSMIN1]]) + // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) + // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) // CHECK: tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] : // CHECK: tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] : // CHECK: tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] : From 5cfeb8175d3de31d4d5b359a61080a3644daeda9 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Thu, 23 May 2024 21:51:44 -0700 Subject: [PATCH 04/11] Put back CHECK-LABELs --- mlir/test/Dialect/Linalg/tile-to-forall.mlir | 42 +++++++++---------- .../TilingInterface/tile-using-interface.mlir | 18 ++++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir index c0ba5a8402d5f..6e92deaf4cf0d 100644 --- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir @@ -179,27 +179,27 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 * 20)> - -// CHECK: matmul_tile_size_dynamic( +// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> +// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> +// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> +// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> +// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)> + +// CHECK-LABEL: matmul_tile_size_dynamic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor // CHECK-SAME: %[[B:[0-9a-z]+]]: tensor // CHECK-SAME: %[[C:[0-9a-z]+]]: tensor func.func @matmul_tile_size_dynamic(%A: tensor, %B: tensor, %C: tensor) -> tensor { // CHECK: %[[M:.+]] = tensor.dim %[[A]], %c0 : // CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 : - // CHECK: %[[NT0:.+]] = affine.apply #[[MAP0]]()[%[[M]]] - // CHECK: %[[NT1:.+]] = affine.apply #[[MAP1]]()[%[[N]]] + // CHECK: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]]] + // CHECK: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]] // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS0:.+]] = affine.min #[[MAP2]](%[[IV0]])[%[[M]]] - // CHECK: %[[TS1:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N]]] - // CHECK: %[[LB0:.+]] = affine.apply #[[MAP4]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[MAP5]](%[[IV1]]) + // CHECK: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] + // CHECK: %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] + // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) + // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) // CHECK: tensor.extract_slice %[[A]] // CHECK: tensor.extract_slice %[[B]] // CHECK: tensor.extract_slice %[[C_BLK]] @@ -223,21 +223,21 @@ module attributes {transform.with_named_sequence} { // Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot. -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * -21 + 300, 21)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 * 21)> +// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -21 + 300, 21)> +// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 21)> -// CHECK: matmul_tile_size_static( +// CHECK-LABEL: matmul_tile_size_static( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor // CHECK-SAME: %[[B:[0-9a-z]+]]: tensor // CHECK-SAME: %[[C:[0-9a-z]+]]: tensor func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> { // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS:.+]] = affine.min #[[MAP0]](%[[IV1]]) + // CHECK: %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]]) // CHECK-NOT: affine.max // CHECK-NOT: affine.min - // CHECK: %[[LB0:.+]] = affine.apply #[[MAP1]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[MAP2]](%[[IV1]]) + // CHECK: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]]) + // CHECK: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]]) // CHECK: %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] : // CHECK: %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] : // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] : diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir index 08be9737f4302..0a4d4c45f10be 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir @@ -259,15 +259,15 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK: #[[MAP_ADD:.+]] = affine_map<(d0, d1) -> (d0 + d1)> -// CHECK: @indexed_semantics -// CHECK: scf.for %[[I0:.+]] = %{{.*}} to %{{.*}} step %{{.*}} -// CHECK: scf.for %[[I1:.+]] = %{{.*}} to %{{.*}} step %{{.*}} -// CHECK: %[[INDEX0:.+]] = linalg.index 0 -// CHECK: %[[INDEX0_AMENDED:.+]] = affine.apply #[[MAP_ADD]](%[[INDEX0]], %[[I0]]) -// CHECK: %[[INDEX1:.+]] = linalg.index 1 -// CHECK: %[[INDEX1_AMENDED:.+]] = affine.apply #[[MAP_ADD]](%[[INDEX1]], %[[I1]]) -// CHECK: arith.addi %[[INDEX0_AMENDED]], %[[INDEX1_AMENDED]] +// CHECK: #[[$MAP_ADD:.+]] = affine_map<(d0, d1) -> (d0 + d1)> +// CHECK-LABEL: @indexed_semantics +// CHECK: scf.for %[[I0:.+]] = %{{.*}} to %{{.*}} step %{{.*}} +// CHECK: scf.for %[[I1:.+]] = %{{.*}} to %{{.*}} step %{{.*}} +// CHECK: %[[INDEX0:.+]] = linalg.index 0 +// CHECK: %[[INDEX0_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX0]], %[[I0]]) +// CHECK: %[[INDEX1:.+]] = linalg.index 1 +// CHECK: %[[INDEX1_AMENDED:.+]] = affine.apply #[[$MAP_ADD]](%[[INDEX1]], %[[I1]]) +// CHECK: arith.addi %[[INDEX0_AMENDED]], %[[INDEX1_AMENDED]] // ----- From a5f069f4ec87fb89ba21186e7112045c70d4fdd9 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Thu, 23 May 2024 21:59:11 -0700 Subject: [PATCH 05/11] Address comments --- mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h | 4 ++-- mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h index 9291c91cfa1ed..1f21af6d6a29a 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h @@ -65,8 +65,8 @@ struct SCFTilingOptions { numThreadsComputationFunction = std::move(fun); return *this; } - /// Convenience function to set the `tileSizeComputationFunction` to a - /// function that computes tile sizes at the point they are needed. + /// Convenience function to set the `numThreadsComputationFunction` to a + /// function that computes num threads at the point they are needed. SCFTilingOptions &setNumThreads(ArrayRef numThreads); /// The interchange vector to reorder the tiled loops. diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 033f2977d1cbd..89fdd43cad9f7 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -87,7 +87,7 @@ verifyTileSizeOptions(RewriterBase &rewriter, Location loc, if (!options.interchangeVector.empty()) { if (!isPermutationVector(options.interchangeVector)) { return rewriter.notifyMatchFailure( - loc, "invalid intechange vector, not a permutation of the entire " + loc, "invalid interchange vector, not a permutation of the entire " "iteration space"); } } From 6d57db2e38b8091ad00a6358e5cabc494fe7c5a7 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Sun, 26 May 2024 17:38:05 -0700 Subject: [PATCH 06/11] Next round of comments. --- .../SCF/Transforms/TileUsingInterface.cpp | 46 ++++++++++--------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 89fdd43cad9f7..298c561cfbfd1 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -75,11 +75,11 @@ fillInterchangeVector(ArrayRef interchangeVector, static LogicalResult verifyTileSizeOptions(RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options) { - // Specifying number of tile is only supported on `scf.forall` op. + // Specifying number of threads is only supported on `scf.forall` op. if (options.numThreadsComputationFunction && options.loopType != scf::SCFTilingOptions::LoopType::ForallOp) { return rewriter.notifyMatchFailure( - loc, "number of tiles/threads can only by specified when loop type is " + loc, "number of threads can only by specified when loop type is " "set to use `scf.forall`"); } @@ -111,25 +111,27 @@ getTileSizes(RewriterBase &rewriter, TilingInterface op, // If the number of tiles is also specified, use that. if (options.tileSizeComputationFunction) { tileSizes = options.tileSizeComputationFunction(rewriter, op); - } else { - // Compute the tile sizes from the iteration domain and number - // of tiles as follows - // - niters = ceilDiv(ub - lb, step) - // - tileSize = ceilDiv(niters, numThreads) - AffineExpr s0, s1, s2, s3; - bindSymbols(rewriter.getContext(), s0, s1, s2, s3); - AffineExpr numItersExpr = (s1 - s0).ceilDiv(s2); - AffineExpr tileSizeExpr = numItersExpr.ceilDiv(s3); tileSizes.resize(numLoops, zero); - for (auto [index, range, nt] : - llvm::enumerate(iterationDomain, numThreads)) { - if (isConstantIntValue(nt, 0)) - continue; + return {tileSizes, numThreads}; + } - tileSizes[index] = affine::makeComposedFoldedAffineApply( - rewriter, op.getLoc(), tileSizeExpr, - {range.offset, range.size, range.stride, nt}); - } + // Compute the tile sizes from the iteration domain and number + // of tiles as follows + // - niters = ceilDiv(ub - lb, step) + // - tileSize = ceilDiv(niters, numThreads) + AffineExpr s0, s1, s2, s3; + bindSymbols(rewriter.getContext(), s0, s1, s2, s3); + AffineExpr numItersExpr = (s1 - s0).ceilDiv(s2); + AffineExpr tileSizeExpr = numItersExpr.ceilDiv(s3); + tileSizes.resize(numLoops, zero); + for (auto [index, range, nt] : + llvm::enumerate(iterationDomain, numThreads)) { + if (isConstantIntValue(nt, 0)) + continue; + + tileSizes[index] = affine::makeComposedFoldedAffineApply( + rewriter, op.getLoc(), tileSizeExpr, + {range.offset, range.size, range.stride, nt}); } tileSizes.resize(numLoops, zero); return {tileSizes, numThreads}; @@ -139,9 +141,9 @@ getTileSizes(RewriterBase &rewriter, TilingInterface op, // skips tiling a particular dimension. This convention is significantly // simpler to handle instead of adjusting affine maps to account for missing // dimensions. - if (options.tileSizeComputationFunction) { - tileSizes = options.tileSizeComputationFunction(rewriter, op); - } + assert(options.tileSizeComputationFunction && + "expected tile sizes to be specified"); + tileSizes = options.tileSizeComputationFunction(rewriter, op); tileSizes.resize(numLoops, zero); return {tileSizes, numThreads}; From bc03c8ae8587a7c56c737a6fa93a4e8e794d366d Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Thu, 30 May 2024 16:54:38 -0700 Subject: [PATCH 07/11] Drop support for non-unit strides, and assert that strides of iteration domain are 1. --- .../SCF/Transforms/TileUsingInterface.cpp | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 298c561cfbfd1..8ca6ed226f9c9 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -99,6 +99,11 @@ static std::tuple, SmallVector> getTileSizes(RewriterBase &rewriter, TilingInterface op, ArrayRef iterationDomain, const scf::SCFTilingOptions &options) { + assert( + llvm::all_of(iterationDomain, + [](Range r) { return isConstantIntValue(r.stride, 1); }) && + "tile size computation assumes that all dimensions of the iteration " + "domain have stride 1"); OpFoldResult zero = rewriter.getIndexAttr(0); SmallVector tileSizes, numThreads; size_t numLoops = iterationDomain.size(); @@ -119,10 +124,11 @@ getTileSizes(RewriterBase &rewriter, TilingInterface op, // of tiles as follows // - niters = ceilDiv(ub - lb, step) // - tileSize = ceilDiv(niters, numThreads) - AffineExpr s0, s1, s2, s3; - bindSymbols(rewriter.getContext(), s0, s1, s2, s3); - AffineExpr numItersExpr = (s1 - s0).ceilDiv(s2); - AffineExpr tileSizeExpr = numItersExpr.ceilDiv(s3); + AffineExpr s0, s1, s2; + bindSymbols(rewriter.getContext(), s0, s1, s2); + // TODO: The step here is assumed to be 1. + AffineExpr numItersExpr = (s1 - s0); + AffineExpr tileSizeExpr = numItersExpr.ceilDiv(s2); tileSizes.resize(numLoops, zero); for (auto [index, range, nt] : llvm::enumerate(iterationDomain, numThreads)) { @@ -130,8 +136,7 @@ getTileSizes(RewriterBase &rewriter, TilingInterface op, continue; tileSizes[index] = affine::makeComposedFoldedAffineApply( - rewriter, op.getLoc(), tileSizeExpr, - {range.offset, range.size, range.stride, nt}); + rewriter, op.getLoc(), tileSizeExpr, {range.offset, range.size, nt}); } tileSizes.resize(numLoops, zero); return {tileSizes, numThreads}; @@ -244,13 +249,19 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, SmallVector offsets, sizes; int materializedLoopNum = 0; + assert( + llvm::all_of(iterationDomain, + [](Range r) { return isConstantIntValue(r.stride, 1); }) && + "the offset and tile size computation assumes stride 1 for all " + "dimensions of the iteration domain"); + if (!numThreads.empty()) { - AffineExpr d0, d1, s0, s1, s2; + AffineExpr d0, d1, s0, s1; AffineExpr offsetExpr, residualTileSizeExpr; bindDims(rewriter.getContext(), d0, d1); - bindSymbols(rewriter.getContext(), s0, s1, s2); - offsetExpr = d0 + d1 * s0 * s1; - residualTileSizeExpr = s2 - (d0 + d1 * s0 * s1); + bindSymbols(rewriter.getContext(), s0, s1); + offsetExpr = d0 + d1 * s0; + residualTileSizeExpr = s1 - (d0 + d1 * s0); for (auto [nt, tileSize, loopRange] : llvm::zip_equal(numThreads, tileSizes, iterationDomain)) { @@ -264,11 +275,11 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, Value iv = ivs[materializedLoopNum++]; OpFoldResult offset = affine::makeComposedFoldedAffineApply( rewriter, loc, offsetExpr, - ArrayRef{loopRange.offset, iv, loopRange.stride, - tileSize}); + ArrayRef{loopRange.offset, iv, tileSize}); OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply( rewriter, loc, residualTileSizeExpr, - {loopRange.offset, nt, loopRange.stride, tileSize, loopRange.size}); + {loopRange.offset, nt, tileSize, loopRange.size}); + OpFoldResult size = tileSize; if (!isConstantIntValue(residualTileSize, 0)) { OpFoldResult sizeMinusOffsetPerThread = @@ -776,6 +787,11 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 1. Get the range of the loops that are represented by the operation. SmallVector iterationDomain = op.getIterationDomain(rewriter); + if (llvm::any_of(iterationDomain, + [](Range r) { return !isConstantIntValue(r.stride, 1); })) { + return rewriter.notifyMatchFailure( + op, "unhandled tiling of iteration domain with non-unit stride"); + } // 2. Materialize the tile sizes and/or number of threads; SmallVector tileSizes, numThreads; From c3e8ca947a09d3d20dc31b7b863704271beddc7d Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Thu, 30 May 2024 18:32:40 -0700 Subject: [PATCH 08/11] Remove use of `getLoopBounds` to avoid unnecessary lit test churn. --- .../SCF/Transforms/TileUsingInterface.cpp | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 8ca6ed226f9c9..b583bd194935b 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -94,16 +94,12 @@ verifyTileSizeOptions(RewriterBase &rewriter, Location loc, return success(); } -/// Compute the tile sizes and num threads values passed in. +/// Method to instantiate the tile sizes and/or number of threads specified +/// by the user. static std::tuple, SmallVector> -getTileSizes(RewriterBase &rewriter, TilingInterface op, - ArrayRef iterationDomain, - const scf::SCFTilingOptions &options) { - assert( - llvm::all_of(iterationDomain, - [](Range r) { return isConstantIntValue(r.stride, 1); }) && - "tile size computation assumes that all dimensions of the iteration " - "domain have stride 1"); +getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op, + ArrayRef iterationDomain, + const scf::SCFTilingOptions &options) { OpFoldResult zero = rewriter.getIndexAttr(0); SmallVector tileSizes, numThreads; size_t numLoops = iterationDomain.size(); @@ -240,7 +236,9 @@ static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst; } -/// Compute the tile offsets and sizes. +/// Compute the `OpFoldResult`s that represents the multi-dimensional +/// `offset`s and `size`s of the tile of the iteration space that the +/// innermost loop body of the generated tiled loops corresponds to. static std::tuple, SmallVector> getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, ArrayRef iterationDomain, @@ -249,12 +247,6 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, SmallVector offsets, sizes; int materializedLoopNum = 0; - assert( - llvm::all_of(iterationDomain, - [](Range r) { return isConstantIntValue(r.stride, 1); }) && - "the offset and tile size computation assumes stride 1 for all " - "dimensions of the iteration domain"); - if (!numThreads.empty()) { AffineExpr d0, d1, s0, s1; AffineExpr offsetExpr, residualTileSizeExpr; @@ -266,7 +258,9 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, for (auto [nt, tileSize, loopRange] : llvm::zip_equal(numThreads, tileSizes, iterationDomain)) { - if (isConstantIntValue(nt, 0) || isConstantIntValue(nt, 1)) { + // Non-tiled cases, set the offset and size to the + // `loopRange.offset/size`. + if (isConstantIntValue(nt, 0)) { offsets.push_back(loopRange.offset); sizes.push_back(loopRange.size); continue; @@ -290,6 +284,16 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()), {sizeMinusOffsetPerThread, tileSize}); } + + // Consider the case where the original loop was `[0, 100)`. + // If number of threads are `7`, the tile size would be computed as + // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6) + // - `offset = 0 + 6 * 15 = 105` + // - `tileSize = min(15, 100 - 105) = -5` + // To avoid negative tile sizes, we need to do a further + // `nonNegativeTileSize = affine.max(0, tileSize)`. + // This `max` can be avoided if + // `offset + tileSize * (numThreads - 1) < (ub - lb)` if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) { AffineMap maxMap = AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()); @@ -305,6 +309,8 @@ getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, for (auto [tileSize, loopRange] : llvm::zip_equal(tileSizes, iterationDomain)) { + // Non-tiled cases, set the offset and size to the + // `loopRange.offset/size`. if (isConstantIntValue(tileSize, 0)) { offsets.push_back(loopRange.offset); sizes.push_back(loopRange.size); @@ -787,16 +793,11 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 1. Get the range of the loops that are represented by the operation. SmallVector iterationDomain = op.getIterationDomain(rewriter); - if (llvm::any_of(iterationDomain, - [](Range r) { return !isConstantIntValue(r.stride, 1); })) { - return rewriter.notifyMatchFailure( - op, "unhandled tiling of iteration domain with non-unit stride"); - } // 2. Materialize the tile sizes and/or number of threads; SmallVector tileSizes, numThreads; std::tie(tileSizes, numThreads) = - getTileSizes(rewriter, op, iterationDomain, options); + getUserTileSizesAndNumThreads(rewriter, op, iterationDomain, options); // Check if it is safe to tile. This is hold over from previous iterations // of tile to for-all. Consider dropping it. From 876704f714822678049990a76c1120e1979f697f Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Thu, 13 Jun 2024 20:19:18 -0700 Subject: [PATCH 09/11] Add method to normalize `scf.forall` op. --- mlir/include/mlir/Dialect/SCF/Utils/Utils.h | 7 ++ .../TransformOps/LinalgTransformOps.cpp | 117 +++++++++++++++--- .../SCF/Transforms/TileUsingInterface.cpp | 4 +- mlir/lib/Dialect/SCF/Utils/Utils.cpp | 34 +++++ mlir/test/Dialect/Linalg/tile-tensors.mlir | 2 +- mlir/test/Dialect/Linalg/tile-to-forall.mlir | 68 +++++----- .../Dialect/Linalg/transform-op-tile.mlir | 2 +- .../tile-and-fuse-using-interface.mlir | 2 +- .../TilingInterface/tile-using-interface.mlir | 24 ++-- .../TilingInterface/tile-using-scfforall.mlir | 20 +-- 10 files changed, 199 insertions(+), 81 deletions(-) diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h index b7d6e99b5fdcc..8d15a293058d3 100644 --- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h @@ -195,6 +195,13 @@ scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target, scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source, RewriterBase &rewriter); +/// Normalize an `scf.forall` operation. Returns `failure()`if normalization fails. +// On `success()` returns the +/// newly created operation with all uses of the original operation replaced +/// with results of the new operation. +FailureOr normalizeForallOp(RewriterBase &rewriter, + scf::ForallOp forallOp); + } // namespace mlir #endif // MLIR_DIALECT_SCF_UTILS_UTILS_H_ diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 2fd4c9f48201e..9baf358a95503 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" @@ -3151,6 +3152,94 @@ void transform::TileUsingForallOp::build(OpBuilder &builder, /*mapping=*/mapping); } +/// Given `lbs`, `ubs` and `steps` of loops, return (for each loop), the +/// normalized upper bound. +static SmallVector +normalizeUpperBounds(RewriterBase &rewriter, Location loc, + ArrayRef lbs, ArrayRef ubs, + ArrayRef steps) { + AffineExpr s0, s1, s2; + bindSymbols(rewriter.getContext(), s0, s1, s2); + AffineExpr normalizedUbExpr = (s1 - s0).ceilDiv(s2); + SmallVector normalizedUbs; + for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) { + OpFoldResult normalizedUb = affine::makeComposedFoldedAffineApply( + rewriter, loc, normalizedUbExpr, {lb, ub, step}); + normalizedUbs.push_back(normalizedUb); + } + return normalizedUbs; +} + +/// When a loop is normalized, the uses of the induction variable within the +/// loop need to replaced with `original_lb + old_iv * original_step`. +static SmallVector denormalizeIndVar(RewriterBase &rewriter, + Location loc, ValueRange ivs, + ArrayRef lbs, + ArrayRef steps) { + AffineExpr s0, s1; + AffineExpr d0; + bindSymbols(rewriter.getContext(), s0, s1); + bindDims(rewriter.getContext(), d0); + AffineExpr denormExpr = s0 + d0 * s1; + SmallVector denormalizedIvs; + + for (auto [iv, lb, step] : llvm::zip_equal(ivs, lbs, steps)) { + OpFoldResult denormValue = affine::makeComposedFoldedAffineApply( + rewriter, loc, denormExpr, ArrayRef{iv, lb, step}); + denormalizedIvs.push_back( + getValueOrCreateConstantIndexOp(rewriter, loc, denormValue)); + } + return denormalizedIvs; +} + +/// Given a `scf.forall` loop return a loop op with the loop bounds +/// normalized. +/// TODO: Replace this with a general utility to normalize `scf.forall`. +/// At the time of writing, this wasnt done since adding this to `scf` +/// dialect would disallow using of `affine.apply` operations due +/// to cyclic dependencies. To avoid churn in lit tests +/// with the change this was added with, defer that to a follow up. +static scf::ForallOp normalizeForallLoopOp(RewriterBase &rewriter, + scf::ForallOp loop) { + SmallVector lbs = loop.getMixedLowerBound(); + SmallVector ubs = loop.getMixedUpperBound(); + SmallVector steps = loop.getMixedStep(); + + if (llvm::all_of( + lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) && + llvm::all_of( + steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) { + return loop; + } + + Location loc = loop.getLoc(); + SmallVector normalizedUbs = + normalizeUpperBounds(rewriter, loc, lbs, ubs, steps); + SmallVector normalizedLbs(normalizedUbs.size(), + rewriter.getIndexAttr(0)); + SmallVector normalizedSteps(normalizedUbs.size(), + rewriter.getIndexAttr(1)); + + auto normalizedForallOp = rewriter.create( + loc, normalizedLbs, normalizedUbs, normalizedSteps, loop.getOutputs(), + loop.getMapping(), [](OpBuilder &, Location, ValueRange) {}); + + auto normalizedLoopIvs = normalizedForallOp.getInductionVars(); + OpBuilder::InsertionGuard g(rewriter); + Block *normalizedLoopBlock = normalizedForallOp.getBody(); + rewriter.setInsertionPointToStart(normalizedLoopBlock); + + SmallVector argValues = + denormalizeIndVar(rewriter, loc, normalizedLoopIvs, lbs, steps); + argValues.append(normalizedForallOp.getRegionIterArgs().begin(), + normalizedForallOp.getRegionIterArgs().end()); + Block *origLoopBlock = loop.getBody(); + rewriter.mergeBlocks(origLoopBlock, normalizedLoopBlock, argValues); + + rewriter.replaceOp(loop, normalizedForallOp); + return normalizedForallOp; +} + DiagnosedSilenceableFailure transform::tileToForallOpImpl( RewriterBase &rewriter, transform::TransformState &state, TransformOpInterface transformOp, Operation *target, @@ -3172,23 +3261,6 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl( if (!mixedNumThreads.empty()) { options.setNumThreads(mixedNumThreads); } else { - SmallVector loopRanges = tileableOp.getIterationDomain(rewriter); - unsigned nLoops = loopRanges.size(); - SmallVector numThreads; - numThreads.reserve(nLoops); - AffineExpr s0, s1; - bindSymbols(rewriter.getContext(), s0, s1); - AffineExpr divExpr = s0.ceilDiv(s1); - for (int i = 0, e = std::min(mixedTileSizes.size(), loopRanges.size()); - i < e; ++i) { - OpFoldResult numTiles = mixedTileSizes[i]; - if (!isConstantIntValue(numTiles, 0)) - numTiles = affine::makeComposedFoldedAffineApply( - rewriter, tileableOp.getLoc(), divExpr, - {loopRanges[i].size, numTiles}); - numThreads.push_back(numTiles); - } - options.setNumThreads(numThreads); options.setTileSizes(mixedTileSizes); } if (mapping) { @@ -3199,9 +3271,20 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl( if (failed(maybeTilingResult)) return transformOp.emitDefaultSilenceableFailure(tileableOp); + rewriter.replaceOp(tileableOp, maybeTilingResult->replacements); tilingResult = *maybeTilingResult; + + if (mixedNumThreads.empty()) { + auto generatedForallOp = cast(tilingResult.loops.front()); + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(generatedForallOp); + scf::ForallOp normalizedForallOp = + normalizeForallLoopOp(rewriter, generatedForallOp); + tilingResult.loops.front() = normalizedForallOp; + } + return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index b583bd194935b..e404c01010a32 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -217,10 +217,10 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, AffineExpr s0, s1, d0; bindDims(b.getContext(), d0); bindSymbols(b.getContext(), s0, s1); - AffineMap minMap = AffineMap::get(1, 2, {s0, s1 - d0}, b.getContext()); + AffineMap minMap = AffineMap::get(1, 2, {s0 - d0, s1}, b.getContext()); Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size); return affine::makeComposedFoldedAffineMin( - b, loc, minMap, SmallVector{offset, tileSize, size}); + b, loc, minMap, SmallVector{offset, size, tileSize}); } /// Returns true if the maximum tile offset `tileSize * numThreads-1` is less diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index c0ee9d2afe91c..0018c99d7636a 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -1363,3 +1363,37 @@ scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target, return fusedLoop; } + +FailureOr mlir::normalizeForallOp(RewriterBase &rewriter, + scf::ForallOp forallOp) { + SmallVector lbs = forallOp.getMixedLowerBound(); + SmallVector ubs = forallOp.getMixedUpperBound(); + SmallVector steps = forallOp.getMixedStep(); + + if (llvm::all_of( + lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) && + llvm::all_of( + steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) { + return forallOp; + } + + SmallVector newLbs, newUbs, newSteps; + for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) { + LoopParams normalizedLoopParams = + emitNormalizedLoopBounds(rewriter, forallOp.getLoc(), lb, ub, step); + newLbs.push_back(normalizedLoopParams.lowerBound); + newUbs.push_back(normalizedLoopParams.upperBound); + newSteps.push_back(normalizedLoopParams.step); + } + + auto normalizedForallOp = rewriter.create( + forallOp.getLoc(), newLbs, newUbs, newSteps, forallOp.getOutputs(), + forallOp.getMapping(), [](OpBuilder &, Location, ValueRange) {}); + + rewriter.inlineRegionBefore(forallOp.getBodyRegion(), + normalizedForallOp.getBodyRegion(), + normalizedForallOp.getBodyRegion().begin()); + + rewriter.replaceAllOpUsesWith(forallOp, normalizedForallOp); + return success(); +} diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir index 89183813c080b..8f13c69070457 100644 --- a/mlir/test/Dialect/Linalg/tile-tensors.mlir +++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir @@ -119,7 +119,7 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)> +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> // CHECK: fold_extract_slice // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir index 6e92deaf4cf0d..778d5bb8b9c84 100644 --- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir +++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir @@ -196,10 +196,10 @@ func.func @matmul_tile_size_dynamic(%A: tensor, %B: tensor, %C // CHECK: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]]] // CHECK: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]] // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] - // CHECK: %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] - // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) + // CHECK-DAG: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] + // CHECK-DAG: %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] + // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) + // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) // CHECK: tensor.extract_slice %[[A]] // CHECK: tensor.extract_slice %[[B]] // CHECK: tensor.extract_slice %[[C_BLK]] @@ -233,11 +233,11 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[C:[0-9a-z]+]]: tensor func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> { // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]]) + // CHECK-DAG: %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]]) + // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]]) + // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]]) // CHECK-NOT: affine.max // CHECK-NOT: affine.min - // CHECK: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]]) // CHECK: %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] : // CHECK: %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] : // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] : @@ -452,10 +452,9 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> // CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> -// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (0, d0)> -// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> -// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)> +// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> +// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> // CHECK-LABEL: matmul_tile_size_dynamic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor @@ -464,18 +463,16 @@ module attributes {transform.with_named_sequence} { func.func @matmul_tile_size_dynamic(%A: tensor, %B: tensor, %C: tensor) -> tensor { // CHECK: %[[c1:.*]] = arith.constant 1 : index // CHECK: %[[c0:.*]] = arith.constant 0 : index - // CHECK: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : - // CHECK: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] : - // CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]] - // CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] - // CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : + // CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : + // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] : + // CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]] + // CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] + // CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TSMIN0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] - // CHECK: %[[TS0:.+]] = affine.max #[[$map3]](%[[TSMIN0]]) - // CHECK: %[[TSMIN1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] - // CHECK: %[[TS1:.+]] = affine.max #[[$map3]](%[[TSMIN1]]) - // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) + // CHECK-DAG: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] + // CHECK-DAG: %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]] + // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]]) + // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]]) // CHECK: tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] : // CHECK: tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] : // CHECK: tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] : @@ -523,10 +520,9 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> // CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> -// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (0, d0)> -// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> -// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)> -// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)> +// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> +// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)> +// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> // CHECK-LABEL: matmul_tile_size_dynamic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor @@ -535,18 +531,16 @@ module attributes {transform.with_named_sequence} { func.func @matmul_tile_size_dynamic(%A: tensor, %B: tensor, %C: tensor) -> tensor { // CHECK: %[[c1:.*]] = arith.constant 1 : index // CHECK: %[[c0:.*]] = arith.constant 0 : index - // CHECK: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : - // CHECK: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] : - // CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]] - // CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] - // CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : + // CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : + // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] : + // CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]] + // CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] + // CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) - // CHECK: %[[TSMIN0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] - // CHECK: %[[TS0:.+]] = affine.max #[[$map3]](%[[TSMIN0]]) - // CHECK: %[[TSMIN1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] - // CHECK: %[[TS1:.+]] = affine.max #[[$map3]](%[[TSMIN1]]) - // CHECK: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) - // CHECK: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) + // CHECK-DAG: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] + // CHECK-DAG: %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]] + // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]]) + // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]]) // CHECK: tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] : // CHECK: tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] : // CHECK: tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] : diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir index 727e3c361f054..7bac850d0b7fe 100644 --- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir @@ -184,7 +184,7 @@ module { // CHECK: %[[VS:.*]] = vector.vscale // CHECK: %[[STEP:.*]] = arith.muli %[[VEC_SIZE]], %[[VS]] : index // CHECK: scf.for %[[IV:.*]] = %[[C0]] to %[[DIM]] step %[[STEP]] iter_args(%[[VAL:.*]] = %[[ARG_2]]) -> (tensor) { -// CHECK: %[[SIZE:.*]] = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%[[IV]])[%[[STEP]], %[[DIM]]] +// CHECK: %[[SIZE:.*]] = affine.min affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)>(%[[IV]])[%[[DIM]], %[[STEP]]] // CHECK: %[[SLICE_ARG0:.*]] = tensor.extract_slice %[[ARG_0]][%[[IV]]] [%[[SIZE]]] [1] : tensor to tensor // CHECK: %[[SLICE_ARG1:.*]] = tensor.extract_slice %[[ARG_1]][%[[IV]]] [%[[SIZE]]] [1] : tensor to tensor // CHECK: %[[SLICE_ARG2:.*]] = tensor.extract_slice %[[VAL]][%[[IV]]] [%[[SIZE]]] [1] : tensor to tensor diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir index 11ab30a7d237c..d1aed593f4545 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir @@ -428,7 +428,7 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> +// CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> // CHECK: func @matmul_sequence_fusion( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir index 0a4d4c45f10be..8eb1311170c66 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir @@ -16,8 +16,8 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> // CHECK-LABEL: func.func @simple_matmul( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor @@ -68,9 +68,9 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)> // CHECK-LABEL: func.func @simple_matmul_memref( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: memref @@ -127,7 +127,7 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 128, 10)> // CHECK-LABEL: func.func @multi_result( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>) // CHECK-DAG: %[[INIT0:.+]] = tensor.empty() @@ -180,9 +180,9 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)> // CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)> // CHECK-DAG: #[[$MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)> // CHECK-LABEL: func.func @conv2D( @@ -287,9 +287,9 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> // CHECK-LABEL: func.func @interchange_matmul( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir index c5aff744b57ee..53dd0c6a2425c 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir @@ -17,8 +17,8 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> // CHECK: func.func @simple_matmul( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor @@ -65,8 +65,8 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> // CHECK-LABEL: func.func @simple_matmul_memref( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: memref @@ -117,7 +117,7 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 128, 10)> // CHECK-LABEL: func.func @multi_result( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>) // CHECK-DAG: %[[INIT0:.+]] = tensor.empty() @@ -161,9 +161,9 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)> // CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)> // CHECK-DAG: #[[$MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)> // CHECK-LABEL: func.func @conv2D( @@ -264,8 +264,8 @@ module attributes {transform.with_named_sequence} { transform.yield } } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)> +// CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)> // CHECK-LABEL: func.func @interchange_matmul( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor From 3375498b221526868b5ebd2d2c990515da5fd3e7 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Tue, 30 Jul 2024 12:48:29 -0700 Subject: [PATCH 10/11] Rebase fixes Signed-off-by: MaheshRavishankar --- .../mlir/Dialect/Linalg/Transforms/Transforms.h | 12 +++++++----- mlir/include/mlir/Dialect/SCF/Utils/Utils.h | 3 ++- mlir/lib/Dialect/SCF/Utils/Utils.cpp | 12 ++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 248e626346b52..477ef7bfafb18 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -867,7 +867,7 @@ computeContinuousTileSizes(OpBuilder &builder, TilingInterface op, unsigned dimension, OpFoldResult targetSize, bool emitAssertions); - /// Transformation information returned after reduction tiling. +/// Transformation information returned after reduction tiling. struct ForallReductionTilingResult { /// The partial reduction tiled op generated. SmallVector parallelTiledOps; @@ -1727,10 +1727,12 @@ void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m, void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns); /// Adds patterns that reduce the rank of named contraction ops that have -/// unit dimensions in the operand(s) by converting to a sequence of `collapse_shape`, -/// ``, `expand_shape` (if on tensors). For example a -/// `linalg.batch_matmul` with unit batch size will convert to `linalg.matmul` -/// and a `linalg.matvec` with with unit spatial dim in lhs will convert to a `linalg.dot`. +/// unit dimensions in the operand(s) by converting to a sequence of +/// `collapse_shape`, +/// ``, `expand_shape` (if on tensors). For +/// example a `linalg.batch_matmul` with unit batch size will convert to +/// `linalg.matmul` and a `linalg.matvec` with with unit spatial dim in lhs will +/// convert to a `linalg.dot`. void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns); } // namespace linalg diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h index 8d15a293058d3..4001ba3fc84c9 100644 --- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h @@ -195,7 +195,8 @@ scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target, scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source, RewriterBase &rewriter); -/// Normalize an `scf.forall` operation. Returns `failure()`if normalization fails. +/// Normalize an `scf.forall` operation. Returns `failure()`if normalization +/// fails. // On `success()` returns the /// newly created operation with all uses of the original operation replaced /// with results of the new operation. diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index 0018c99d7636a..9df6e24de178f 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -294,8 +294,8 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, } /// Returns the trip count of `forOp` if its' low bound, high bound and step are -/// constants, or optional otherwise. Trip count is computed as ceilDiv(highBound -/// - lowBound, step). +/// constants, or optional otherwise. Trip count is computed as +/// ceilDiv(highBound - lowBound, step). static std::optional getConstantTripCount(scf::ForOp forOp) { std::optional lbCstOp = getConstantIntValue(forOp.getLowerBound()); std::optional ubCstOp = getConstantIntValue(forOp.getUpperBound()); @@ -1379,11 +1379,11 @@ FailureOr mlir::normalizeForallOp(RewriterBase &rewriter, SmallVector newLbs, newUbs, newSteps; for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) { - LoopParams normalizedLoopParams = + Range normalizedLoopParams = emitNormalizedLoopBounds(rewriter, forallOp.getLoc(), lb, ub, step); - newLbs.push_back(normalizedLoopParams.lowerBound); - newUbs.push_back(normalizedLoopParams.upperBound); - newSteps.push_back(normalizedLoopParams.step); + newLbs.push_back(normalizedLoopParams.offset); + newUbs.push_back(normalizedLoopParams.size); + newSteps.push_back(normalizedLoopParams.stride); } auto normalizedForallOp = rewriter.create( From 998d0063acdb170ca76d5cb561f495e631009f15 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Tue, 30 Jul 2024 22:49:21 -0700 Subject: [PATCH 11/11] Fix Bazel build files. Signed-off-by: MaheshRavishankar --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 407523c690cb3..8493823114012 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -11213,6 +11213,7 @@ cc_library( ":AffineDialect", ":Analysis", ":ArithDialect", + ":ArithUtils", ":AsmParser", ":BufferizationDialect", ":BufferizationTransforms",