Skip to content

Commit dbaea96

Browse files
test
1 parent 7d157e7 commit dbaea96

File tree

3 files changed

+92
-28
lines changed

3 files changed

+92
-28
lines changed

lib/gc/Transforms/GPU/GpuLoopTiling.cpp

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,34 +41,30 @@ struct GpuLoopTiling final : GpuPass<GpuLoopTiling>,
4141

4242
void runOnOperation() override {
4343
IRRewriter rewriter(&getContext());
44-
auto euThreads = static_cast<double>(getEuThreads(rewriter));
44+
size_t euThreads = getEuThreads(rewriter);
4545
getOperation().walk<WalkOrder::PreOrder>([&](scf::ParallelOp loop) {
4646
if (!loop->getParentOfType<scf::ParallelOp>()) {
47-
tile(loop, euThreads);
47+
SmallVector<int64_t> loopSizes;
48+
auto steps = loop.getStep();
49+
loopSizes.reserve(steps.size());
50+
51+
for (auto step : steps) {
52+
if (auto v = getConstIdxValue(step)) {
53+
loopSizes.push_back(v);
54+
} else {
55+
loopSizes.push_back(32);
56+
}
57+
}
58+
59+
SmallVector<int64_t> tileSizes;
60+
normaliseTiles(euThreads, loopSizes, tileSizes);
61+
tileParallelLoop(loop, tileSizes, false);
4862
}
4963
return WalkResult::skip();
5064
});
5165
if (failed(simplifyRegions(rewriter, getOperation()->getRegions()))) {
5266
gcLogD("Failed to simplify regions");
5367
}
5468
}
55-
56-
private:
57-
static void tile(scf::ParallelOp loop, double euThreads) {
58-
SmallVector<int64_t> tileSizes;
59-
auto steps = loop.getStep();
60-
tileSizes.reserve(steps.size());
61-
62-
for (auto step : steps) {
63-
if (auto v = getConstIdxValue(step)) {
64-
tileSizes.push_back(static_cast<int64_t>(
65-
std::ceil(static_cast<double>(v) / euThreads)));
66-
} else {
67-
tileSizes.push_back(32);
68-
}
69-
}
70-
71-
tileParallelLoop(loop, tileSizes, false);
72-
}
7369
};
7470
} // namespace

lib/gc/Transforms/GPU/GpuTilingAndFusion.cpp

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,21 @@ struct GpuTilingAndFusion final
4646
void runOnOperation() override {
4747
IRRewriter rewriter(&getContext());
4848
scf::SCFTileAndFuseOptions opts;
49+
opts.setFusionControlFn(
50+
[&](tensor::ExtractSliceOp candidateSliceOp, OpResult originalProducer,
51+
bool isDestinationOperand)
52+
-> std::optional<scf::SCFTileAndFuseOptions::ControlFnResult> {
53+
Operation *op = originalProducer.getOwner();
54+
if (!op) {
55+
return std::nullopt;
56+
}
57+
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
58+
if (!linalgOp.hasOnlyProjectedPermutations()) {
59+
return std::nullopt;
60+
}
61+
}
62+
return scf::SCFTileAndFuseOptions::ControlFnResult{};
63+
});
4964
opts.tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
5065
// The outer loop is converted to a GPU kernel and the tile sizes are mapped
5166
// to the grid sizes.
@@ -77,13 +92,15 @@ struct GpuTilingAndFusion final
7792
assert(itTypes.size() == itDomains.size());
7893

7994
// TODO: Add a parameter to the options?
80-
size_t totalSize = calcOperandsSize(op) * euThreads;
95+
size_t totalSize = calcOperandsSize(op);
8196
unsigned loopCount = 0;
97+
SmallVector<int64_t> sizes;
8298

8399
for (auto [t, r] : zip(itTypes, itDomains)) {
84100
if (t == utils::IteratorType::parallel) {
85101
if (auto v = getConstantIntValue(r.size)) {
86102
loopCount++;
103+
sizes.emplace_back(*v);
87104
totalSize *= *v;
88105
} else {
89106
return calcDynamicSizes(builder, ti, euMem, euThreads);
@@ -95,19 +112,25 @@ struct GpuTilingAndFusion final
95112
return {};
96113
}
97114

98-
// TODO: In case of different sizes, calculate the ratio for each loop
99-
double ratio = std::pow(static_cast<double>(totalSize) /
100-
static_cast<double>(euMem),
101-
1.0 / loopCount);
102-
ratio = std::max(1.0, ratio);
115+
auto outerTileSize = static_cast<size_t>(
116+
std::ceil(static_cast<double>(euMem) /
117+
static_cast<double>(calcOperandsSize(op))));
118+
SmallVector<int64_t> outerTiles;
119+
SmallVector<int64_t> innerTiles;
120+
normaliseTiles(outerTileSize, sizes, outerTiles);
121+
normaliseTiles(euThreads, sizes, innerTiles);
122+
123+
unsigned counter = 0;
103124
SmallVector<OpFoldResult> tiles;
104125
tiles.reserve(itDomains.size());
105126

106127
for (auto [t, r] : zip(itTypes, itDomains)) {
107128
if (t != utils::IteratorType::parallel) {
108129
tiles.emplace_back(builder.getIndexAttr(1));
109130
} else if (auto v = getConstantIntValue(r.size)) {
110-
tiles.emplace_back(ceil(builder, *v, ratio));
131+
tiles.emplace_back(
132+
ceil(builder, outerTiles[counter], innerTiles[counter]));
133+
counter++;
111134
} else {
112135
abort(); // Must never get here
113136
}
@@ -174,7 +197,8 @@ struct GpuTilingAndFusion final
174197
static std::optional<TilingInterface> findTi(Operation *op) {
175198
std::optional<TilingInterface> last;
176199
op->walk<WalkOrder::PreOrder>([&](linalg::LinalgOp linalgOp) {
177-
if (!linalgOp->getParentOfType<scf::ForallOp>()) {
200+
if (linalgOp.hasOnlyProjectedPermutations() &&
201+
!linalgOp->getParentOfType<scf::ForallOp>()) {
178202
if (auto ti = dyn_cast<TilingInterface>(linalgOp.getOperation())) {
179203
last = ti;
180204
}

lib/gc/Transforms/GPU/GpuUtils.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#ifndef GPUUTILS_H
99
#define GPUUTILS_H
1010

11+
#include <numeric>
12+
1113
#include "mlir/IR/Builders.h"
1214
#include "mlir/IR/BuiltinOps.h"
1315
#include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -69,4 +71,46 @@ static int64_t getConstIdxValue(Value value) {
6971
}
7072
return 0;
7173
}
74+
75+
static void normaliseTiles(size_t totalSize,
76+
SmallVector<int64_t> &loopSizes,
77+
SmallVector<int64_t> &tiles) {
78+
size_t loopCount = loopSizes.size();
79+
auto size = static_cast<int64_t>(
80+
std::pow(totalSize, 1.0 / static_cast<double>(loopCount)));
81+
tiles.assign(loopCount, size);
82+
size_t product = 1;
83+
for (auto ptr = tiles.begin(), end = tiles.end(); ptr != end - 1; ++ptr) {
84+
product *= *ptr + 1;
85+
if (std::accumulate(ptr + 1, end, product, std::multiplies<>()) >
86+
totalSize) {
87+
break;
88+
}
89+
*ptr += 1;
90+
}
91+
}
92+
93+
static void normaliseTiles2(size_t totalSize, SmallVector<int64_t> &loopSizes,
94+
SmallVector<int64_t> &tiles) {
95+
size_t loopCount = loopSizes.size();
96+
assert(loopCount > 0);
97+
std::vector<std::pair<int64_t, size_t>> sorted;
98+
sorted.reserve(loopCount);
99+
for (size_t i = 0; i < loopCount; ++i) {
100+
sorted.emplace_back(loopSizes[i], i);
101+
}
102+
std::sort(sorted.begin(), sorted.end());
103+
tiles.assign(loopCount, 1);
104+
105+
// Distribute the totalSize among the tiles
106+
for (size_t i = 0; i < loopCount; ++i) {
107+
auto factor = static_cast<int64_t>(
108+
std::pow(totalSize, 1.0 / static_cast<double>(loopCount - i)));
109+
if (factor >= sorted[i].first) {
110+
factor = sorted[i].first;
111+
}
112+
tiles[sorted[i].second] = factor;
113+
totalSize /= factor;
114+
}
115+
}
72116
#endif

0 commit comments

Comments
 (0)