Added test

AndreyPavlenko · AndreyPavlenko · commit 0ebb9d7077b7 · 2024-09-24T01:51:27.000+02:00
diff --git a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
@@ -299,10 +299,8 @@ template <unsigned N> struct OclModuleExecutorBase {
     assert(argCounter == mod->functionType.getNumInputs());
   }
 
-  void checkArg(void *alignedPtr, bool isUsm = true) const {
+  void checkArg(const void *alignedPtr, bool isUsm = true) const {
     assert(!isUsm || mod->runtime.isUsm(alignedPtr));
-    // It's recommended to have at least 16-byte alignment
-    assert(reinterpret_cast<std::uintptr_t>(alignedPtr) % 16 == 0);
   }
 #endif
 };
diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt
@@ -38,5 +38,4 @@ endforeach ()
 
 target_include_directories(GcGpuPasses PUBLIC ${IMEX_BUILD_INCLUDES})
 target_link_libraries(GcGpuPasses PUBLIC ${IMEX_LIBS})
-target_link_libraries(GcPasses PUBLIC GcGpuPasses)
 set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
diff --git a/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt b/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_unittest(GCExecutionEngineGpuTests
 )
 target_link_libraries(GCExecutionEngineGpuTests
   PRIVATE
+  GcGpuPasses
   GcJitWrapper
   GcGpuOclRuntime
 )
diff --git a/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp b/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp
@@ -19,9 +19,6 @@
 #include "gtest/gtest.h"
 #include <memory>
 
-#include <mlir/Dialect/GPU/Transforms/Passes.h>
-
-#include "gc/Transforms/Passes.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include <CL/cl_ext.h>
@@ -31,12 +28,12 @@ using namespace gc::gpu;
 
 constexpr char addStatic[] = R"mlir(
 module @test {
-  func.func @entry(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
-    %0 = bufferization.to_tensor %arg0 restrict : memref<32x32xf32>
-    %1 = bufferization.to_tensor %arg1 restrict : memref<32x32xf32>
-    %2 = tensor.empty() : tensor<32x32xf32>
-    %3 = linalg.add ins(%1, %0 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%2 : tensor<32x32xf32>) -> tensor<32x32xf32>
-    bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<32x32xf32>, memref<32x32xf32>) -> ()
+  func.func @entry(%arg0: memref<64x64xf32>, %arg1: memref<64x64xf32>, %arg2: memref<64x64xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<64x64xf32>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<64x64xf32>
+    %2 = tensor.empty() : tensor<64x64xf32>
+    %3 = linalg.add ins(%1, %0 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2 : tensor<64x64xf32>) -> tensor<64x64xf32>
+    bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<64x64xf32>, memref<64x64xf32>) -> ()
     return
   }
 }
@@ -59,40 +56,69 @@ module @test {
 }
 )mlir";
 
-template <unsigned N, unsigned M = N> struct TestAdd {
+constexpr char matmulAddStatic[] = R"mlir(
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @entry(%arg0: memref<64x128xf32>, %arg1: memref<128x128xf32>, %arg2: memref<64x128xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<64x128xf32>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<128x128xf32>
+    %2 = tensor.empty() : tensor<64x128xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<64x128xf32>) -> tensor<64x128xf32>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<64x128xf32>, tensor<128x128xf32>) outs(%3 : tensor<64x128xf32>) -> tensor<64x128xf32>
+    %5 = tensor.empty() : tensor<64x128xf32>
+    %6 = linalg.add ins(%4, %0 : tensor<64x128xf32>, tensor<64x128xf32>) outs(%5 : tensor<64x128xf32>) -> tensor<64x128xf32>
+    bufferization.materialize_in_destination %6 in restrict writable %arg2 : (tensor<64x128xf32>, memref<64x128xf32>) -> ()
+    return
+  }
+}
+)mlir";
+
+struct TestBase {
   OclRuntime runtime = gcGetOrReport(OclRuntime::get());
   cl_command_queue queue = gcGetOrReport(runtime.createQueue());
+  OclContext ctx{runtime, queue};
+  MLIRContext mlirCtx{gc::initCompilerAndGetDialects()};
+
+  virtual void exec(std::shared_ptr<const OclModule> &mod) = 0;
+
+  virtual ~TestBase() { gcGetOrReport(runtime.releaseQueue(queue)); }
+
+  OwningOpRef<ModuleOp> parse(const char *code) {
+    std::unique_ptr<llvm::MemoryBuffer> memBuf =
+        llvm::MemoryBuffer::getMemBuffer(code);
+    llvm::SourceMgr srcMgr;
+    srcMgr.AddNewSourceBuffer(std::move(memBuf), SMLoc());
+    return parseSourceFile<ModuleOp>(srcMgr, &mlirCtx);
+  }
+};
 
+template <unsigned N, unsigned M = N> struct TestAdd : TestBase {
   static constexpr unsigned size = N * M;
   float *buf0 = gcGetOrReport(runtime.usmNewDev<float>(size));
   float *buf1 = gcGetOrReport(runtime.usmNewDev<float>(size));
   float *buf2 = gcGetOrReport(runtime.usmNewShared<float>(size));
-  MLIRContext mlirCtx{gc::initCompilerAndGetDialects()};
-  float cpuBuf1[size] = {};
-  float cpuBuf2[size] = {};
 
-  explicit TestAdd() { std::fill(cpuBuf1, cpuBuf1 + size, 2.0f); }
+  explicit TestAdd() {
+    float cpuBuf[size];
+    std::fill(cpuBuf, cpuBuf + size, 2.0f);
+    assert(runtime.usmCpy(ctx, cpuBuf, buf0, size));
+    assert(runtime.usmCpy(ctx, cpuBuf, buf1, size));
+    gcGetOrReport(ctx.finish());
+  }
 
-  virtual ~TestAdd() {
-    gcGetOrReport(runtime.releaseQueue(queue));
+  ~TestAdd() override {
     assert(runtime.usmFree(buf0));
     assert(runtime.usmFree(buf1));
     assert(runtime.usmFree(buf2));
   }
 
-  virtual void exec(std::shared_ptr<const OclModule> &mod, OclContext &ctx) = 0;
-
   void test(const char *code) {
-    OclContext ctx(runtime, queue);
-    assert(runtime.usmCpy(ctx, cpuBuf1, buf0, size));
-    assert(runtime.usmCpy(ctx, cpuBuf1, buf1, size));
-
     OclModuleBuilder builder(parse(code));
     auto mod = gcGetOrReport(builder.build(runtime));
+    exec(mod);
 
-    exec(mod, ctx);
-
-    assert(runtime.usmCpy(ctx, buf2, cpuBuf2, size));
+    float cpuBuf[size];
+    assert(runtime.usmCpy(ctx, buf2, cpuBuf, size));
     gcGetOrReport(ctx.finish());
 
     for (unsigned i = 0; i < size; i++) {
@@ -101,24 +127,51 @@ template <unsigned N, unsigned M = N> struct TestAdd {
     }
     // std::cout << "\n";
 
-    for (float i : cpuBuf2) {
-      // std::cout << cpuBuf2[i] << " ";
+    for (float i : cpuBuf) {
+      // std::cout << i << " ";
       assert(i == 4.0f);
     }
   }
+};
 
-  OwningOpRef<ModuleOp> parse(const char *code) {
-    std::unique_ptr<llvm::MemoryBuffer> memBuf =
-        llvm::MemoryBuffer::getMemBuffer(code);
-    llvm::SourceMgr srcMgr;
-    srcMgr.AddNewSourceBuffer(std::move(memBuf), SMLoc());
-    return parseSourceFile<ModuleOp>(srcMgr, &mlirCtx);
+template <unsigned N, unsigned M = N> struct TestMatmulAdd : TestBase {
+  static constexpr unsigned size1 = N * M;
+  static constexpr unsigned size2 = M * M;
+  float *buf0 = gcGetOrReport(runtime.usmNewDev<float>(size1));
+  float *buf1 = gcGetOrReport(runtime.usmNewDev<float>(size2));
+  float *buf2 = gcGetOrReport(runtime.usmNewShared<float>(size1));
+
+  explicit TestMatmulAdd() {
+    float cpuBuf[size2];
+    std::fill(cpuBuf, cpuBuf + size2, 2);
+    assert(runtime.usmCpy(ctx, cpuBuf, buf0, size1));
+    assert(runtime.usmCpy(ctx, cpuBuf, buf1, size2));
+    gcGetOrReport(ctx.finish());
+  }
+
+  ~TestMatmulAdd() override {
+    assert(runtime.usmFree(buf0));
+    assert(runtime.usmFree(buf1));
+    assert(runtime.usmFree(buf2));
+  }
+
+  void test(const char *code) {
+    OclModuleBuilder builder(parse(code));
+    auto mod = gcGetOrReport(builder.build(runtime));
+    exec(mod);
+
+    gcGetOrReport(ctx.finish());
+    for (unsigned i = 0; i < size1; i++) {
+      // std::cout << buf2[i] << " ";
+      assert(buf2[i] == 514);
+    }
+    // std::cout << "\n";
   }
 };
 
 TEST(GpuOclRuntime, TestAddStatic) {
-  struct TestAddStatic1 : TestAdd<32> {
-    void exec(std::shared_ptr<const OclModule> &mod, OclContext &ctx) override {
+  struct TestAddStatic1 : TestAdd<64> {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
       assert(mod->isStatic);
       StaticExecutor<3> exec(mod);
       exec(ctx, buf0, buf1, buf2);
@@ -128,8 +181,8 @@ TEST(GpuOclRuntime, TestAddStatic) {
   } test1;
   test1.test(addStatic);
 
-  struct TestAddStatic2 : TestAdd<32> {
-    void exec(std::shared_ptr<const OclModule> &mod, OclContext &ctx) override {
+  struct TestAddStatic2 : TestAdd<64> {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
       assert(mod->isStatic);
       StaticExecutor<3> exec(mod);
       exec.arg(buf0);
@@ -146,7 +199,7 @@ TEST(GpuOclRuntime, TestAddStatic) {
 TEST(GpuOclRuntime, TestAddDynamic) {
   GTEST_SKIP() << "Dynamic shapes are not yet supported";
   struct TestAddDynamic : TestAdd<32, 64> {
-    void exec(std::shared_ptr<const OclModule> &mod, OclContext &ctx) override {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
       assert(!mod->isStatic);
       int64_t shape[] = {32, 64};
       int64_t strides[] = {64, 1};
@@ -161,3 +214,15 @@ TEST(GpuOclRuntime, TestAddDynamic) {
   } test;
   test.test(addDynamic);
 }
+
+TEST(GpuOclRuntime, TestMatmulAddStatic) {
+  struct Test : TestMatmulAdd<64, 128> {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
+      assert(mod->isStatic);
+      StaticExecutor<3> exec(mod);
+      exec(ctx, buf0, buf1, buf2);
+      assert(exec.isSmall());
+    }
+  } test;
+  test.test(matmulAddStatic);
+}

Original file line number	Diff line number	Diff line change
`@@ -299,10 +299,8 @@ template <unsigned N> struct OclModuleExecutorBase {`
`299`	`299`	`assert(argCounter == mod->functionType.getNumInputs());`
`300`	`300`	`}`
`301`	`301`
`302`		`- void checkArg(void *alignedPtr, bool isUsm = true) const {`
	`302`	`+ void checkArg(const void *alignedPtr, bool isUsm = true) const {`
`303`	`303`	`assert(!isUsm \|\| mod->runtime.isUsm(alignedPtr));`
`304`		`- // It's recommended to have at least 16-byte alignment`
`305`		`- assert(reinterpret_cast<std::uintptr_t>(alignedPtr) % 16 == 0);`
`306`	`304`	`}`
`307`	`305`	`#endif`
`308`	`306`	`};`
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@ add_mlir_unittest(GCExecutionEngineGpuTests`
`3`	`3`	`)`
`4`	`4`	`target_link_libraries(GCExecutionEngineGpuTests`
`5`	`5`	`PRIVATE`
	`6`	`+ GcGpuPasses`
`6`	`7`	`GcJitWrapper`
`7`	`8`	`GcGpuOclRuntime`
`8`	`9`	`)`