diff --git a/cmake/functions.cmake b/cmake/functions.cmake
index cbd173e75..1ba8accc9 100644
--- a/cmake/functions.cmake
+++ b/cmake/functions.cmake
@@ -117,4 +117,36 @@ function(gc_add_mlir_dialect_library name)
     if(GcInterface IN_LIST ARGN)
         target_link_libraries(obj.${name} PUBLIC GcInterface)
     endif()
-endfunction()
\ No newline at end of file
+endfunction()
+
+macro(gc_add_mlir_tool name)
+    # the dependency list copied from mlir/tools/mlir-cpu-runner/CMakeLists.txt of upstream
+    if(NOT DEFINED LLVM_LINK_COMPONENTS)
+        set(LLVM_LINK_COMPONENTS
+          Core
+          Support
+          nativecodegen
+          native
+        )
+    endif()
+    if(NOT DEFINED MLIR_LINK_COMPONENTS)
+        gc_set_mlir_link_components(MLIR_LINK_COMPONENTS
+          MLIRAnalysis
+          MLIRBuiltinToLLVMIRTranslation
+          MLIRExecutionEngine
+          MLIRIR
+          MLIRJitRunner
+          MLIRLLVMDialect
+          MLIRLLVMToLLVMIRTranslation
+          MLIRToLLVMIRTranslationRegistration
+          MLIRParser
+          MLIRTargetLLVMIRExport
+          MLIRSupport
+        )
+    endif()
+    add_mlir_tool(${ARGV})
+    #LLVM_LINK_COMPONENTS is processed by LLVM cmake in add_llvm_executable
+    target_link_libraries(${name} PRIVATE GcInterface ${MLIR_LINK_COMPONENTS})
+    llvm_update_compile_flags(${name})
+    set_property(GLOBAL APPEND PROPERTY GC_TOOLS ${name})
+endmacro()
\ No newline at end of file
diff --git a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
index c54f0d94e..282c68503 100644
--- a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
+++ b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
@@ -240,6 +240,7 @@ struct OclModule {
 
 struct OclModuleBuilderOpts {
   StringRef funcName = {};
+  bool printIr = false;
   bool enableObjectDump = false;
   ArrayRef<StringRef> sharedLibPaths = {};
   void (*pipeline)(OpPassManager &) = nullptr;
@@ -267,6 +268,7 @@ struct OclModuleBuilder {
 
 private:
   ModuleOp mlirModule;
+  const bool printIr;
   const bool enableObjectDump;
   const ArrayRef<StringRef> sharedLibPaths;
   void (*const pipeline)(OpPassManager &);
diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
index 6a33c9d05..1f12bc2f2 100644
--- a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
+++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
@@ -749,7 +749,8 @@ ArrayRef<Type> getArgTypes(const StringRef &funcName, ModuleOp &mod) {
 
 OclModuleBuilder::OclModuleBuilder(ModuleOp module,
                                    const OclModuleBuilderOpts &opts)
-    : mlirModule(module), enableObjectDump(opts.enableObjectDump),
+    : mlirModule(module), printIr(opts.printIr),
+      enableObjectDump(opts.enableObjectDump),
       sharedLibPaths(opts.sharedLibPaths),
       pipeline(opts.pipeline
                    ? opts.pipeline
@@ -799,6 +800,10 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
 
   auto staticMain = createStaticMain(mod, funcName, argTypes);
 
+  if (printIr) {
+    mod.dump();
+  }
+
   ExecutionEngineOptions opts;
   opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::Aggressive;
   opts.enableObjectDump = enableObjectDump;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a1af4a91a..873924bd4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,3 +1,21 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
 add_subdirectory(dnnl)
 add_subdirectory(gc-cpu-runner)
+add_subdirectory(gc-gpu-runner)
 add_subdirectory(gc-opt)
diff --git a/src/gc-cpu-runner/CMakeLists.txt b/src/gc-cpu-runner/CMakeLists.txt
index eaab1242d..a0037d6b7 100644
--- a/src/gc-cpu-runner/CMakeLists.txt
+++ b/src/gc-cpu-runner/CMakeLists.txt
@@ -29,38 +29,8 @@ if(GC_DEV_LINK_LLVM_DYLIB)
     MLIRExecutionEngineShared
     MLIRJitRunner
   )
-else()
-  # the dependency list copied from mlir/tools/mlir-cpu-runner/CMakeLists.txt of upstream
-  set(LLVM_LINK_COMPONENTS
-    Core
-    Support
-    nativecodegen
-    native
-  )
-  set(MLIR_LINK_COMPONENTS
-    MLIRAnalysis
-    MLIRBuiltinToLLVMIRTranslation
-    MLIRExecutionEngine
-    MLIRIR
-    MLIRJitRunner
-    MLIRLLVMDialect
-    MLIRLLVMToLLVMIRTranslation
-    MLIRToLLVMIRTranslationRegistration
-    MLIRParser
-    MLIRTargetLLVMIRExport
-    MLIRSupport
-  )
 endif()
 
-#LLVM_LINK_COMPONENTS is processed by LLVM cmake in add_llvm_executable
-set(gc_cpu_runner_libs
-        ${MLIR_LINK_COMPONENTS}
-        GcCpuRuntime)
-add_mlir_tool(gc-cpu-runner
-        gc-cpu-runner.cpp
-
-)
-llvm_update_compile_flags(gc-cpu-runner)
-
-target_link_libraries(gc-cpu-runner PRIVATE GcInterface ${gc_cpu_runner_libs})
+gc_add_mlir_tool(gc-cpu-runner gc-cpu-runner.cpp)
+target_link_libraries(gc-cpu-runner PRIVATE GcCpuRuntime)
 mlir_check_all_link_libraries(gc-cpu-runner)
diff --git a/src/gc-gpu-runner/CMakeLists.txt b/src/gc-gpu-runner/CMakeLists.txt
new file mode 100644
index 000000000..47a685353
--- /dev/null
+++ b/src/gc-gpu-runner/CMakeLists.txt
@@ -0,0 +1,28 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+if(NOT GC_ENABLE_TOOLS OR NOT GC_ENABLE_IMEX)
+  message(STATUS "Gpu runner is not enabled.")
+  return()
+endif()
+
+gc_add_mlir_tool(gc-gpu-runner GpuRunner.cpp)
+target_link_libraries(gc-gpu-runner PRIVATE
+  GcJitWrapper
+  GcGpuOclRuntime
+)
+mlir_check_all_link_libraries(gc-gpu-runner)
diff --git a/src/gc-gpu-runner/GpuRunner.cpp b/src/gc-gpu-runner/GpuRunner.cpp
new file mode 100644
index 000000000..6cae0dd51
--- /dev/null
+++ b/src/gc-gpu-runner/GpuRunner.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "gc/ExecutionEngine/Driver/Driver.h"
+#include "gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h"
+#include "gc/Transforms/Passes.h"
+#include "gc/Utils/Error.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/ExecutionEngine/JitRunner.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Tools/ParseUtilities.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+
+namespace {
+struct Options {
+  llvm::cl::OptionCategory runnerCategory{"GPU runner options"};
+  llvm::cl::opt<std::string> inputFilename{
+      llvm::cl::Positional, llvm::cl::desc("<input file>"), llvm::cl::init("-"),
+      llvm::cl::cat(runnerCategory)};
+  llvm::cl::opt<std::string> mainFuncName{
+      "e",
+      llvm::cl::desc("The function to be executed. If not specified, the "
+                     "first matching function in the module to be used."),
+      llvm::cl::value_desc("function name"), llvm::cl::cat(runnerCategory)};
+  llvm::cl::opt<bool> skipPipeline{
+      "skip-pipeline",
+      llvm::cl::desc("Skip the GPU pipeline. It's expected, that the input is "
+                     "already lowered with 'gc-op --gc-gpu-pipeline'."),
+      llvm::cl::init(false), llvm::cl::cat(runnerCategory)};
+  llvm::cl::list<std::string> sharedLibs{
+      "shared-libs",
+      llvm::cl::desc("Comma separated library paths to link dynamically."),
+      llvm::cl::MiscFlags::CommaSeparated, llvm::cl::desc("<lib1,lib2,...>"),
+      llvm::cl::cat(runnerCategory)};
+  llvm::cl::opt<bool> printIr{
+      "print-ir",
+      llvm::cl::desc("Print the resulting IR before the execution."),
+      llvm::cl::init(false), llvm::cl::cat(runnerCategory)};
+  llvm::cl::opt<std::string> objDumpFile{
+      "obj-dump-file",
+      llvm::cl::desc("Dump the compiled object to the specified file."),
+      llvm::cl::value_desc("file path"), llvm::cl::cat(runnerCategory)};
+};
+} // namespace
+
+void findFunc(Options &opts, ModuleOp mod) {
+  bool (*matcher)(ArrayRef<Type>, ModuleOp &);
+
+  if (opts.skipPipeline) {
+    matcher = [](ArrayRef<Type> args, ModuleOp &mod) {
+      if (args.size() != 3)
+        return false;
+      auto ctx = mod.getContext();
+      auto ptrType = LLVM::LLVMPointerType::get(ctx);
+      return args[0] == ptrType && args[1] == ptrType &&
+             args[2] == IntegerType::get(ctx, 64);
+    };
+  } else {
+    matcher = [](ArrayRef<Type> args, ModuleOp &) { return args.empty(); };
+  }
+
+  if (opts.mainFuncName.empty()) {
+    auto setFuncName = [&](auto funcOp) {
+      if (funcOp && !funcOp.isExternal() && funcOp.isPublic() &&
+          matcher(funcOp.getArgumentTypes(), mod)) {
+        opts.mainFuncName = funcOp.getName().str();
+        return true;
+      }
+      return false;
+    };
+
+    for (auto &op : mod.getBody()->getOperations()) {
+      if (setFuncName(dyn_cast<LLVM::LLVMFuncOp>(op)) ||
+          setFuncName(dyn_cast<func::FuncOp>(op))) {
+        return;
+      }
+    }
+    gcReportErr("No matching function found.");
+  }
+
+  ArrayRef<Type> args;
+  if (auto llvmFunc = mod.lookupSymbol<LLVM::LLVMFuncOp>(opts.mainFuncName)) {
+    args = llvmFunc.getArgumentTypes();
+  } else if (auto func = mod.lookupSymbol<func::FuncOp>(opts.mainFuncName)) {
+    args = func.getArgumentTypes();
+  } else {
+    gcReportErr("The function '", opts.mainFuncName.c_str(), "' not found.");
+  }
+
+  if (!matcher(args, mod)) {
+    if (opts.skipPipeline) {
+      gcReportErr("The function '", opts.mainFuncName.c_str(),
+                  "' signature does not match (!llvm.ptr, !llvm.ptr, i64).");
+    }
+    gcReportErr("The function '", opts.mainFuncName.c_str(),
+                "' must have no arguments.");
+  }
+}
+
+int main(int argc, char **argv) {
+  Options opts;
+  llvm::cl::ParseCommandLineOptions(argc, argv, "GraphCompiler GPU runner\n");
+
+  std::string errMsg;
+  auto file = openInputFile(opts.inputFilename, &errMsg);
+  if (!file) {
+    gcReportErr("Failed to read input IR: ", errMsg.c_str());
+  }
+
+  auto srcMgr = std::make_shared<llvm::SourceMgr>();
+  srcMgr->AddNewSourceBuffer(std::move(file), SMLoc());
+  MLIRContext mlirCtx{gc::initCompilerAndGetDialects()};
+  auto mlirMod = parseSourceFile<ModuleOp>(srcMgr, {&mlirCtx});
+  findFunc(opts, *mlirMod);
+
+  gc::gpu::OclModuleBuilderOpts builderOpts;
+  SmallVector<StringRef, 4> sharedLibs(opts.sharedLibs.begin(),
+                                       opts.sharedLibs.end());
+  builderOpts.funcName = opts.mainFuncName;
+  builderOpts.printIr = opts.printIr;
+  builderOpts.enableObjectDump = !opts.objDumpFile.getValue().empty();
+  builderOpts.sharedLibPaths = sharedLibs;
+  builderOpts.pipeline =
+      opts.skipPipeline ? [](OpPassManager &) {} : [](OpPassManager &pm) {
+        gc::GPUPipelineOptions pipelineOpts;
+        pipelineOpts.isUsmArgs = false;
+        pipelineOpts.callFinish = true;
+        populateGPUPipeline(pm, pipelineOpts);
+      };
+
+  gc::gpu::OclModuleBuilder builder{mlirMod, builderOpts};
+  auto runtime = gcGetOrReport(gc::gpu::OclRuntime::get());
+  auto oclMod = gcGetOrReport(builder.build(runtime));
+  assert(oclMod->isStatic);
+
+  if (!opts.objDumpFile.getValue().empty()) {
+    gcLogD("Dumping the compiled object to ", opts.objDumpFile.getValue());
+    oclMod->dumpToObjectFile(opts.objDumpFile.getValue());
+  }
+
+  auto queue = gcGetOrReport(runtime.createQueue());
+  gc::gpu::OclContext ctx{runtime, queue};
+  gc::gpu::StaticExecutor<0> exec{oclMod};
+  gcLogD("Executing function ", opts.mainFuncName.c_str(), "()");
+  exec(ctx);
+  gcGetOrReport(ctx.finish());
+  gcGetOrReport(runtime.releaseQueue(queue));
+  return 0;
+}
diff --git a/test/mlir/test/CMakeLists.txt b/test/mlir/test/CMakeLists.txt
index d631a194f..0e2764c79 100644
--- a/test/mlir/test/CMakeLists.txt
+++ b/test/mlir/test/CMakeLists.txt
@@ -32,7 +32,7 @@ set(GC_OPT_TEST_DEPENDS
 
 if(GC_ENABLE_IMEX)
         include(imex)
-        list(APPEND GC_OPT_TEST_DEPENDS GcOpenclRuntime)
+        list(APPEND GC_OPT_TEST_DEPENDS gc-gpu-runner)
 endif()
 
 if(GC_ENABLE_BINDINGS_PYTHON)
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose.mlir
index 27d1cbb63..1302d6809 100644
--- a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose.mlir
+++ b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose.mlir
@@ -1,6 +1,5 @@
-// RUN: gc-opt %s --gc-gpu-pipeline="is-usm-args=false use-gpu-ocl=false" \
-// RUN: | gc-cpu-runner -e main --entry-point-result=void \
-// RUN:   --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-gpu-runner --shared-libs=%mlir_runner_utils %s | FileCheck %s
+
 module{
 
 func.func @linalg_matmul(%arg0: tensor<128x256xf16>,
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose_sep.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose_sep.mlir
index 7047a728b..0a06bda8e 100644
--- a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose_sep.mlir
+++ b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_128x64_transpose_sep.mlir
@@ -1,6 +1,5 @@
-// RUN: gc-opt %s --gc-gpu-pipeline="is-usm-args=false use-gpu-ocl=false" \
-// RUN: | gc-cpu-runner -e main --entry-point-result=void \
-// RUN:   --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-gpu-runner --shared-libs=%mlir_runner_utils %s | FileCheck %s
+
 module{
 
 func.func @linalg_matmul(%arg0: tensor<128x256xf16>,
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_64x64.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_64x64.mlir
index 3978c457d..8b5fbbdea 100644
--- a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_64x64.mlir
+++ b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_64x64.mlir
@@ -1,6 +1,5 @@
-// RUN: gc-opt %s --gc-gpu-pipeline="is-usm-args=false use-gpu-ocl=false" \
-// RUN: | gc-cpu-runner -e main --entry-point-result=void \
-// RUN:   --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-gpu-runner --shared-libs=%mlir_runner_utils %s | FileCheck %s
+
 module{
 
 func.func @linalg_matmul(%arg0: tensor<64x64xf16>,
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir
index c08fc99a0..cb3f59728 100644
--- a/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir
+++ b/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir
@@ -1,6 +1,4 @@
-// RUN: gc-opt %s --gc-gpu-pipeline="is-usm-args=false use-gpu-ocl=false" \
-// RUN: | gc-cpu-runner -e main --entry-point-result=void \
-// RUN:   --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-gpu-runner --shared-libs=%mlir_runner_utils %s | FileCheck %s
 
 module {
   func.func @linalg_mlp(%arg0: tensor<32x4096xf16>, %arg1: tensor<4096x4096xf16>, %arg2 : tensor<32x4096xf16>,
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096_transpose.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096_transpose.mlir
index f1b662981..ff88f71ec 100644
--- a/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096_transpose.mlir
+++ b/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096_transpose.mlir
@@ -1,6 +1,4 @@
-// RUN: gc-opt %s --gc-gpu-pipeline="is-usm-args=false use-gpu-ocl=false" \
-// RUN: | gc-cpu-runner -e main --entry-point-result=void \
-// RUN:   --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-gpu-runner --shared-libs=%mlir_runner_utils %s | FileCheck %s
 
 module {
   func.func @linalg_mlp(%arg0: tensor<32x4096xf16>, %arg1: tensor<4096x4096xf16>, %arg2 : tensor<32x4096xf16>, 
diff --git a/test/mlir/test/gc/gpu-runner/mlp.mlir b/test/mlir/test/gc/gpu-runner/mlp.mlir
index c6cf901a7..6f914beb8 100644
--- a/test/mlir/test/gc/gpu-runner/mlp.mlir
+++ b/test/mlir/test/gc/gpu-runner/mlp.mlir
@@ -1,4 +1,5 @@
-// RUN: gc-opt %s --gc-gpu-pipeline="is-usm-args=false use-gpu-ocl=false" | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+// RUN: gc-gpu-runner --shared-libs=%mlir_runner_utils %s | FileCheck %s
+
 #map0 = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
diff --git a/test/mlir/test/lit.cfg.py b/test/mlir/test/lit.cfg.py
index 09b0451e5..08fc42a33 100644
--- a/test/mlir/test/lit.cfg.py
+++ b/test/mlir/test/lit.cfg.py
@@ -49,7 +49,7 @@
 llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
 
 tool_dirs = [config.gc_tools_dir, config.llvm_tools_dir]
-tools = ["gc-opt", "gc-cpu-runner"]
+tools = ["gc-opt", "gc-cpu-runner", "gc-gpu-runner"]
 
 llvm_config.add_tool_substitutions(tools, tool_dirs)