Skip to content

Add all-in-one pass pipeline #75

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions include/gc/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,34 @@
#include "mlir/Pass/Pass.h"

namespace mlir {

namespace LLVM {
class LLVMDialect;
}

namespace scf {
class SCFDialect;
}

namespace openmp {
class OpenMPDialect;
}

namespace linalg {
class LinalgDialect;
}

namespace MemRef {
class MemRefDialect;
}

class PassManager;

namespace gc {

void populateFrontendPasses(mlir::PassManager &);
void populateCPUPipeline(mlir::PassManager &);

#define GEN_PASS_DECL
#include "gc/Transforms/Passes.h.inc"

Expand Down
13 changes: 13 additions & 0 deletions include/gc/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,17 @@ def ConvertOneDNNGraphToLinalg : Pass<"convert-onednn-graph-to-linalg"> {
];
}

def GCCPUPipeline: Pass<"gc-cpu-pipeline"> {
let summary = "All-in-one pipeline for GC for CPU";
let dependentDialects = ["onednn_graph::OneDNNGraphDialect",
"tensor::TensorDialect",
"memref::MemRefDialect",
"linalg::LinalgDialect",
"LLVM::LLVMDialect",
"scf::SCFDialect",
"bufferization::BufferizationDialect",
"omp::OpenMPDialect",
"vector::VectorDialect"];
}

#endif // GC_DIALECT_GC_PASSES
3 changes: 2 additions & 1 deletion lib/gc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ include(functions)

add_subdirectory(CAPI)
add_subdirectory(Dialect)
add_subdirectory(Transforms)
add_subdirectory(Transforms)
add_subdirectory(ExecutionEngine)
1 change: 1 addition & 0 deletions lib/gc/ExecutionEngine/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_subdirectory(CPURuntime)
15 changes: 15 additions & 0 deletions lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
find_package(OpenMP REQUIRED)

if ("iomp" IN_LIST OpenMP_C_LIB_NAMES OR "omp" IN_LIST OpenMP_C_LIB_NAMES OR "omp5" IN_LIST OpenMP_C_LIB_NAMES)
else()
add_definitions("-DGC_NEEDS_OMP_WRAPPER=1")
endif()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_mlir_library(GCCpuRuntime
SHARED
Parallel.cpp

EXCLUDE_FROM_LIBMLIR
)
188 changes: 188 additions & 0 deletions lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
//===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <assert.h>
#include <atomic>
#include <chrono>
#include <immintrin.h>
#include <omp.h>
#include <stdarg.h>

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

#define WEAK_SYMBOL __attribute__((weak))

namespace {
struct barrier_t {
alignas(64) std::atomic<int32_t> pending_;
std::atomic<int32_t> rounds_;
uint64_t total_;
// pad barrier to size of cacheline to avoid false sharing
char padding_[64 - 4 * sizeof(int32_t)];
};

using barrier_idle_func = uint64_t (*)(std::atomic<int32_t> *remaining,
int32_t expected_remain, int32_t tid,
void *args);
} // namespace

extern "C" {
int gc_runtime_keep_alive = 0;
void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func,
void *idle_args) {
auto cur_round = b->rounds_.load(std::memory_order_acquire);
auto cnt = --b->pending_;
assert(cnt >= 0);
if (cnt == 0) {
b->pending_.store(b->total_);
b->rounds_.store(cur_round + 1);
} else {
if (idle_func) {
if (cur_round != b->rounds_.load()) {
return;
}
idle_func(&b->rounds_, cur_round + 1, -1, idle_args);
}
while (cur_round == b->rounds_.load()) {
_mm_pause();
}
}
}

static_assert(sizeof(barrier_t) == 64, "size of barrier_t should be 64-byte");

void gc_init_barrier(barrier_t *b, int num_barriers, uint64_t thread_count) {
for (int i = 0; i < num_barriers; i++) {
b[i].total_ = thread_count;
b[i].pending_.store(thread_count);
b[i].rounds_.store(0);
}
}

#if GC_NEEDS_OMP_WRAPPER
void WEAK_SYMBOL __kmpc_barrier(void *loc, int32_t global_tid) {
#pragma omp barrier
}

int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) {
return omp_get_thread_num();
}

// The implementation was extracted and simplified from LLVM libomp
// at openmp/runtime/src/kmp_sched.cpp
void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid,
int32_t schedtype,
int32_t *plastiter, uint64_t *plower,
uint64_t *pupper, int64_t *pstride,
int64_t incr, int64_t chunk) {
if (unlikely(schedtype != 34)) {
std::abort();
}
const int32_t FALSE = 0;
const int32_t TRUE = 1;
using UT = uint64_t;
// using ST = int64_t;
/* this all has to be changed back to TID and such.. */
uint32_t tid = gtid;
uint32_t nth = omp_get_num_threads();
UT trip_count;

/* special handling for zero-trip loops */
if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
if (plastiter != nullptr)
*plastiter = FALSE;
/* leave pupper and plower set to entire iteration space */
*pstride = incr; /* value should never be used */
return;
}

if (nth == 1) {
if (plastiter != nullptr)
*plastiter = TRUE;
*pstride =
(incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
return;
}

/* compute trip count */
if (incr == 1) {
trip_count = *pupper - *plower + 1;
} else if (incr == -1) {
trip_count = *plower - *pupper + 1;
} else if (incr > 0) {
// upper-lower can exceed the limit of signed type
trip_count = (UT)(*pupper - *plower) / incr + 1;
} else {
trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
}
if (trip_count < nth) {
if (tid < trip_count) {
*pupper = *plower = *plower + tid * incr;
} else {
// set bounds so non-active threads execute no iterations
*plower = *pupper + (incr > 0 ? 1 : -1);
}
if (plastiter != nullptr)
*plastiter = (tid == trip_count - 1);
} else {
UT small_chunk = trip_count / nth;
UT extras = trip_count % nth;
*plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
*pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
if (plastiter != nullptr)
*plastiter = (tid == nth - 1);
}
*pstride = trip_count;
}

void WEAK_SYMBOL __kmpc_for_static_fini(void *ptr, int32_t v) {}

static thread_local int next_num_threads = 0;

/*!
@ingroup PARALLEL
The type for a microtask which gets passed to @ref __kmpc_fork_call().
The arguments to the outlined function are
@param global_tid the global thread identity of the thread executing the
function.
@param bound_tid the local identity of the thread executing the function
@param ... pointers to shared variables accessed by the function.
*/
using kmpc_micro = void (*)(int32_t *global_tid, int32_t *bound_tid, ...);
void WEAK_SYMBOL __kmpc_fork_call(void *loc, int32_t argc, void *pfunc, ...) {
if (unlikely(argc != 1 && argc != 0)) {
std::abort();
}
va_list ap;
va_start(ap, pfunc);
void *c = va_arg(ap, void *);
int32_t global_tid = 0;
if (unlikely(next_num_threads)) {
#pragma omp parallel num_threads(next_num_threads)
{
kmpc_micro func = (kmpc_micro)(pfunc);
func(&global_tid, nullptr, c);
}
next_num_threads = 0;
} else {
#pragma omp parallel
{
kmpc_micro func = (kmpc_micro)(pfunc);
func(&global_tid, nullptr, c);
}
}
va_end(ap);
}

void WEAK_SYMBOL __kmpc_push_num_threads(void *loc, int32_t global_tid,
int32_t num_threads) {
next_num_threads = num_threads;
}
#endif
}
1 change: 1 addition & 0 deletions lib/gc/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ gc_set_mlir_link_components(MLIR_LINK_COMPONENTS

add_mlir_library(GCPasses
OneDNNGraphToLinalg.cpp
Pipeline.cpp
TileNamed.cpp

ADDITIONAL_HEADER_DIRS
Expand Down
Loading