diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index a703d67c1b2..dc42a52a234 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -33,14 +33,14 @@ if(NOT PYTHON_EXECUTABLE) resolve_python_executable() endif() -# NB: Enabling this will serialize execution of delegate instances -# Keeping this OFF by default to maintain existing behavior, to be revisited. +# NB: Enabling this will serialize execution of delegate instances Keeping this +# OFF by default to maintain existing behavior, to be revisited. option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE - "Enable workspace sharing across different delegate instances" ON) -# Keeping this OFF by default due to regressions in decode -# and model load with kleidi kernels -option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI - "Enable Arm Kleidi kernels" OFF) + "Enable workspace sharing across different delegate instances" ON +) +# Keeping this OFF by default due to regressions in decode and model load with +# kleidi kernels +option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF) if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) endif() @@ -100,8 +100,7 @@ include(cmake/Dependencies.cmake) list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(xnnpack_backend STATIC ${_xnnpack_backend__srcs}) target_link_libraries( - xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core - xnnpack_schema + xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core xnnpack_schema ) target_include_directories( @@ -119,6 +118,12 @@ target_include_directories( target_compile_options(xnnpack_backend PUBLIC ${_common_compile_options}) target_link_options_shared_lib(xnnpack_backend) +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + list(APPEND xnn_executor_runner_libs optimized_native_cpu_ops_lib) +else() + list(APPEND xnn_executor_runner_libs portable_ops_lib) +endif() + list(APPEND xnn_executor_runner_libs xnnpack_backend executorch) # ios can only build library but not binary @@ -134,14 +139,19 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") if(EXECUTORCH_BUILD_DEVTOOLS) list(APPEND xnn_executor_runner_libs etdump) else() - message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.") + message( + SEND_ERROR + "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled." + ) endif() endif() - target_link_libraries( - xnn_executor_runner gflags portable_ops_lib ${xnn_executor_runner_libs} - ) + target_link_libraries(xnn_executor_runner gflags ${xnn_executor_runner_libs}) target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options}) + if(EXECUTORCH_BUILD_PTHREADPOOL) + target_link_libraries(xnn_executor_runner extension_threadpool pthreadpool) + target_compile_definitions(xnn_executor_runner PRIVATE ET_USE_THREADPOOL) + endif() endif() install( diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index f7702fae3de..187d6f34489 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -35,6 +35,11 @@ #include #endif // ET_EVENT_TRACER_ENABLED +#if defined(ET_USE_THREADPOOL) +#include +#include +#endif + static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB static uint8_t temp_allocator_pool[1024U * 1024U]; @@ -47,6 +52,10 @@ DEFINE_uint32(num_executions, 1, "Number of times to run the model."); #ifdef ET_EVENT_TRACER_ENABLED DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path."); #endif // ET_EVENT_TRACER_ENABLED +DEFINE_int32( + cpu_threads, + -1, + "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); using executorch::extension::FileDataLoader; using executorch::runtime::Error; @@ -124,6 +133,18 @@ int main(int argc, char** argv) { return 1; } + auto cpu_threads = FLAGS_cpu_threads; +#if defined(ET_USE_THREADPOOL) + uint32_t num_performant_cores = cpu_threads == -1 + ? ::executorch::extension::cpuinfo::get_num_performant_cores() + : static_cast(cpu_threads); + ET_LOG( + Info, "Resetting threadpool with num threads = %d", num_performant_cores); + if (num_performant_cores > 0) { + ::executorch::extension::threadpool::get_threadpool() + ->_unsafe_reset_threadpool(num_performant_cores); + } +#endif // ET_USE_THREADPOOL // Create a loader to get the data of the program file. There are other // DataLoaders that use mmap() or point to data that's already in memory, and // users can create their own DataLoaders to load from arbitrary sources.