diff --git a/CMakeLists.txt b/CMakeLists.txt index b34ed07a10e..6dbb66afdaa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch) # Real integrations should supply their own YAML file that only lists the # operators necessary for the models that will run. # +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + # find pytorch lib here to make it available to all + # sub-directories. Find it before including portable so that + # optimized_portable_kernels can use it. + find_package_torch_headers() +endif() + if(BUILD_EXECUTORCH_PORTABLE_OPS) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - # find pytorch lib here to make it available to all sub-directories - find_package_torch_headers() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 7cba9e91fe5..693be68c35e 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}") list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(optimized_kernels ${_optimized_kernels__srcs}) target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft") +target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS) target_link_libraries( optimized_kernels PUBLIC executorch_core cpublas extension_threadpool ) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index e27ba12ac0d..edea045d65f 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -66,13 +66,13 @@ gen_operators_lib( # Portable kernels support optional parallelization (and, in the # future, perhaps other performance features). If support is present, # produce an optimized version. -set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) - -if(BUILD_OPTIMIZED_PORTABLE_KERNELS) +if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_library(optimized_portable_kernels ${_portable_kernels__srcs}) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options}) + target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS}) + target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS) install( TARGETS optimized_portable_kernels DESTINATION lib diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index dbe35f8eefd..d9d72b5be3f 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -73,6 +73,7 @@ def define_common_targets(): # -Wmacro-redefined, and we only care about getting # reasonable vectorization and Sleef support. "-DCPU_CAPABILITY_AVX2", + "-DET_USE_PYTORCH_HEADERS", "-DHAVE_AVX2_CPU_DEFINITION", "-DSTANDALONE_TORCH_HEADER", ] + get_sleef_preprocessor_flags(), @@ -86,5 +87,5 @@ def define_common_targets(): # linker failure. "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(), "DEFAULT": [], - }) + ["-DSTANDALONE_TORCH_HEADER"], + }) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]), ) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3932f1097e1..812e8e4a67a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() +# +# size_test_all_optimized_ops: binary with optimized ops and no delegate backend +# +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) +add_executable(size_test_all_optimized_ops ${_size_test__srcs}) +target_link_options_shared_lib(optimized_native_cpu_ops_lib) +target_link_libraries( + size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib) +if(CMAKE_BUILD_TYPE EQUAL "Release") + target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections") +endif() +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh new file mode 100644 index 00000000000..181c2ce617d --- /dev/null +++ b/test/build_optimized_size_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Unlike build_size_test.sh, this script: +# - does not attempt to disable exceptions and RTTI +# - as a consequence, is able to build optimized kernels +# - uses MinSizeRel builds +# - is not currently intended to run in CI +# - sets -g to make it easier to use tools like bloaty to investigate size + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" + +cmake_install_executorch_lib() { + echo "Installing libexecutorch.a" + clean_executorch_install_folders + update_tokenizers_git_submodule + CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \ + -DCMAKE_CXX_STANDARD_REQUIRED=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DOPTIMIZE_SIZE=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config MinSizeRel +} + +test_cmake_size_test() { + CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test + + echo "Build size test" + cmake --build cmake-out/test -j9 --config MinSizeRel + + echo 'ExecuTorch with no ops binary size, unstripped:' + ls -al cmake-out/test/size_test + + echo 'ExecuTorch with portable ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_ops + + echo 'ExecuTorch with optimized ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_optimized_ops +} + +if [[ -z $PYTHON_EXECUTABLE ]]; then + PYTHON_EXECUTABLE=python3 +fi + +cmake_install_executorch_lib +test_cmake_size_test diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 49aa6cf08af..56c7fa2d7d4 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -149,7 +149,7 @@ endif() if(TARGET coremldelegate) set_target_properties( coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES - "coreml_inmemoryfs;coreml_util" + "coreml_inmemoryfs;coreml_util" ) endif() @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib) endif() if(TARGET extension_threadpool) target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) + set_target_properties( + extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES + "cpuinfo;pthreadpool" + ) endif()