diff --git a/CMakeLists.txt b/CMakeLists.txt
index b34ed07a10e..6dbb66afdaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all
+  # sub-directories. Find it before including portable so that
+  # optimized_portable_kernels can use it.
+  find_package_torch_headers()
+endif()
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-  # find pytorch lib here to make it available to all sub-directories
-  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 7cba9e91fe5..693be68c35e 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}")
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
+target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
 target_link_libraries(
   optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
 )
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index e27ba12ac0d..edea045d65f 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -66,13 +66,13 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
+  target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
   install(
     TARGETS optimized_portable_kernels
     DESTINATION lib
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index dbe35f8eefd..d9d72b5be3f 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -73,6 +73,7 @@ def define_common_targets():
             # -Wmacro-redefined, and we only care about getting
             # reasonable vectorization and Sleef support.
             "-DCPU_CAPABILITY_AVX2",
+            "-DET_USE_PYTORCH_HEADERS",
             "-DHAVE_AVX2_CPU_DEFINITION",
             "-DSTANDALONE_TORCH_HEADER",
         ] + get_sleef_preprocessor_flags(),
@@ -86,5 +87,5 @@ def define_common_targets():
             # linker failure.
             "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
             "DEFAULT": [],
-        }) + ["-DSTANDALONE_TORCH_HEADER"],
+        }) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]),
     )
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3932f1097e1..812e8e4a67a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
 
+#
+# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
+#
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+add_executable(size_test_all_optimized_ops ${_size_test__srcs})
+target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+target_link_libraries(
+  size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
+if(CMAKE_BUILD_TYPE EQUAL "Release")
+  target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
+endif()
+endif()
+
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
new file mode 100644
index 00000000000..181c2ce617d
--- /dev/null
+++ b/test/build_optimized_size_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Unlike build_size_test.sh, this script:
+# - does not attempt to disable exceptions and RTTI
+# - as a consequence, is able to build optimized kernels
+# - uses MinSizeRel builds
+# - is not currently intended to run in CI
+# - sets -g to make it easier to use tools like bloaty to investigate size
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
+
+cmake_install_executorch_lib() {
+  echo "Installing libexecutorch.a"
+  clean_executorch_install_folders
+  update_tokenizers_git_submodule
+  CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
+          -DCMAKE_CXX_STANDARD_REQUIRED=ON \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DOPTIMIZE_SIZE=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config MinSizeRel
+}
+
+test_cmake_size_test() {
+    CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
+
+    echo "Build size test"
+    cmake --build cmake-out/test -j9 --config MinSizeRel
+
+    echo 'ExecuTorch with no ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test
+
+    echo 'ExecuTorch with portable ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_ops
+
+    echo 'ExecuTorch with optimized ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_optimized_ops
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_lib
+test_cmake_size_test
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 49aa6cf08af..56c7fa2d7d4 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()