From 8848e4450e7db722b4d8600bd4d61e87a570578b Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 10:31:35 -0800 Subject: [PATCH 01/19] dynamic link --- backends/aoti/common_shims.cpp | 67 ++++++++++++++++++++++- backends/aoti/common_shims.h | 97 ++++++++++++++++++++++++---------- backends/cuda/CMakeLists.txt | 30 +++++++++-- 3 files changed, 158 insertions(+), 36 deletions(-) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index deb10478778..52d1f030ee5 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -16,8 +16,8 @@ namespace aoti { namespace internal { // Global storage for tensor metadata -std::unordered_map> tensor_to_sizes; -std::unordered_map> tensor_to_strides; +AOTI_SHIM_EXPORT std::unordered_map> tensor_to_sizes; +AOTI_SHIM_EXPORT std::unordered_map> tensor_to_strides; } // namespace internal extern "C" { @@ -204,6 +204,69 @@ void cleanup_tensor_metadata() { internal::tensor_to_strides.clear(); } +AOTI_SHIM_EXPORT void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg) { + ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg); +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) { + (void)tensor; + (void)ret_size; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) { + (void)orig_handle; + (void)new_handle; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor) { + (void)data_ptr; + (void)ndim; + (void)sizes; + (void)strides; + (void)storage_offset; + (void)dtype; + (void)device_type; + (void)device_index; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + } // extern "C" } // namespace aoti diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 91bb785b684..e2f66039549 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -15,6 +15,13 @@ #include #include +#if defined(EXPORT_AOTI_FUNCTIONS) +#include +#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT +#else +#define AOTI_SHIM_EXPORT +#endif + namespace executorch { namespace backends { namespace aoti { @@ -23,57 +30,89 @@ namespace aoti { using executorch::runtime::Error; using executorch::runtime::etensor::Tensor; +// Global storage for tensor metadata +extern std::unordered_map> tensor_to_sizes; +extern std::unordered_map> tensor_to_strides; + extern "C" { // Common AOTI type aliases using AOTIRuntimeError = Error; using AOTITorchError = Error; -// Global storage for tensor metadata -extern std::unordered_map> tensor_to_sizes; -extern std::unordered_map> tensor_to_strides; - // Attribute-related operations (memory-irrelevant) -AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); -AOTITorchError aoti_torch_get_storage_offset( - Tensor* tensor, - int64_t* ret_storage_offset); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset); -AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); -AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); -AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); -AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); -AOTITorchError aoti_torch_get_device_index( - Tensor* tensor, - int32_t* ret_device_index); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index); -AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); // Utility functions for device and layout information -int32_t aoti_torch_device_type_cpu(); -int32_t aoti_torch_layout_strided(); -int32_t aoti_torch_dtype_float32(); -int32_t aoti_torch_dtype_bfloat16(); -int32_t aoti_torch_dtype_int8(); -int32_t aoti_torch_dtype_int16(); -int32_t aoti_torch_dtype_int32(); -int32_t aoti_torch_dtype_int64(); -int32_t aoti_torch_dtype_bool(); +AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu(); +AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64(); // Dtype utility function needed by Metal backend -size_t aoti_torch_dtype_element_size(int32_t dtype); +AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype); // Autograd mode functions -int32_t aoti_torch_grad_mode_is_enabled(); -void aoti_torch_grad_mode_set_enabled(bool enabled); +AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled(); +AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled); // Cleanup functions for clearing global state -void cleanup_tensor_metadata(); +AOTI_SHIM_EXPORT void cleanup_tensor_metadata(); + +AOTI_SHIM_EXPORT void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle); + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor); } // extern "C" diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index c95d34247be..d90b2539ffc 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -76,8 +76,27 @@ set(_aoti_cuda_sources runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu runtime/platform/platform.cpp + ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp ) -add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC +if(MSVC) + add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) + + # Define export macros for Windows DLL + target_compile_definitions( + aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS + ) + + # Ensure proper DLL import/export library naming on Windows with + # config-specific paths + set_target_properties( + aoti_cuda + PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF + ) +else() + add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +endif() + target_include_directories( aoti_cuda PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $ @@ -95,11 +114,12 @@ target_link_options( # Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch # CUDA libraries target_link_libraries( - aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) -# If you need other CUDA libraries, link them similarly: -# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) -executorch_target_link_options_shared_lib(aoti_cuda) + +if(NOT MSVC) + executorch_target_link_options_shared_lib(aoti_cuda) +endif() if(BUILD_TESTING) add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp) From 0645ce021479448260d709269a98e55c713e6a2d Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 10:48:01 -0800 Subject: [PATCH 02/19] factor out shim, dynamic linkage --- backends/aoti/common_shims.h | 8 +- backends/aoti/export.h | 25 +++++ backends/cuda/CMakeLists.txt | 92 +++++++++++++------ backends/cuda/runtime/shims/cuda_guard.h | 13 +-- backends/cuda/runtime/shims/int4mm.h | 3 +- backends/cuda/runtime/shims/memory.h | 11 ++- .../cuda/runtime/shims/tensor_attribute.h | 5 +- 7 files changed, 108 insertions(+), 49 deletions(-) create mode 100644 backends/aoti/export.h diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index e2f66039549..b2bac6f41cd 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -15,13 +16,6 @@ #include #include -#if defined(EXPORT_AOTI_FUNCTIONS) -#include -#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT -#else -#define AOTI_SHIM_EXPORT -#endif - namespace executorch { namespace backends { namespace aoti { diff --git a/backends/aoti/export.h b/backends/aoti/export.h new file mode 100644 index 00000000000..879aa942035 --- /dev/null +++ b/backends/aoti/export.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// Define export macro for Windows DLL +// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by +// CMake, which causes this macro to export symbols using __declspec(dllexport). +// When consuming the library, the macro imports symbols using +// __declspec(dllimport). On non-Windows platforms, the macro is empty and has +// no effect. +#ifdef _WIN32 +#ifdef EXPORT_AOTI_FUNCTIONS +#define AOTI_SHIM_EXPORT __declspec(dllexport) +#else +#define AOTI_SHIM_EXPORT __declspec(dllimport) +#endif +#else +#define AOTI_SHIM_EXPORT +#endif diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index d90b2539ffc..893627302a1 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -67,9 +67,8 @@ install( DESTINATION lib ) -# CUDA-specific AOTI functionality -set(_aoti_cuda_sources - runtime/cuda_backend.cpp +# CUDA-specific AOTI shim symbols (dynamically linked) +set(_aoti_cuda_shim_sources runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp runtime/guard.cpp @@ -78,59 +77,96 @@ set(_aoti_cuda_sources runtime/platform/platform.cpp ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp ) -# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC -if(MSVC) - add_library(aoti_cuda SHARED ${_aoti_cuda_sources}) - # Define export macros for Windows DLL +add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources}) + +# Define export macros for shared library +if(MSVC) target_compile_definitions( - aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS + aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS ) - # Ensure proper DLL import/export library naming on Windows with - # config-specific paths + # Ensure proper DLL import/export library naming on Windows set_target_properties( - aoti_cuda + aoti_cuda_shims PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF ) -else() - add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) endif() target_include_directories( - aoti_cuda + aoti_cuda_shims PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $ $ ) + target_compile_options( - aoti_cuda PUBLIC $<$:/EHsc /GR> - $<$>:-fexceptions -frtti -fPIC> + aoti_cuda_shims PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> ) + # Ensure symbols are exported properly target_link_options( - aoti_cuda PUBLIC $<$>:-Wl,--export-dynamic> + aoti_cuda_shims PUBLIC + $<$>:-Wl,--export-dynamic> ) -# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch -# CUDA libraries +# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker target_link_libraries( - aoti_cuda PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) if(NOT MSVC) - executorch_target_link_options_shared_lib(aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda_shims) endif() +install( + TARGETS aoti_cuda_shims + EXPORT ExecuTorchTargets + DESTINATION lib +) + +# CUDA backend implementation +set(_aoti_cuda_backend_sources + runtime/cuda_backend.cpp +) + +# CUDA backend implementation +add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources}) + +target_include_directories( + aoti_cuda_backend + PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $ + $ +) +target_compile_options( + aoti_cuda_backend PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) +# Ensure symbols are exported properly +target_link_options( + aoti_cuda_backend PUBLIC $<$>:-Wl,--export-dynamic> +) + +# Link against shims library and other dependencies +target_link_libraries( + aoti_cuda_backend PUBLIC aoti_cuda_shims extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} +) + +executorch_target_link_options_shared_lib(aoti_cuda_backend) + +install( + TARGETS aoti_cuda_backend + EXPORT ExecuTorchTargets + DESTINATION lib +) + +# Alias for backward compatibility +add_library(aoti_cuda ALIAS aoti_cuda_backend) + if(BUILD_TESTING) add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp) target_link_libraries( - multimodal_benchmark PUBLIC aoti_cuda extension_module_static + multimodal_benchmark PUBLIC aoti_cuda_backend extension_module_static extension_flat_tensor portable_ops_lib ) endif() - -install( - TARGETS aoti_cuda - EXPORT ExecuTorchTargets - DESTINATION lib -) diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h index f930f3df643..9472bedb70e 100644 --- a/backends/cuda/runtime/shims/cuda_guard.h +++ b/backends/cuda/runtime/shims/cuda_guard.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -33,7 +34,7 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*; * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_guard( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_guard( int32_t device_index, CUDAGuardHandle* ret_guard); @@ -44,7 +45,7 @@ AOTITorchError aoti_torch_create_cuda_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); /** * Sets the CUDA device to a new index for an existing guard. @@ -54,7 +55,7 @@ AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_cuda_guard_set_index( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index( CUDAGuardHandle guard, int32_t device_index); @@ -69,7 +70,7 @@ AOTITorchError aoti_torch_cuda_guard_set_index( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_stream_guard( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard( void* stream, int32_t device_index, CUDAStreamGuardHandle* ret_guard); @@ -81,7 +82,7 @@ AOTITorchError aoti_torch_create_cuda_stream_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); /** * Gets the current CUDA stream for a specified device. @@ -91,7 +92,7 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_get_current_cuda_stream( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream( int32_t device_index, void** ret_stream); diff --git a/backends/cuda/runtime/shims/int4mm.h b/backends/cuda/runtime/shims/int4mm.h index 6bd2d9b3a79..87a9916b0aa 100644 --- a/backends/cuda/runtime/shims/int4mm.h +++ b/backends/cuda/runtime/shims/int4mm.h @@ -10,6 +10,7 @@ #include #include +#include namespace executorch::backends::cuda { @@ -69,7 +70,7 @@ extern "C" { * or invalid qGroupSize * - Error::Internal: CUDA kernel launch failure */ -AOTITorchError aoti_torch_cuda__weight_int4pack_mm( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm( Tensor* self, Tensor* mat2, int64_t qGroupSize, diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 7a8d4c3609b..8e0e133fe8a 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -10,6 +10,7 @@ #include #include +#include #include namespace executorch::backends::cuda { @@ -43,7 +44,7 @@ extern "C" { * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_tensor_from_blob_v2( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( void* data, int64_t ndim, const int64_t* sizes_ptr, @@ -71,7 +72,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_empty_strided( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided( int64_t ndim, const int64_t* sizes_ptr, const int64_t* strides_ptr, @@ -87,7 +88,7 @@ AOTITorchError aoti_torch_empty_strided( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); /** * Creates a tensor view that reinterprets the same underlying memory with @@ -106,7 +107,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); * * @return Error::Ok on success, appropriate error code on failure */ -AOTITorchError aoti_torch__reinterpret_tensor( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( Tensor* self, int64_t ndim, const int64_t* sizes_ptr, @@ -136,7 +137,7 @@ AOTITorchError aoti_torch__reinterpret_tensor( * - Error::MemoryAllocationFailed: failed to allocate temporary memory * - Error::Internal: CUDA operation failures */ -AOTITorchError +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); // Function to clear all tensors from internal storage diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index 6b61b5bd3b8..cf65c79ac6a 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -24,12 +25,12 @@ extern "C" { using AOTITorchError = Error; // Device type functions for tensor attributes -AOTITorchError aoti_torch_get_device_type( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_type( Tensor* tensor, int32_t* ret_device_type); // Device type constants -int32_t aoti_torch_device_type_cuda(); +AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cuda(); } // extern "C" From 6751c66d1715fef41318bc6aa31741d2aa25d182 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 10:52:16 -0800 Subject: [PATCH 03/19] shim linkage --- backends/cuda/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 893627302a1..bea1e812ad6 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -149,7 +149,8 @@ target_link_options( # Link against shims library and other dependencies target_link_libraries( - aoti_cuda_backend PUBLIC aoti_cuda_shims extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda_backend PRIVATE aoti_cuda_shims + PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) executorch_target_link_options_shared_lib(aoti_cuda_backend) From 2f78880ddd721126645e4f9f0715148fa2e16cfd Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 10:52:36 -0800 Subject: [PATCH 04/19] lint --- backends/aoti/common_shims.cpp | 6 ++-- backends/cuda/CMakeLists.txt | 35 +++++++++---------- backends/cuda/runtime/shims/cuda_guard.h | 21 ++++++----- .../cuda/runtime/shims/tensor_attribute.h | 5 ++- 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index 52d1f030ee5..6733a5e7afd 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -16,8 +16,10 @@ namespace aoti { namespace internal { // Global storage for tensor metadata -AOTI_SHIM_EXPORT std::unordered_map> tensor_to_sizes; -AOTI_SHIM_EXPORT std::unordered_map> tensor_to_strides; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_sizes; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_strides; } // namespace internal extern "C" { diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index bea1e812ad6..0ce59dc547d 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -82,14 +82,11 @@ add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources}) # Define export macros for shared library if(MSVC) - target_compile_definitions( - aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS - ) + target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS) # Ensure proper DLL import/export library naming on Windows set_target_properties( - aoti_cuda_shims - PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF + aoti_cuda_shims PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF ) endif() @@ -100,19 +97,20 @@ target_include_directories( ) target_compile_options( - aoti_cuda_shims PUBLIC $<$:/EHsc /GR> - $<$>:-fexceptions -frtti -fPIC> + aoti_cuda_shims + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> ) # Ensure symbols are exported properly target_link_options( - aoti_cuda_shims PUBLIC - $<$>:-Wl,--export-dynamic> + aoti_cuda_shims PUBLIC $<$>:-Wl,--export-dynamic> ) # Link against CUDA::cudart, common AOTI library, cuda_tensor_maker target_link_libraries( - aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart + ${CMAKE_DL_LIBS} ) if(NOT MSVC) @@ -126,9 +124,7 @@ install( ) # CUDA backend implementation -set(_aoti_cuda_backend_sources - runtime/cuda_backend.cpp -) +set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp) # CUDA backend implementation add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources}) @@ -139,18 +135,21 @@ target_include_directories( $ ) target_compile_options( - aoti_cuda_backend PUBLIC $<$:/EHsc /GR> - $<$>:-fexceptions -frtti -fPIC> + aoti_cuda_backend + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> ) # Ensure symbols are exported properly target_link_options( - aoti_cuda_backend PUBLIC $<$>:-Wl,--export-dynamic> + aoti_cuda_backend PUBLIC + $<$>:-Wl,--export-dynamic> ) # Link against shims library and other dependencies target_link_libraries( - aoti_cuda_backend PRIVATE aoti_cuda_shims - PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda_backend + PRIVATE aoti_cuda_shims + PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) executorch_target_link_options_shared_lib(aoti_cuda_backend) diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h index 9472bedb70e..83fceabb98f 100644 --- a/backends/cuda/runtime/shims/cuda_guard.h +++ b/backends/cuda/runtime/shims/cuda_guard.h @@ -34,9 +34,8 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*; * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_guard( - int32_t device_index, - CUDAGuardHandle* ret_guard); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_create_cuda_guard(int32_t device_index, CUDAGuardHandle* ret_guard); /** * Deletes a CUDA device guard and frees its associated resources. @@ -45,7 +44,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); /** * Sets the CUDA device to a new index for an existing guard. @@ -55,9 +55,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle gua * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index( - CUDAGuardHandle guard, - int32_t device_index); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index); /** * Creates a CUDA stream guard that sets the current device and stream, @@ -82,7 +81,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); /** * Gets the current CUDA stream for a specified device. @@ -92,9 +92,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGu * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream( - int32_t device_index, - void** ret_stream); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream); } // extern "C" diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index cf65c79ac6a..683f270ccda 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -25,9 +25,8 @@ extern "C" { using AOTITorchError = Error; // Device type functions for tensor attributes -AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_type( - Tensor* tensor, - int32_t* ret_device_type); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type); // Device type constants AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cuda(); From 23dd27bcb703d33b462da3ba7eed3d35a3c703cc Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 14:36:51 -0800 Subject: [PATCH 05/19] preproc flag for aoti_common --- backends/aoti/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index bcff1d56769..2d1f5859d3c 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -38,6 +38,10 @@ target_compile_options( PUBLIC $<$:/EHsc /GR> $<$>:-fexceptions -frtti -fPIC> ) +target_compile_definitions( + aoti_common + PRIVATE $<$:EXPORT_AOTI_FUNCTIONS> +) # Ensure symbols are exported properly if(APPLE) target_link_options(aoti_common PUBLIC -Wl,-export_dynamic) From 2af37f1e64147071b2fee2ec91f23cb90e550cfb Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 15:06:28 -0800 Subject: [PATCH 06/19] build platform against runtime not shims --- backends/cuda/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 0ce59dc547d..7263b0abbb6 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -74,7 +74,6 @@ set(_aoti_cuda_shim_sources runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu - runtime/platform/platform.cpp ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp ) @@ -124,7 +123,7 @@ install( ) # CUDA backend implementation -set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp) +set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp runtime/platform/platform.cpp) # CUDA backend implementation add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources}) From 717feaec2a5a294e6ee185ac23bbfb478b5c9d55 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 15:14:53 -0800 Subject: [PATCH 07/19] refactor platform layer --- backends/cuda/CMakeLists.txt | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 7263b0abbb6..d603c9215eb 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -67,6 +67,31 @@ install( DESTINATION lib ) +# Platform utilities (load_library, close_library, etc.) +set(_cuda_platform_sources runtime/platform/platform.cpp) +add_library(cuda_platform STATIC ${_cuda_platform_sources}) + +target_include_directories( + cuda_platform + PUBLIC $ $ + $ +) + +target_compile_options( + cuda_platform + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) + +# Link against ExecuTorch core libraries +target_link_libraries(cuda_platform PUBLIC executorch ${CMAKE_DL_LIBS}) + +install( + TARGETS cuda_platform + EXPORT ExecuTorchTargets + DESTINATION lib +) + # CUDA-specific AOTI shim symbols (dynamically linked) set(_aoti_cuda_shim_sources runtime/shims/memory.cpp @@ -106,10 +131,11 @@ target_link_options( aoti_cuda_shims PUBLIC $<$>:-Wl,--export-dynamic> ) -# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker +# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and platform utilities target_link_libraries( - aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart - ${CMAKE_DL_LIBS} + aoti_cuda_shims + PRIVATE cuda_platform + PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) if(NOT MSVC) @@ -123,7 +149,7 @@ install( ) # CUDA backend implementation -set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp runtime/platform/platform.cpp) +set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp) # CUDA backend implementation add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources}) @@ -147,7 +173,7 @@ target_link_options( # Link against shims library and other dependencies target_link_libraries( aoti_cuda_backend - PRIVATE aoti_cuda_shims + PRIVATE aoti_cuda_shims cuda_platform PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) From 9b02f052e73f9a589900fe3d402a828f78f4b98a Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 15:20:49 -0800 Subject: [PATCH 08/19] public symbol missing --- backends/cuda/runtime/shims/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 8e0e133fe8a..8d9d37037de 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -141,7 +141,7 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); // Function to clear all tensors from internal storage -void clear_all_tensors(); +AOTI_SHIM_EXPORT void clear_all_tensors(); } // extern "C" } // namespace executorch::backends::cuda From 15bac110bd117ad1419cf5f041f100caebc09450 Mon Sep 17 00:00:00 2001 From: roman-janik-nxp Date: Thu, 6 Nov 2025 19:14:54 +0100 Subject: [PATCH 09/19] NXP backend: Replace move relu before concat optimization (#15394) ### Summary This PR replaces optimization in `move_relu_before_concat.py` by `MoveActivationBeforeConcat` aten pass. The pass moves selected activations that are supported for fusion on Neutron (Relu, Relu6, Sigmoid, Tanh) before the `concat` node if concat input nodes are either Conv 2D or Linear 2D. The whole node Logic is determined by target specs, now supporting Neutron-C. Tests updated. ### Test plan Unit tests provided (test_move_activation_before_concatenation.py). cc @robert-kalmar --- .../move_activation_before_concat.py | 102 ++ .../aten_passes/neutron_aten_pass_manager.py | 9 +- .../optimizations/move_relu_before_concat.py | 107 -- .../backend/ir/tflite_optimizer/optimizer.py | 8 - backends/nxp/quantizer/neutron_quantizer.py | 10 +- backends/nxp/quantizer/patterns.py | 147 ++- backends/nxp/tests/test_batch_norm_fusion.py | 9 +- backends/nxp/tests/test_gru_splitting.py | 17 +- .../nxp/tests/test_linear_and_add_fusion.py | 55 +- ...st_move_activation_before_concatenation.py | 947 ++++++++++++++++++ .../test_removing_nodes_with_known_outputs.py | 13 +- .../nxp/tests/test_split_group_convolution.py | 20 +- 12 files changed, 1283 insertions(+), 161 deletions(-) create mode 100644 backends/nxp/aten_passes/move_activation_before_concat.py delete mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py create mode 100644 backends/nxp/tests/test_move_activation_before_concatenation.py diff --git a/backends/nxp/aten_passes/move_activation_before_concat.py b/backends/nxp/aten_passes/move_activation_before_concat.py new file mode 100644 index 00000000000..8ba306d42e2 --- /dev/null +++ b/backends/nxp/aten_passes/move_activation_before_concat.py @@ -0,0 +1,102 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch + +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec + +from torch.fx import GraphModule, Node +from torch.fx.passes.infra.pass_base import PassBase, PassResult + + +class MoveActivationBeforeConcat(PassBase): + """Move some operators around in the following pattern. + This is a common pattern that emerges from the conversion of separable convolutions. + This optimization works together with joint quantization of compute nodes and activations. Without it, + it is not beneficial. + + │ │ │ │ + ┌──────▼──────┐ ┌──────▼──────┐ ┌──────▼──────┐ ┌──────▼──────┐ + │ aten.conv2d │ ... │ aten.conv2d │ │ aten.conv2d │ ... │ aten.conv2d │ + └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + └───────┐ ┌──────┘ │ │ + ┌──▼─────▼─┐ replace with ┌─────▼─────┐ ┌─────▼─────┐ + │ aten.cat │ ──────────────► │ aten.relu │ ... │ aten.relu │ + └────┬─────┘ └─────┬─────┘ └─────┬─────┘ + │ └───────┐ ┌───────┘ + ┌─────▼─────┐ ┌──▼─────▼─┐ + │ aten.relu │ │ aten.cat │ + └─────┬─────┘ └────┬─────┘ + │ │ + """ + + def __init__(self, neutron_target_spec: NeutronTargetSpec): + self.neutron_target_spec = neutron_target_spec + + def call(self, module: GraphModule) -> bool: + def _is_concat(node_: Node) -> bool: + return ( + node_.op == "call_function" + and node_.target == torch.ops.aten.cat.default + ) + + made_changes = False + + for node in module.graph.nodes: + if not _is_concat(node): + continue # Not cat node. + + cat_node = node + activation = next(iter(cat_node.users)) + + # Check if all cat inputs nodes are conv 2D or linear 2D type and their only user is cat. + if not all( + self.neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ): + continue + + # Check if following activation is supported on Neutron as fused activation. + if not ( + len(cat_node.users) == 1 + and self.neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + activation + ) + ): + continue + + # Loop all Cat input nodes and insert new activation after node. + for input_node in cat_node.all_input_nodes: + with module.graph.inserting_after(input_node): + new_activation = module.graph.call_function( + activation.target, + args=(), + kwargs=activation.kwargs, + ) + + new_activation.meta["source_fn_stack"] = [ + ( + new_activation.name, + activation.meta["source_fn_stack"][-1][-1], + ) + ] + new_activation.meta["val"] = input_node.meta["val"] + + # Replace the uses of the input node with the new activation node. + input_node.replace_all_uses_with(new_activation) + new_activation.args = (input_node, *activation.args[1:]) + + # Replace the uses of the activation node with the cat node. + activation.replace_all_uses_with(cat_node) + + module.graph.erase_node(activation) + + made_changes = True + + return PassResult(module, made_changes) diff --git a/backends/nxp/aten_passes/neutron_aten_pass_manager.py b/backends/nxp/aten_passes/neutron_aten_pass_manager.py index 407ebf5da61..35205c76c68 100644 --- a/backends/nxp/aten_passes/neutron_aten_pass_manager.py +++ b/backends/nxp/aten_passes/neutron_aten_pass_manager.py @@ -16,6 +16,9 @@ from executorch.backends.nxp.aten_passes.fuse_linear_and_add_pass import ( FuseLinearAndAddPass, ) +from executorch.backends.nxp.aten_passes.move_activation_before_concat import ( + MoveActivationBeforeConcat, +) from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import ( RemoveNodesWithKnownOutputs, ) @@ -25,6 +28,7 @@ from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import ( SplitGRUBasedOnNumLayers, ) +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.exir.pass_manager import PassManager from torch import nn from torch.fx.passes.infra.pass_base import PassResult @@ -34,7 +38,9 @@ class NeutronAtenPassManager(PassManager): - def __init__(self, passes: list[PassType] = None): + def __init__( + self, neutron_target_spec: NeutronTargetSpec, passes: list[PassType] = None + ): passes: list[PassType] = passes or [ FuseBatchNormWithConvPass(), FuseBatchNormWithLinearPass(), @@ -42,6 +48,7 @@ def __init__(self, passes: list[PassType] = None): SplitGRUBasedOnNumLayers(), RemoveNodesWithKnownOutputs(), FuseLinearAndAddPass(), + MoveActivationBeforeConcat(neutron_target_spec), ] super().__init__(passes) diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py deleted file mode 100755 index 4d10b7c80ae..00000000000 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright 2024 NXP -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from collections import defaultdict -from copy import deepcopy - -from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model -from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import ( - AllInputsComeFrom, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import ( - BaseOptimization, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import ( - Op, - PatternMatcher, -) -from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import ( - TensorHasOneConsumer, - TensorsHaveSameQuantization, -) - - -class MoveActivationBeforeConcatenation(BaseOptimization): - """ - Move some operators around in the following pattern. - This is a common pattern that emerges from the conversion of separable convolutions. - - │ │ │ │ - ┌───▼────┐ ┌───▼────┐ ┌───▼────┐ ┌───▼────┐ - │ Conv2D │ ... │ Conv2D │ │ Conv2D │ ... │ Conv2D │ - └───┬────┘ └───┬────┘ └───┬────┘ └───┬────┘ - └──┐ ┌──┘ │ │ - ┌──▼──────────▼─┐ ┌──▼───┐ ┌──▼───┐ - │ Concatenation │ ─────► │ Relu │ ... │ Relu │ - └───────┬───────┘ └──┬───┘ └──┬───┘ - │ 'x' └──┐ ┌──┘ - ┌──▼───┐ ┌──▼──────────▼─┐ - │ Relu │ │ Concatenation │ - └──┬───┘ └───────┬───────┘ - │ 'y' │ - """ - - activations = ["Relu", "ReluN1To1", "Relu6", "Tanh", "Sign"] - - def __call__(self) -> bool: - matcher = PatternMatcher( - self._builder, - [ - Op(["Concatenation"], None, ["x"], [AllInputsComeFrom("Conv2D")]), - Op(self.activations, ["x"], ["y"]), - ], - [ - TensorHasOneConsumer("x"), - # If the activation function is not changing the quantization parameters, it can be moved without - # messing with the quantization elsewhere. - TensorsHaveSameQuantization(["x", "y"]), - ], - ) - - to_remove = [] - - # Mapping an operator to a list of operators. These operators (value) will later be added into the TFLite - # model's `operators` in front of the specified operator (key). - to_add: dict[tflite_model.Operator, list[tflite_model.Operator]] = defaultdict( - lambda: [] - ) - - for [concat, activation], _, _, _ in matcher.match_patterns(): - new_concat_inputs = [] - for concat_input in concat.tmp_inputs: - # Create a new operator for the activation function. - new_activation = deepcopy(activation) - new_activation.tmp_inputs = [concat_input] - new_activation_output = self._builder.duplicate_tensor(concat_input) - new_activation.tmp_outputs = [new_activation_output] - - to_add[concat].append( - new_activation - ) # Insert the new activation into the model later. - - new_concat_inputs.append( - new_activation_output - ) # Connect the activation with the `Concatenation`. - - concat.tmp_inputs = new_concat_inputs - - # Tensor rule ensures that only the activation functions is using the output of the `Concatenation`. - # It is safe to bypass. - concat.tmp_outputs[0] = activation.tmp_outputs[0] - to_remove.append(activation) - - operators = self._builder.get_operators() - - # Add the new activations into the model. - for concat, activations in to_add.items(): - idx = operators.index(concat) - for activation in activations: - operators.insert(idx, activation) - - # Remove the old activations. - for activation in to_remove: - operators.remove(activation) - - return len(to_remove) != 0 diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py index 3611c55e995..52de6f224eb 100755 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py +++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py @@ -11,9 +11,6 @@ from executorch.backends.nxp.backend.ir import logger from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import ( - MoveActivationBeforeConcatenation, -) from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.permute_fully_connected_weights_after_reshape import ( PermuteFullyConnectedWeightsAfterReshape, ) @@ -29,8 +26,6 @@ class Optimization(Enum): PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE = 12 - MOVE_ACTIVATION_BEFORE_CONCAT = 15 - class Optimizer: """ @@ -68,9 +63,6 @@ def __init__( Optimization.PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE: PermuteFullyConnectedWeightsAfterReshape( builder, conversion_config ), - Optimization.MOVE_ACTIVATION_BEFORE_CONCAT: MoveActivationBeforeConcatenation( - builder, conversion_config - ), } def optimize( diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py index 6564c19d7b9..f476e16628e 100644 --- a/backends/nxp/quantizer/neutron_quantizer.py +++ b/backends/nxp/quantizer/neutron_quantizer.py @@ -12,6 +12,7 @@ from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.backends.nxp.quantizer.patterns import ( AbsPattern, + ActivationsConcatClusterPattern, AdaptiveAvgPoolPattern, AddmmPattern, AddTensorPattern, @@ -225,13 +226,16 @@ def __init__(self, neutron_target_spec: NeutronTargetSpec): self.op_to_applied_quantizer = { pt: False for q in self.quantizers for pt in q.pattern.partition_types() } + self.cluster_quantizers = [ + NeutronAtenQuantizer(ActivationsConcatClusterPattern(self), static_qconfig) + ] def transform_for_annotation( self, model: torch.fx.GraphModule ) -> torch.fx.GraphModule: model.graph.eliminate_dead_code() # Remove dead code to simplify the graph for the passes. - model = NeutronAtenPassManager()(model).graph_module + model = NeutronAtenPassManager(self.neutron_target_spec)(model).graph_module model.graph.eliminate_dead_code() # Remove dead code again, in case it was created by the passes. @@ -240,6 +244,10 @@ def transform_for_annotation( def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: self._annotate_inputs(model) + # Annotate node clusters in model + for cluster_quantizer in self.cluster_quantizers: + cluster_quantizer.annotate(model) + nodes = list(model.graph.nodes) for node in nodes: if ( diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py index ccd579d5c52..ee92cd42ef1 100644 --- a/backends/nxp/quantizer/patterns.py +++ b/backends/nxp/quantizer/patterns.py @@ -13,6 +13,7 @@ from executorch.backends.nxp.quantizer.utils import get_bias_qparams from torch import fx from torch._ops import OpOverload +from torch.fx import Node from torchao.quantization.pt2e import PerChannelMinMaxObserver from torchao.quantization.pt2e.quantizer import ( DerivedQuantizationSpec, @@ -20,6 +21,7 @@ QuantizationSpec, SharedQuantizationSpec, ) + from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY @@ -199,7 +201,6 @@ def partition_types(self) -> list[OpOverload]: def get_anchors( self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] ) -> PartitionAnchors: - # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... addmm_node = fused_partition[0].nodes[-1] bias_qspec = DerivedQuantizationSpec( @@ -745,3 +746,147 @@ def get_anchors( return get_anchors_for_fixed_quant_specs( fused_partition, scale=1.0 / 128.0, zero_point=0 ) + + +class ActivationsConcatClusterPattern(QuantizationPattern): + """ + Quantizer for activations concat cluster pattern. + + The quantizer matches a pattern where concat node is preceded by activation nodes preceded by Conv 2D or Linear. + All activation nodes quantization parameters must be the same. Only activations, that have support for fusion + to preceding compute node on Neutron are allowed. This cluster is usually produced by MoveActivationBeforeConcat + pass. Cluster schema: + + │ │ + ┌──────▼──────┐ ┌──────▼──────┐ + │ aten.conv2d │ ... │ aten.conv2d │ + └──────┬──────┘ └──────┬──────┘ + │ │ + ┌─────▼─────┐ ┌─────▼─────┐ + │ aten.relu │ ... │ aten.relu │ + └─────┬─────┘ └─────┬─────┘ + └───────┐ ┌───────┘ + ┌──▼─────▼─┐ + │ aten.cat │ + └────┬─────┘ + │ + """ + + def __init__(self, neutron_quantizer): + self.neutron_quantizer = neutron_quantizer + self.neutron_target_info = ( + self.neutron_quantizer.neutron_target_spec.neutron_target_info + ) + + @staticmethod + def _all_activations_are_equal(activations: list[Node]) -> bool: + first_input_node = activations[0] + hardtanh_t = [ + torch.ops.aten.hardtanh.default, + torch.ops.aten.hardtanh_.default, + ] + relu_t = [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + ] + tanh_t = [ + torch.ops.aten.tanh.default, + torch.ops.aten.tanh_.default, + ] + + def _activations_are_equal(activation1: Node, activation2: Node) -> bool: + if ( # Targets are equal also with their inplace variants + (activation1.target in hardtanh_t and activation2.target in hardtanh_t) + or (activation1.target in relu_t and activation2.target in relu_t) + or (activation1.target in tanh_t and activation2.target in tanh_t) + or ( + activation1.target == torch.ops.aten.sigmoid.default + and activation2.target == torch.ops.aten.sigmoid.default + ) + ): + return True + elif ( # Hardtanh with min_val 0 and max_val 'inf' is equal to Relu + activation1.target in hardtanh_t + and activation1.args[1:] == (0.0, float("inf")) + and activation2.target in relu_t + ) or ( + activation1.target in relu_t + and activation2.target in hardtanh_t + and activation2.args[1:] == (0.0, float("inf")) + ): + return True + else: + return False + + return all( + _activations_are_equal(activation, first_input_node) + for activation in activations + ) + + def partition_types(self) -> list[OpOverload]: + return [torch.ops.aten.cat.default] + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule] + ) -> PartitionAnchors | None: + cat_node = fused_partition[0].nodes[-1] + + # Check all cat inputs are supported activations + if not all( + self.neutron_target_info.is_supported_fused_activation__aten(input_node) + for input_node in cat_node.all_input_nodes + ): + return None + + # Check all cat inputs are equal activations + if not self._all_activations_are_equal(cat_node.all_input_nodes): + return None + + # Check compute nodes are Conv 2D or Linear + if not all( + self.neutron_target_info.is_fusable_conv_or_linear__aten(compute_node) + for input_node in cat_node.all_input_nodes + for compute_node in input_node.all_input_nodes + ): + return None + + # Annotate compute nodes + for input_node in cat_node.all_input_nodes: + for compute_node in input_node.all_input_nodes: + if compute_node.target not in self.neutron_quantizer.op_to_quantizer: + return None + compute_node_quantizer = self.neutron_quantizer.op_to_quantizer[ + compute_node.target + ] + compute_node_quantizer.annotate(gm) + del compute_node.meta["quantization_annotation"].output_qspec + + # Annotate activations + for input_node in cat_node.all_input_nodes: + if input_node.target not in self.neutron_quantizer.op_to_quantizer: + return None + activation_quantizer = self.neutron_quantizer.op_to_quantizer[ + input_node.target + ] + activation_quantizer.annotate(gm) + input_node.meta["quantization_annotation"].input_qspec_map = {} + + # Annotate cat node + inputs = [] + first_input_node = cat_node.all_input_nodes[0] + for idx in range(len(cat_node.all_input_nodes)): + inputs.append( + ( + cat_node, + NodeArgsIdx(0, idx), + SharedQuantizationSpec(first_input_node), + ) + ) + outputs = [(cat_node, SharedQuantizationSpec(first_input_node))] + + return PartitionAnchors( + inputs=inputs, + weights=[], + biases=[], + output=outputs, + ) diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py index fce11ce5aa2..eeb4b03d7a6 100644 --- a/backends/nxp/tests/test_batch_norm_fusion.py +++ b/backends/nxp/tests/test_batch_norm_fusion.py @@ -18,7 +18,10 @@ from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.view_copy_converter import ( ViewCopyConverter, ) -from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.executorch_pipeline import ( + neutron_target_spec, + to_quantized_edge_program, +) from executorch.backends.nxp.tests.executors import OverrideTargetSupportCheck from torch import nn @@ -98,7 +101,7 @@ def test_batch_norm_conv_fusing(bias: bool, input_shape: list[int]): program = torch.export.export(module, example_input, strict=True) og_module = program.module() - pm = NeutronAtenPassManager() + pm = NeutronAtenPassManager(neutron_target_spec) graph_module_out = pm(deepcopy(program.module())).graph_module # Make sure the fusion worked. @@ -133,7 +136,7 @@ def test_batch_norm_linear_fusing(bias: bool): program = torch.export.export(module, example_input, strict=True) og_module = program.module() - pm = NeutronAtenPassManager() + pm = NeutronAtenPassManager(neutron_target_spec) graph_module_out = pm(deepcopy(program.module())).graph_module # Make sure the fusion worked. diff --git a/backends/nxp/tests/test_gru_splitting.py b/backends/nxp/tests/test_gru_splitting.py index a2e9d324f69..297f9677fb2 100644 --- a/backends/nxp/tests/test_gru_splitting.py +++ b/backends/nxp/tests/test_gru_splitting.py @@ -13,6 +13,7 @@ from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import ( SplitGRUBasedOnNumLayers, ) +from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec @pytest.fixture(autouse=True) @@ -94,7 +95,9 @@ def test_gru_splitting__with_bias(num_layers): ) # Just 1 `GRU` in the model. # Run pre-processing passes of the float32 aten dialect program. - pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()]) + pytorch_pass_manager = NeutronAtenPassManager( + neutron_target_spec, [SplitGRUBasedOnNumLayers()] + ) pytorch_pass_manager(exir_program_aten) post_pass_output = [t.detach() for t in exir_program_aten(*example_input)] @@ -143,7 +146,9 @@ def test_gru_splitting__no_bias(num_layers): ) # Just 1 `GRU` in the model. # Run pre-processing passes of the float32 aten dialect program. - pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()]) + pytorch_pass_manager = NeutronAtenPassManager( + neutron_target_spec, [SplitGRUBasedOnNumLayers()] + ) pytorch_pass_manager(exir_program_aten) post_pass_output = [t.detach() for t in exir_program_aten(*example_input)] @@ -193,7 +198,9 @@ def test_gru_splitting__bidirectional__no_bias(num_layers): ) # Just 1 `GRU` in the model. # Run pre-processing passes of the float32 aten dialect program. - pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()]) + pytorch_pass_manager = NeutronAtenPassManager( + neutron_target_spec, [SplitGRUBasedOnNumLayers()] + ) pytorch_pass_manager(exir_program_aten) nodes = list(exir_program_aten.graph.nodes) @@ -239,7 +246,9 @@ def test_gru_splitting__bidirectional__with_bias(num_layers): ) # Just 1 `GRU` in the model. # Run pre-processing passes of the float32 aten dialect program. - pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()]) + pytorch_pass_manager = NeutronAtenPassManager( + neutron_target_spec, [SplitGRUBasedOnNumLayers()] + ) pytorch_pass_manager(exir_program_aten) nodes = list(exir_program_aten.graph.nodes) diff --git a/backends/nxp/tests/test_linear_and_add_fusion.py b/backends/nxp/tests/test_linear_and_add_fusion.py index 16d3c4140a2..222d748001c 100644 --- a/backends/nxp/tests/test_linear_and_add_fusion.py +++ b/backends/nxp/tests/test_linear_and_add_fusion.py @@ -18,6 +18,7 @@ from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import ( RemoveNodesWithKnownOutputs, ) +from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops from parameterized import parameterized @@ -121,10 +122,11 @@ def test_linear_add_fusing__static__no_bias__valid_shape( original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -167,10 +169,11 @@ def test_linear_add_fusing__static__no_bias__invalid_shape( original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -209,10 +212,11 @@ def test_linear_add_fusing__static__bias__valid_shape( original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -253,10 +257,11 @@ def test_linear_add_fusing__static__no_bias__reverse_order(self): original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -295,10 +300,11 @@ def test_linear_add_fusing__static__bias__reverse_order(self): original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -340,10 +346,11 @@ def test_linear_add_fusing__static__alpha__no_bias(self): original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -381,10 +388,11 @@ def test_linear_add_fusing__static__alpha__bias(self): original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -424,10 +432,11 @@ def test_linear_add_fusing__static__alpha__reversed_add_inputs(self): original_module = program.module() modified_module = NeutronAtenPassManager( + neutron_target_spec, [ RemoveNodesWithKnownOutputs(), # Make the added tensor static. FuseLinearAndAddPass(), - ] + ], )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. @@ -474,9 +483,9 @@ def test_linear_add_fusing__dynamic__no_bias__valid_shape( program = torch.export.export(module, example_input, strict=True) original_module = program.module() - modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( - deepcopy(program.module()) - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [FuseLinearAndAddPass()] + )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. original_nodes = list(original_module.graph.nodes) @@ -513,9 +522,9 @@ def test_linear_add_fusing__dynamic__no_bias__invalid_shape( program = torch.export.export(module, example_input, strict=True) original_module = program.module() - modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( - deepcopy(program.module()) - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [FuseLinearAndAddPass()] + )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. original_nodes = list(original_module.graph.nodes) @@ -550,9 +559,9 @@ def test_linear_add_fusing__dynamic__bias__valid_shape( program = torch.export.export(module, example_input, strict=True) original_module = program.module() - modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( - deepcopy(program.module()) - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [FuseLinearAndAddPass()] + )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. original_nodes = list(original_module.graph.nodes) @@ -584,9 +593,9 @@ def test_linear_add_fusing__dynamic__reverse_order(self): program = torch.export.export(module, example_input, strict=True) original_module = program.module() - modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( - deepcopy(program.module()) - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [FuseLinearAndAddPass()] + )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. original_nodes = list(original_module.graph.nodes) @@ -618,9 +627,9 @@ def test_linear_add_fusing__dynamic__alpha(self): program = torch.export.export(module, example_input, strict=True) original_module = program.module() - modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])( - deepcopy(program.module()) - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [FuseLinearAndAddPass()] + )(deepcopy(program.module())).graph_module # Make sure the module wasn't broken. original_nodes = list(original_module.graph.nodes) diff --git a/backends/nxp/tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/test_move_activation_before_concatenation.py new file mode 100644 index 00000000000..779c958c049 --- /dev/null +++ b/backends/nxp/tests/test_move_activation_before_concatenation.py @@ -0,0 +1,947 @@ +# Copyright 2025 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math +import unittest + +import kgb +import numpy as np +import torch +from executorch.backends.nxp.aten_passes.move_activation_before_concat import ( + MoveActivationBeforeConcat, +) +from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import ( + NeutronAtenPassManager, +) +from executorch.backends.nxp.backend.edge_program_converter import ( + EdgeProgramToIRConverter, +) +from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer +from executorch.backends.nxp.tests.executorch_pipeline import ( + _quantize_model, + get_random_calibration_inputs, + neutron_target_spec, + to_model_input_spec, + to_quantized_edge_program, +) +from executorch.backends.nxp.tests.executors import ( + convert_run_compare, + graph_contains_any_of_ops, + ToChannelFirstPreprocess, + ToChannelLastPreprocess, +) +from executorch.backends.nxp.tests.models import get_activation +from executorch.exir.dialects._ops import ops as exir_ops +from parameterized import parameterized +from torch import nn +from torch.export import ExportedProgram +from torch.fx import GraphModule + +concat_cluster_ops = [ + exir_ops.edge.aten.addmm.default, + exir_ops.edge.aten.convolution.default, + exir_ops.edge.aten.hardtanh.default, + exir_ops.edge.aten.relu.default, + exir_ops.edge.aten.sigmoid.default, + exir_ops.edge.aten.tanh.default, + exir_ops.edge.aten.cat.default, +] + + +class ConvConcatActivationModule(torch.nn.Module): + def __init__(self, activation: str, inplace: bool, in_channels: int): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + in_channels, + (3, 3), + padding=1, + ) + + self.activation = get_activation(activation, inplace) + self.eval() + + def forward(self, x): + x1 = self.conv(x) + x2 = self.conv(x) + x = torch.cat((x1, x2), dim=1) + return self.activation(x) + + +class LinearConcatActivationModule(nn.Module): + def __init__( + self, activation: str, inplace: bool, in_channels: int, mode: str = "linear" + ): + super().__init__() + self.mode = mode.lower() + assert self.mode in [ + "linear", + "addmm", + "mm", + ], "Mode must be 'linear', 'addmm', or 'mm'" + + if self.mode == "linear": + self.linear = nn.Linear(in_channels, in_channels) + else: + # Manual weight and bias for addmm/mm. + self.weight = nn.Parameter(torch.empty(in_channels, in_channels)) + self.bias = nn.Parameter(torch.empty(in_channels)) + nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) + nn.init.uniform_(self.bias, -bound, bound) + + self.activation = get_activation(activation, inplace) + self.eval() + + def forward(self, x): + x1, x2 = None, None + + if self.mode == "linear": + x1 = self.linear(x) + x2 = self.linear(x) + if self.mode == "addmm": + x1 = torch.addmm(self.bias, x, self.weight) + x2 = torch.addmm(self.bias, x, self.weight) + elif self.mode == "mm": + x1 = torch.mm(x, self.weight) + x2 = torch.mm(x, self.weight) + + x = torch.cat((x1, x2), dim=1) + return self.activation(x) + + +class ConvActivationConcatModule(torch.nn.Module): + def __init__( + self, + activation1: str, + activation2: str, + act1_inplace: bool, + act2_inplace: bool, + in_channels: int, + ): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + in_channels, + (3, 3), + padding=1, + ) + + self.activation1 = get_activation(activation1, act1_inplace) + self.activation2 = get_activation(activation2, act2_inplace) + self.eval() + + def forward(self, x): + x1 = self.conv(x) + x1 = self.activation1(x1) + x2 = self.conv(x) + x2 = self.activation2(x2) + return torch.cat((x1, x2), dim=1) + + +class LinearActivationConcatModule(torch.nn.Module): + def __init__( + self, + activation1: str, + activation2: str, + act1_inplace: bool, + act2_inplace: bool, + in_channels: int, + ): + super().__init__() + self.linear = nn.Linear(in_channels, in_channels) + + self.activation1 = get_activation(activation1, act1_inplace) + self.activation2 = get_activation(activation2, act2_inplace) + self.eval() + + def forward(self, x): + x1 = self.linear(x) + x1 = self.activation1(x1) + x2 = self.linear(x) + x2 = self.activation2(x2) + return torch.cat((x1, x2), dim=1) + + +class TestMoveActivationBeforeConcat(unittest.TestCase): + __test__ = False # Prevent interfering with PyTest tests. + + @classmethod + def setUpClass(cls): + torch.manual_seed(23) + np.random.seed(42) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat__conv(self, activation, inplace): + input_shape = (1, 3, 8, 8) + model = ConvConcatActivationModule( + activation=activation, inplace=inplace, in_channels=3 + ) + + calibration_inputs = get_random_calibration_inputs( + to_model_input_spec(input_shape) + ) + example_input = calibration_inputs[0] + + exir_program_aten = torch.export.export( + model, example_input, strict=True + ).module() + + outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)] + nodes = list(exir_program_aten.graph.nodes) + assert len(nodes) == 8 + cat_node = nodes[5] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[6] + ) + ) + + # Apply the optimization. + NeutronAtenPassManager( + neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)] + )(exir_program_aten) + + nodes = list(exir_program_aten.graph.nodes) + + # Make sure the optimization was applied. + assert len(nodes) == 9 + cat_node = nodes[7] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert nodes[8].target == "output" + + outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)] + + # Make sure the model still produces the exact same output. + assert np.allclose(outputs_before[0], outputs_after[0]) + + # Run pre-processing passes of the float32 aten dialect program. + neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec) + neutron_aten_pass_manager(exir_program_aten) # All passes by default. + + exir_program_aten_quant = _quantize_model( + exir_program_aten, + NeutronQuantizer(neutron_target_spec), + calibration_inputs, + ) + + # Check convolution and activation are in same QDQ cluster. + nodes = list(exir_program_aten_quant.graph.nodes) + assert len(nodes) == 26 + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[14] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[15] + ) + ) + assert ( + nodes[16].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[18] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[19] + ) + ) + assert ( + nodes[20].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat__linear(self, activation, inplace): + input_shape = (1, 8) + model = LinearConcatActivationModule( + activation=activation, inplace=inplace, in_channels=8, mode="linear" + ) + + calibration_inputs = get_random_calibration_inputs( + to_model_input_spec(input_shape) + ) + example_input = calibration_inputs[0] + + exir_program_aten = torch.export.export( + model, example_input, strict=True + ).module() + + outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)] + nodes = list(exir_program_aten.graph.nodes) + assert len(nodes) == 8 + cat_node = nodes[5] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[6] + ) + ) + + # Apply the optimization. + NeutronAtenPassManager( + neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)] + )(exir_program_aten) + + nodes = list(exir_program_aten.graph.nodes) + + # Make sure the optimization was applied. + assert len(nodes) == 9 + cat_node = nodes[7] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert nodes[8].target == "output" + + outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)] + + # Make sure the model still produces the exact same output. + assert np.allclose(outputs_before[0], outputs_after[0]) + + # Run pre-processing passes of the float32 aten dialect program. + neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec) + neutron_aten_pass_manager(exir_program_aten) # All passes by default. + + exir_program_aten_quant = _quantize_model( + exir_program_aten, + NeutronQuantizer(neutron_target_spec), + calibration_inputs, + ) + + # Check linear and activation are in same QDQ cluster. + nodes = list(exir_program_aten_quant.graph.nodes) + assert len(nodes) == 22 + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[10] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[11] + ) + ) + assert ( + nodes[12].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[14] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[15] + ) + ) + assert ( + nodes[16].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat__addmm(self, activation, inplace): + input_shape = (1, 8) + model = LinearConcatActivationModule( + activation=activation, inplace=inplace, in_channels=8, mode="addmm" + ) + + calibration_inputs = get_random_calibration_inputs( + to_model_input_spec(input_shape) + ) + example_input = calibration_inputs[0] + + exir_program_aten = torch.export.export( + model, example_input, strict=True + ).module() + + outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)] + nodes = list(exir_program_aten.graph.nodes) + assert len(nodes) == 8 + cat_node = nodes[5] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[6] + ) + ) + + # Apply the optimization. + NeutronAtenPassManager( + neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)] + )(exir_program_aten) + + nodes = list(exir_program_aten.graph.nodes) + + # Make sure the optimization was applied. + assert len(nodes) == 9 + cat_node = nodes[7] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert nodes[8].target == "output" + + outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)] + + # Make sure the model still produces the exact same output. + assert np.allclose(outputs_before[0], outputs_after[0]) + + # Run pre-processing passes of the float32 aten dialect program. + neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec) + neutron_aten_pass_manager(exir_program_aten) # All passes by default. + + exir_program_aten_quant = _quantize_model( + exir_program_aten, + NeutronQuantizer(neutron_target_spec), + calibration_inputs, + ) + + # Check addmm and activation are in same QDQ cluster. + nodes = list(exir_program_aten_quant.graph.nodes) + assert len(nodes) == 22 + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[10] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[11] + ) + ) + assert ( + nodes[12].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[14] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[15] + ) + ) + assert ( + nodes[16].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat__mm(self, activation, inplace): + input_shape = (1, 8) + model = LinearConcatActivationModule( + activation=activation, inplace=inplace, in_channels=8, mode="mm" + ) + + calibration_inputs = get_random_calibration_inputs( + to_model_input_spec(input_shape) + ) + example_input = calibration_inputs[0] + + exir_program_aten = torch.export.export( + model, example_input, strict=True + ).module() + + outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)] + nodes = list(exir_program_aten.graph.nodes) + assert len(nodes) == 7 + cat_node = nodes[4] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[5] + ) + ) + + # Apply the optimization. + NeutronAtenPassManager( + neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)] + )(exir_program_aten) + + nodes = list(exir_program_aten.graph.nodes) + + # Make sure the optimization was applied. + assert len(nodes) == 8 + cat_node = nodes[6] + assert cat_node.target == torch.ops.aten.cat.default + assert all( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + input_node + ) + and len(input_node.users) == 1 + for input_node in cat_node.all_input_nodes + ) + assert nodes[7].target == "output" + + outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)] + + # Make sure the model still produces the exact same output. + assert np.allclose(outputs_before[0], outputs_after[0]) + + # Run pre-processing passes of the float32 aten dialect program. + neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec) + neutron_aten_pass_manager(exir_program_aten) # All passes by default. + + exir_program_aten_quant = _quantize_model( + exir_program_aten, + NeutronQuantizer(neutron_target_spec), + calibration_inputs, + ) + + # Check mm and activation are in same QDQ cluster. + nodes = list(exir_program_aten_quant.graph.nodes) + assert len(nodes) == 19 + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[7] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[8] + ) + ) + assert ( + nodes[9].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[11] + ) + assert ( + neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[12] + ) + ) + assert ( + nodes[13].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat_quantization__conv( + self, activation, inplace + ): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, + call_original=True, + owner=EdgeProgramToIRConverter, + ) as converter_spy: + input_shape = (1, 8, 8, 8) + model = ConvConcatActivationModule( + activation=activation, inplace=inplace, in_channels=8 + ) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=concat_cluster_ops + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + tflite_input_preprocess=ToChannelLastPreprocess(), + tflite_output_preprocess=ToChannelFirstPreprocess(), + ) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat_quantization__linear( + self, activation, inplace + ): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, + call_original=True, + owner=EdgeProgramToIRConverter, + ) as converter_spy: + input_shape = (1, 8) + model = LinearConcatActivationModule( + activation=activation, inplace=inplace, in_channels=8, mode="linear" + ) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=concat_cluster_ops + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + ) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat_quantization__addmm( + self, activation, inplace + ): + torch.manual_seed(23) + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, + call_original=True, + owner=EdgeProgramToIRConverter, + ) as converter_spy: + input_shape = (1, 8) + model = LinearConcatActivationModule( + activation=activation, inplace=inplace, in_channels=8, mode="addmm" + ) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=concat_cluster_ops + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + atol=1.0, + ) + + @parameterized.expand( + [ + ["relu", True], + ["relu", False], + ["relu6", True], + ["relu6", False], + ["tanh", True], + ["tanh", False], + ["sigmoid", False], + ] + ) + def test_move_activation_before_concat_quantization__mm(self, activation, inplace): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, + call_original=True, + owner=EdgeProgramToIRConverter, + ) as converter_spy: + input_shape = (1, 8) + model = LinearConcatActivationModule( + activation=activation, inplace=inplace, in_channels=8, mode="mm" + ) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, ops=concat_cluster_ops + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + ) + + @parameterized.expand( + [ + ["relu", "relu", True, False], + ["relu6", "relu6", False, True], + ["tanh", "tanh", True, False], + ["sigmoid", "sigmoid", False, True], + ["relu", "relu_hardtanh", True, True], + ] + ) + def test_concat_cluster_quantization__conv( + self, activation1, activation2, act1_inplace, act2_inplace + ): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, + call_original=True, + owner=EdgeProgramToIRConverter, + ) as converter_spy: + with kgb.spy_on(_quantize_model, call_original=True) as quantizer_spy: + input_shape = (1, 8, 8, 8) + model = ConvActivationConcatModule( + activation1, activation2, act1_inplace, act2_inplace, in_channels=8 + ) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, + ops=concat_cluster_ops, + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[ + -1 + ].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + exir_program_aten_quant: GraphModule = quantizer_spy.calls[-1].args[0] + + # Check convolution and activation are in same QDQ cluster. + nodes = list(exir_program_aten_quant.graph.nodes) + assert len(nodes) == 26 + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[14] + ) + assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[15] + ) + assert ( + nodes[16].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[18] + ) + assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[19] + ) + assert ( + nodes[20].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + + input_data = ( + np.random.random(input_shape).astype(np.float32) * 50 + ).astype(np.int8) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + tflite_input_preprocess=ToChannelLastPreprocess(), + tflite_output_preprocess=ToChannelFirstPreprocess(), + ) + + @parameterized.expand( + [ + ["relu", "relu", True, False], + ["relu6", "relu6", False, True], + ["tanh", "tanh", True, False], + ["sigmoid", "sigmoid", False, True], + ["relu", "relu_hardtanh", True, True], + ] + ) + def test_concat_cluster_quantization__linear( + self, activation1, activation2, act1_inplace, act2_inplace + ): + with kgb.spy_on( + EdgeProgramToIRConverter.convert_program, + call_original=True, + owner=EdgeProgramToIRConverter, + ) as converter_spy: + with kgb.spy_on(_quantize_model, call_original=True) as quantizer_spy: + input_shape = (1, 8) + model = LinearActivationConcatModule( + activation1, activation2, act1_inplace, act2_inplace, in_channels=8 + ) + + edge_program = to_quantized_edge_program( + model, input_shape + ).exported_program() + + # Make sure that all nodes were delegated. + assert not graph_contains_any_of_ops( + graph=edge_program.graph, + ops=concat_cluster_ops, + ) + assert any( + "lowered_module" in node.name for node in edge_program.graph.nodes + ) + + tflite_flatbuffers_model, io_formats = converter_spy.calls[ + -1 + ].return_value + exported_program: ExportedProgram = converter_spy.calls[-1].args[0] + exir_program_aten_quant: GraphModule = quantizer_spy.calls[-1].args[0] + + # Check linear and activation are in same QDQ cluster. + nodes = list(exir_program_aten_quant.graph.nodes) + assert len(nodes) == 22 + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[10] + ) + assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[11] + ) + assert ( + nodes[12].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten( + nodes[14] + ) + assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten( + nodes[15] + ) + assert ( + nodes[16].target + == torch.ops.quantized_decomposed.quantize_per_tensor.default + ) + + input_data = ( + np.random.random(input_shape).astype(np.float32) * 50 + ).astype(np.int8) + convert_run_compare( + exported_program, + input_data, + tfl_model=tflite_flatbuffers_model, + tflite_input_preprocess=ToChannelLastPreprocess(), + tflite_output_preprocess=ToChannelFirstPreprocess(), + ) diff --git a/backends/nxp/tests/test_removing_nodes_with_known_outputs.py b/backends/nxp/tests/test_removing_nodes_with_known_outputs.py index 8f5549c8526..0c496356791 100644 --- a/backends/nxp/tests/test_removing_nodes_with_known_outputs.py +++ b/backends/nxp/tests/test_removing_nodes_with_known_outputs.py @@ -17,6 +17,7 @@ from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import ( SplitGRUBasedOnNumLayers, ) +from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops from parameterized import parameterized from torch import nn @@ -57,7 +58,9 @@ def test_removing_nodes__zeros(self): outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)] # Apply the optimization. - NeutronAtenPassManager([RemoveNodesWithKnownOutputs()])(exir_program_aten) + NeutronAtenPassManager(neutron_target_spec, [RemoveNodesWithKnownOutputs()])( + exir_program_aten + ) # Make sure the `aten.zeros` is no longer in the model. assert not graph_contains_any_of_ops( @@ -81,7 +84,9 @@ def test_removing_nodes__split(self, num_layers): exir_program_aten = torch.export.export(model, example_input).module() # Apply the pass to split the `aten.gru.input` into multiple instances, and add a `split` node. - NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])(exir_program_aten) + NeutronAtenPassManager(neutron_target_spec, [SplitGRUBasedOnNumLayers()])( + exir_program_aten + ) # Make sure the `aten.zeros` and `torch.split` are in the model. assert graph_contains_any_of_ops( @@ -93,7 +98,9 @@ def test_removing_nodes__split(self, num_layers): outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)] # Apply the optimization. - NeutronAtenPassManager([RemoveNodesWithKnownOutputs()])(exir_program_aten) + NeutronAtenPassManager(neutron_target_spec, [RemoveNodesWithKnownOutputs()])( + exir_program_aten + ) # Make sure the `aten.zeros` and `torch.split` are no longer in the model. assert not graph_contains_any_of_ops( diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py index 8b2d5723dbb..6e084699307 100644 --- a/backends/nxp/tests/test_split_group_convolution.py +++ b/backends/nxp/tests/test_split_group_convolution.py @@ -88,9 +88,9 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int graph_module = torch.export.export(module, example_input, strict=True).module() original_module = deepcopy(graph_module) - modified_module = NeutronAtenPassManager([SplitGroupConvolution()])( - graph_module - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [SplitGroupConvolution()] + )(graph_module).graph_module # Make sure the fusion worked. original_nodes = list(original_module.graph.nodes) @@ -145,9 +145,9 @@ def test_split_group_convolution__1d(self, _, input_shape: list[int], group: int graph_module = torch.export.export(module, example_input).module() original_module = deepcopy(graph_module) - modified_module = NeutronAtenPassManager([SplitGroupConvolution()])( - graph_module - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [SplitGroupConvolution()] + )(graph_module).graph_module # Make sure the fusion worked. original_nodes = list(original_module.graph.nodes) @@ -199,9 +199,9 @@ def test_split_group_convolution__3d(self, _, input_shape: list[int], group: int graph_module = torch.export.export(module, example_input).module() original_module = deepcopy(graph_module) - modified_module = NeutronAtenPassManager([SplitGroupConvolution()])( - graph_module - ).graph_module + modified_module = NeutronAtenPassManager( + neutron_target_spec, [SplitGroupConvolution()] + )(graph_module).graph_module # Verify that the pass has NOT made any changes, as it is disabled for 3D convolution. original_nodes = list(original_module.graph.nodes) @@ -233,7 +233,7 @@ def test_split_group_convolution__applied_by_default(self): graph_module = torch.export.export(module, example_input).module() original_module = deepcopy(graph_module) - modified_module = NeutronAtenPassManager()( + modified_module = NeutronAtenPassManager(neutron_target_spec)( graph_module ).graph_module # Default passes. From e519de0d7609e4096d468d33634867ecd0a34409 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 10:59:16 -0800 Subject: [PATCH 10/19] voxtral cmake changes --- examples/models/voxtral/CMakeLists.txt | 32 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 866d17160ba..02d6cf6df7b 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -39,18 +39,16 @@ executorch_target_link_options_shared_lib(executorch) set(link_libraries executorch gflags) set(_srcs multimodal.cpp) -list( - APPEND - link_libraries - optimized_native_cpu_ops_lib - quantized_ops_lib - custom_ops - cpublas - eigen_blas -) +# Common ops for all builds +list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) -executorch_target_link_options_shared_lib(quantized_ops_lib) -executorch_target_link_options_shared_lib(custom_ops) + +# CPU-only builds need quantized and custom ops +if(NOT EXECUTORCH_BUILD_CUDA AND MSVC) + list(APPEND link_libraries quantized_ops_lib custom_ops) + executorch_target_link_options_shared_lib(quantized_ops_lib) + executorch_target_link_options_shared_lib(custom_ops) +endif() # XNNPACK if(TARGET xnnpack_backend) @@ -104,11 +102,21 @@ list(APPEND link_libraries tokenizers::tokenizers) add_executable(voxtral_runner ${_srcs}) if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(voxtral_runner) - if(NOT APPLE) + if(NOT APPLE AND NOT MSVC) target_link_options(voxtral_runner PRIVATE "LINKER:-s") endif() endif() +# Link CUDA backend +if(EXECUTORCH_BUILD_CUDA) + find_package(CUDAToolkit REQUIRED) + list(APPEND link_libraries aoti_cuda) + if(NOT MSVC) + # On non-MSVC, use shared lib options + executorch_target_link_options_shared_lib(aoti_cuda) + endif() +endif() + target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) From 762f06a79dac0549fc349081bfa4aaf1b0422cec Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 14:52:47 -0800 Subject: [PATCH 11/19] cuda_backend --- examples/models/voxtral/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 02d6cf6df7b..ea0be1f03ca 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -110,7 +110,7 @@ endif() # Link CUDA backend if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda) + list(APPEND link_libraries aoti_cuda_backend) if(NOT MSVC) # On non-MSVC, use shared lib options executorch_target_link_options_shared_lib(aoti_cuda) From c9a3cd0dd2f349bddcb1ea793b9913f65d6a03b2 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 15:30:35 -0800 Subject: [PATCH 12/19] cuda shenanigans --- examples/models/voxtral/CMakeLists.txt | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index ea0be1f03ca..ea4c3b533c4 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -87,8 +87,11 @@ list( # Link CUDA backend if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda) - executorch_target_link_options_shared_lib(aoti_cuda) + list(APPEND link_libraries aoti_cuda_backend) + if(NOT MSVC) + # On non-MSVC, use shared lib options + executorch_target_link_options_shared_lib(aoti_cuda) + endif() endif() if(EXECUTORCH_BUILD_METAL) @@ -107,16 +110,6 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") endif() endif() -# Link CUDA backend -if(EXECUTORCH_BUILD_CUDA) - find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda_backend) - if(NOT MSVC) - # On non-MSVC, use shared lib options - executorch_target_link_options_shared_lib(aoti_cuda) - endif() -endif() - target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) From f13ba262ef2c320da7999b5f475a3c8bcb122a64 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 15:31:32 -0800 Subject: [PATCH 13/19] cuda shenanigans2 --- examples/models/voxtral/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index ea4c3b533c4..80f76da7af9 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -90,7 +90,7 @@ if(EXECUTORCH_BUILD_CUDA) list(APPEND link_libraries aoti_cuda_backend) if(NOT MSVC) # On non-MSVC, use shared lib options - executorch_target_link_options_shared_lib(aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda_backend) endif() endif() From 089dcc94e6922b64924c5928b479e848a34203b9 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 15:35:24 -0800 Subject: [PATCH 14/19] lint --- backends/aoti/CMakeLists.txt | 3 +-- backends/cuda/CMakeLists.txt | 10 ++++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 2d1f5859d3c..d5582dfe7c7 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -39,8 +39,7 @@ target_compile_options( $<$>:-fexceptions -frtti -fPIC> ) target_compile_definitions( - aoti_common - PRIVATE $<$:EXPORT_AOTI_FUNCTIONS> + aoti_common PRIVATE $<$:EXPORT_AOTI_FUNCTIONS> ) # Ensure symbols are exported properly if(APPLE) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index d603c9215eb..ac93621831e 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -94,11 +94,8 @@ install( # CUDA-specific AOTI shim symbols (dynamically linked) set(_aoti_cuda_shim_sources - runtime/shims/memory.cpp - runtime/shims/tensor_attribute.cpp - runtime/guard.cpp - runtime/shims/cuda_guard.cpp - runtime/shims/int4mm.cu + runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp + runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp ) @@ -131,7 +128,8 @@ target_link_options( aoti_cuda_shims PUBLIC $<$>:-Wl,--export-dynamic> ) -# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and platform utilities +# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and +# platform utilities target_link_libraries( aoti_cuda_shims PRIVATE cuda_platform From bfefdeba40d54e799efc587ec8c39300053de7bb Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 16:02:59 -0800 Subject: [PATCH 15/19] copy dll automatically --- examples/models/voxtral/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 80f76da7af9..85cc62aa8f6 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -113,3 +113,15 @@ endif() target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) + +# On Windows, copy required DLLs to the executable directory +if(MSVC AND EXECUTORCH_BUILD_CUDA) + add_custom_command( + TARGET voxtral_runner + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + $ + COMMENT "Copying aoti_cuda_shims.dll to voxtral_runner directory" + ) +endif() From a6ba1767d3c0b7edaf1738c6c135b5d678575b7c Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 16:18:27 -0800 Subject: [PATCH 16/19] remove alias --- backends/cuda/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index ac93621831e..ec7fc9b4fd3 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -183,9 +183,6 @@ install( DESTINATION lib ) -# Alias for backward compatibility -add_library(aoti_cuda ALIAS aoti_cuda_backend) - if(BUILD_TESTING) add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp) target_link_libraries( From 5596c453fe8c1013b05ba0c2677205960ffed543 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 16:24:57 -0800 Subject: [PATCH 17/19] lint --- examples/models/voxtral/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 85cc62aa8f6..24a1096c889 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -119,8 +119,7 @@ if(MSVC AND EXECUTORCH_BUILD_CUDA) add_custom_command( TARGET voxtral_runner POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - $ + COMMAND ${CMAKE_COMMAND} -E copy_if_different $ $ COMMENT "Copying aoti_cuda_shims.dll to voxtral_runner directory" ) From a6b20e64b312f3f2cf61476aa077a076afccfd85 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 6 Nov 2025 17:54:07 -0800 Subject: [PATCH 18/19] fix test build and have some unintuitive linkage for now --- CMakeLists.txt | 5 +++-- backends/cuda/CMakeLists.txt | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6d6f26b41f..51573d276b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -591,8 +591,9 @@ endif() if(EXECUTORCH_BUILD_CUDA) # Build CUDA-specific AOTI functionality add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda) - # Add aoti_cuda to backends - it already depends on aoti_common - list(APPEND _executorch_backends aoti_cuda) + # Add aoti_cuda_backend to backends - it transitively includes aoti_cuda_shims + # and cuda_platform + list(APPEND _executorch_backends aoti_cuda_backend) endif() if(EXECUTORCH_BUILD_METAL) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index ec7fc9b4fd3..2eb923f2ab9 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -168,13 +168,21 @@ target_link_options( $<$>:-Wl,--export-dynamic> ) -# Link against shims library and other dependencies +# Link against shims library and other dependencies On Windows (MSVC), use +# PRIVATE linkage for aoti_cuda_shims since the DLL is copied to the executable +# directory. On other platforms, use PUBLIC so the dependency propagates to +# consumers. target_link_libraries( - aoti_cuda_backend - PRIVATE aoti_cuda_shims cuda_platform - PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda_backend PUBLIC cuda_platform extension_tensor cuda_tensor_maker + CUDA::cudart ${CMAKE_DL_LIBS} ) +if(MSVC) + target_link_libraries(aoti_cuda_backend PRIVATE aoti_cuda_shims) +else() + target_link_libraries(aoti_cuda_backend PUBLIC aoti_cuda_shims) +endif() + executorch_target_link_options_shared_lib(aoti_cuda_backend) install( From 5f73ad29aa28eed0af59098f0d38ea3c693af9d3 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Mon, 10 Nov 2025 10:12:08 -0800 Subject: [PATCH 19/19] update ET_platform deps --- backends/cuda/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 2eb923f2ab9..8f121bdbd32 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -58,7 +58,9 @@ else() endif() # Link against ExecuTorch core libraries -target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS}) +target_link_libraries( + cuda_tensor_maker PRIVATE executorch_core ${CMAKE_DL_LIBS} +) executorch_target_link_options_shared_lib(cuda_tensor_maker) install( @@ -84,7 +86,7 @@ target_compile_options( ) # Link against ExecuTorch core libraries -target_link_libraries(cuda_platform PUBLIC executorch ${CMAKE_DL_LIBS}) +target_link_libraries(cuda_platform PRIVATE executorch_core ${CMAKE_DL_LIBS}) install( TARGETS cuda_platform