From 8848e4450e7db722b4d8600bd4d61e87a570578b Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 10:31:35 -0800
Subject: [PATCH 01/19] dynamic link

---
 backends/aoti/common_shims.cpp | 67 ++++++++++++++++++++++-
 backends/aoti/common_shims.h   | 97 ++++++++++++++++++++++++----------
 backends/cuda/CMakeLists.txt   | 30 +++++++++--
 3 files changed, 158 insertions(+), 36 deletions(-)
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index deb10478778..52d1f030ee5 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -16,8 +16,8 @@ namespace aoti {
 
 namespace internal {
 // Global storage for tensor metadata
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
 } // namespace internal
 
 extern "C" {
@@ -204,6 +204,69 @@ void cleanup_tensor_metadata() {
   internal::tensor_to_strides.clear();
 }
 
+AOTI_SHIM_EXPORT void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg);
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) {
+  (void)tensor;
+  (void)ret_size;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) {
+  (void)orig_handle;
+  (void)new_handle;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor) {
+  (void)data_ptr;
+  (void)ndim;
+  (void)sizes;
+  (void)strides;
+  (void)storage_offset;
+  (void)dtype;
+  (void)device_type;
+  (void)device_index;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 91bb785b684..e2f66039549 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -15,6 +15,13 @@
 #include <unordered_map>
 #include <vector>
 
+#if defined(EXPORT_AOTI_FUNCTIONS)
+#include <executorch/backends/cuda/runtime/export.h>
+#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT
+#else
+#define AOTI_SHIM_EXPORT
+#endif
+
 namespace executorch {
 namespace backends {
 namespace aoti {
@@ -23,57 +30,89 @@ namespace aoti {
 using executorch::runtime::Error;
 using executorch::runtime::etensor::Tensor;
 
+// Global storage for tensor metadata
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+
 extern "C" {
 
 // Common AOTI type aliases
 using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
-// Global storage for tensor metadata
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
-
 // Attribute-related operations (memory-irrelevant)
-AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
 
-AOTITorchError aoti_torch_get_storage_offset(
-    Tensor* tensor,
-    int64_t* ret_storage_offset);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset);
 
-AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
 
-AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
 
-AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
 
-AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
 
-AOTITorchError aoti_torch_get_device_index(
-    Tensor* tensor,
-    int32_t* ret_device_index);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index);
 
-AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 
 // Utility functions for device and layout information
-int32_t aoti_torch_device_type_cpu();
-int32_t aoti_torch_layout_strided();
-int32_t aoti_torch_dtype_float32();
-int32_t aoti_torch_dtype_bfloat16();
-int32_t aoti_torch_dtype_int8();
-int32_t aoti_torch_dtype_int16();
-int32_t aoti_torch_dtype_int32();
-int32_t aoti_torch_dtype_int64();
-int32_t aoti_torch_dtype_bool();
+AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
+AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64();
 
 // Dtype utility function needed by Metal backend
-size_t aoti_torch_dtype_element_size(int32_t dtype);
+AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);
 
 // Autograd mode functions
-int32_t aoti_torch_grad_mode_is_enabled();
-void aoti_torch_grad_mode_set_enabled(bool enabled);
+AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled();
+AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 
 // Cleanup functions for clearing global state
-void cleanup_tensor_metadata();
+AOTI_SHIM_EXPORT void cleanup_tensor_metadata();
+
+AOTI_SHIM_EXPORT void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor);
 
 } // extern "C"
 
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index c95d34247be..d90b2539ffc 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -76,8 +76,27 @@ set(_aoti_cuda_sources
     runtime/shims/cuda_guard.cpp
     runtime/shims/int4mm.cu
     runtime/platform/platform.cpp
+    ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
-add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
+if(MSVC)
+  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
+
+  # Define export macros for Windows DLL
+  target_compile_definitions(
+    aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS
+  )
+
+  # Ensure proper DLL import/export library naming on Windows with
+  # config-specific paths
+  set_target_properties(
+    aoti_cuda
+    PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF
+  )
+else()
+  add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+endif()
+
 target_include_directories(
   aoti_cuda
   PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
@@ -95,11 +114,12 @@ target_link_options(
 # Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch
 # CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
-# If you need other CUDA libraries, link them similarly:
-# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
-executorch_target_link_options_shared_lib(aoti_cuda)
+
+if(NOT MSVC)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
 
 if(BUILD_TESTING)
   add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)

From 0645ce021479448260d709269a98e55c713e6a2d Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 10:48:01 -0800
Subject: [PATCH 02/19] factor out shim, dynamic linkage

---
 backends/aoti/common_shims.h                  |  8 +-
 backends/aoti/export.h                        | 25 +++++
 backends/cuda/CMakeLists.txt                  | 92 +++++++++++++------
 backends/cuda/runtime/shims/cuda_guard.h      | 13 +--
 backends/cuda/runtime/shims/int4mm.h          |  3 +-
 backends/cuda/runtime/shims/memory.h          | 11 ++-
 .../cuda/runtime/shims/tensor_attribute.h     |  5 +-
 7 files changed, 108 insertions(+), 49 deletions(-)
 create mode 100644 backends/aoti/export.h

diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index e2f66039549..b2bac6f41cd 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <executorch/backends/aoti/export.h>
 #include <executorch/backends/aoti/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
@@ -15,13 +16,6 @@
 #include <unordered_map>
 #include <vector>
 
-#if defined(EXPORT_AOTI_FUNCTIONS)
-#include <executorch/backends/cuda/runtime/export.h>
-#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT
-#else
-#define AOTI_SHIM_EXPORT
-#endif
-
 namespace executorch {
 namespace backends {
 namespace aoti {
diff --git a/backends/aoti/export.h b/backends/aoti/export.h
new file mode 100644
index 00000000000..879aa942035
--- /dev/null
+++ b/backends/aoti/export.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// Define export macro for Windows DLL
+// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by
+// CMake, which causes this macro to export symbols using __declspec(dllexport).
+// When consuming the library, the macro imports symbols using
+// __declspec(dllimport). On non-Windows platforms, the macro is empty and has
+// no effect.
+#ifdef _WIN32
+#ifdef EXPORT_AOTI_FUNCTIONS
+#define AOTI_SHIM_EXPORT __declspec(dllexport)
+#else
+#define AOTI_SHIM_EXPORT __declspec(dllimport)
+#endif
+#else
+#define AOTI_SHIM_EXPORT
+#endif
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index d90b2539ffc..893627302a1 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -67,9 +67,8 @@ install(
   DESTINATION lib
 )
 
-# CUDA-specific AOTI functionality
-set(_aoti_cuda_sources
-    runtime/cuda_backend.cpp
+# CUDA-specific AOTI shim symbols (dynamically linked)
+set(_aoti_cuda_shim_sources
     runtime/shims/memory.cpp
     runtime/shims/tensor_attribute.cpp
     runtime/guard.cpp
@@ -78,59 +77,96 @@ set(_aoti_cuda_sources
     runtime/platform/platform.cpp
     ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
-# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
-if(MSVC)
-  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
 
-  # Define export macros for Windows DLL
+add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
+
+# Define export macros for shared library
+if(MSVC)
   target_compile_definitions(
-    aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS
+    aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS
   )
 
-  # Ensure proper DLL import/export library naming on Windows with
-  # config-specific paths
+  # Ensure proper DLL import/export library naming on Windows
   set_target_properties(
-    aoti_cuda
+    aoti_cuda_shims
     PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF
   )
-else()
-  add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 endif()
 
 target_include_directories(
-  aoti_cuda
+  aoti_cuda_shims
   PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
          $<INSTALL_INTERFACE:include>
 )
+
 target_compile_options(
-  aoti_cuda PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
-                   $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+  aoti_cuda_shims PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+                         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
+
 # Ensure symbols are exported properly
 target_link_options(
-  aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+  aoti_cuda_shims PUBLIC
+  $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch
-# CUDA libraries
+# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker
 target_link_libraries(
-  aoti_cuda PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 
 if(NOT MSVC)
-  executorch_target_link_options_shared_lib(aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda_shims)
 endif()
 
+install(
+  TARGETS aoti_cuda_shims
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
+
+# CUDA backend implementation
+set(_aoti_cuda_backend_sources
+    runtime/cuda_backend.cpp
+)
+
+# CUDA backend implementation
+add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources})
+
+target_include_directories(
+  aoti_cuda_backend
+  PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+)
+target_compile_options(
+  aoti_cuda_backend PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+                           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+target_link_options(
+  aoti_cuda_backend PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+)
+
+# Link against shims library and other dependencies
+target_link_libraries(
+  aoti_cuda_backend PUBLIC aoti_cuda_shims extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+)
+
+executorch_target_link_options_shared_lib(aoti_cuda_backend)
+
+install(
+  TARGETS aoti_cuda_backend
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
+
+# Alias for backward compatibility
+add_library(aoti_cuda ALIAS aoti_cuda_backend)
+
 if(BUILD_TESTING)
   add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
   target_link_libraries(
-    multimodal_benchmark PUBLIC aoti_cuda extension_module_static
+    multimodal_benchmark PUBLIC aoti_cuda_backend extension_module_static
                                 extension_flat_tensor portable_ops_lib
   )
 endif()
-
-install(
-  TARGETS aoti_cuda
-  EXPORT ExecuTorchTargets
-  DESTINATION lib
-)
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
index f930f3df643..9472bedb70e 100644
--- a/backends/cuda/runtime/shims/cuda_guard.h
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/aoti/export.h>
 #include <executorch/backends/cuda/runtime/guard.h>
 #include <cstdint>
 
@@ -33,7 +34,7 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*;
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_cuda_guard(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
     int32_t device_index,
     CUDAGuardHandle* ret_guard);
 
@@ -44,7 +45,7 @@ AOTITorchError aoti_torch_create_cuda_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
 
 /**
  * Sets the CUDA device to a new index for an existing guard.
@@ -54,7 +55,7 @@ AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_cuda_guard_set_index(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index(
     CUDAGuardHandle guard,
     int32_t device_index);
 
@@ -69,7 +70,7 @@ AOTITorchError aoti_torch_cuda_guard_set_index(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_cuda_stream_guard(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
     void* stream,
     int32_t device_index,
     CUDAStreamGuardHandle* ret_guard);
@@ -81,7 +82,7 @@ AOTITorchError aoti_torch_create_cuda_stream_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 
 /**
  * Gets the current CUDA stream for a specified device.
@@ -91,7 +92,7 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_get_current_cuda_stream(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream(
     int32_t device_index,
     void** ret_stream);
 
diff --git a/backends/cuda/runtime/shims/int4mm.h b/backends/cuda/runtime/shims/int4mm.h
index 6bd2d9b3a79..87a9916b0aa 100644
--- a/backends/cuda/runtime/shims/int4mm.h
+++ b/backends/cuda/runtime/shims/int4mm.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/aoti/export.h>
 
 namespace executorch::backends::cuda {
 
@@ -69,7 +70,7 @@ extern "C" {
  *     or invalid qGroupSize
  *   - Error::Internal: CUDA kernel launch failure
  */
-AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
     Tensor* self,
     Tensor* mat2,
     int64_t qGroupSize,
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index 7a8d4c3609b..8e0e133fe8a 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/aoti/export.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
@@ -43,7 +44,7 @@ extern "C" {
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     void* data,
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -71,7 +72,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_empty_strided(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
     const int64_t* sizes_ptr,
     const int64_t* strides_ptr,
@@ -87,7 +88,7 @@ AOTITorchError aoti_torch_empty_strided(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
 
 /**
  * Creates a tensor view that reinterprets the same underlying memory with
@@ -106,7 +107,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
  *
  * @return Error::Ok on success, appropriate error code on failure
  */
-AOTITorchError aoti_torch__reinterpret_tensor(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
     Tensor* self,
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -136,7 +137,7 @@ AOTITorchError aoti_torch__reinterpret_tensor(
  *         - Error::MemoryAllocationFailed: failed to allocate temporary memory
  *         - Error::Internal: CUDA operation failures
  */
-AOTITorchError
+AOTI_SHIM_EXPORT AOTITorchError
 aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 
 // Function to clear all tensors from internal storage
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index 6b61b5bd3b8..cf65c79ac6a 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <executorch/backends/aoti/export.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <cstdint>
@@ -24,12 +25,12 @@ extern "C" {
 using AOTITorchError = Error;
 
 // Device type functions for tensor attributes
-AOTITorchError aoti_torch_get_device_type(
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_type(
     Tensor* tensor,
     int32_t* ret_device_type);
 
 // Device type constants
-int32_t aoti_torch_device_type_cuda();
+AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cuda();
 
 } // extern "C"
 

From 6751c66d1715fef41318bc6aa31741d2aa25d182 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 10:52:16 -0800
Subject: [PATCH 03/19] shim linkage

---
 backends/cuda/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 893627302a1..bea1e812ad6 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -149,7 +149,8 @@ target_link_options(
 
 # Link against shims library and other dependencies
 target_link_libraries(
-  aoti_cuda_backend PUBLIC aoti_cuda_shims extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda_backend PRIVATE aoti_cuda_shims
+                    PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 
 executorch_target_link_options_shared_lib(aoti_cuda_backend)

From 2f78880ddd721126645e4f9f0715148fa2e16cfd Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 10:52:36 -0800
Subject: [PATCH 04/19] lint

---
 backends/aoti/common_shims.cpp                |  6 ++--
 backends/cuda/CMakeLists.txt                  | 35 +++++++++----------
 backends/cuda/runtime/shims/cuda_guard.h      | 21 ++++++-----
 .../cuda/runtime/shims/tensor_attribute.h     |  5 ++-
 4 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index 52d1f030ee5..6733a5e7afd 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -16,8 +16,10 @@ namespace aoti {
 
 namespace internal {
 // Global storage for tensor metadata
-AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_sizes;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_strides;
 } // namespace internal
 
 extern "C" {
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index bea1e812ad6..0ce59dc547d 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -82,14 +82,11 @@ add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
 
 # Define export macros for shared library
 if(MSVC)
-  target_compile_definitions(
-    aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS
-  )
+  target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS)
 
   # Ensure proper DLL import/export library naming on Windows
   set_target_properties(
-    aoti_cuda_shims
-    PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF
+    aoti_cuda_shims PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF
   )
 endif()
 
@@ -100,19 +97,20 @@ target_include_directories(
 )
 
 target_compile_options(
-  aoti_cuda_shims PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
-                         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+  aoti_cuda_shims
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
 
 # Ensure symbols are exported properly
 target_link_options(
-  aoti_cuda_shims PUBLIC
-  $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+  aoti_cuda_shims PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
 # Link against CUDA::cudart, common AOTI library, cuda_tensor_maker
 target_link_libraries(
-  aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart
+                         ${CMAKE_DL_LIBS}
 )
 
 if(NOT MSVC)
@@ -126,9 +124,7 @@ install(
 )
 
 # CUDA backend implementation
-set(_aoti_cuda_backend_sources
-    runtime/cuda_backend.cpp
-)
+set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp)
 
 # CUDA backend implementation
 add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources})
@@ -139,18 +135,21 @@ target_include_directories(
          $<INSTALL_INTERFACE:include>
 )
 target_compile_options(
-  aoti_cuda_backend PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
-                           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+  aoti_cuda_backend
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
 # Ensure symbols are exported properly
 target_link_options(
-  aoti_cuda_backend PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+  aoti_cuda_backend PUBLIC
+  $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
 # Link against shims library and other dependencies
 target_link_libraries(
-  aoti_cuda_backend PRIVATE aoti_cuda_shims
-                    PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda_backend
+  PRIVATE aoti_cuda_shims
+  PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 
 executorch_target_link_options_shared_lib(aoti_cuda_backend)
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
index 9472bedb70e..83fceabb98f 100644
--- a/backends/cuda/runtime/shims/cuda_guard.h
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -34,9 +34,8 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*;
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
-    int32_t device_index,
-    CUDAGuardHandle* ret_guard);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_create_cuda_guard(int32_t device_index, CUDAGuardHandle* ret_guard);
 
 /**
  * Deletes a CUDA device guard and frees its associated resources.
@@ -45,7 +44,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
 
 /**
  * Sets the CUDA device to a new index for an existing guard.
@@ -55,9 +55,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle gua
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_guard_set_index(
-    CUDAGuardHandle guard,
-    int32_t device_index);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
 
 /**
  * Creates a CUDA stream guard that sets the current device and stream,
@@ -82,7 +81,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 
 /**
  * Gets the current CUDA stream for a specified device.
@@ -92,9 +92,8 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGu
  * @return AOTITorchError error code (Error::Ok on success, or an error code on
  * failure)
  */
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_current_cuda_stream(
-    int32_t device_index,
-    void** ret_stream);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
 
 } // extern "C"
 
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index cf65c79ac6a..683f270ccda 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -25,9 +25,8 @@ extern "C" {
 using AOTITorchError = Error;
 
 // Device type functions for tensor attributes
-AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_device_type(
-    Tensor* tensor,
-    int32_t* ret_device_type);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type);
 
 // Device type constants
 AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cuda();

From 23dd27bcb703d33b462da3ba7eed3d35a3c703cc Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 14:36:51 -0800
Subject: [PATCH 05/19] preproc flag for aoti_common

---
 backends/aoti/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index bcff1d56769..2d1f5859d3c 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -38,6 +38,10 @@ target_compile_options(
   PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
          $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
+target_compile_definitions(
+  aoti_common
+  PRIVATE $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
+)
 # Ensure symbols are exported properly
 if(APPLE)
   target_link_options(aoti_common PUBLIC -Wl,-export_dynamic)

From 2af37f1e64147071b2fee2ec91f23cb90e550cfb Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 15:06:28 -0800
Subject: [PATCH 06/19] build platform against runtime not shims

---
 backends/cuda/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 0ce59dc547d..7263b0abbb6 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -74,7 +74,6 @@ set(_aoti_cuda_shim_sources
     runtime/guard.cpp
     runtime/shims/cuda_guard.cpp
     runtime/shims/int4mm.cu
-    runtime/platform/platform.cpp
     ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
 
@@ -124,7 +123,7 @@ install(
 )
 
 # CUDA backend implementation
-set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp)
+set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp runtime/platform/platform.cpp)
 
 # CUDA backend implementation
 add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources})

From 717feaec2a5a294e6ee185ac23bbfb478b5c9d55 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 15:14:53 -0800
Subject: [PATCH 07/19] refactor platform layer

---
 backends/cuda/CMakeLists.txt | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 7263b0abbb6..d603c9215eb 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -67,6 +67,31 @@ install(
   DESTINATION lib
 )
 
+# Platform utilities (load_library, close_library, etc.)
+set(_cuda_platform_sources runtime/platform/platform.cpp)
+add_library(cuda_platform STATIC ${_cuda_platform_sources})
+
+target_include_directories(
+  cuda_platform
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+
+target_compile_options(
+  cuda_platform
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+
+# Link against ExecuTorch core libraries
+target_link_libraries(cuda_platform PUBLIC executorch ${CMAKE_DL_LIBS})
+
+install(
+  TARGETS cuda_platform
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
+
 # CUDA-specific AOTI shim symbols (dynamically linked)
 set(_aoti_cuda_shim_sources
     runtime/shims/memory.cpp
@@ -106,10 +131,11 @@ target_link_options(
   aoti_cuda_shims PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker
+# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and platform utilities
 target_link_libraries(
-  aoti_cuda_shims PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart
-                         ${CMAKE_DL_LIBS}
+  aoti_cuda_shims
+  PRIVATE cuda_platform
+  PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 
 if(NOT MSVC)
@@ -123,7 +149,7 @@ install(
 )
 
 # CUDA backend implementation
-set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp runtime/platform/platform.cpp)
+set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp)
 
 # CUDA backend implementation
 add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources})
@@ -147,7 +173,7 @@ target_link_options(
 # Link against shims library and other dependencies
 target_link_libraries(
   aoti_cuda_backend
-  PRIVATE aoti_cuda_shims
+  PRIVATE aoti_cuda_shims cuda_platform
   PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 

From 9b02f052e73f9a589900fe3d402a828f78f4b98a Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 15:20:49 -0800
Subject: [PATCH 08/19] public symbol missing

---
 backends/cuda/runtime/shims/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index 8e0e133fe8a..8d9d37037de 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -141,7 +141,7 @@ AOTI_SHIM_EXPORT AOTITorchError
 aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 
 // Function to clear all tensors from internal storage
-void clear_all_tensors();
+AOTI_SHIM_EXPORT void clear_all_tensors();
 } // extern "C"
 
 } // namespace executorch::backends::cuda

From 15bac110bd117ad1419cf5f041f100caebc09450 Mon Sep 17 00:00:00 2001
From: roman-janik-nxp <roman.janik@nxp.com>
Date: Thu, 6 Nov 2025 19:14:54 +0100
Subject: [PATCH 09/19] NXP backend: Replace move relu before concat
 optimization (#15394)

### Summary
This PR replaces optimization in `move_relu_before_concat.py` by
`MoveActivationBeforeConcat` aten pass. The pass moves selected
activations that are supported for fusion on Neutron (Relu, Relu6,
Sigmoid, Tanh) before the `concat` node if concat input nodes are either
Conv 2D or Linear 2D. The whole node Logic is determined by target
specs, now supporting Neutron-C. Tests updated.

### Test plan
Unit tests provided (test_move_activation_before_concatenation.py).

cc @robert-kalmar
---
 .../move_activation_before_concat.py          | 102 ++
 .../aten_passes/neutron_aten_pass_manager.py  |   9 +-
 .../optimizations/move_relu_before_concat.py  | 107 --
 .../backend/ir/tflite_optimizer/optimizer.py  |   8 -
 backends/nxp/quantizer/neutron_quantizer.py   |  10 +-
 backends/nxp/quantizer/patterns.py            | 147 ++-
 backends/nxp/tests/test_batch_norm_fusion.py  |   9 +-
 backends/nxp/tests/test_gru_splitting.py      |  17 +-
 .../nxp/tests/test_linear_and_add_fusion.py   |  55 +-
 ...st_move_activation_before_concatenation.py | 947 ++++++++++++++++++
 .../test_removing_nodes_with_known_outputs.py |  13 +-
 .../nxp/tests/test_split_group_convolution.py |  20 +-
 12 files changed, 1283 insertions(+), 161 deletions(-)
 create mode 100644 backends/nxp/aten_passes/move_activation_before_concat.py
 delete mode 100755 backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
 create mode 100644 backends/nxp/tests/test_move_activation_before_concatenation.py

diff --git a/backends/nxp/aten_passes/move_activation_before_concat.py b/backends/nxp/aten_passes/move_activation_before_concat.py
new file mode 100644
index 00000000000..8ba306d42e2
--- /dev/null
+++ b/backends/nxp/aten_passes/move_activation_before_concat.py
@@ -0,0 +1,102 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
+
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+
+class MoveActivationBeforeConcat(PassBase):
+    """Move some operators around in the following pattern.
+    This is a common pattern that emerges from the conversion of separable convolutions.
+    This optimization works together with joint quantization of compute nodes and activations. Without it,
+    it is not beneficial.
+
+             │                    │                               │                     │
+      ┌──────▼──────┐      ┌──────▼──────┐                 ┌──────▼──────┐       ┌──────▼──────┐
+      │ aten.conv2d │  ... │ aten.conv2d │                 │ aten.conv2d │  ...  │ aten.conv2d │
+      └──────┬──────┘      └──────┬──────┘                 └──────┬──────┘       └──────┬──────┘
+             └───────┐     ┌──────┘                               │                     │
+                  ┌──▼─────▼─┐           replace with       ┌─────▼─────┐         ┌─────▼─────┐
+                  │ aten.cat │          ──────────────►     │ aten.relu │   ...   │ aten.relu │
+                  └────┬─────┘                              └─────┬─────┘         └─────┬─────┘
+                       │                                          └───────┐     ┌───────┘
+                 ┌─────▼─────┐                                         ┌──▼─────▼─┐
+                 │ aten.relu │                                         │ aten.cat │
+                 └─────┬─────┘                                         └────┬─────┘
+                       │                                                    │
+    """
+
+    def __init__(self, neutron_target_spec: NeutronTargetSpec):
+        self.neutron_target_spec = neutron_target_spec
+
+    def call(self, module: GraphModule) -> bool:
+        def _is_concat(node_: Node) -> bool:
+            return (
+                node_.op == "call_function"
+                and node_.target == torch.ops.aten.cat.default
+            )
+
+        made_changes = False
+
+        for node in module.graph.nodes:
+            if not _is_concat(node):
+                continue  # Not cat node.
+
+            cat_node = node
+            activation = next(iter(cat_node.users))
+
+            # Check if all cat inputs nodes are conv 2D or linear 2D type and their only user is cat.
+            if not all(
+                self.neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    input_node
+                )
+                and len(input_node.users) == 1
+                for input_node in cat_node.all_input_nodes
+            ):
+                continue
+
+            # Check if following activation is supported on Neutron as fused activation.
+            if not (
+                len(cat_node.users) == 1
+                and self.neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    activation
+                )
+            ):
+                continue
+
+            # Loop all Cat input nodes and insert new activation after node.
+            for input_node in cat_node.all_input_nodes:
+                with module.graph.inserting_after(input_node):
+                    new_activation = module.graph.call_function(
+                        activation.target,
+                        args=(),
+                        kwargs=activation.kwargs,
+                    )
+
+                    new_activation.meta["source_fn_stack"] = [
+                        (
+                            new_activation.name,
+                            activation.meta["source_fn_stack"][-1][-1],
+                        )
+                    ]
+                    new_activation.meta["val"] = input_node.meta["val"]
+
+                    # Replace the uses of the input node with the new activation node.
+                    input_node.replace_all_uses_with(new_activation)
+                    new_activation.args = (input_node, *activation.args[1:])
+
+            # Replace the uses of the activation node with the cat node.
+            activation.replace_all_uses_with(cat_node)
+
+            module.graph.erase_node(activation)
+
+            made_changes = True
+
+        return PassResult(module, made_changes)
diff --git a/backends/nxp/aten_passes/neutron_aten_pass_manager.py b/backends/nxp/aten_passes/neutron_aten_pass_manager.py
index 407ebf5da61..35205c76c68 100644
--- a/backends/nxp/aten_passes/neutron_aten_pass_manager.py
+++ b/backends/nxp/aten_passes/neutron_aten_pass_manager.py
@@ -16,6 +16,9 @@
 from executorch.backends.nxp.aten_passes.fuse_linear_and_add_pass import (
     FuseLinearAndAddPass,
 )
+from executorch.backends.nxp.aten_passes.move_activation_before_concat import (
+    MoveActivationBeforeConcat,
+)
 from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import (
     RemoveNodesWithKnownOutputs,
 )
@@ -25,6 +28,7 @@
 from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import (
     SplitGRUBasedOnNumLayers,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.exir.pass_manager import PassManager
 from torch import nn
 from torch.fx.passes.infra.pass_base import PassResult
@@ -34,7 +38,9 @@
 
 class NeutronAtenPassManager(PassManager):
 
-    def __init__(self, passes: list[PassType] = None):
+    def __init__(
+        self, neutron_target_spec: NeutronTargetSpec, passes: list[PassType] = None
+    ):
         passes: list[PassType] = passes or [
             FuseBatchNormWithConvPass(),
             FuseBatchNormWithLinearPass(),
@@ -42,6 +48,7 @@ def __init__(self, passes: list[PassType] = None):
             SplitGRUBasedOnNumLayers(),
             RemoveNodesWithKnownOutputs(),
             FuseLinearAndAddPass(),
+            MoveActivationBeforeConcat(neutron_target_spec),
         ]
 
         super().__init__(passes)
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
deleted file mode 100755
index 4d10b7c80ae..00000000000
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/move_relu_before_concat.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2024 NXP
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import defaultdict
-from copy import deepcopy
-
-from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
-from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import (
-    AllInputsComeFrom,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
-    BaseOptimization,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
-    Op,
-    PatternMatcher,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
-    TensorHasOneConsumer,
-    TensorsHaveSameQuantization,
-)
-
-
-class MoveActivationBeforeConcatenation(BaseOptimization):
-    """
-    Move some operators around in the following pattern.
-    This is a common pattern that emerges from the conversion of separable convolutions.
-
-          │                │                            │                │
-      ┌───▼────┐       ┌───▼────┐                   ┌───▼────┐       ┌───▼────┐
-      │ Conv2D │  ...  │ Conv2D │                   │ Conv2D │  ...  │ Conv2D │
-      └───┬────┘       └───┬────┘                   └───┬────┘       └───┬────┘
-          └──┐          ┌──┘                            │                │
-          ┌──▼──────────▼─┐                          ┌──▼───┐         ┌──▼───┐
-          │ Concatenation │           ─────►         │ Relu │   ...   │ Relu │
-          └───────┬───────┘                          └──┬───┘         └──┬───┘
-                  │  'x'                                └──┐          ┌──┘
-               ┌──▼───┐                                 ┌──▼──────────▼─┐
-               │ Relu │                                 │ Concatenation │
-               └──┬───┘                                 └───────┬───────┘
-                  │  'y'                                        │
-    """
-
-    activations = ["Relu", "ReluN1To1", "Relu6", "Tanh", "Sign"]
-
-    def __call__(self) -> bool:
-        matcher = PatternMatcher(
-            self._builder,
-            [
-                Op(["Concatenation"], None, ["x"], [AllInputsComeFrom("Conv2D")]),
-                Op(self.activations, ["x"], ["y"]),
-            ],
-            [
-                TensorHasOneConsumer("x"),
-                # If the activation function is not changing the quantization parameters, it can be moved without
-                #  messing with the quantization elsewhere.
-                TensorsHaveSameQuantization(["x", "y"]),
-            ],
-        )
-
-        to_remove = []
-
-        # Mapping an operator to a list of operators. These operators (value) will later be added into the TFLite
-        #  model's `operators` in front of the specified operator (key).
-        to_add: dict[tflite_model.Operator, list[tflite_model.Operator]] = defaultdict(
-            lambda: []
-        )
-
-        for [concat, activation], _, _, _ in matcher.match_patterns():
-            new_concat_inputs = []
-            for concat_input in concat.tmp_inputs:
-                # Create a new operator for the activation function.
-                new_activation = deepcopy(activation)
-                new_activation.tmp_inputs = [concat_input]
-                new_activation_output = self._builder.duplicate_tensor(concat_input)
-                new_activation.tmp_outputs = [new_activation_output]
-
-                to_add[concat].append(
-                    new_activation
-                )  # Insert the new activation into the model later.
-
-                new_concat_inputs.append(
-                    new_activation_output
-                )  # Connect the activation with the `Concatenation`.
-
-            concat.tmp_inputs = new_concat_inputs
-
-            # Tensor rule ensures that only the activation functions is using the output of the `Concatenation`.
-            # It is safe to bypass.
-            concat.tmp_outputs[0] = activation.tmp_outputs[0]
-            to_remove.append(activation)
-
-        operators = self._builder.get_operators()
-
-        # Add the new activations into the model.
-        for concat, activations in to_add.items():
-            idx = operators.index(concat)
-            for activation in activations:
-                operators.insert(idx, activation)
-
-        # Remove the old activations.
-        for activation in to_remove:
-            operators.remove(activation)
-
-        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
index 3611c55e995..52de6f224eb 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
@@ -11,9 +11,6 @@
 
 from executorch.backends.nxp.backend.ir import logger
 from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import (
-    MoveActivationBeforeConcatenation,
-)
 from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.permute_fully_connected_weights_after_reshape import (
     PermuteFullyConnectedWeightsAfterReshape,
 )
@@ -29,8 +26,6 @@ class Optimization(Enum):
 
     PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE = 12
 
-    MOVE_ACTIVATION_BEFORE_CONCAT = 15
-
 
 class Optimizer:
     """
@@ -68,9 +63,6 @@ def __init__(
             Optimization.PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE: PermuteFullyConnectedWeightsAfterReshape(
                 builder, conversion_config
             ),
-            Optimization.MOVE_ACTIVATION_BEFORE_CONCAT: MoveActivationBeforeConcatenation(
-                builder, conversion_config
-            ),
         }
 
     def optimize(
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index 6564c19d7b9..f476e16628e 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -12,6 +12,7 @@
 from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.quantizer.patterns import (
     AbsPattern,
+    ActivationsConcatClusterPattern,
     AdaptiveAvgPoolPattern,
     AddmmPattern,
     AddTensorPattern,
@@ -225,13 +226,16 @@ def __init__(self, neutron_target_spec: NeutronTargetSpec):
         self.op_to_applied_quantizer = {
             pt: False for q in self.quantizers for pt in q.pattern.partition_types()
         }
+        self.cluster_quantizers = [
+            NeutronAtenQuantizer(ActivationsConcatClusterPattern(self), static_qconfig)
+        ]
 
     def transform_for_annotation(
         self, model: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
         model.graph.eliminate_dead_code()  # Remove dead code to simplify the graph for the passes.
 
-        model = NeutronAtenPassManager()(model).graph_module
+        model = NeutronAtenPassManager(self.neutron_target_spec)(model).graph_module
 
         model.graph.eliminate_dead_code()  # Remove dead code again, in case it was created by the passes.
 
@@ -240,6 +244,10 @@ def transform_for_annotation(
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
         self._annotate_inputs(model)
 
+        # Annotate node clusters in model
+        for cluster_quantizer in self.cluster_quantizers:
+            cluster_quantizer.annotate(model)
+
         nodes = list(model.graph.nodes)
         for node in nodes:
             if (
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index ccd579d5c52..ee92cd42ef1 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -13,6 +13,7 @@
 from executorch.backends.nxp.quantizer.utils import get_bias_qparams
 from torch import fx
 from torch._ops import OpOverload
+from torch.fx import Node
 from torchao.quantization.pt2e import PerChannelMinMaxObserver
 from torchao.quantization.pt2e.quantizer import (
     DerivedQuantizationSpec,
@@ -20,6 +21,7 @@
     QuantizationSpec,
     SharedQuantizationSpec,
 )
+
 from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 
@@ -199,7 +201,6 @@ def partition_types(self) -> list[OpOverload]:
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors:
-        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         addmm_node = fused_partition[0].nodes[-1]
 
         bias_qspec = DerivedQuantizationSpec(
@@ -745,3 +746,147 @@ def get_anchors(
         return get_anchors_for_fixed_quant_specs(
             fused_partition, scale=1.0 / 128.0, zero_point=0
         )
+
+
+class ActivationsConcatClusterPattern(QuantizationPattern):
+    """
+    Quantizer for activations concat cluster pattern.
+
+    The quantizer matches a pattern where concat node is preceded by activation nodes preceded by Conv 2D or Linear.
+    All activation nodes quantization parameters must be the same. Only activations, that have support for fusion
+    to preceding compute node on Neutron are allowed. This cluster is usually produced by MoveActivationBeforeConcat
+    pass. Cluster schema:
+
+            │                     │
+     ┌──────▼──────┐       ┌──────▼──────┐
+     │ aten.conv2d │  ...  │ aten.conv2d │
+     └──────┬──────┘       └──────┬──────┘
+            │                     │
+      ┌─────▼─────┐         ┌─────▼─────┐
+      │ aten.relu │   ...   │ aten.relu │
+      └─────┬─────┘         └─────┬─────┘
+            └───────┐     ┌───────┘
+                 ┌──▼─────▼─┐
+                 │ aten.cat │
+                 └────┬─────┘
+                      │
+    """
+
+    def __init__(self, neutron_quantizer):
+        self.neutron_quantizer = neutron_quantizer
+        self.neutron_target_info = (
+            self.neutron_quantizer.neutron_target_spec.neutron_target_info
+        )
+
+    @staticmethod
+    def _all_activations_are_equal(activations: list[Node]) -> bool:
+        first_input_node = activations[0]
+        hardtanh_t = [
+            torch.ops.aten.hardtanh.default,
+            torch.ops.aten.hardtanh_.default,
+        ]
+        relu_t = [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]
+        tanh_t = [
+            torch.ops.aten.tanh.default,
+            torch.ops.aten.tanh_.default,
+        ]
+
+        def _activations_are_equal(activation1: Node, activation2: Node) -> bool:
+            if (  # Targets are equal also with their inplace variants
+                (activation1.target in hardtanh_t and activation2.target in hardtanh_t)
+                or (activation1.target in relu_t and activation2.target in relu_t)
+                or (activation1.target in tanh_t and activation2.target in tanh_t)
+                or (
+                    activation1.target == torch.ops.aten.sigmoid.default
+                    and activation2.target == torch.ops.aten.sigmoid.default
+                )
+            ):
+                return True
+            elif (  # Hardtanh with min_val 0 and max_val 'inf' is equal to Relu
+                activation1.target in hardtanh_t
+                and activation1.args[1:] == (0.0, float("inf"))
+                and activation2.target in relu_t
+            ) or (
+                activation1.target in relu_t
+                and activation2.target in hardtanh_t
+                and activation2.args[1:] == (0.0, float("inf"))
+            ):
+                return True
+            else:
+                return False
+
+        return all(
+            _activations_are_equal(activation, first_input_node)
+            for activation in activations
+        )
+
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.cat.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        cat_node = fused_partition[0].nodes[-1]
+
+        # Check all cat inputs are supported activations
+        if not all(
+            self.neutron_target_info.is_supported_fused_activation__aten(input_node)
+            for input_node in cat_node.all_input_nodes
+        ):
+            return None
+
+        # Check all cat inputs are equal activations
+        if not self._all_activations_are_equal(cat_node.all_input_nodes):
+            return None
+
+        # Check compute nodes are Conv 2D or Linear
+        if not all(
+            self.neutron_target_info.is_fusable_conv_or_linear__aten(compute_node)
+            for input_node in cat_node.all_input_nodes
+            for compute_node in input_node.all_input_nodes
+        ):
+            return None
+
+        # Annotate compute nodes
+        for input_node in cat_node.all_input_nodes:
+            for compute_node in input_node.all_input_nodes:
+                if compute_node.target not in self.neutron_quantizer.op_to_quantizer:
+                    return None
+                compute_node_quantizer = self.neutron_quantizer.op_to_quantizer[
+                    compute_node.target
+                ]
+                compute_node_quantizer.annotate(gm)
+                del compute_node.meta["quantization_annotation"].output_qspec
+
+        # Annotate activations
+        for input_node in cat_node.all_input_nodes:
+            if input_node.target not in self.neutron_quantizer.op_to_quantizer:
+                return None
+            activation_quantizer = self.neutron_quantizer.op_to_quantizer[
+                input_node.target
+            ]
+            activation_quantizer.annotate(gm)
+            input_node.meta["quantization_annotation"].input_qspec_map = {}
+
+        # Annotate cat node
+        inputs = []
+        first_input_node = cat_node.all_input_nodes[0]
+        for idx in range(len(cat_node.all_input_nodes)):
+            inputs.append(
+                (
+                    cat_node,
+                    NodeArgsIdx(0, idx),
+                    SharedQuantizationSpec(first_input_node),
+                )
+            )
+        outputs = [(cat_node, SharedQuantizationSpec(first_input_node))]
+
+        return PartitionAnchors(
+            inputs=inputs,
+            weights=[],
+            biases=[],
+            output=outputs,
+        )
diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py
index fce11ce5aa2..eeb4b03d7a6 100644
--- a/backends/nxp/tests/test_batch_norm_fusion.py
+++ b/backends/nxp/tests/test_batch_norm_fusion.py
@@ -18,7 +18,10 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.view_copy_converter import (
     ViewCopyConverter,
 )
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    neutron_target_spec,
+    to_quantized_edge_program,
+)
 from executorch.backends.nxp.tests.executors import OverrideTargetSupportCheck
 from torch import nn
 
@@ -98,7 +101,7 @@ def test_batch_norm_conv_fusing(bias: bool, input_shape: list[int]):
     program = torch.export.export(module, example_input, strict=True)
     og_module = program.module()
 
-    pm = NeutronAtenPassManager()
+    pm = NeutronAtenPassManager(neutron_target_spec)
     graph_module_out = pm(deepcopy(program.module())).graph_module
 
     # Make sure the fusion worked.
@@ -133,7 +136,7 @@ def test_batch_norm_linear_fusing(bias: bool):
     program = torch.export.export(module, example_input, strict=True)
     og_module = program.module()
 
-    pm = NeutronAtenPassManager()
+    pm = NeutronAtenPassManager(neutron_target_spec)
     graph_module_out = pm(deepcopy(program.module())).graph_module
 
     # Make sure the fusion worked.
diff --git a/backends/nxp/tests/test_gru_splitting.py b/backends/nxp/tests/test_gru_splitting.py
index a2e9d324f69..297f9677fb2 100644
--- a/backends/nxp/tests/test_gru_splitting.py
+++ b/backends/nxp/tests/test_gru_splitting.py
@@ -13,6 +13,7 @@
 from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import (
     SplitGRUBasedOnNumLayers,
 )
+from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec
 
 
 @pytest.fixture(autouse=True)
@@ -94,7 +95,9 @@ def test_gru_splitting__with_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     post_pass_output = [t.detach() for t in exir_program_aten(*example_input)]
@@ -143,7 +146,9 @@ def test_gru_splitting__no_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     post_pass_output = [t.detach() for t in exir_program_aten(*example_input)]
@@ -193,7 +198,9 @@ def test_gru_splitting__bidirectional__no_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     nodes = list(exir_program_aten.graph.nodes)
@@ -239,7 +246,9 @@ def test_gru_splitting__bidirectional__with_bias(num_layers):
     )  # Just 1 `GRU` in the model.
 
     # Run pre-processing passes of the float32 aten dialect program.
-    pytorch_pass_manager = NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])
+    pytorch_pass_manager = NeutronAtenPassManager(
+        neutron_target_spec, [SplitGRUBasedOnNumLayers()]
+    )
     pytorch_pass_manager(exir_program_aten)
 
     nodes = list(exir_program_aten.graph.nodes)
diff --git a/backends/nxp/tests/test_linear_and_add_fusion.py b/backends/nxp/tests/test_linear_and_add_fusion.py
index 16d3c4140a2..222d748001c 100644
--- a/backends/nxp/tests/test_linear_and_add_fusion.py
+++ b/backends/nxp/tests/test_linear_and_add_fusion.py
@@ -18,6 +18,7 @@
 from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import (
     RemoveNodesWithKnownOutputs,
 )
+from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from parameterized import parameterized
 
@@ -121,10 +122,11 @@ def test_linear_add_fusing__static__no_bias__valid_shape(
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -167,10 +169,11 @@ def test_linear_add_fusing__static__no_bias__invalid_shape(
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -209,10 +212,11 @@ def test_linear_add_fusing__static__bias__valid_shape(
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -253,10 +257,11 @@ def test_linear_add_fusing__static__no_bias__reverse_order(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -295,10 +300,11 @@ def test_linear_add_fusing__static__bias__reverse_order(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -340,10 +346,11 @@ def test_linear_add_fusing__static__alpha__no_bias(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -381,10 +388,11 @@ def test_linear_add_fusing__static__alpha__bias(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -424,10 +432,11 @@ def test_linear_add_fusing__static__alpha__reversed_add_inputs(self):
         original_module = program.module()
 
         modified_module = NeutronAtenPassManager(
+            neutron_target_spec,
             [
                 RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
                 FuseLinearAndAddPass(),
-            ]
+            ],
         )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
@@ -474,9 +483,9 @@ def test_linear_add_fusing__dynamic__no_bias__valid_shape(
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -513,9 +522,9 @@ def test_linear_add_fusing__dynamic__no_bias__invalid_shape(
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -550,9 +559,9 @@ def test_linear_add_fusing__dynamic__bias__valid_shape(
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -584,9 +593,9 @@ def test_linear_add_fusing__dynamic__reverse_order(self):
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
@@ -618,9 +627,9 @@ def test_linear_add_fusing__dynamic__alpha(self):
         program = torch.export.export(module, example_input, strict=True)
         original_module = program.module()
 
-        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
-            deepcopy(program.module())
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [FuseLinearAndAddPass()]
+        )(deepcopy(program.module())).graph_module
 
         # Make sure the module wasn't broken.
         original_nodes = list(original_module.graph.nodes)
diff --git a/backends/nxp/tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/test_move_activation_before_concatenation.py
new file mode 100644
index 00000000000..779c958c049
--- /dev/null
+++ b/backends/nxp/tests/test_move_activation_before_concatenation.py
@@ -0,0 +1,947 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import unittest
+
+import kgb
+import numpy as np
+import torch
+from executorch.backends.nxp.aten_passes.move_activation_before_concat import (
+    MoveActivationBeforeConcat,
+)
+from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import (
+    NeutronAtenPassManager,
+)
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    _quantize_model,
+    get_random_calibration_inputs,
+    neutron_target_spec,
+    to_model_input_spec,
+    to_quantized_edge_program,
+)
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    graph_contains_any_of_ops,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import get_activation
+from executorch.exir.dialects._ops import ops as exir_ops
+from parameterized import parameterized
+from torch import nn
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
+
+concat_cluster_ops = [
+    exir_ops.edge.aten.addmm.default,
+    exir_ops.edge.aten.convolution.default,
+    exir_ops.edge.aten.hardtanh.default,
+    exir_ops.edge.aten.relu.default,
+    exir_ops.edge.aten.sigmoid.default,
+    exir_ops.edge.aten.tanh.default,
+    exir_ops.edge.aten.cat.default,
+]
+
+
+class ConvConcatActivationModule(torch.nn.Module):
+    def __init__(self, activation: str, inplace: bool, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            (3, 3),
+            padding=1,
+        )
+
+        self.activation = get_activation(activation, inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1 = self.conv(x)
+        x2 = self.conv(x)
+        x = torch.cat((x1, x2), dim=1)
+        return self.activation(x)
+
+
+class LinearConcatActivationModule(nn.Module):
+    def __init__(
+        self, activation: str, inplace: bool, in_channels: int, mode: str = "linear"
+    ):
+        super().__init__()
+        self.mode = mode.lower()
+        assert self.mode in [
+            "linear",
+            "addmm",
+            "mm",
+        ], "Mode must be 'linear', 'addmm', or 'mm'"
+
+        if self.mode == "linear":
+            self.linear = nn.Linear(in_channels, in_channels)
+        else:
+            # Manual weight and bias for addmm/mm.
+            self.weight = nn.Parameter(torch.empty(in_channels, in_channels))
+            self.bias = nn.Parameter(torch.empty(in_channels))
+            nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            nn.init.uniform_(self.bias, -bound, bound)
+
+        self.activation = get_activation(activation, inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1, x2 = None, None
+
+        if self.mode == "linear":
+            x1 = self.linear(x)
+            x2 = self.linear(x)
+        if self.mode == "addmm":
+            x1 = torch.addmm(self.bias, x, self.weight)
+            x2 = torch.addmm(self.bias, x, self.weight)
+        elif self.mode == "mm":
+            x1 = torch.mm(x, self.weight)
+            x2 = torch.mm(x, self.weight)
+
+        x = torch.cat((x1, x2), dim=1)
+        return self.activation(x)
+
+
+class ConvActivationConcatModule(torch.nn.Module):
+    def __init__(
+        self,
+        activation1: str,
+        activation2: str,
+        act1_inplace: bool,
+        act2_inplace: bool,
+        in_channels: int,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            (3, 3),
+            padding=1,
+        )
+
+        self.activation1 = get_activation(activation1, act1_inplace)
+        self.activation2 = get_activation(activation2, act2_inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1 = self.conv(x)
+        x1 = self.activation1(x1)
+        x2 = self.conv(x)
+        x2 = self.activation2(x2)
+        return torch.cat((x1, x2), dim=1)
+
+
+class LinearActivationConcatModule(torch.nn.Module):
+    def __init__(
+        self,
+        activation1: str,
+        activation2: str,
+        act1_inplace: bool,
+        act2_inplace: bool,
+        in_channels: int,
+    ):
+        super().__init__()
+        self.linear = nn.Linear(in_channels, in_channels)
+
+        self.activation1 = get_activation(activation1, act1_inplace)
+        self.activation2 = get_activation(activation2, act2_inplace)
+        self.eval()
+
+    def forward(self, x):
+        x1 = self.linear(x)
+        x1 = self.activation1(x1)
+        x2 = self.linear(x)
+        x2 = self.activation2(x2)
+        return torch.cat((x1, x2), dim=1)
+
+
+class TestMoveActivationBeforeConcat(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests.
+
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(42)
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__conv(self, activation, inplace):
+        input_shape = (1, 3, 8, 8)
+        model = ConvConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=3
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 8
+        cat_node = nodes[5]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[6]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 9
+        cat_node = nodes[7]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[8].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check convolution and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 26
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[14]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[15]
+            )
+        )
+        assert (
+            nodes[16].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[18]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[19]
+            )
+        )
+        assert (
+            nodes[20].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__linear(self, activation, inplace):
+        input_shape = (1, 8)
+        model = LinearConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=8, mode="linear"
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 8
+        cat_node = nodes[5]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[6]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 9
+        cat_node = nodes[7]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[8].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check linear and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 22
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[10]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[11]
+            )
+        )
+        assert (
+            nodes[12].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[14]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[15]
+            )
+        )
+        assert (
+            nodes[16].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__addmm(self, activation, inplace):
+        input_shape = (1, 8)
+        model = LinearConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=8, mode="addmm"
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 8
+        cat_node = nodes[5]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[6]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 9
+        cat_node = nodes[7]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[8].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check addmm and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 22
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[10]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[11]
+            )
+        )
+        assert (
+            nodes[12].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[14]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[15]
+            )
+        )
+        assert (
+            nodes[16].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat__mm(self, activation, inplace):
+        input_shape = (1, 8)
+        model = LinearConcatActivationModule(
+            activation=activation, inplace=inplace, in_channels=8, mode="mm"
+        )
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(
+            model, example_input, strict=True
+        ).module()
+
+        outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+        nodes = list(exir_program_aten.graph.nodes)
+        assert len(nodes) == 7
+        cat_node = nodes[4]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[5]
+            )
+        )
+
+        # Apply the optimization.
+        NeutronAtenPassManager(
+            neutron_target_spec, [MoveActivationBeforeConcat(neutron_target_spec)]
+        )(exir_program_aten)
+
+        nodes = list(exir_program_aten.graph.nodes)
+
+        # Make sure the optimization was applied.
+        assert len(nodes) == 8
+        cat_node = nodes[6]
+        assert cat_node.target == torch.ops.aten.cat.default
+        assert all(
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                input_node
+            )
+            and len(input_node.users) == 1
+            for input_node in cat_node.all_input_nodes
+        )
+        assert nodes[7].target == "output"
+
+        outputs_after = [o.detach().numpy() for o in exir_program_aten(*example_input)]
+
+        # Make sure the model still produces the exact same output.
+        assert np.allclose(outputs_before[0], outputs_after[0])
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        neutron_aten_pass_manager = NeutronAtenPassManager(neutron_target_spec)
+        neutron_aten_pass_manager(exir_program_aten)  # All passes by default.
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten,
+            NeutronQuantizer(neutron_target_spec),
+            calibration_inputs,
+        )
+
+        # Check mm and activation are in same QDQ cluster.
+        nodes = list(exir_program_aten_quant.graph.nodes)
+        assert len(nodes) == 19
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[7]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[8]
+            )
+        )
+        assert (
+            nodes[9].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+        assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+            nodes[11]
+        )
+        assert (
+            neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                nodes[12]
+            )
+        )
+        assert (
+            nodes[13].target
+            == torch.ops.quantized_decomposed.quantize_per_tensor.default
+        )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__conv(
+        self, activation, inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8, 8, 8)
+            model = ConvConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                tflite_output_preprocess=ToChannelFirstPreprocess(),
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__linear(
+        self, activation, inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8)
+            model = LinearConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8, mode="linear"
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__addmm(
+        self, activation, inplace
+    ):
+        torch.manual_seed(23)
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8)
+            model = LinearConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8, mode="addmm"
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+                atol=1.0,
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", True],
+            ["relu", False],
+            ["relu6", True],
+            ["relu6", False],
+            ["tanh", True],
+            ["tanh", False],
+            ["sigmoid", False],
+        ]
+    )
+    def test_move_activation_before_concat_quantization__mm(self, activation, inplace):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            input_shape = (1, 8)
+            model = LinearConcatActivationModule(
+                activation=activation, inplace=inplace, in_channels=8, mode="mm"
+            )
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=concat_cluster_ops
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
+
+    @parameterized.expand(
+        [
+            ["relu", "relu", True, False],
+            ["relu6", "relu6", False, True],
+            ["tanh", "tanh", True, False],
+            ["sigmoid", "sigmoid", False, True],
+            ["relu", "relu_hardtanh", True, True],
+        ]
+    )
+    def test_concat_cluster_quantization__conv(
+        self, activation1, activation2, act1_inplace, act2_inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            with kgb.spy_on(_quantize_model, call_original=True) as quantizer_spy:
+                input_shape = (1, 8, 8, 8)
+                model = ConvActivationConcatModule(
+                    activation1, activation2, act1_inplace, act2_inplace, in_channels=8
+                )
+
+                edge_program = to_quantized_edge_program(
+                    model, input_shape
+                ).exported_program()
+
+                # Make sure that all nodes were delegated.
+                assert not graph_contains_any_of_ops(
+                    graph=edge_program.graph,
+                    ops=concat_cluster_ops,
+                )
+                assert any(
+                    "lowered_module" in node.name for node in edge_program.graph.nodes
+                )
+
+                tflite_flatbuffers_model, io_formats = converter_spy.calls[
+                    -1
+                ].return_value
+                exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+                exir_program_aten_quant: GraphModule = quantizer_spy.calls[-1].args[0]
+
+                # Check convolution and activation are in same QDQ cluster.
+                nodes = list(exir_program_aten_quant.graph.nodes)
+                assert len(nodes) == 26
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[14]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[15]
+                )
+                assert (
+                    nodes[16].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[18]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[19]
+                )
+                assert (
+                    nodes[20].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+
+                input_data = (
+                    np.random.random(input_shape).astype(np.float32) * 50
+                ).astype(np.int8)
+                convert_run_compare(
+                    exported_program,
+                    input_data,
+                    tfl_model=tflite_flatbuffers_model,
+                    tflite_input_preprocess=ToChannelLastPreprocess(),
+                    tflite_output_preprocess=ToChannelFirstPreprocess(),
+                )
+
+    @parameterized.expand(
+        [
+            ["relu", "relu", True, False],
+            ["relu6", "relu6", False, True],
+            ["tanh", "tanh", True, False],
+            ["sigmoid", "sigmoid", False, True],
+            ["relu", "relu_hardtanh", True, True],
+        ]
+    )
+    def test_concat_cluster_quantization__linear(
+        self, activation1, activation2, act1_inplace, act2_inplace
+    ):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program,
+            call_original=True,
+            owner=EdgeProgramToIRConverter,
+        ) as converter_spy:
+            with kgb.spy_on(_quantize_model, call_original=True) as quantizer_spy:
+                input_shape = (1, 8)
+                model = LinearActivationConcatModule(
+                    activation1, activation2, act1_inplace, act2_inplace, in_channels=8
+                )
+
+                edge_program = to_quantized_edge_program(
+                    model, input_shape
+                ).exported_program()
+
+                # Make sure that all nodes were delegated.
+                assert not graph_contains_any_of_ops(
+                    graph=edge_program.graph,
+                    ops=concat_cluster_ops,
+                )
+                assert any(
+                    "lowered_module" in node.name for node in edge_program.graph.nodes
+                )
+
+                tflite_flatbuffers_model, io_formats = converter_spy.calls[
+                    -1
+                ].return_value
+                exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+                exir_program_aten_quant: GraphModule = quantizer_spy.calls[-1].args[0]
+
+                # Check linear and activation are in same QDQ cluster.
+                nodes = list(exir_program_aten_quant.graph.nodes)
+                assert len(nodes) == 22
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[10]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[11]
+                )
+                assert (
+                    nodes[12].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+                assert neutron_target_spec.neutron_target_info.is_fusable_conv_or_linear__aten(
+                    nodes[14]
+                )
+                assert neutron_target_spec.neutron_target_info.is_supported_fused_activation__aten(
+                    nodes[15]
+                )
+                assert (
+                    nodes[16].target
+                    == torch.ops.quantized_decomposed.quantize_per_tensor.default
+                )
+
+                input_data = (
+                    np.random.random(input_shape).astype(np.float32) * 50
+                ).astype(np.int8)
+                convert_run_compare(
+                    exported_program,
+                    input_data,
+                    tfl_model=tflite_flatbuffers_model,
+                    tflite_input_preprocess=ToChannelLastPreprocess(),
+                    tflite_output_preprocess=ToChannelFirstPreprocess(),
+                )
diff --git a/backends/nxp/tests/test_removing_nodes_with_known_outputs.py b/backends/nxp/tests/test_removing_nodes_with_known_outputs.py
index 8f5549c8526..0c496356791 100644
--- a/backends/nxp/tests/test_removing_nodes_with_known_outputs.py
+++ b/backends/nxp/tests/test_removing_nodes_with_known_outputs.py
@@ -17,6 +17,7 @@
 from executorch.backends.nxp.aten_passes.split_gru_based_on_num_layers import (
     SplitGRUBasedOnNumLayers,
 )
+from executorch.backends.nxp.tests.executorch_pipeline import neutron_target_spec
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 from parameterized import parameterized
 from torch import nn
@@ -57,7 +58,9 @@ def test_removing_nodes__zeros(self):
         outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
 
         # Apply the optimization.
-        NeutronAtenPassManager([RemoveNodesWithKnownOutputs()])(exir_program_aten)
+        NeutronAtenPassManager(neutron_target_spec, [RemoveNodesWithKnownOutputs()])(
+            exir_program_aten
+        )
 
         # Make sure the `aten.zeros` is no longer in the model.
         assert not graph_contains_any_of_ops(
@@ -81,7 +84,9 @@ def test_removing_nodes__split(self, num_layers):
         exir_program_aten = torch.export.export(model, example_input).module()
 
         # Apply the pass to split the `aten.gru.input` into multiple instances, and add a `split` node.
-        NeutronAtenPassManager([SplitGRUBasedOnNumLayers()])(exir_program_aten)
+        NeutronAtenPassManager(neutron_target_spec, [SplitGRUBasedOnNumLayers()])(
+            exir_program_aten
+        )
 
         # Make sure the `aten.zeros` and `torch.split` are in the model.
         assert graph_contains_any_of_ops(
@@ -93,7 +98,9 @@ def test_removing_nodes__split(self, num_layers):
         outputs_before = [o.detach().numpy() for o in exir_program_aten(*example_input)]
 
         # Apply the optimization.
-        NeutronAtenPassManager([RemoveNodesWithKnownOutputs()])(exir_program_aten)
+        NeutronAtenPassManager(neutron_target_spec, [RemoveNodesWithKnownOutputs()])(
+            exir_program_aten
+        )
 
         # Make sure the `aten.zeros` and `torch.split` are no longer in the model.
         assert not graph_contains_any_of_ops(
diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py
index 8b2d5723dbb..6e084699307 100644
--- a/backends/nxp/tests/test_split_group_convolution.py
+++ b/backends/nxp/tests/test_split_group_convolution.py
@@ -88,9 +88,9 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int
         graph_module = torch.export.export(module, example_input, strict=True).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager([SplitGroupConvolution()])(
-            graph_module
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [SplitGroupConvolution()]
+        )(graph_module).graph_module
 
         # Make sure the fusion worked.
         original_nodes = list(original_module.graph.nodes)
@@ -145,9 +145,9 @@ def test_split_group_convolution__1d(self, _, input_shape: list[int], group: int
         graph_module = torch.export.export(module, example_input).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager([SplitGroupConvolution()])(
-            graph_module
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [SplitGroupConvolution()]
+        )(graph_module).graph_module
 
         # Make sure the fusion worked.
         original_nodes = list(original_module.graph.nodes)
@@ -199,9 +199,9 @@ def test_split_group_convolution__3d(self, _, input_shape: list[int], group: int
         graph_module = torch.export.export(module, example_input).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager([SplitGroupConvolution()])(
-            graph_module
-        ).graph_module
+        modified_module = NeutronAtenPassManager(
+            neutron_target_spec, [SplitGroupConvolution()]
+        )(graph_module).graph_module
 
         # Verify that the pass has NOT made any changes, as it is disabled for 3D convolution.
         original_nodes = list(original_module.graph.nodes)
@@ -233,7 +233,7 @@ def test_split_group_convolution__applied_by_default(self):
         graph_module = torch.export.export(module, example_input).module()
         original_module = deepcopy(graph_module)
 
-        modified_module = NeutronAtenPassManager()(
+        modified_module = NeutronAtenPassManager(neutron_target_spec)(
             graph_module
         ).graph_module  # Default passes.
 

From e519de0d7609e4096d468d33634867ecd0a34409 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 10:59:16 -0800
Subject: [PATCH 10/19] voxtral cmake changes

---
 examples/models/voxtral/CMakeLists.txt | 32 ++++++++++++++++----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 866d17160ba..02d6cf6df7b 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -39,18 +39,16 @@ executorch_target_link_options_shared_lib(executorch)
 set(link_libraries executorch gflags)
 set(_srcs multimodal.cpp)
 
-list(
-  APPEND
-  link_libraries
-  optimized_native_cpu_ops_lib
-  quantized_ops_lib
-  custom_ops
-  cpublas
-  eigen_blas
-)
+# Common ops for all builds
+list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
 executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
-executorch_target_link_options_shared_lib(quantized_ops_lib)
-executorch_target_link_options_shared_lib(custom_ops)
+
+# CPU-only builds need quantized and custom ops
+if(NOT EXECUTORCH_BUILD_CUDA AND MSVC)
+  list(APPEND link_libraries quantized_ops_lib custom_ops)
+  executorch_target_link_options_shared_lib(quantized_ops_lib)
+  executorch_target_link_options_shared_lib(custom_ops)
+endif()
 
 # XNNPACK
 if(TARGET xnnpack_backend)
@@ -104,11 +102,21 @@ list(APPEND link_libraries tokenizers::tokenizers)
 add_executable(voxtral_runner ${_srcs})
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(voxtral_runner)
-  if(NOT APPLE)
+  if(NOT APPLE AND NOT MSVC)
     target_link_options(voxtral_runner PRIVATE "LINKER:-s")
   endif()
 endif()
 
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  if(NOT MSVC)
+    # On non-MSVC, use shared lib options
+    executorch_target_link_options_shared_lib(aoti_cuda)
+  endif()
+endif()
+
 target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
 target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
 target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})

From 762f06a79dac0549fc349081bfa4aaf1b0422cec Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 14:52:47 -0800
Subject: [PATCH 11/19] cuda_backend

---
 examples/models/voxtral/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 02d6cf6df7b..ea0be1f03ca 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -110,7 +110,7 @@ endif()
 # Link CUDA backend
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda)
+  list(APPEND link_libraries aoti_cuda_backend)
   if(NOT MSVC)
     # On non-MSVC, use shared lib options
     executorch_target_link_options_shared_lib(aoti_cuda)

From c9a3cd0dd2f349bddcb1ea793b9913f65d6a03b2 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 15:30:35 -0800
Subject: [PATCH 12/19] cuda shenanigans

---
 examples/models/voxtral/CMakeLists.txt | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index ea0be1f03ca..ea4c3b533c4 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -87,8 +87,11 @@ list(
 # Link CUDA backend
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda)
-  executorch_target_link_options_shared_lib(aoti_cuda)
+  list(APPEND link_libraries aoti_cuda_backend)
+  if(NOT MSVC)
+    # On non-MSVC, use shared lib options
+    executorch_target_link_options_shared_lib(aoti_cuda)
+  endif()
 endif()
 
 if(EXECUTORCH_BUILD_METAL)
@@ -107,16 +110,6 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   endif()
 endif()
 
-# Link CUDA backend
-if(EXECUTORCH_BUILD_CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda_backend)
-  if(NOT MSVC)
-    # On non-MSVC, use shared lib options
-    executorch_target_link_options_shared_lib(aoti_cuda)
-  endif()
-endif()
-
 target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
 target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
 target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})

From f13ba262ef2c320da7999b5f475a3c8bcb122a64 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 15:31:32 -0800
Subject: [PATCH 13/19] cuda shenanigans2

---
 examples/models/voxtral/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index ea4c3b533c4..80f76da7af9 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -90,7 +90,7 @@ if(EXECUTORCH_BUILD_CUDA)
   list(APPEND link_libraries aoti_cuda_backend)
   if(NOT MSVC)
     # On non-MSVC, use shared lib options
-    executorch_target_link_options_shared_lib(aoti_cuda)
+    executorch_target_link_options_shared_lib(aoti_cuda_backend)
   endif()
 endif()
 

From 089dcc94e6922b64924c5928b479e848a34203b9 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 15:35:24 -0800
Subject: [PATCH 14/19] lint

---
 backends/aoti/CMakeLists.txt |  3 +--
 backends/cuda/CMakeLists.txt | 10 ++++------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 2d1f5859d3c..d5582dfe7c7 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -39,8 +39,7 @@ target_compile_options(
          $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
 target_compile_definitions(
-  aoti_common
-  PRIVATE $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
+  aoti_common PRIVATE $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
 )
 # Ensure symbols are exported properly
 if(APPLE)
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index d603c9215eb..ac93621831e 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -94,11 +94,8 @@ install(
 
 # CUDA-specific AOTI shim symbols (dynamically linked)
 set(_aoti_cuda_shim_sources
-    runtime/shims/memory.cpp
-    runtime/shims/tensor_attribute.cpp
-    runtime/guard.cpp
-    runtime/shims/cuda_guard.cpp
-    runtime/shims/int4mm.cu
+    runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp
+    runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
     ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
 
@@ -131,7 +128,8 @@ target_link_options(
   aoti_cuda_shims PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and platform utilities
+# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and
+# platform utilities
 target_link_libraries(
   aoti_cuda_shims
   PRIVATE cuda_platform

From bfefdeba40d54e799efc587ec8c39300053de7bb Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 16:02:59 -0800
Subject: [PATCH 15/19] copy dll automatically

---
 examples/models/voxtral/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 80f76da7af9..85cc62aa8f6 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -113,3 +113,15 @@ endif()
 target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
 target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
 target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
+
+# On Windows, copy required DLLs to the executable directory
+if(MSVC AND EXECUTORCH_BUILD_CUDA)
+  add_custom_command(
+    TARGET voxtral_runner
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            $<TARGET_FILE:aoti_cuda_shims>
+            $<TARGET_FILE_DIR:voxtral_runner>
+    COMMENT "Copying aoti_cuda_shims.dll to voxtral_runner directory"
+  )
+endif()

From a6ba1767d3c0b7edaf1738c6c135b5d678575b7c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 16:18:27 -0800
Subject: [PATCH 16/19] remove alias

---
 backends/cuda/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index ac93621831e..ec7fc9b4fd3 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -183,9 +183,6 @@ install(
   DESTINATION lib
 )
 
-# Alias for backward compatibility
-add_library(aoti_cuda ALIAS aoti_cuda_backend)
-
 if(BUILD_TESTING)
   add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
   target_link_libraries(

From 5596c453fe8c1013b05ba0c2677205960ffed543 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 16:24:57 -0800
Subject: [PATCH 17/19] lint

---
 examples/models/voxtral/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 85cc62aa8f6..24a1096c889 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -119,8 +119,7 @@ if(MSVC AND EXECUTORCH_BUILD_CUDA)
   add_custom_command(
     TARGET voxtral_runner
     POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-            $<TARGET_FILE:aoti_cuda_shims>
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
             $<TARGET_FILE_DIR:voxtral_runner>
     COMMENT "Copying aoti_cuda_shims.dll to voxtral_runner directory"
   )

From a6b20e64b312f3f2cf61476aa077a076afccfd85 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 6 Nov 2025 17:54:07 -0800
Subject: [PATCH 18/19] fix test build and have some unintuitive linkage for
 now

---
 CMakeLists.txt               |  5 +++--
 backends/cuda/CMakeLists.txt | 16 ++++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6d6f26b41f..51573d276b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -591,8 +591,9 @@ endif()
 if(EXECUTORCH_BUILD_CUDA)
   # Build CUDA-specific AOTI functionality
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
-  # Add aoti_cuda to backends - it already depends on aoti_common
-  list(APPEND _executorch_backends aoti_cuda)
+  # Add aoti_cuda_backend to backends - it transitively includes aoti_cuda_shims
+  # and cuda_platform
+  list(APPEND _executorch_backends aoti_cuda_backend)
 endif()
 
 if(EXECUTORCH_BUILD_METAL)
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index ec7fc9b4fd3..2eb923f2ab9 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -168,13 +168,21 @@ target_link_options(
   $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against shims library and other dependencies
+# Link against shims library and other dependencies On Windows (MSVC), use
+# PRIVATE linkage for aoti_cuda_shims since the DLL is copied to the executable
+# directory. On other platforms, use PUBLIC so the dependency propagates to
+# consumers.
 target_link_libraries(
-  aoti_cuda_backend
-  PRIVATE aoti_cuda_shims cuda_platform
-  PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda_backend PUBLIC cuda_platform extension_tensor cuda_tensor_maker
+                           CUDA::cudart ${CMAKE_DL_LIBS}
 )
 
+if(MSVC)
+  target_link_libraries(aoti_cuda_backend PRIVATE aoti_cuda_shims)
+else()
+  target_link_libraries(aoti_cuda_backend PUBLIC aoti_cuda_shims)
+endif()
+
 executorch_target_link_options_shared_lib(aoti_cuda_backend)
 
 install(

From 5f73ad29aa28eed0af59098f0d38ea3c693af9d3 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Mon, 10 Nov 2025 10:12:08 -0800
Subject: [PATCH 19/19] update ET_platform deps

---
 backends/cuda/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 2eb923f2ab9..8f121bdbd32 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -58,7 +58,9 @@ else()
 endif()
 
 # Link against ExecuTorch core libraries
-target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
+target_link_libraries(
+  cuda_tensor_maker PRIVATE executorch_core ${CMAKE_DL_LIBS}
+)
 executorch_target_link_options_shared_lib(cuda_tensor_maker)
 
 install(
@@ -84,7 +86,7 @@ target_compile_options(
 )
 
 # Link against ExecuTorch core libraries
-target_link_libraries(cuda_platform PUBLIC executorch ${CMAKE_DL_LIBS})
+target_link_libraries(cuda_platform PRIVATE executorch_core ${CMAKE_DL_LIBS})
 
 install(
   TARGETS cuda_platform