diff --git a/CMakeLists.txt b/CMakeLists.txt index c6d6f26b41f..51573d276b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -591,8 +591,9 @@ endif() if(EXECUTORCH_BUILD_CUDA) # Build CUDA-specific AOTI functionality add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda) - # Add aoti_cuda to backends - it already depends on aoti_common - list(APPEND _executorch_backends aoti_cuda) + # Add aoti_cuda_backend to backends - it transitively includes aoti_cuda_shims + # and cuda_platform + list(APPEND _executorch_backends aoti_cuda_backend) endif() if(EXECUTORCH_BUILD_METAL) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index bcff1d56769..d5582dfe7c7 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -38,6 +38,9 @@ target_compile_options( PUBLIC $<$:/EHsc /GR> $<$>:-fexceptions -frtti -fPIC> ) +target_compile_definitions( + aoti_common PRIVATE $<$:EXPORT_AOTI_FUNCTIONS> +) # Ensure symbols are exported properly if(APPLE) target_link_options(aoti_common PUBLIC -Wl,-export_dynamic) diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index deb10478778..6733a5e7afd 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -16,8 +16,10 @@ namespace aoti { namespace internal { // Global storage for tensor metadata -std::unordered_map> tensor_to_sizes; -std::unordered_map> tensor_to_strides; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_sizes; +AOTI_SHIM_EXPORT std::unordered_map> + tensor_to_strides; } // namespace internal extern "C" { @@ -204,6 +206,69 @@ void cleanup_tensor_metadata() { internal::tensor_to_strides.clear(); } +AOTI_SHIM_EXPORT void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg) { + ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg); +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) { + (void)tensor; + (void)ret_size; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) { + (void)self; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) { + (void)orig_handle; + (void)new_handle; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor) { + (void)data_ptr; + (void)ndim; + (void)sizes; + (void)strides; + (void)storage_offset; + (void)dtype; + (void)device_type; + (void)device_index; + (void)ret_new_tensor; + throw std::runtime_error("Not implemented"); + return Error::Internal; +} + } // extern "C" } // namespace aoti diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 91bb785b684..b2bac6f41cd 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -23,57 +24,89 @@ namespace aoti { using executorch::runtime::Error; using executorch::runtime::etensor::Tensor; +// Global storage for tensor metadata +extern std::unordered_map> tensor_to_sizes; +extern std::unordered_map> tensor_to_strides; + extern "C" { // Common AOTI type aliases using AOTIRuntimeError = Error; using AOTITorchError = Error; -// Global storage for tensor metadata -extern std::unordered_map> tensor_to_sizes; -extern std::unordered_map> tensor_to_strides; - // Attribute-related operations (memory-irrelevant) -AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr); -AOTITorchError aoti_torch_get_storage_offset( - Tensor* tensor, - int64_t* ret_storage_offset); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset); -AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides); -AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype); -AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes); -AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); -AOTITorchError aoti_torch_get_device_index( - Tensor* tensor, - int32_t* ret_device_index); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index); -AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); // Utility functions for device and layout information -int32_t aoti_torch_device_type_cpu(); -int32_t aoti_torch_layout_strided(); -int32_t aoti_torch_dtype_float32(); -int32_t aoti_torch_dtype_bfloat16(); -int32_t aoti_torch_dtype_int8(); -int32_t aoti_torch_dtype_int16(); -int32_t aoti_torch_dtype_int32(); -int32_t aoti_torch_dtype_int64(); -int32_t aoti_torch_dtype_bool(); +AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu(); +AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64(); // Dtype utility function needed by Metal backend -size_t aoti_torch_dtype_element_size(int32_t dtype); +AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype); // Autograd mode functions -int32_t aoti_torch_grad_mode_is_enabled(); -void aoti_torch_grad_mode_set_enabled(bool enabled); +AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled(); +AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled); // Cleanup functions for clearing global state -void cleanup_tensor_metadata(); +AOTI_SHIM_EXPORT void cleanup_tensor_metadata(); + +AOTI_SHIM_EXPORT void aoti_torch_warn( + const char* func, + const char* file, + uint32_t line, + const char* msg); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor); + +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle); + +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( + void* data_ptr, + int64_t ndim, + const int64_t* sizes, + const int64_t* strides, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + Tensor** ret_new_tensor); } // extern "C" diff --git a/backends/aoti/export.h b/backends/aoti/export.h new file mode 100644 index 00000000000..879aa942035 --- /dev/null +++ b/backends/aoti/export.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// Define export macro for Windows DLL +// When building the aoti_cuda library, EXPORT_AOTI_FUNCTIONS is defined by +// CMake, which causes this macro to export symbols using __declspec(dllexport). +// When consuming the library, the macro imports symbols using +// __declspec(dllimport). On non-Windows platforms, the macro is empty and has +// no effect. +#ifdef _WIN32 +#ifdef EXPORT_AOTI_FUNCTIONS +#define AOTI_SHIM_EXPORT __declspec(dllexport) +#else +#define AOTI_SHIM_EXPORT __declspec(dllimport) +#endif +#else +#define AOTI_SHIM_EXPORT +#endif diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index c95d34247be..8f121bdbd32 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -58,7 +58,9 @@ else() endif() # Link against ExecuTorch core libraries -target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS}) +target_link_libraries( + cuda_tensor_maker PRIVATE executorch_core ${CMAKE_DL_LIBS} +) executorch_target_link_options_shared_lib(cuda_tensor_maker) install( @@ -67,50 +69,134 @@ install( DESTINATION lib ) -# CUDA-specific AOTI functionality -set(_aoti_cuda_sources - runtime/cuda_backend.cpp - runtime/shims/memory.cpp - runtime/shims/tensor_attribute.cpp - runtime/guard.cpp - runtime/shims/cuda_guard.cpp - runtime/shims/int4mm.cu - runtime/platform/platform.cpp +# Platform utilities (load_library, close_library, etc.) +set(_cuda_platform_sources runtime/platform/platform.cpp) +add_library(cuda_platform STATIC ${_cuda_platform_sources}) + +target_include_directories( + cuda_platform + PUBLIC $ $ + $ +) + +target_compile_options( + cuda_platform + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) + +# Link against ExecuTorch core libraries +target_link_libraries(cuda_platform PRIVATE executorch_core ${CMAKE_DL_LIBS}) + +install( + TARGETS cuda_platform + EXPORT ExecuTorchTargets + DESTINATION lib +) + +# CUDA-specific AOTI shim symbols (dynamically linked) +set(_aoti_cuda_shim_sources + runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp + runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu + ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp ) -add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) + +add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources}) + +# Define export macros for shared library +if(MSVC) + target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS) + + # Ensure proper DLL import/export library naming on Windows + set_target_properties( + aoti_cuda_shims PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF + ) +endif() + target_include_directories( - aoti_cuda + aoti_cuda_shims PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $ $ ) + target_compile_options( - aoti_cuda PUBLIC $<$:/EHsc /GR> - $<$>:-fexceptions -frtti -fPIC> + aoti_cuda_shims + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> ) + # Ensure symbols are exported properly target_link_options( - aoti_cuda PUBLIC $<$>:-Wl,--export-dynamic> + aoti_cuda_shims PUBLIC $<$>:-Wl,--export-dynamic> ) -# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch -# CUDA libraries +# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and +# platform utilities target_link_libraries( - aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda_shims + PRIVATE cuda_platform + PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) -# If you need other CUDA libraries, link them similarly: -# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) -executorch_target_link_options_shared_lib(aoti_cuda) -if(BUILD_TESTING) - add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp) - target_link_libraries( - multimodal_benchmark PUBLIC aoti_cuda extension_module_static - extension_flat_tensor portable_ops_lib - ) +if(NOT MSVC) + executorch_target_link_options_shared_lib(aoti_cuda_shims) +endif() + +install( + TARGETS aoti_cuda_shims + EXPORT ExecuTorchTargets + DESTINATION lib +) + +# CUDA backend implementation +set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp) + +# CUDA backend implementation +add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources}) + +target_include_directories( + aoti_cuda_backend + PUBLIC ${CUDAToolkit_INCLUDE_DIRS} $ + $ +) +target_compile_options( + aoti_cuda_backend + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) +# Ensure symbols are exported properly +target_link_options( + aoti_cuda_backend PUBLIC + $<$>:-Wl,--export-dynamic> +) + +# Link against shims library and other dependencies On Windows (MSVC), use +# PRIVATE linkage for aoti_cuda_shims since the DLL is copied to the executable +# directory. On other platforms, use PUBLIC so the dependency propagates to +# consumers. +target_link_libraries( + aoti_cuda_backend PUBLIC cuda_platform extension_tensor cuda_tensor_maker + CUDA::cudart ${CMAKE_DL_LIBS} +) + +if(MSVC) + target_link_libraries(aoti_cuda_backend PRIVATE aoti_cuda_shims) +else() + target_link_libraries(aoti_cuda_backend PUBLIC aoti_cuda_shims) endif() +executorch_target_link_options_shared_lib(aoti_cuda_backend) + install( - TARGETS aoti_cuda + TARGETS aoti_cuda_backend EXPORT ExecuTorchTargets DESTINATION lib ) + +if(BUILD_TESTING) + add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp) + target_link_libraries( + multimodal_benchmark PUBLIC aoti_cuda_backend extension_module_static + extension_flat_tensor portable_ops_lib + ) +endif() diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h index f930f3df643..83fceabb98f 100644 --- a/backends/cuda/runtime/shims/cuda_guard.h +++ b/backends/cuda/runtime/shims/cuda_guard.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -33,9 +34,8 @@ using CUDAStreamGuardHandle = CUDAStreamGuard*; * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_guard( - int32_t device_index, - CUDAGuardHandle* ret_guard); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_create_cuda_guard(int32_t device_index, CUDAGuardHandle* ret_guard); /** * Deletes a CUDA device guard and frees its associated resources. @@ -44,7 +44,8 @@ AOTITorchError aoti_torch_create_cuda_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); /** * Sets the CUDA device to a new index for an existing guard. @@ -54,9 +55,8 @@ AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_cuda_guard_set_index( - CUDAGuardHandle guard, - int32_t device_index); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index); /** * Creates a CUDA stream guard that sets the current device and stream, @@ -69,7 +69,7 @@ AOTITorchError aoti_torch_cuda_guard_set_index( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_cuda_stream_guard( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard( void* stream, int32_t device_index, CUDAStreamGuardHandle* ret_guard); @@ -81,7 +81,8 @@ AOTITorchError aoti_torch_create_cuda_stream_guard( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); /** * Gets the current CUDA stream for a specified device. @@ -91,9 +92,8 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_get_current_cuda_stream( - int32_t device_index, - void** ret_stream); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream); } // extern "C" diff --git a/backends/cuda/runtime/shims/int4mm.h b/backends/cuda/runtime/shims/int4mm.h index 6bd2d9b3a79..87a9916b0aa 100644 --- a/backends/cuda/runtime/shims/int4mm.h +++ b/backends/cuda/runtime/shims/int4mm.h @@ -10,6 +10,7 @@ #include #include +#include namespace executorch::backends::cuda { @@ -69,7 +70,7 @@ extern "C" { * or invalid qGroupSize * - Error::Internal: CUDA kernel launch failure */ -AOTITorchError aoti_torch_cuda__weight_int4pack_mm( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm( Tensor* self, Tensor* mat2, int64_t qGroupSize, diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 7a8d4c3609b..8d9d37037de 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -10,6 +10,7 @@ #include #include +#include #include namespace executorch::backends::cuda { @@ -43,7 +44,7 @@ extern "C" { * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_create_tensor_from_blob_v2( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2( void* data, int64_t ndim, const int64_t* sizes_ptr, @@ -71,7 +72,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_empty_strided( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_empty_strided( int64_t ndim, const int64_t* sizes_ptr, const int64_t* strides_ptr, @@ -87,7 +88,7 @@ AOTITorchError aoti_torch_empty_strided( * @return AOTITorchError error code (Error::Ok on success, or an error code on * failure) */ -AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); /** * Creates a tensor view that reinterprets the same underlying memory with @@ -106,7 +107,7 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor); * * @return Error::Ok on success, appropriate error code on failure */ -AOTITorchError aoti_torch__reinterpret_tensor( +AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor( Tensor* self, int64_t ndim, const int64_t* sizes_ptr, @@ -136,11 +137,11 @@ AOTITorchError aoti_torch__reinterpret_tensor( * - Error::MemoryAllocationFailed: failed to allocate temporary memory * - Error::Internal: CUDA operation failures */ -AOTITorchError +AOTI_SHIM_EXPORT AOTITorchError aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking); // Function to clear all tensors from internal storage -void clear_all_tensors(); +AOTI_SHIM_EXPORT void clear_all_tensors(); } // extern "C" } // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index 6b61b5bd3b8..683f270ccda 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -24,12 +25,11 @@ extern "C" { using AOTITorchError = Error; // Device type functions for tensor attributes -AOTITorchError aoti_torch_get_device_type( - Tensor* tensor, - int32_t* ret_device_type); +AOTI_SHIM_EXPORT AOTITorchError +aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type); // Device type constants -int32_t aoti_torch_device_type_cuda(); +AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cuda(); } // extern "C" diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 866d17160ba..24a1096c889 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -39,18 +39,16 @@ executorch_target_link_options_shared_lib(executorch) set(link_libraries executorch gflags) set(_srcs multimodal.cpp) -list( - APPEND - link_libraries - optimized_native_cpu_ops_lib - quantized_ops_lib - custom_ops - cpublas - eigen_blas -) +# Common ops for all builds +list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) -executorch_target_link_options_shared_lib(quantized_ops_lib) -executorch_target_link_options_shared_lib(custom_ops) + +# CPU-only builds need quantized and custom ops +if(NOT EXECUTORCH_BUILD_CUDA AND MSVC) + list(APPEND link_libraries quantized_ops_lib custom_ops) + executorch_target_link_options_shared_lib(quantized_ops_lib) + executorch_target_link_options_shared_lib(custom_ops) +endif() # XNNPACK if(TARGET xnnpack_backend) @@ -89,8 +87,11 @@ list( # Link CUDA backend if(EXECUTORCH_BUILD_CUDA) find_package(CUDAToolkit REQUIRED) - list(APPEND link_libraries aoti_cuda) - executorch_target_link_options_shared_lib(aoti_cuda) + list(APPEND link_libraries aoti_cuda_backend) + if(NOT MSVC) + # On non-MSVC, use shared lib options + executorch_target_link_options_shared_lib(aoti_cuda_backend) + endif() endif() if(EXECUTORCH_BUILD_METAL) @@ -104,7 +105,7 @@ list(APPEND link_libraries tokenizers::tokenizers) add_executable(voxtral_runner ${_srcs}) if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(voxtral_runner) - if(NOT APPLE) + if(NOT APPLE AND NOT MSVC) target_link_options(voxtral_runner PRIVATE "LINKER:-s") endif() endif() @@ -112,3 +113,14 @@ endif() target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) + +# On Windows, copy required DLLs to the executable directory +if(MSVC AND EXECUTORCH_BUILD_CUDA) + add_custom_command( + TARGET voxtral_runner + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different $ + $ + COMMENT "Copying aoti_cuda_shims.dll to voxtral_runner directory" + ) +endif()