Skip to content

Commit 465569d

Browse files
ikawrakowIwan Kawrakow
andauthored
Faster DeepSeek FA on CUDA (#408)
* New DeepSeek FlashMLA Does not work because the RoPE portion is stored at the end in our case, while in mainline it is stored at the beginning, and the FA kernel assumes that. * Rearrange MLA K cache so it first new CUDA FA implementation * constexpr and minor changes --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 8669c3d commit 465569d

File tree

4 files changed

+356
-130
lines changed

4 files changed

+356
-130
lines changed

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -986,7 +986,7 @@ endif()
986986
set(CUDA_CXX_FLAGS "")
987987

988988
if (GGML_CUDA)
989-
set(CUDA_FLAGS -use_fast_math)
989+
set(CUDA_FLAGS -use_fast_math -extended-lambda)
990990

991991
if (GGML_FATAL_WARNINGS)
992992
list(APPEND CUDA_FLAGS -Werror all-warnings)

ggml/src/ggml-cuda/cp-async.cuh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@
22

33
#include "common.cuh"
44

5+
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
6+
#ifdef CP_ASYNC_AVAILABLE
7+
return __cvta_generic_to_shared(generic_ptr);
8+
#else
9+
GGML_UNUSED(generic_ptr);
10+
NO_DEVICE_CODE;
11+
return 0;
12+
#endif // CP_ASYNC_AVAILABLE
13+
}
14+
515
// Copies data from global to shared memory, cg == cache global.
616
// Both the src and dst pointers must be aligned to 16 bit.
717
// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.

0 commit comments

Comments
 (0)