createthis
diff --git a/‎ggml/src/ggml-cuda/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎ggml/src/ggml-cuda/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/cute_tie.cuh‎
Lines changed: 48 additions & 0 deletions b/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/cute_tie.cuh‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/epilogue_utils.cuh‎
Lines changed: 27 additions & 0 deletions b/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/epilogue_utils.cuh‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/reduction.cuh‎
Lines changed: 44 additions & 0 deletions b/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/reduction.cuh‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/scheduler.cuh‎
Lines changed: 257 additions & 0 deletions b/‎ggml/src/ggml-cuda/vendors/DeepGEMM/deep_gemm/common/scheduler.cuh‎
Lines changed: 257 additions & 0 deletions
@@ -218,3 +218,8 @@ endif()
       target_include_directories(ggml-cuda PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/vendors/cutlass/include
       )
+
+    # DeepGEMM vendor includes for sm90/sm100 FP8 paged MQA logits kernel (DSA)
+    target_include_directories(ggml-cuda PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/vendors/DeepGEMM
+    )
@@ -0,0 +1,48 @@
+#pragma once
+
+namespace cute {
+
+struct ignore_t {
+    template <typename T>
+    constexpr const ignore_t& operator=(T&&) const noexcept {
+        return *this;
+    }
+};
+
+inline constexpr ignore_t ignore{};
+
+} // namespace cute
+
+#define CUTE_TIE_CONCAT_IMPL(A, B) A##B
+#define CUTE_TIE_CONCAT(A, B) CUTE_TIE_CONCAT_IMPL(A, B)
+
+#define CUTE_TIE_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+#define CUTE_TIE_COUNT_ARGS(...) \
+    CUTE_TIE_GET_NTH_ARG(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+
+#define CUTE_TIE_OP_DECL(I, TUPLE, VAR) auto VAR = ::cute::get<I>(TUPLE)
+#define CUTE_TIE_OP_ASSIGN(I, TUPLE, VAR) VAR = ::cute::get<I>(TUPLE)
+
+#define CUTE_TIE_APPLY_OP_1(OP, T, V1) OP(0, T, V1);
+#define CUTE_TIE_APPLY_OP_2(OP, T, V1, V2) OP(0, T, V1); OP(1, T, V2);
+#define CUTE_TIE_APPLY_OP_3(OP, T, V1, V2, V3) OP(0, T, V1); OP(1, T, V2); OP(2, T, V3);
+#define CUTE_TIE_APPLY_OP_4(OP, T, V1, V2, V3, V4) OP(0, T, V1); OP(1, T, V2); OP(2, T, V3); OP(3, T, V4);
+#define CUTE_TIE_APPLY_OP_5(OP, T, V1, V2, V3, V4, V5) OP(0, T, V1); OP(1, T, V2); OP(2, T, V3); OP(3, T, V4); OP(4, T, V5);
+
+#define CUTE_TIE_DECL(TUPLE_EXPR, ...) \
+    auto&& CUTE_TIE_CONCAT(cute_tie__temp_tuple_, __LINE__) = (TUPLE_EXPR); \
+    CUTE_TIE_CONCAT(CUTE_TIE_APPLY_OP_, CUTE_TIE_COUNT_ARGS(__VA_ARGS__)) ( \
+        CUTE_TIE_OP_DECL, \
+        CUTE_TIE_CONCAT(cute_tie__temp_tuple_, __LINE__), \
+        __VA_ARGS__ \
+    )
+
+#define CUTE_TIE(TUPLE_EXPR, ...) \
+    do { \
+        auto&& CUTE_TIE_CONCAT(cute_tie__temp_tuple_, __LINE__) = (TUPLE_EXPR); \
+        CUTE_TIE_CONCAT(CUTE_TIE_APPLY_OP_, CUTE_TIE_COUNT_ARGS(__VA_ARGS__)) ( \
+            CUTE_TIE_OP_ASSIGN, \
+            CUTE_TIE_CONCAT(cute_tie__temp_tuple_, __LINE__), \
+            __VA_ARGS__ \
+        ); \
+    } while (0)
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <deep_gemm/common/types.hpp>
+#include <deep_gemm/common/utils.cuh>
+
+namespace deep_gemm {
+
+struct EpilogueIdentity {
+    template <uint32_t STORE_BLOCK_N>
+    __device__ __forceinline__ static uint32_t apply_index_n(const uint32_t &n_idx) {
+        return n_idx;
+    }
+};
+
+template <uint32_t kLeft, uint32_t kMid, uint32_t kRight>
+struct EpilogueHeadSplits: EpilogueIdentity {
+    template <uint32_t STORE_BLOCK_N>
+    __device__ __forceinline__ static uint32_t apply_index_n(const uint32_t &n_idx) {
+        DG_STATIC_ASSERT(kLeft % STORE_BLOCK_N == 0 and kMid % STORE_BLOCK_N == 0 
+                         and kRight % STORE_BLOCK_N == 0, "Invalid head splits config");
+        return n_idx + (n_idx + kRight) / (kLeft + kRight) * kMid;
+    }
+};
+
+#pragma clang diagnostic pop
+
+} // namespace deep_gemm
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/utility>
+
+#include <deep_gemm/common/utils.cuh>
+
+// Operation functors
+template <typename T> struct ReduceSum { __device__ T operator()(T a, T b) const { return a + b; } };
+template <typename T> struct ReduceMax { __device__ T operator()(T a, T b) const { return a > b ? a : b; } };
+template <typename T> struct ReduceMin { __device__ T operator()(T a, T b) const { return a < b ? a : b; } };
+template <typename T> struct ReduceAnd { __device__ T operator()(T a, T b) const { return a & b; } };
+template <typename T> struct ReduceOr  { __device__ T operator()(T a, T b) const { return a | b; } };
+
+// Unified reduction function
+template <int kNumLanesPerGroup, bool kIntergroupReduce, typename T, typename Op>
+__forceinline__ __device__ T warp_reduce(T value, Op op) {
+    DG_STATIC_ASSERT(kNumLanesPerGroup == 32 or kNumLanesPerGroup == 16 or kNumLanesPerGroup == 8 or
+                     kNumLanesPerGroup ==  4 or kNumLanesPerGroup == 2  or kNumLanesPerGroup == 1,
+                     "Invalid number of lanes");
+    constexpr uint32_t mask = 0xffffffff;
+    if constexpr (kIntergroupReduce) {
+        if constexpr (kNumLanesPerGroup <=  1) value = op(value, __shfl_xor_sync(mask, value,  1));
+        if constexpr (kNumLanesPerGroup <=  2) value = op(value, __shfl_xor_sync(mask, value,  2));
+        if constexpr (kNumLanesPerGroup <=  4) value = op(value, __shfl_xor_sync(mask, value,  4));
+        if constexpr (kNumLanesPerGroup <=  8) value = op(value, __shfl_xor_sync(mask, value,  8));
+        if constexpr (kNumLanesPerGroup <= 16) value = op(value, __shfl_xor_sync(mask, value, 16));
+    } else {
+        if constexpr (kNumLanesPerGroup >= 32) value = op(value, __shfl_xor_sync(mask, value, 16));
+        if constexpr (kNumLanesPerGroup >= 16) value = op(value, __shfl_xor_sync(mask, value,  8));
+        if constexpr (kNumLanesPerGroup >=  8) value = op(value, __shfl_xor_sync(mask, value,  4));
+        if constexpr (kNumLanesPerGroup >=  4) value = op(value, __shfl_xor_sync(mask, value,  2));
+        if constexpr (kNumLanesPerGroup >=  2) value = op(value, __shfl_xor_sync(mask, value,  1));
+    }
+    return value;
+}
+
+// Convenience aliases
+template <int kNumLanesPerGroup = 32, bool kIntergroupReduce = false, typename T>
+__forceinline__ __device__ T warp_reduce_sum(T value) {
+    return warp_reduce<kNumLanesPerGroup, kIntergroupReduce, T>(value, ReduceSum<T>{});
+}
@@ -0,0 +1,257 @@
+#pragma once
+
+#include <deep_gemm/common/types.hpp>
+#include <deep_gemm/common/utils.cuh>
+
+namespace deep_gemm {
+
+enum class IndexType {
+    MN,
+    K,
+    SF_K,
+};
+
+template <GemmType kGemmType, uint32_t BLOCK_M, uint32_t BLOCK_N, uint32_t kNumSMs, bool kIsMulticastOnA>
+static constexpr uint32_t get_num_1d_blocks_per_group() {
+    // Select the best from candidates
+    uint32_t num_best_blocks = 0, min_usage = cute::numeric_limits<uint32_t>::max();
+    for (const auto& candidate: {8u, 16u}) {
+        const auto& usage = kIsMulticastOnA ?
+                    candidate * BLOCK_N + constexpr_ceil_div(kNumSMs, candidate) * BLOCK_M: // Grouping on N
+                    candidate * BLOCK_M + constexpr_ceil_div(kNumSMs, candidate) * BLOCK_N; // Grouping on M
+        if (usage < min_usage)
+            min_usage = usage, num_best_blocks = candidate;
+    }
+    return num_best_blocks;
+}
+
+#pragma clang diagnostic push
+#pragma ide diagnostic ignored "cppcoreguidelines-pro-type-member-init"
+template <GemmType kGemmType,
+          uint32_t BLOCK_M, uint32_t BLOCK_N,
+          uint32_t kNumGroups,
+          uint32_t kNumMulticast, bool kIsMulticastOnA,
+          uint32_t kNumSMs,
+          uint32_t SF_K_ALIGNMENT = 512u,  // for k-grouped GEMM only: 128 (SM90 float SF) or 512 (SM100 UE8M0 SF)
+          uint32_t kNum1DBlocksPerGroup = get_num_1d_blocks_per_group<kGemmType, BLOCK_M, BLOCK_N, kNumSMs, kIsMulticastOnA>()>
+struct Scheduler {
+    int current_iter = -1;
+
+    // Block configs
+    uint32_t num_blocks;
+    uint32_t num_m_blocks;
+    uint32_t num_n_blocks;
+
+    // For SM90 multicast checks
+    uint32_t num_blocks_in_group;
+    bool is_peer_cta_alive = true;
+
+    // For grouped GEMM
+    int* grouped_layout;
+    uint32_t current_group_idx = 0;
+    // Only used for masked layout
+    uint32_t current_m_cumsum = 0;
+    // Only used for k-grouped layout
+    uint32_t current_shape_k, current_num_valid_groups = 0, current_k_cumsum = 0, current_sf_k_cumsum = 0;
+    uint32_t next_group_idx, next_shape_k;
+
+    // Only used for k-grouped gemm
+    __device__ __forceinline__ void get_next_k_group(uint32_t &group_idx, uint32_t &shape_k) const {
+        for (; group_idx < kNumGroups; ++ group_idx) {
+            shape_k = __ldg(grouped_layout + group_idx);
+            if (shape_k > 0)
+                break;
+        }
+    }
+
+    // ReSharper disable once CppPossiblyUninitializedMember
+    __device__ __forceinline__ explicit Scheduler(const uint32_t& shape_m, const uint32_t& shape_n, const uint32_t& shape_k,
+                                                  int* grouped_layout = nullptr) {
+        num_m_blocks = ceil_div(shape_m, BLOCK_M);
+        num_n_blocks = ceil_div(shape_n, BLOCK_N);
+        current_shape_k = shape_k;
+        if constexpr (kGemmType == GemmType::Normal or kGemmType == GemmType::Batched) {
+            num_blocks = num_m_blocks * num_n_blocks;
+        } else if (kGemmType == GemmType::MGroupedContiguous) {
+            num_blocks = num_m_blocks * num_n_blocks;
+            this->grouped_layout = grouped_layout;
+        } else if (kGemmType == GemmType::MGroupedMasked) {
+            this->grouped_layout = grouped_layout;
+        } else if (kGemmType == GemmType::KGroupedContiguous) {
+            this->grouped_layout = grouped_layout;
+            get_next_k_group(current_group_idx, current_shape_k);
+            next_group_idx = current_group_idx + 1;
+            get_next_k_group(next_group_idx, next_shape_k);
+        }
+    }
+
+    __device__ __forceinline__ void get_swizzled_block_idx(const uint32_t& block_idx, uint32_t& m_block_idx, uint32_t& n_block_idx) {
+        DG_STATIC_ASSERT(kNum1DBlocksPerGroup % kNumMulticast == 0, "Invalid group size");
+
+        // Swizzle for better L2 usages
+        const auto& primary_num_blocks = kIsMulticastOnA ? num_n_blocks : num_m_blocks;
+        const auto& secondary_num_blocks = kIsMulticastOnA ? num_m_blocks : num_n_blocks;
+        const auto& num_blocks_per_group = secondary_num_blocks * kNum1DBlocksPerGroup;
+        const auto& group_idx = block_idx / num_blocks_per_group;
+        auto first_block_idx = group_idx * kNum1DBlocksPerGroup;
+        auto in_group_idx = block_idx % num_blocks_per_group;
+        num_blocks_in_group = min(kNum1DBlocksPerGroup, primary_num_blocks - first_block_idx);
+
+        // Fix unaligned TMA multicast
+        // NOTES: for SM90 only, as SM90 can dynamically disable TMA multicast
+        // while SM100 uses 2-CTA, which can not be dynamically disabled
+#if __CUDA_ARCH__ < 1000
+        if (kNumMulticast > 1 and num_blocks_in_group % 2 != 0) {
+            if (in_group_idx < (num_blocks_in_group ^ 1) * secondary_num_blocks) {
+                num_blocks_in_group = num_blocks_in_group ^ 1;
+            } else {
+                in_group_idx = in_group_idx - (num_blocks_in_group ^ 1) * secondary_num_blocks;
+                first_block_idx += num_blocks_in_group ^ 1;
+                num_blocks_in_group = 1;
+            }
+        }
+#endif
+
+        // Convert to final M/N block indices
+        // `kIsMulticastOnA == true` leads to groups on N
+        if constexpr (kIsMulticastOnA) {
+            m_block_idx = in_group_idx / num_blocks_in_group;
+            n_block_idx = first_block_idx + in_group_idx % num_blocks_in_group;
+        } else {
+            m_block_idx = first_block_idx + in_group_idx % num_blocks_in_group;
+            n_block_idx = in_group_idx / num_blocks_in_group;
+        }
+    }
+
+    template <bool kWithGroupOffset, IndexType kIndexType = IndexType::MN>
+    __device__ __forceinline__ uint32_t get_global_idx(const uint32_t shape_dim, const uint32_t block_size,
+                                                       const uint32_t& block_idx, const uint32_t& m_block_idx = 0) {
+        if constexpr (kGemmType == GemmType::Normal) {
+            return block_idx * block_size;
+        } else if constexpr (kGemmType == GemmType::MGroupedContiguous) {
+            const auto offset = kWithGroupOffset ? cute::max(0, __ldg(grouped_layout + m_block_idx * BLOCK_M)) : 0;
+            return offset * shape_dim + block_idx * block_size;
+        } else if constexpr (kGemmType == GemmType::MGroupedMasked) {
+            const auto offset = kWithGroupOffset ? current_group_idx : 0;
+            return offset * shape_dim + block_idx * block_size;
+        } else if constexpr (kGemmType == GemmType::KGroupedContiguous) {
+            auto offset = 0;
+            if constexpr (kWithGroupOffset) {
+                if constexpr (kIndexType == IndexType::MN)
+                    offset = current_group_idx * shape_dim;
+                else if constexpr (kIndexType == IndexType::K)
+                    offset = current_k_cumsum;
+                else if constexpr (kIndexType == IndexType::SF_K)
+                    offset = current_sf_k_cumsum;
+            }
+            return offset + block_idx * block_size;
+        } else if constexpr (kGemmType == GemmType::Batched) {
+            // Ignore kWithGroupOffset, and apply offset for IndexType::SF_K
+            const auto offset = kIndexType == IndexType::SF_K ? current_group_idx : 0;
+            return offset * shape_dim + block_idx * block_size;
+        }
+    }
+
+    __device__ __forceinline__ bool get_next_block(uint32_t& m_block_idx, uint32_t& n_block_idx) {
+        const auto next_block_idx = (++ current_iter) * kNumSMs + blockIdx.x;
+
+        if constexpr (kGemmType == GemmType::MGroupedMasked) {
+            while (true) {
+                // End of the task
+                if (current_group_idx == kNumGroups)
+                    return false;
+
+                // Within current group
+                num_m_blocks = ceil_div(static_cast<uint32_t>(__ldg(grouped_layout + current_group_idx)), BLOCK_M);
+                const auto current_m_block_cumsum = current_m_cumsum + num_m_blocks;
+                if (next_block_idx < current_m_block_cumsum * num_n_blocks)
+                    break;
+
+                // Move to check the next group
+                current_group_idx ++, current_m_cumsum = current_m_block_cumsum;
+            }
+
+            get_swizzled_block_idx(next_block_idx - current_m_cumsum * num_n_blocks, m_block_idx, n_block_idx);
+        } else if constexpr (kGemmType == GemmType::KGroupedContiguous) {
+            while (true) {
+                // End of the task
+                if (current_group_idx == kNumGroups)
+                    return false;
+
+                // Within current group
+                if (next_block_idx < (current_num_valid_groups + 1) * num_m_blocks * num_n_blocks)
+                    break;
+
+                // Move to check the next group
+                current_k_cumsum += current_shape_k;
+                current_sf_k_cumsum += ceil_div(current_shape_k, SF_K_ALIGNMENT);
+                current_num_valid_groups ++;
+
+                current_group_idx = next_group_idx ++;
+                current_shape_k = next_shape_k;
+                get_next_k_group(next_group_idx, next_shape_k);
+            }
+
+            get_swizzled_block_idx(next_block_idx - current_num_valid_groups * num_m_blocks * num_n_blocks, m_block_idx, n_block_idx);
+        } else if constexpr (kGemmType == GemmType::Batched) {
+            if (next_block_idx >= num_blocks * kNumGroups)
+                return false;
+
+            current_group_idx = next_block_idx / num_blocks;
+            const auto& block_idx = next_block_idx - current_group_idx * num_blocks;
+            if constexpr (kIsMulticastOnA) {
+                m_block_idx = block_idx / num_n_blocks;
+                n_block_idx = block_idx % num_n_blocks;
+            } else {
+                m_block_idx = block_idx % num_m_blocks;
+                n_block_idx = block_idx / num_m_blocks;
+            }
+        } else {
+            if (next_block_idx >= num_blocks)
+                return false;
+
+            // For SM90 only
+            // NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned
+            is_peer_cta_alive = num_n_blocks % kNumMulticast == 0 or                  // Always aligned on N (constant bypass)
+                                num_m_blocks % kNumMulticast == 0 or                  // Always aligned on M (constant bypass)
+                                (next_block_idx ^ 1) < num_blocks;                    // Peer CTA in bound
+            get_swizzled_block_idx(next_block_idx, m_block_idx, n_block_idx);
+        }
+        return true;
+    }
+
+    // For SM90 only
+    __device__ __forceinline__ bool is_tma_multicast_valid(const uint32_t& m_block_idx) const {
+        if (num_blocks_in_group == 1)
+            return false;
+        if constexpr (kGemmType == GemmType::Normal or kGemmType == GemmType::MGroupedMasked or
+                      kGemmType == GemmType::KGroupedContiguous or kGemmType == GemmType::Batched) {
+            return true;
+        } else {
+            DG_STATIC_ASSERT(kGemmType == GemmType::MGroupedContiguous, "Invalid Gemm type");
+            if constexpr (kIsMulticastOnA) {
+                return true;
+            } else {
+                const auto& group_idx = __ldg(grouped_layout + m_block_idx * BLOCK_M);
+                const auto& peer_group_idx = __ldg(grouped_layout + (m_block_idx ^ 1) * BLOCK_M);
+                return group_idx == peer_group_idx;
+            }
+        }
+    }
+
+    // For SM90 only
+    // ReSharper disable once CppNotAllPathsReturnValue
+    __device__ __forceinline__ bool is_computation_valid(const uint32_t& m_block_idx, const uint32_t& m_offset) const {
+        if constexpr (kGemmType == GemmType::Normal or kGemmType == GemmType::Batched) {
+            return true;
+        } else if constexpr (kGemmType == GemmType::MGroupedContiguous) {
+            return __ldg(grouped_layout + m_offset + m_block_idx * BLOCK_M) >= 0;
+        } else if constexpr (kGemmType == GemmType::MGroupedMasked) {
+            return m_offset + m_block_idx * BLOCK_M < __ldg(grouped_layout + current_group_idx);
+        }
+    }
+};
+
+#pragma clang diagnostic pop
+
+} // namespace deep_gemm
Original file line number	Diff line number	Diff line change
`@@ -218,3 +218,8 @@ endif()`
`218`	`218`	`target_include_directories(ggml-cuda PRIVATE`
`219`	`219`	`${CMAKE_CURRENT_SOURCE_DIR}/vendors/cutlass/include`
`220`	`220`	`)`
	`221`	`+`
	`222`	`+ # DeepGEMM vendor includes for sm90/sm100 FP8 paged MQA logits kernel (DSA)`
	`223`	`+ target_include_directories(ggml-cuda PRIVATE`
	`224`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/vendors/DeepGEMM`
	`225`	`+ )`