Skip to content

Commit 0679a60

Browse files
committed
Adds alternative memory copy implementation option
Includes commented alternative implementation for shared memory tiled copy operation using warp contiguous layout. Preserves existing functionality while providing development path for potential performance optimization.
1 parent 6c53edf commit 0679a60

File tree

1 file changed

+1
-0
lines changed

1 file changed

+1
-0
lines changed

csrc/src/flash_bwd_kernel.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
382382
Tensor tdSsdS = smem_thr_copy_PdS.partition_D(sdS); // ((Atom, AtomNum), PIPE_M, PIPE_N)
383383

384384
auto smem_tiled_copy_PdSt = make_tiled_copy_A(typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma_dkv);
385+
// auto smem_tiled_copy_PdSt = make_tiled_copy_A_warpcontiguousN<MMA_N_SdP>(typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma_dkv);
385386
auto smem_thr_copy_PdSt = smem_tiled_copy_PdSt.get_thread_slice(tidx);
386387
Tensor tdVsPt = smem_thr_copy_PdSt.partition_S(sPt);
387388
Tensor tdKsdSt = smem_thr_copy_PdSt.partition_S(sdSt);

0 commit comments

Comments
 (0)