Fix Q cache for TP mode

turboderp · turboderp · commit 15d39896f41d · 2024-08-20T11:52:23.000+02:00
diff --git a/exllamav2/attn.py b/exllamav2/attn.py
@@ -614,8 +614,6 @@ def forward_paged_tp(
         cfg = self.model.config
         ctx = self.model.tp_context
 
-        assert cache.q_block != 1, \
-            "Models with odd key/value dims not supported in TP mode with quantized cache"
         assert not self.sliding_window, \
             "Sliding window not supported in TP mode"
 
@@ -631,7 +629,7 @@ def forward_paged_tp(
             self.layer_idx,
             batch_size,
             0,
-            attn_params.max_cache_seqlen if cache.q_block > 1 else 0,
+            attn_params.max_cache_seqlen,
             page_size,
             attn_params.cache_seqlens_tp,
             attn_params.block_index_tp
@@ -706,7 +704,7 @@ def forward_paged_tp_old(
             self.layer_idx,
             batch_size,
             0,
-            attn_params.max_cache_seqlen if cache.q_block > 1 else 0,
+            attn_params.max_cache_seqlen,
             page_size,
             attn_params.cache_seqlens_tp,
             attn_params.block_index_tp
@@ -1171,7 +1169,7 @@ def forward_tp(
         )
 
         if cache is not None:
-            cache.store_kv_state(self.layer_idx, batch_size, 0, q_len)
+            cache.store_kv_state(self.layer_idx, batch_size, past_len, q_len)
 
         return ctx.get_pinned(0, batch_size, q_len, cfg.hidden_size)
 
diff --git a/exllamav2/cache.py b/exllamav2/cache.py
@@ -432,8 +432,11 @@ def __init__(
         # Models with odd key/value dims need to quantize/dequantize in multi-token blocks. Make sure the quant
         # blocksize aligns with a whole number of tokens
 
+        if not num_key_value_heads:
+            num_key_value_heads = cfg.num_key_value_heads
+
         Q_CACHE_BLOCKSIZE_Q = 512
-        kv_dim = cfg.num_key_value_heads * cfg.head_dim
+        kv_dim = num_key_value_heads * cfg.head_dim
         self.q_block = 1
         while (kv_dim * self.q_block) % Q_CACHE_BLOCKSIZE_Q:
             self.q_block += 1
diff --git a/exllamav2/exllamav2_ext/cuda/cache.cu b/exllamav2/exllamav2_ext/cuda/cache.cu
@@ -172,12 +172,10 @@ __global__ void fp16_to_q_kv_paged_kernel
     int px_a = seqlen - vx_a;
     int px_b = px_a + q_len;
 
-    if (dim < BLOCKSIZE_Q)
+    if (dim % BLOCKSIZE_Q)
     {
-        int g = BLOCKSIZE_Q / dim;
-//        if (px_a > 0) DBGI4(px_a, px_b, px_a / g * g, DIVIDE(px_b, g) * g);
-        px_a = px_a / g * g;
-        px_b = DIVIDE(px_b, g) * g;
+        while ((px_a * dim) % BLOCKSIZE_Q) px_a--;
+        while ((px_b * dim) % BLOCKSIZE_Q) px_b++;
     }
 
     px_a = max(px_a, 0);
@@ -372,10 +370,8 @@ __global__ void q_to_fp16_kv_paged_kernel
 
     if (dim < BLOCKSIZE_Q)
     {
-        int g = BLOCKSIZE_Q / dim;
-//        if (vx_a > 0) DBGI4(vx_a, vx_b, vx_a / g * g, DIVIDE(vx_b, g) * g);
-        vx_a = vx_a / g * g;
-        vx_b = DIVIDE(vx_b, g) * g;
+        while ((vx_a * dim) % BLOCKSIZE_Q) vx_a--;
+        while ((vx_b * dim) % BLOCKSIZE_Q) vx_b++;
     }
 
     int vnum = max(vx_b - vx_a, 0);
diff --git a/exllamav2/exllamav2_ext/ext_cache.cpp b/exllamav2/exllamav2_ext/ext_cache.cpp
@@ -155,9 +155,14 @@ void fp16_to_q_kv
         int stride = k_in.size(1) * k_in.size(2) * k_in.size(3);
         int height = batch_size;
 
-        int tsize = k_in.size(2) * k_in.size(3);
-        offset *= tsize;
-        width *= tsize;
+        int dim = k_in.size(2) * k_in.size(3);
+        if (dim % Q_CACHE_BLOCKSIZE_Q)
+        {
+            while ((offset * dim) % Q_CACHE_BLOCKSIZE_Q) offset--;
+            while ((width * dim) % Q_CACHE_BLOCKSIZE_Q) width++;
+        }
+        offset *= dim;
+        width *= dim;
 
         array_fp16_to_q_kv_cuda
         (
@@ -168,7 +173,7 @@ void fp16_to_q_kv
             (const half*) v_in.data_ptr(),
             (unsigned char*) v_out.data_ptr(),
             (half*) v_scales.data_ptr(),
-            tsize,
+            dim,
             stride,
             height,
             offset,
@@ -257,9 +262,14 @@ void q_to_fp16_kv
         int stride = k_out.size(1) * k_out.size(2) * k_out.size(3);
         int height = batch_size;
 
-        int tsize = k_out.size(2) * k_out.size(3);
-        offset *= tsize;
-        width *= tsize;
+        int dim = k_out.size(2) * k_out.size(3);
+        if (dim % Q_CACHE_BLOCKSIZE_Q)
+        {
+            while ((offset * dim) % Q_CACHE_BLOCKSIZE_Q) offset--;
+            while ((width * dim) % Q_CACHE_BLOCKSIZE_Q) width++;
+        }
+        offset *= dim;
+        width *= dim;
 
         array_q_to_fp16_kv_cuda
         (
@@ -270,7 +280,7 @@ void q_to_fp16_kv
             (const unsigned char*) v_in.data_ptr(),
             (const half*) v_scales.data_ptr(),
             (half*) v_out.data_ptr(),
-            tsize,
+            dim,
             stride,
             height,
             offset,
diff --git a/exllamav2/model.py b/exllamav2/model.py
@@ -221,7 +221,10 @@ def set_device_map(self,
 
         self.device_context = []
         for idx, scratch_bytes in enumerate(fixed_bytes):
-            self.device_context.append(ExLlamaV2DeviceContext(self, idx, scratch_bytes))
+            if scratch_bytes > 0:
+                self.device_context.append(ExLlamaV2DeviceContext(self, idx, scratch_bytes))
+            else:
+                self.device_context.append(None)
 
         # Create map for cache
 
@@ -300,7 +303,8 @@ def load_tp(
         callback: Callable[[int, int], None] | None = None,
         callback_gen: Callable[[int, int], None] | None = None,
         progress: bool = False,
-        expect_cache_tokens: int = 0
+        expect_cache_tokens: int = 0,
+        expect_cache_base: type = None
     ):
 
         if progress:
@@ -313,7 +317,7 @@ def callback_pb(a, b):
             assert callback is None, \
                 "Cannot use callback function and console progress bar at the same time."
             callback = callback_pb
-        f = self.load_tp_gen(gpu_split, callback, callback_gen, expect_cache_tokens)
+        f = self.load_tp_gen(gpu_split, callback, callback_gen, expect_cache_tokens, expect_cache_base)
         for item in f:
             pass
         if progress:
@@ -325,10 +329,11 @@ def load_tp_gen(
         gpu_split: list[float] | None = None,
         callback: Callable[[int, int], None] | None = None,
         callback_gen: Callable[[int, int], None] | None = None,
-        expect_cache_tokens: int = 0
+        expect_cache_tokens: int = 0,
+        expect_cache_base: type = None
     ):
         self.config.no_graphs = True
-        self.tp_context = TPContext(self, gpu_split, expect_cache_tokens)
+        self.tp_context = TPContext(self, gpu_split, expect_cache_tokens, expect_cache_base)
 
         # Create device tensors
 
diff --git a/exllamav2/tensor_p.py b/exllamav2/tensor_p.py
@@ -50,7 +50,8 @@ def __init__(
         self,
         model: ExLlamaV2,
         gpu_split: list[float] | None,
-        expect_cache_tokens: int = 0
+        expect_cache_tokens: int = 0,
+        expect_cache_base: type = None
     ):
         self.model = model
         cfg = self.model.config
@@ -80,7 +81,7 @@ def __init__(
         self.sin = None
         self.cos = None
 
-        self.define_split(gpu_split, expect_cache_tokens)
+        self.define_split(gpu_split, expect_cache_tokens, expect_cache_base)
 
 
     def unload(self):
@@ -98,7 +99,12 @@ def all_devices(self) -> list[int]:
         return sorted(devs)
 
 
-    def define_split(self, gpu_split: list[float] | None, expect_cache_tokens):
+    def define_split(
+        self,
+        gpu_split: list[float] | None,
+        expect_cache_tokens: int,
+        expect_cache_base: type
+    ):
         cfg = self.model.config
 
         if gpu_split is None:
@@ -119,8 +125,18 @@ def define_split(self, gpu_split: list[float] | None, expect_cache_tokens):
 
         if not expect_cache_tokens:
             expect_cache_tokens = cfg.max_seq_len * cfg.max_batch_size
+        if expect_cache_base == sys.modules["exllamav2.cache"].ExLlamaV2Cache_8bit:
+            bytes_per_element = 1
+        elif expect_cache_base == sys.modules["exllamav2.cache"].ExLlamaV2Cache_Q8:
+            bytes_per_element = 8.5/8
+        elif expect_cache_base == sys.modules["exllamav2.cache"].ExLlamaV2Cache_Q6:
+            bytes_per_element = 6.5/8
+        elif expect_cache_base == sys.modules["exllamav2.cache"].ExLlamaV2Cache_Q4:
+            bytes_per_element = 4.5/8
+        else:
+            bytes_per_element = 2
 
-        cache_size = 2 * 2 * cfg.num_key_value_heads * cfg.head_dim * cfg.num_hidden_layers * expect_cache_tokens
+        cache_size = 2 * bytes_per_element * cfg.num_key_value_heads * cfg.head_dim * cfg.num_hidden_layers * expect_cache_tokens
         gpu_split = [max(0, gs - int(cache_size * r / 1024**2)) for gs, r in zip(gpu_split, attn_ratio)]
 
         # Subtract size of attn layers

Original file line number	Diff line number	Diff line change
`@@ -172,12 +172,10 @@ __global__ void fp16_to_q_kv_paged_kernel`
`172`	`172`	`int px_a = seqlen - vx_a;`
`173`	`173`	`int px_b = px_a + q_len;`
`174`	`174`
`175`		`- if (dim < BLOCKSIZE_Q)`
	`175`	`+ if (dim % BLOCKSIZE_Q)`
`176`	`176`	`{`
`177`		`- int g = BLOCKSIZE_Q / dim;`
`178`		`-// if (px_a > 0) DBGI4(px_a, px_b, px_a / g * g, DIVIDE(px_b, g) * g);`
`179`		`- px_a = px_a / g * g;`
`180`		`- px_b = DIVIDE(px_b, g) * g;`
	`177`	`+ while ((px_a * dim) % BLOCKSIZE_Q) px_a--;`
	`178`	`+ while ((px_b * dim) % BLOCKSIZE_Q) px_b++;`
`181`	`179`	`}`
`182`	`180`
`183`	`181`	`px_a = max(px_a, 0);`
`@@ -372,10 +370,8 @@ __global__ void q_to_fp16_kv_paged_kernel`
`372`	`370`
`373`	`371`	`if (dim < BLOCKSIZE_Q)`
`374`	`372`	`{`
`375`		`- int g = BLOCKSIZE_Q / dim;`
`376`		`-// if (vx_a > 0) DBGI4(vx_a, vx_b, vx_a / g * g, DIVIDE(vx_b, g) * g);`
`377`		`- vx_a = vx_a / g * g;`
`378`		`- vx_b = DIVIDE(vx_b, g) * g;`
	`373`	`+ while ((vx_a * dim) % BLOCKSIZE_Q) vx_a--;`
	`374`	`+ while ((vx_b * dim) % BLOCKSIZE_Q) vx_b++;`
`379`	`375`	`}`
`380`	`376`
`381`	`377`	`int vnum = max(vx_b - vx_a, 0);`