1- from exllamav2 import (
2- ExLlamaV2 ,
3- ExLlamaV2Tokenizer ,
4- )
1+ from __future__ import annotations
2+ from threading import Lock
3+ from concurrent .futures import ThreadPoolExecutor , Future
4+ from exllamav2 import ExLlamaV2 , ExLlamaV2Tokenizer
5+ from exllamav2 .ext import exllamav2_ext as ext_c , none_tensor
6+ import torch
57
68class ExLlamaV2Filter :
79
@@ -11,6 +13,10 @@ class ExLlamaV2Filter:
1113 tokenizer : ExLlamaV2Tokenizer
1214 sequence_str : str
1315
16+ background_result : Future | None = None
17+
18+ # For compatibility
19+ allow_return_type_list : bool = True
1420
1521 def __init__ (self ,
1622 model : ExLlamaV2 ,
@@ -31,13 +37,51 @@ def clone(self, c = None):
3137
3238
3339 def begin (self , prefix_str ):
34- pass
40+ raise NotImplementedError
3541
3642
3743 def feed (self , token ):
38- pass
44+ raise NotImplementedError
3945
4046
4147 def next (self ):
42- pass
48+ raise NotImplementedError
49+
50+
51+ def use_background_worker (self ) -> bool :
52+ """
53+ To indicate whether filter can/should run as a background thread. If True, next() will be called
54+ asynchronously after the CUDA workload has been scheduled for the following forward pass, instead of right
55+ before sampling. Should be True for any CPU-intensive filter such as a grammar constraint.
56+ """
57+ return False
58+
59+
60+ def background_next (self , pool : ThreadPoolExecutor ):
61+ """
62+ Schedule next() via the provided thread pool executor
63+ """
64+ assert self .background_result is None
65+ self .background_result = pool .submit (self .next )
66+
67+
68+ def background_drop (self ):
69+ """
70+ Clear the result of an asynchronous filter pass. Used when a complex filter reaches an end state and forces
71+ the selection of eos_token_id. next() could still be scheduled after this selection, leaving a pending result
72+ that would break subsequent generations with the same filter.
73+ """
74+ if self .background_result is not None :
75+ self .background_result .result ()
76+ self .background_result = None
77+
4378
79+ def get_next (self ) -> tuple :
80+ """
81+ Return either next() or the result of any scheduled call to next()
82+ """
83+ if self .background_result is None :
84+ return self .next ()
85+ r = self .background_result .result ()
86+ self .background_result = None
87+ return r
0 commit comments