"Fix" sequence breaker tokenization

p-e-w · p-e-w · commit a7ba4a2a7d0a · 2024-07-29T10:21:51.000+05:30
Most tokenizers encode punctuation tokens differently depending on where they occur in the input, and which tokens surround them. With the default sequence breakers, the appropriate encoding usually corresponds to the encoding produced when the token occurs after a word, rather than by itself. To emulate this, prefix the token with "a" before encoding, and extract the final token of the result. See LostRuins/koboldcpp#982 for a correct solution to this problem.
diff --git a/mistralrs-core/src/sampler.rs b/mistralrs-core/src/sampler.rs
@@ -116,14 +116,19 @@ impl DrySamplingParamsInner {
                     .into_iter()
                     .map(|breaker| {
                         tokenizer
-                            .encode(breaker.clone(), true)
+                            // Prefix with 'a' to get the correct encoding of the token at the end of a text.
+                            //
+                            // FIXME: This is a hack. See https://github.com/LostRuins/koboldcpp/pull/982
+                            //        for the correct solution which covers multi-token sequence breakers
+                            //        and ambiguous encodings.
+                            .encode(format!("a{breaker}"), true)
                             .map_err(anyhow::Error::msg)
                             .map(|enc| {
                                 let ids = enc.get_ids();
                                 if !ids.is_empty() {
                                     None
                                 } else {
-                                    Some(ids[0])
+                                    Some(ids[ids.len() - 1])
                                 }
                             })
                     })