Skip to content

Commit 1be9783

Browse files
fix: prevent segfault in tokenizer on highly repetitive input (#17786)
Add nosubs|optimize flags to std::regex constructors to prevent catastrophic backtracking when processing prompts with repeated identical characters (e.g., 'A' * 10000). The nosubs flag disables subgroup capture, significantly reducing memory usage and backtracking on uniform token sequences
1 parent a6cfc21 commit 1be9783

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

src/unicode.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
499499

500500
// use std::wregex to split the text
501501
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
502-
std::wregex expr(regex_expr);
502+
std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
503503
std::vector<size_t> bpe_offsets; // store the offset of each word
504504
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
505505
size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
529529

530530
// use std::regex to split the text
531531
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
532-
std::regex expr(regex_expr);
532+
std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
533533
std::vector<size_t> bpe_offsets; // store the offset of each word
534534
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
535535
size_t start = 0;

0 commit comments

Comments
 (0)