From 79b0c01998aff44667871119a22b46bf4a0892ea Mon Sep 17 00:00:00 2001 From: Jaffe2718 Date: Thu, 4 Dec 2025 19:02:19 +0800 Subject: [PATCH 1/4] Avoid hard-coding definition vocabulary to be compatible with different sizes --- src/whisper.cpp | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index f6793cb237b..da1d8064ceb 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -453,7 +453,7 @@ struct whisper_vocab { } int num_languages() const { - return n_vocab - 51765 - (is_multilingual() ? 1 : 0); + return n_vocab - token_to_id.size() - 1509; } }; @@ -1621,22 +1621,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); } - vocab.n_vocab = model.hparams.n_vocab; - if (vocab.is_multilingual()) { - vocab.token_eot++; - vocab.token_sot++; - - // account for variable number of language tokens - const int dt = vocab.num_languages() - 98; - - vocab.token_translate += dt; - vocab.token_transcribe += dt; - vocab.token_solm += dt; - vocab.token_prev += dt; - vocab.token_nosp += dt; - vocab.token_not += dt; - vocab.token_beg += dt; - } + size_t common_vocab_size = vocab.token_to_id.size(); // common vocab size, excluding special tokens + vocab.n_vocab = model.hparams.n_vocab; // all tokens, including special tokens + + vocab.token_eot = common_vocab_size; // <|endoftext|> + vocab.token_sot = common_vocab_size + 1; // <|startoftext|> + // [common_vocab_size + 2, vocab.n_vocab - 1504) are language tokens + // num_language = vocab.token_translate - vocab.token_sot = vocab.n_vocab - vocab.token_to_id.size() - 1509 + vocab.token_translate = vocab.n_vocab - 1507; // <|translate|> + vocab.token_transcribe = vocab.n_vocab - 1506; // <|transcribe|> + vocab.token_solm = vocab.n_vocab - 1505; // <|startoflm|> + vocab.token_prev = vocab.n_vocab - 1504; // <|startofprev|> + vocab.token_nosp = vocab.n_vocab - 1503; // <|nospeech|> + vocab.token_not = vocab.n_vocab - 1502; // <|notimestamps|> + vocab.token_beg = vocab.n_vocab - 1501; // timestamps from <|0.00|> to <|30.00|>, 1501 tokens if (n_vocab < model.hparams.n_vocab) { WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab); From 43d873ca5c4703d630387a7329886355256c99e8 Mon Sep 17 00:00:00 2001 From: Jaffe2718 Date: Thu, 4 Dec 2025 20:04:50 +0800 Subject: [PATCH 2/4] modify comment --- src/whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index da1d8064ceb..0f64ca49521 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1626,7 +1626,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con vocab.token_eot = common_vocab_size; // <|endoftext|> vocab.token_sot = common_vocab_size + 1; // <|startoftext|> - // [common_vocab_size + 2, vocab.n_vocab - 1504) are language tokens + // [common_vocab_size + 2, vocab.n_vocab - 1507) are language tokens // num_language = vocab.token_translate - vocab.token_sot = vocab.n_vocab - vocab.token_to_id.size() - 1509 vocab.token_translate = vocab.n_vocab - 1507; // <|translate|> vocab.token_transcribe = vocab.n_vocab - 1506; // <|transcribe|> From ccd9b6ef80e9c3df5d843c67b584fbbfb35c5a96 Mon Sep 17 00:00:00 2001 From: Jaffe2718 Date: Sat, 6 Dec 2025 20:19:13 +0800 Subject: [PATCH 3/4] fix num_language(): incorrect after loading special tokens --- src/whisper.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 0f64ca49521..d88e0ad4d9c 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -453,7 +453,7 @@ struct whisper_vocab { } int num_languages() const { - return n_vocab - token_to_id.size() - 1509; + return token_translate - token_sot - 1; } }; @@ -1621,13 +1621,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); } - size_t common_vocab_size = vocab.token_to_id.size(); // common vocab size, excluding special tokens vocab.n_vocab = model.hparams.n_vocab; // all tokens, including special tokens - vocab.token_eot = common_vocab_size; // <|endoftext|> - vocab.token_sot = common_vocab_size + 1; // <|startoftext|> - // [common_vocab_size + 2, vocab.n_vocab - 1507) are language tokens - // num_language = vocab.token_translate - vocab.token_sot = vocab.n_vocab - vocab.token_to_id.size() - 1509 + vocab.token_eot = n_vocab; // <|endoftext|> 50256 for en, 50257 for multilingual, others for custom model + vocab.token_sot = n_vocab + 1; // <|startoftext|> + // [n_vocab + 2, vocab.n_vocab - 1507) are language tokens + // num_language = vocab.token_translate - vocab.token_sot - 1 = vocab.n_vocab - n_vocab - 1509 vocab.token_translate = vocab.n_vocab - 1507; // <|translate|> vocab.token_transcribe = vocab.n_vocab - 1506; // <|transcribe|> vocab.token_solm = vocab.n_vocab - 1505; // <|startoflm|> From 1ab1804e9708865ff3d76bbf729566c134a50bc7 Mon Sep 17 00:00:00 2001 From: Jaffe2718 Date: Sat, 6 Dec 2025 20:20:08 +0800 Subject: [PATCH 4/4] fix convert script: remove special token `<|endoftext|>` --- models/convert-h5-to-ggml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/models/convert-h5-to-ggml.py b/models/convert-h5-to-ggml.py index 80244d735e9..9f004d9bce5 100644 --- a/models/convert-h5-to-ggml.py +++ b/models/convert-h5-to-ggml.py @@ -107,6 +107,8 @@ def bytes_to_unicode(): fname_out = dir_out / "ggml-model.bin" tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8")) +if "<|endoftext|>" in tokens: + del tokens["<|endoftext|>"] # use 16-bit or 32-bit floats use_f16 = True