From 79b0c01998aff44667871119a22b46bf4a0892ea Mon Sep 17 00:00:00 2001
From: Jaffe2718 <qqyttwqeei@163.com>
Date: Thu, 4 Dec 2025 19:02:19 +0800
Subject: [PATCH 1/4] Avoid hard-coding definition vocabulary to be compatible
 with different sizes

---
 src/whisper.cpp | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index f6793cb237b..da1d8064ceb 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -453,7 +453,7 @@ struct whisper_vocab {
     }
 
     int num_languages() const {
-        return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
+        return n_vocab - token_to_id.size() - 1509;
     }
 };
 
@@ -1621,22 +1621,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
         }
 
-        vocab.n_vocab = model.hparams.n_vocab;
-        if (vocab.is_multilingual()) {
-            vocab.token_eot++;
-            vocab.token_sot++;
-
-            // account for variable number of language tokens
-            const int dt = vocab.num_languages() - 98;
-
-            vocab.token_translate  += dt;
-            vocab.token_transcribe += dt;
-            vocab.token_solm       += dt;
-            vocab.token_prev       += dt;
-            vocab.token_nosp       += dt;
-            vocab.token_not        += dt;
-            vocab.token_beg        += dt;
-        }
+        size_t common_vocab_size = vocab.token_to_id.size();     // common vocab size, excluding special tokens
+        vocab.n_vocab = model.hparams.n_vocab;                   // all tokens, including special tokens
+
+        vocab.token_eot        = common_vocab_size;        // <|endoftext|>
+        vocab.token_sot        = common_vocab_size + 1;    // <|startoftext|>
+        // [common_vocab_size + 2, vocab.n_vocab - 1504) are language tokens
+        // num_language = vocab.token_translate - vocab.token_sot = vocab.n_vocab - vocab.token_to_id.size() - 1509
+        vocab.token_translate  = vocab.n_vocab - 1507;     // <|translate|>
+        vocab.token_transcribe = vocab.n_vocab - 1506;     // <|transcribe|>
+        vocab.token_solm       = vocab.n_vocab - 1505;     // <|startoflm|>
+        vocab.token_prev       = vocab.n_vocab - 1504;     // <|startofprev|>
+        vocab.token_nosp       = vocab.n_vocab - 1503;     // <|nospeech|>
+        vocab.token_not        = vocab.n_vocab - 1502;     // <|notimestamps|>
+        vocab.token_beg        = vocab.n_vocab - 1501;     // timestamps from <|0.00|> to <|30.00|>, 1501 tokens
 
         if (n_vocab < model.hparams.n_vocab) {
             WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);

From 43d873ca5c4703d630387a7329886355256c99e8 Mon Sep 17 00:00:00 2001
From: Jaffe2718 <qqyttwqeei@163.com>
Date: Thu, 4 Dec 2025 20:04:50 +0800
Subject: [PATCH 2/4] modify comment

---
 src/whisper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index da1d8064ceb..0f64ca49521 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1626,7 +1626,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
         vocab.token_eot        = common_vocab_size;        // <|endoftext|>
         vocab.token_sot        = common_vocab_size + 1;    // <|startoftext|>
-        // [common_vocab_size + 2, vocab.n_vocab - 1504) are language tokens
+        // [common_vocab_size + 2, vocab.n_vocab - 1507) are language tokens
         // num_language = vocab.token_translate - vocab.token_sot = vocab.n_vocab - vocab.token_to_id.size() - 1509
         vocab.token_translate  = vocab.n_vocab - 1507;     // <|translate|>
         vocab.token_transcribe = vocab.n_vocab - 1506;     // <|transcribe|>

From ccd9b6ef80e9c3df5d843c67b584fbbfb35c5a96 Mon Sep 17 00:00:00 2001
From: Jaffe2718 <qqyttwqeei@163.com>
Date: Sat, 6 Dec 2025 20:19:13 +0800
Subject: [PATCH 3/4] fix num_language(): incorrect after loading special
 tokens

---
 src/whisper.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 0f64ca49521..d88e0ad4d9c 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -453,7 +453,7 @@ struct whisper_vocab {
     }
 
     int num_languages() const {
-        return n_vocab - token_to_id.size() - 1509;
+        return token_translate - token_sot - 1;
     }
 };
 
@@ -1621,13 +1621,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
         }
 
-        size_t common_vocab_size = vocab.token_to_id.size();     // common vocab size, excluding special tokens
         vocab.n_vocab = model.hparams.n_vocab;                   // all tokens, including special tokens
 
-        vocab.token_eot        = common_vocab_size;        // <|endoftext|>
-        vocab.token_sot        = common_vocab_size + 1;    // <|startoftext|>
-        // [common_vocab_size + 2, vocab.n_vocab - 1507) are language tokens
-        // num_language = vocab.token_translate - vocab.token_sot = vocab.n_vocab - vocab.token_to_id.size() - 1509
+        vocab.token_eot        = n_vocab;                  // <|endoftext|>   50256 for en, 50257 for multilingual, others for custom model
+        vocab.token_sot        = n_vocab + 1;              // <|startoftext|>
+        // [n_vocab + 2, vocab.n_vocab - 1507) are language tokens
+        // num_language = vocab.token_translate - vocab.token_sot - 1 = vocab.n_vocab - n_vocab - 1509
         vocab.token_translate  = vocab.n_vocab - 1507;     // <|translate|>
         vocab.token_transcribe = vocab.n_vocab - 1506;     // <|transcribe|>
         vocab.token_solm       = vocab.n_vocab - 1505;     // <|startoflm|>

From 1ab1804e9708865ff3d76bbf729566c134a50bc7 Mon Sep 17 00:00:00 2001
From: Jaffe2718 <qqyttwqeei@163.com>
Date: Sat, 6 Dec 2025 20:20:08 +0800
Subject: [PATCH 4/4] fix convert script: remove special token `<|endoftext|>`

---
 models/convert-h5-to-ggml.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/models/convert-h5-to-ggml.py b/models/convert-h5-to-ggml.py
index 80244d735e9..9f004d9bce5 100644
--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@@ -107,6 +107,8 @@ def bytes_to_unicode():
 fname_out = dir_out / "ggml-model.bin"
 
 tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
+if "<|endoftext|>" in tokens:
+    del tokens["<|endoftext|>"]
 
 # use 16-bit or 32-bit floats
 use_f16 = True