Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions models/convert-h5-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def bytes_to_unicode():
fname_out = dir_out / "ggml-model.bin"

tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
if "<|endoftext|>" in tokens:
del tokens["<|endoftext|>"]
Comment on lines +110 to +111
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I use ggml-tiny.en.bin recognition, the result is also empty, the same reason why the tests fail

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the last commit was Migrate from HG dataset into HG model, it is necessary that these models need to be reconverted if they were last generated with this script, otherwise, <|endoftext|> will be written into common tokens.


# use 16-bit or 32-bit floats
use_f16 = True
Expand Down
29 changes: 13 additions & 16 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ struct whisper_vocab {
}

int num_languages() const {
return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
return token_translate - token_sot - 1;
}
};

Expand Down Expand Up @@ -1621,22 +1621,19 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
//printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
}

vocab.n_vocab = model.hparams.n_vocab;
if (vocab.is_multilingual()) {
vocab.token_eot++;
vocab.token_sot++;
vocab.n_vocab = model.hparams.n_vocab; // all tokens, including special tokens

// account for variable number of language tokens
const int dt = vocab.num_languages() - 98;

vocab.token_translate += dt;
vocab.token_transcribe += dt;
vocab.token_solm += dt;
vocab.token_prev += dt;
vocab.token_nosp += dt;
vocab.token_not += dt;
vocab.token_beg += dt;
}
vocab.token_eot = n_vocab; // <|endoftext|> 50256 for en, 50257 for multilingual, others for custom model
vocab.token_sot = n_vocab + 1; // <|startoftext|>
// [n_vocab + 2, vocab.n_vocab - 1507) are language tokens
// num_language = vocab.token_translate - vocab.token_sot - 1 = vocab.n_vocab - n_vocab - 1509
vocab.token_translate = vocab.n_vocab - 1507; // <|translate|>
vocab.token_transcribe = vocab.n_vocab - 1506; // <|transcribe|>
vocab.token_solm = vocab.n_vocab - 1505; // <|startoflm|>
vocab.token_prev = vocab.n_vocab - 1504; // <|startofprev|>
vocab.token_nosp = vocab.n_vocab - 1503; // <|nospeech|>
vocab.token_not = vocab.n_vocab - 1502; // <|notimestamps|>
vocab.token_beg = vocab.n_vocab - 1501; // timestamps from <|0.00|> to <|30.00|>, 1501 tokens

if (n_vocab < model.hparams.n_vocab) {
WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
Expand Down