Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions sherpa-onnx/csrc/silero-vad-model-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
po->Register(
"silero-vad-max-speech-duration", &max_speech_duration,
"In seconds. If a speech segment is longer than this value, then we "
"increase the threshold to 0.9. After finishing detecting the segment, "
"the threshold value is reset to its original value.");
"cut a segment.");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please don't remove it.


po->Register(
"silero-vad-window-size", &window_size,
Expand Down Expand Up @@ -102,12 +101,12 @@ bool SileroVadModelConfig::Validate() const {
std::string SileroVadModelConfig::ToString() const {
std::ostringstream os;

os << "SileroVadModelConfig(";
os << "SilerVadModelConfig(";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please don't change it.

os << "model=\"" << model << "\", ";
os << "threshold=" << threshold << ", ";
os << "min_silence_duration=" << min_silence_duration << ", ";
os << "min_speech_duration=" << min_speech_duration << ", ";
os << "max_speech_duration=" << max_speech_duration << ", ";
os << "max_speech_duration=" << max_speech_duration << ", ";
os << "window_size=" << window_size << ")";

return os.str();
Expand Down
5 changes: 1 addition & 4 deletions sherpa-onnx/csrc/silero-vad-model-config.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ struct SileroVadModelConfig {
// 256, 512, 768 samples for 800 Hz
int32_t window_size = 512; // in samples

// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,
// the threshold value is reset to its original value.
float max_speech_duration = 20; // in seconds
float max_speech_duration = 20; // in seconds
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please don't remove the comments.


SileroVadModelConfig() = default;

Expand Down
66 changes: 53 additions & 13 deletions sherpa-onnx/csrc/silero-vad-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/session.h"
#include "silero-vad-model.h"

namespace sherpa_onnx {

Expand All @@ -32,9 +33,13 @@ class SileroVadModel::Impl {
}

min_silence_samples_ =
sample_rate_ * config_.silero_vad.min_silence_duration;
(int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration);

min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
min_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration);

max_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration);
Comment on lines 35 to +42
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to make such changes?

}

#if __ANDROID_API__ >= 9
Expand All @@ -54,9 +59,13 @@ class SileroVadModel::Impl {
}

min_silence_samples_ =
sample_rate_ * config_.silero_vad.min_silence_duration;
(int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration);

min_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration);

min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
max_speech_samples_ =
(int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration);
}
#endif

Expand Down Expand Up @@ -155,14 +164,34 @@ class SileroVadModel::Impl {

int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }

int32_t MaxSpeechDurationSamples() const { return max_speech_samples_; }

float Threshold() { return config_.silero_vad.threshold; }

void SetMinSilenceDuration(float s) {
min_silence_samples_ = sample_rate_ * s;
min_silence_samples_ = (int32_t)(sample_rate_ * s);
}

void SetMinSpeechDuration(float s) {
min_speech_samples_ = (int32_t)(sample_rate_ * s);
}

void SetMaxSpeechDuration(float s) {
max_speech_samples_ = (int32_t)(sample_rate_ * s);
}

void SetThreshold(float threshold) {
config_.silero_vad.threshold = threshold;
}

float Run(const float *samples, int32_t n) {
if (is_v5_) {
return RunV5(samples, n);
} else {
return RunV4(samples, n);
}
}

private:
void Init(void *model_data, size_t model_data_length) {
sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
Expand Down Expand Up @@ -335,14 +364,6 @@ class SileroVadModel::Impl {
}
}

float Run(const float *samples, int32_t n) {
if (is_v5_) {
return RunV5(samples, n);
} else {
return RunV4(samples, n);
}
}

float RunV5(const float *samples, int32_t n) {
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
Expand Down Expand Up @@ -418,6 +439,7 @@ class SileroVadModel::Impl {
int64_t sample_rate_;
int32_t min_silence_samples_;
int32_t min_speech_samples_;
int32_t max_speech_samples_;

bool triggered_ = false;
int32_t current_sample_ = 0;
Expand Down Expand Up @@ -457,12 +479,30 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const {
return impl_->MinSpeechDurationSamples();
}

int32_t SileroVadModel::MaxSpeechDurationSamples() {
return impl_->MaxSpeechDurationSamples();
}

float SileroVadModel::Threshold() { return impl_->Threshold(); }

void SileroVadModel::SetMinSilenceDuration(float s) {
impl_->SetMinSilenceDuration(s);
}

void SileroVadModel::SetMinSpeechDuration(float s) {
impl_->SetMinSpeechDuration(s);
}

void SileroVadModel::SetThreshold(float threshold) {
impl_->SetThreshold(threshold);
}

void SileroVadModel::SetMaxSpeechDuration(float s) {
impl_->SetMaxSpeechDuration(s);
}

float SileroVadModel::Run(const float *samples, int32_t n) {
return impl_->Run(samples, n);
}

} // namespace sherpa_onnx
8 changes: 7 additions & 1 deletion sherpa-onnx/csrc/silero-vad-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class SileroVadModel : public VadModel {
*/
bool IsSpeech(const float *samples, int32_t n) override;

float Run(const float *samples, int32_t n);

// For silero vad V4, it is WindowShift().
// For silero vad V5, it is WindowShift()+64 for 16kHz and
// WindowShift()+32 for 8kHz
Expand All @@ -47,9 +49,13 @@ class SileroVadModel : public VadModel {

int32_t MinSilenceDurationSamples() const override;
int32_t MinSpeechDurationSamples() const override;
int32_t MaxSpeechDurationSamples();
float Threshold();

void SetMinSilenceDuration(float s) override;
void SetThreshold(float threshold) override;
void SetMinSpeechDuration(float s);
void SetMaxSpeechDuration(float s);
void SetThreshold(float threshold) override;

private:
class Impl;
Expand Down
Loading