ggml-org
diff --git a/‎bindings/go/params.go‎
Lines changed: 38 additions & 0 deletions b/‎bindings/go/params.go‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎bindings/go/pkg/whisper/consts.go‎
Lines changed: 7 additions & 0 deletions b/‎bindings/go/pkg/whisper/consts.go‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎bindings/go/pkg/whisper/context.go‎
Lines changed: 9 additions & 8 deletions b/‎bindings/go/pkg/whisper/context.go‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎bindings/go/pkg/whisper/context_test.go‎
Lines changed: 94 additions & 15 deletions b/‎bindings/go/pkg/whisper/context_test.go‎
Lines changed: 94 additions & 15 deletions
diff --git a/‎bindings/go/pkg/whisper/interface.go‎
Lines changed: 31 additions & 1 deletion b/‎bindings/go/pkg/whisper/interface.go‎
Lines changed: 31 additions & 1 deletion
@@ -47,6 +47,44 @@ func (p *Params) SetPrintTimestamps(v bool) {
 	p.print_timestamps = toBool(v)
 }
 
+// Enable tinydiarize speaker turn detection
+func (p *Params) SetDiarize(v bool) {
+	p.tdrz_enable = toBool(v)
+}
+
+// Voice Activity Detection (VAD)
+func (p *Params) SetVAD(v bool) {
+	p.vad = toBool(v)
+}
+
+func (p *Params) SetVADModelPath(path string) {
+	p.vad_model_path = C.CString(path)
+}
+
+func (p *Params) SetVADThreshold(t float32) {
+	p.vad_params.threshold = C.float(t)
+}
+
+func (p *Params) SetVADMinSpeechMs(ms int) {
+	p.vad_params.min_speech_duration_ms = C.int(ms)
+}
+
+func (p *Params) SetVADMinSilenceMs(ms int) {
+	p.vad_params.min_silence_duration_ms = C.int(ms)
+}
+
+func (p *Params) SetVADMaxSpeechSec(s float32) {
+	p.vad_params.max_speech_duration_s = C.float(s)
+}
+
+func (p *Params) SetVADSpeechPadMs(ms int) {
+	p.vad_params.speech_pad_ms = C.int(ms)
+}
+
+func (p *Params) SetVADSamplesOverlap(sec float32) {
+	p.vad_params.samples_overlap = C.float(sec)
+}
+
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
 	if lang == -1 {
 
@@ -28,3 +28,10 @@ const SampleRate = whisper.SampleRate
 
 // SampleBits is the number of bytes per sample.
 const SampleBits = whisper.SampleBits
+
+type SamplingStrategy whisper.SamplingStrategy
+
+const (
+	SAMPLING_GREEDY      SamplingStrategy = SamplingStrategy(whisper.SAMPLING_GREEDY)
+	SAMPLING_BEAM_SEARCH SamplingStrategy = SamplingStrategy(whisper.SAMPLING_BEAM_SEARCH)
+)
@@ -19,11 +19,11 @@ type context struct {
 	Parameters
 }
 
-func newContext(model Model, params whisper.Params) (Context, error) {
+func newContext(model Model, params Parameters) (Context, error) {
 	c := new(context)
 	c.model = model
 
-	c.params = newParameters(&params)
+	c.params = params
 	c.Parameters = c.params
 
 	// allocate isolated state per context
@@ -132,7 +132,7 @@ func (context *context) Process(
 		context.params.SetSingleSegment(true)
 	}
 
-	lowLevelParams := context.params.WhisperParams()
+	lowLevelParams := context.params.UnsafeParams()
 	if lowLevelParams == nil {
 		return fmt.Errorf("lowLevelParams is nil: %w", ErrInternalAppError)
 	}
@@ -249,11 +249,12 @@ func (context *context) IsLANG(t Token, lang string) bool {
 // State-backed helper functions
 func toSegmentFromState(ctx *whisper.Context, st *whisper.State, n int) Segment {
 	return Segment{
-		Num:    n,
-		Text:   strings.TrimSpace(ctx.Whisper_full_get_segment_text_from_state(st, n)),
-		Start:  time.Duration(ctx.Whisper_full_get_segment_t0_from_state(st, n)) * time.Millisecond * 10,
-		End:    time.Duration(ctx.Whisper_full_get_segment_t1_from_state(st, n)) * time.Millisecond * 10,
-		Tokens: toTokensFromState(ctx, st, n),
+		Num:             n,
+		Text:            strings.TrimSpace(ctx.Whisper_full_get_segment_text_from_state(st, n)),
+		Start:           time.Duration(ctx.Whisper_full_get_segment_t0_from_state(st, n)) * time.Millisecond * 10,
+		End:             time.Duration(ctx.Whisper_full_get_segment_t1_from_state(st, n)) * time.Millisecond * 10,
+		Tokens:          toTokensFromState(ctx, st, n),
+		SpeakerTurnNext: ctx.Whisper_full_get_segment_speaker_turn_next_from_state(st, n),
 	}
 }
 
 
@@ -17,7 +17,7 @@ func TestSetLanguage(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	context, err := model.NewContext()
 	assert.NoError(err)
@@ -35,7 +35,7 @@ func TestContextModelIsMultilingual(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	context, err := model.NewContext()
 	assert.NoError(err)
@@ -54,7 +54,7 @@ func TestLanguage(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	context, err := model.NewContext()
 	assert.NoError(err)
@@ -72,7 +72,7 @@ func TestProcess(t *testing.T) {
 
 	fh, err := os.Open(SamplePath)
 	assert.NoError(err)
-	defer fh.Close()
+	defer func() { _ = fh.Close() }()
 
 	// Decode the WAV file - load the full buffer
 	dec := wav.NewDecoder(fh)
@@ -85,7 +85,7 @@ func TestProcess(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	context, err := model.NewContext()
 	assert.NoError(err)
@@ -99,7 +99,7 @@ func TestDetectedLanguage(t *testing.T) {
 
 	fh, err := os.Open(SamplePath)
 	assert.NoError(err)
-	defer fh.Close()
+	defer func() { _ = fh.Close() }()
 
 	// Decode the WAV file - load the full buffer
 	dec := wav.NewDecoder(fh)
@@ -112,7 +112,7 @@ func TestDetectedLanguage(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	context, err := model.NewContext()
 	assert.NoError(err)
@@ -139,7 +139,7 @@ func TestContext_ConcurrentProcessing(t *testing.T) {
 
 	fh, err := os.Open(SamplePath)
 	assert.NoError(err)
-	defer fh.Close()
+	defer func() { _ = fh.Close() }()
 
 	dec := wav.NewDecoder(fh)
 	buf, err := dec.FullPCMBuffer()
@@ -150,12 +150,12 @@ func TestContext_ConcurrentProcessing(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	ctx, err := model.NewContext()
 	assert.NoError(err)
 	assert.NotNil(ctx)
-	defer ctx.Close()
+	defer func() { _ = ctx.Close() }()
 
 	err = ctx.Process(data, nil, nil, nil)
 	assert.NoError(err)
@@ -179,7 +179,7 @@ func TestContext_Parallel_DifferentInputs(t *testing.T) {
 
 	fh, err := os.Open(SamplePath)
 	assert.NoError(err)
-	defer fh.Close()
+	defer func() { _ = fh.Close() }()
 
 	dec := wav.NewDecoder(fh)
 	buf, err := dec.FullPCMBuffer()
@@ -195,14 +195,14 @@ func TestContext_Parallel_DifferentInputs(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	ctx1, err := model.NewContext()
 	assert.NoError(err)
-	defer ctx1.Close()
+	defer func() { _ = ctx1.Close() }()
 	ctx2, err := model.NewContext()
 	assert.NoError(err)
-	defer ctx2.Close()
+	defer func() { _ = ctx2.Close() }()
 
 	// Run in parallel - each context has isolated whisper_state
 	var wg sync.WaitGroup
@@ -258,7 +258,7 @@ func TestContext_Close(t *testing.T) {
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
-	defer model.Close()
+	defer func() { _ = model.Close() }()
 
 	ctx, err := model.NewContext()
 	assert.NoError(err)
@@ -294,3 +294,82 @@ func Test_Close_Context_of_Closed_Model(t *testing.T) {
 	require.NoError(t, model.Close())
 	require.NoError(t, ctx.Close())
 }
+
+func TestContext_VAD_And_Diarization_Params_DoNotPanic(t *testing.T) {
+	assert := assert.New(t)
+
+	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
+		t.Skip("Skipping test, model not found:", ModelPath)
+	}
+	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
+		t.Skip("Skipping test, sample not found:", SamplePath)
+	}
+
+	fh, err := os.Open(SamplePath)
+	assert.NoError(err)
+	defer func() { _ = fh.Close() }()
+
+	dec := wav.NewDecoder(fh)
+	buf, err := dec.FullPCMBuffer()
+	assert.NoError(err)
+	assert.Equal(uint16(1), dec.NumChans)
+	data := buf.AsFloat32Buffer().Data
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	defer func() { _ = model.Close() }()
+
+	ctx, err := model.NewContext()
+	assert.NoError(err)
+	defer func() { _ = ctx.Close() }()
+
+	p := ctx.Params()
+	p.SetDiarize(true)
+	p.SetVAD(true)
+	p.SetVADThreshold(0.5)
+	p.SetVADMinSpeechMs(200)
+	p.SetVADMinSilenceMs(100)
+	p.SetVADMaxSpeechSec(10)
+	p.SetVADSpeechPadMs(30)
+	p.SetVADSamplesOverlap(0.02)
+
+	err = ctx.Process(data, nil, nil, nil)
+	assert.NoError(err)
+}
+
+func TestContext_SpeakerTurnNext_Field_Present(t *testing.T) {
+	assert := assert.New(t)
+
+	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
+		t.Skip("Skipping test, model not found:", ModelPath)
+	}
+	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
+		t.Skip("Skipping test, sample not found:", SamplePath)
+	}
+
+	fh, err := os.Open(SamplePath)
+	assert.NoError(err)
+	defer func() { _ = fh.Close() }()
+
+	dec := wav.NewDecoder(fh)
+	buf, err := dec.FullPCMBuffer()
+	assert.NoError(err)
+	assert.Equal(uint16(1), dec.NumChans)
+	data := buf.AsFloat32Buffer().Data
+
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	defer func() { _ = model.Close() }()
+
+	ctx, err := model.NewContext()
+	assert.NoError(err)
+	defer func() { _ = ctx.Close() }()
+
+	err = ctx.Process(data, nil, nil, nil)
+	assert.NoError(err)
+
+	seg, err := ctx.NextSegment()
+	assert.NoError(err)
+	t.Logf("SpeakerTurnNext: %v", seg.SpeakerTurnNext)
+	_ = seg.SpeakerTurnNext // ensure field exists and is readable
+}
@@ -48,6 +48,8 @@ type TokenIdentifier interface {
 	IsText(Token) (bool, error)
 }
 
+type ParamsConfigure func(Parameters)
+
 // Model is the interface to a whisper model. Create a new model with the
 // function whisper.New(string)
 type Model interface {
@@ -56,6 +58,11 @@ type Model interface {
 	// Return a new speech-to-text context.
 	NewContext() (Context, error)
 
+	NewParams(
+		sampling SamplingStrategy,
+		configure ParamsConfigure,
+	) (Parameters, error)
+
 	// Return true if the model is multilingual.
 	IsMultilingual() bool
 
@@ -94,6 +101,25 @@ type Parameters interface {
 	SetEntropyThold(t float32)
 	SetInitialPrompt(prompt string)
 
+	SetNoContext(bool)
+	SetPrintSpecial(bool)
+	SetPrintProgress(bool)
+	SetPrintRealtime(bool)
+	SetPrintTimestamps(bool)
+
+	// Diarization (tinydiarize)
+	SetDiarize(bool)
+
+	// Voice Activity Detection (VAD)
+	SetVAD(bool)
+	SetVADModelPath(string)
+	SetVADThreshold(float32)
+	SetVADMinSpeechMs(int)
+	SetVADMinSilenceMs(int)
+	SetVADMaxSpeechSec(float32)
+	SetVADSpeechPadMs(int)
+	SetVADSamplesOverlap(float32)
+
 	// Set the temperature
 	SetTemperature(t float32)
 
@@ -108,7 +134,8 @@ type Parameters interface {
 	// Getter methods
 	Language() string
 	Threads() int
-	WhisperParams() *whisper.Params
+
+	UnsafeParams() *whisper.Params
 }
 
 // Context is the speech recognition context.
@@ -231,6 +258,9 @@ type Segment struct {
 
 	// The tokens of the segment.
 	Tokens []Token
+
+	// True if the next segment is predicted as a speaker turn (tinydiarize)
+	SpeakerTurnNext bool
 }
 
 // Token is a text or special token