@@ -31,72 +31,131 @@ extern "C" {
31
31
// C interface
32
32
//
33
33
34
- // TODO: documentation will come soon
34
+ //
35
+ // Basic usage:
36
+ //
37
+ // #include "whisper.h"
38
+ //
39
+ // ...
40
+ //
41
+ // struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
42
+ //
43
+ // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
44
+ // fprintf(stderr, "failed to process audio\n");
45
+ // return 7;
46
+ // }
47
+ //
48
+ // const int n_segments = whisper_full_n_segments(ctx);
49
+ // for (int i = 0; i < n_segments; ++i) {
50
+ // const char * text = whisper_full_get_segment_text(ctx, i);
51
+ // printf("%s", text);
52
+ // }
53
+ //
54
+ // whisper_free(ctx);
55
+ //
56
+ // ...
57
+ //
58
+ // This is a demonstration of the most straightforward usage of the library.
59
+ // "pcmf32" contains the RAW audio data in 32-bit floating point format.
60
+ //
61
+ // The interface also allows for more fine-grained control over the computation, but it requires a deeper
62
+ // understanding of how the model works.
63
+ //
35
64
36
65
struct whisper_context ;
37
66
38
67
typedef int whisper_token ;
39
68
69
+ // Allocates all memory needed for the model and loads the model from the given file.
70
+ // Returns NULL on failure.
40
71
WHISPER_API struct whisper_context * whisper_init (const char * path_model );
72
+
73
+ // Frees all memory allocated by the model.
41
74
WHISPER_API void whisper_free (struct whisper_context * ctx );
42
75
76
+ // Convert RAW PCM audio to log mel spectrogram.
77
+ // The resulting spectrogram is stored inside the provided whisper context.
78
+ // Returns 0 on success
43
79
WHISPER_API int whisper_pcm_to_mel (
44
80
struct whisper_context * ctx ,
45
81
const float * samples ,
46
82
int n_samples ,
47
83
int n_threads );
48
84
85
+ // This can be used to set a custom log mel spectrogram inside the provided whisper context.
86
+ // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
49
87
// n_mel must be 80
88
+ // Returns 0 on success
50
89
WHISPER_API int whisper_set_mel (
51
90
struct whisper_context * ctx ,
52
91
const float * data ,
53
92
int n_len ,
54
93
int n_mel );
55
94
95
+ // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
96
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
97
+ // offset can be used to specify the offset of the first frame in the spectrogram.
98
+ // Returns 0 on success
56
99
WHISPER_API int whisper_encode (
57
100
struct whisper_context * ctx ,
58
101
int offset ,
59
102
int n_threads );
60
103
104
+ // Run the Whisper decoder to obtain the logits and probabilities for the next token.
105
+ // Make sure to call whisper_encode() first.
106
+ // tokens + n_tokens is the provided context for the decoder.
107
+ // n_past is the number of tokens to use from previous decoder calls.
108
+ // Returns 0 on success
61
109
WHISPER_API int whisper_decode (
62
110
struct whisper_context * ctx ,
63
111
const whisper_token * tokens ,
64
112
int n_tokens ,
65
113
int n_past ,
66
114
int n_threads );
67
115
116
+ // Token sampling methods.
117
+ // These are provided for convenience and can be used after each call to whisper_decode().
118
+ // You can also implement your own sampling method using the whisper_get_probs() function.
119
+ // whisper_sample_best() returns the token with the highest probability
120
+ // whisper_sample_timestamp() returns the most probable timestamp token
68
121
WHISPER_API whisper_token whisper_sample_best (struct whisper_context * ctx , bool need_timestamp );
69
122
WHISPER_API whisper_token whisper_sample_timestamp (struct whisper_context * ctx );
70
123
71
- // return the id of the specified language, returns -1 if not found
124
+ // Return the id of the specified language, returns -1 if not found
72
125
WHISPER_API int whisper_lang_id (const char * lang );
73
126
74
127
WHISPER_API int whisper_n_len (struct whisper_context * ctx ); // mel length
75
128
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx );
76
129
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx );
77
130
WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx );
78
131
132
+ // The probabilities for the next token
79
133
WHISPER_API float * whisper_get_probs (struct whisper_context * ctx );
80
134
135
+ // Token Id -> String. Uses the vocabulary in the provided context
81
136
WHISPER_API const char * whisper_token_to_str (struct whisper_context * ctx , whisper_token token );
82
137
138
+ // Special tokens
83
139
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx );
84
140
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx );
85
141
WHISPER_API whisper_token whisper_token_prev (struct whisper_context * ctx );
86
142
WHISPER_API whisper_token whisper_token_solm (struct whisper_context * ctx );
87
143
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx );
88
144
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx );
89
145
146
+ // Task tokens
90
147
WHISPER_API whisper_token whisper_token_translate ();
91
148
WHISPER_API whisper_token whisper_token_transcribe ();
92
149
150
+ // Performance information
93
151
WHISPER_API void whisper_print_timings (struct whisper_context * ctx );
94
152
95
153
////////////////////////////////////////////////////////////////////////////
96
154
155
+ // Available decoding strategies
97
156
enum whisper_decode_strategy {
98
- WHISPER_DECODE_GREEDY ,
99
- WHISPER_DECODE_BEAM_SEARCH ,
157
+ WHISPER_DECODE_GREEDY , // Always select the most probable token
158
+ WHISPER_DECODE_BEAM_SEARCH , // TODO: not implemented yet!
100
159
};
101
160
102
161
struct whisper_full_params {
@@ -129,18 +188,23 @@ extern "C" {
129
188
130
189
WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_decode_strategy strategy );
131
190
132
- // full whisper run - encode + decode
191
+ // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
192
+ // Uses the specified decoding strategy to obtain the text.
133
193
WHISPER_API int whisper_full (
134
194
struct whisper_context * ctx ,
135
195
struct whisper_full_params params ,
136
196
const float * samples ,
137
197
int n_samples );
138
198
199
+ // Number of generated text segments.
200
+ // A segment can be a few words, a sentence, or even a paragraph.
139
201
WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx );
140
202
203
+ // Get the start and end time of the specified segment.
141
204
WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx , int i_segment );
142
205
WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx , int i_segment );
143
206
207
+ // Get the text of the specified segment.
144
208
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx , int i_segment );
145
209
146
210
#ifdef __cplusplus
0 commit comments