Skip to content

Commit 9bbca31

Browse files
committed
ref #9 : add API documentation in whisper.h
1 parent 5e563ef commit 9bbca31

File tree

2 files changed

+73
-9
lines changed

2 files changed

+73
-9
lines changed

main.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,25 +230,25 @@ int main(int argc, char ** argv) {
230230

231231
// print result
232232
if (!wparams.print_realtime) {
233-
fprintf(stderr, "\n");
233+
printf("\n");
234234

235235
const int n_segments = whisper_full_n_segments(ctx);
236236
for (int i = 0; i < n_segments; ++i) {
237237
const char * text = whisper_full_get_segment_text(ctx, i);
238238

239239
if (params.no_timestamps) {
240-
fprintf(stderr, "%s", text);
240+
printf("%s", text);
241241
fflush(stdout);
242242
} else {
243243
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
244244
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
245245

246-
fprintf(stderr, "[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
246+
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
247247
}
248248
}
249249
}
250250

251-
fprintf(stderr, "\n");
251+
printf("\n");
252252

253253
// output to text file
254254
if (params.output_txt) {

whisper.h

Lines changed: 69 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,72 +31,131 @@ extern "C" {
3131
// C interface
3232
//
3333

34-
// TODO: documentation will come soon
34+
//
35+
// Basic usage:
36+
//
37+
// #include "whisper.h"
38+
//
39+
// ...
40+
//
41+
// struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
42+
//
43+
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
44+
// fprintf(stderr, "failed to process audio\n");
45+
// return 7;
46+
// }
47+
//
48+
// const int n_segments = whisper_full_n_segments(ctx);
49+
// for (int i = 0; i < n_segments; ++i) {
50+
// const char * text = whisper_full_get_segment_text(ctx, i);
51+
// printf("%s", text);
52+
// }
53+
//
54+
// whisper_free(ctx);
55+
//
56+
// ...
57+
//
58+
// This is a demonstration of the most straightforward usage of the library.
59+
// "pcmf32" contains the RAW audio data in 32-bit floating point format.
60+
//
61+
// The interface also allows for more fine-grained control over the computation, but it requires a deeper
62+
// understanding of how the model works.
63+
//
3564

3665
struct whisper_context;
3766

3867
typedef int whisper_token;
3968

69+
// Allocates all memory needed for the model and loads the model from the given file.
70+
// Returns NULL on failure.
4071
WHISPER_API struct whisper_context * whisper_init(const char * path_model);
72+
73+
// Frees all memory allocated by the model.
4174
WHISPER_API void whisper_free(struct whisper_context * ctx);
4275

76+
// Convert RAW PCM audio to log mel spectrogram.
77+
// The resulting spectrogram is stored inside the provided whisper context.
78+
// Returns 0 on success
4379
WHISPER_API int whisper_pcm_to_mel(
4480
struct whisper_context * ctx,
4581
const float * samples,
4682
int n_samples,
4783
int n_threads);
4884

85+
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
86+
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
4987
// n_mel must be 80
88+
// Returns 0 on success
5089
WHISPER_API int whisper_set_mel(
5190
struct whisper_context * ctx,
5291
const float * data,
5392
int n_len,
5493
int n_mel);
5594

95+
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
96+
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
97+
// offset can be used to specify the offset of the first frame in the spectrogram.
98+
// Returns 0 on success
5699
WHISPER_API int whisper_encode(
57100
struct whisper_context * ctx,
58101
int offset,
59102
int n_threads);
60103

104+
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
105+
// Make sure to call whisper_encode() first.
106+
// tokens + n_tokens is the provided context for the decoder.
107+
// n_past is the number of tokens to use from previous decoder calls.
108+
// Returns 0 on success
61109
WHISPER_API int whisper_decode(
62110
struct whisper_context * ctx,
63111
const whisper_token * tokens,
64112
int n_tokens,
65113
int n_past,
66114
int n_threads);
67115

116+
// Token sampling methods.
117+
// These are provided for convenience and can be used after each call to whisper_decode().
118+
// You can also implement your own sampling method using the whisper_get_probs() function.
119+
// whisper_sample_best() returns the token with the highest probability
120+
// whisper_sample_timestamp() returns the most probable timestamp token
68121
WHISPER_API whisper_token whisper_sample_best(struct whisper_context * ctx, bool need_timestamp);
69122
WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
70123

71-
// return the id of the specified language, returns -1 if not found
124+
// Return the id of the specified language, returns -1 if not found
72125
WHISPER_API int whisper_lang_id(const char * lang);
73126

74127
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
75128
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
76129
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
77130
WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
78131

132+
// The probabilities for the next token
79133
WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
80134

135+
// Token Id -> String. Uses the vocabulary in the provided context
81136
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
82137

138+
// Special tokens
83139
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
84140
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
85141
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
86142
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
87143
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
88144
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
89145

146+
// Task tokens
90147
WHISPER_API whisper_token whisper_token_translate ();
91148
WHISPER_API whisper_token whisper_token_transcribe();
92149

150+
// Performance information
93151
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
94152

95153
////////////////////////////////////////////////////////////////////////////
96154

155+
// Available decoding strategies
97156
enum whisper_decode_strategy {
98-
WHISPER_DECODE_GREEDY,
99-
WHISPER_DECODE_BEAM_SEARCH,
157+
WHISPER_DECODE_GREEDY, // Always select the most probable token
158+
WHISPER_DECODE_BEAM_SEARCH, // TODO: not implemented yet!
100159
};
101160

102161
struct whisper_full_params {
@@ -129,18 +188,23 @@ extern "C" {
129188

130189
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_decode_strategy strategy);
131190

132-
// full whisper run - encode + decode
191+
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
192+
// Uses the specified decoding strategy to obtain the text.
133193
WHISPER_API int whisper_full(
134194
struct whisper_context * ctx,
135195
struct whisper_full_params params,
136196
const float * samples,
137197
int n_samples);
138198

199+
// Number of generated text segments.
200+
// A segment can be a few words, a sentence, or even a paragraph.
139201
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
140202

203+
// Get the start and end time of the specified segment.
141204
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
142205
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
143206

207+
// Get the text of the specified segment.
144208
WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
145209

146210
#ifdef __cplusplus

0 commit comments

Comments
 (0)