Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .ci/scripts/export_model_cuda_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Arguments:
hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
- openai/whisper-small
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
- google/gemma-3-4b-it

quant_name Quantization type (optional, default: non-quantized)
Expand Down Expand Up @@ -62,13 +62,17 @@ case "$HF_MODEL" in
PREPROCESSOR_FEATURE_SIZE="128"
PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
;;
openai/whisper-small)
openai/whisper-*)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PREPROCESSOR_FEATURE_SIZE changes depending on the whisper model

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thx for pointed out! Fixeed it

MODEL_NAME="whisper"
TASK="automatic-speech-recognition"
MAX_SEQ_LEN=""
EXTRA_PIP="librosa"
PREPROCESSOR_FEATURE_SIZE="80"
PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
if [[ "$HF_MODEL" == *"large-v3"* ]]; then
PREPROCESSOR_FEATURE_SIZE="128"
else
PREPROCESSOR_FEATURE_SIZE="80"
fi
;;
google/gemma-3-4b-it)
MODEL_NAME="gemma3"
Expand All @@ -80,7 +84,7 @@ case "$HF_MODEL" in
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it"
exit 1
;;
esac
Expand Down
16 changes: 8 additions & 8 deletions .ci/scripts/test_model_cuda_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Arguments:
hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
- openai/whisper-small
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
- google/gemma-3-4b-it

quant_name Quantization type (required)
Expand Down Expand Up @@ -91,13 +91,13 @@ case "$HF_MODEL" in
AUDIO_FILE="poem.wav"
IMAGE_PATH=""
;;
openai/whisper-small)
MODEL_NAME="whisper"
openai/whisper-*)
MODEL_NAME="${HF_MODEL#openai/}"
RUNNER_TARGET="whisper_runner"
RUNNER_PATH="whisper"
EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes"
PREPROCESSOR="whisper_preprocessor.pte"
TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
TOKENIZER_URL="https://huggingface.co/${HF_MODEL}/resolve/main" # @lint-ignore
TOKENIZER_FILE=""
AUDIO_URL=""
AUDIO_FILE="output.wav"
Expand All @@ -117,7 +117,7 @@ case "$HF_MODEL" in
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it"
exit 1
;;
esac
Expand All @@ -142,7 +142,7 @@ fi
# Download test files
if [ "$AUDIO_URL" != "" ]; then
curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
elif [ "$MODEL_NAME" = "whisper" ]; then
elif [[ "$MODEL_NAME" == *whisper* ]]; then
conda install -y -c conda-forge "ffmpeg<8"
pip install datasets soundfile torchcodec
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
Expand Down Expand Up @@ -179,8 +179,8 @@ case "$MODEL_NAME" in
voxtral)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
;;
whisper)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
whisper-*)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR --model_name ${MODEL_NAME}"
;;
gemma3)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ jobs:
name: "Voxtral-Mini-3B-2507"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
name: "whisper-large-v3-turbo"
- repo: "google"
name: "gemma-3-4b-it"
quant:
Expand Down Expand Up @@ -223,6 +225,8 @@ jobs:
name: "Voxtral-Mini-3B-2507"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
name: "whisper-large-v3-turbo"
- repo: "google"
name: "gemma-3-4b-it"
quant:
Expand Down
11 changes: 10 additions & 1 deletion examples/models/whisper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ This command generates:
Export a preprocessor to convert raw audio to mel-spectrograms:

```bash
# Use --feature_size 128 for whisper-large-v3 and whisper-large-v3-turbo
python -m executorch.extension.audio.mel_spectrogram \
--feature_size 80 \
--stack_output \
Expand Down Expand Up @@ -90,14 +91,22 @@ optimum-cli export executorch \

### Download Tokenizer

Download the tokenizer files required for inference:
Download the tokenizer files required for inference according to your model version:

**For Whisper Small:**
```bash
curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json
curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json
curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_map.json -o special_tokens_map.json
```

**For Whisper Large v2:**
```bash
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer.json -o tokenizer.json
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer_config.json -o tokenizer_config.json
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/special_tokens_map.json -o special_tokens_map.json
```

### Prepare Audio

Generate test audio or use an existing WAV file. The model expects 16kHz mono audio.
Expand Down
21 changes: 20 additions & 1 deletion examples/models/whisper/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ DEFINE_string(
audio_path,
"",
"Path to input audio file. Accepts .wav or raw float .bin.");
DEFINE_string(
model_name,
"base",
"Whisper model name (base, small, medium, large, large-v2, large-v3, large-v3-turbo).");
DEFINE_double(
temperature,
0.0,
Expand Down Expand Up @@ -109,7 +113,22 @@ int main(int argc, char** argv) {
executorch::extension::asr::AsrTranscribeConfig config;
config.max_new_tokens = FLAGS_max_new_tokens;
config.temperature = static_cast<float>(FLAGS_temperature);
config.decoder_start_token_id = 50257;

// Set decoder_start_token_id based on model version
if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" ||
FLAGS_model_name == "large-v3-turbo") {
config.decoder_start_token_id = 50258;
ET_LOG(
Info,
"Using decoder_start_token_id=50258 for model: %s",
FLAGS_model_name.c_str());
} else {
config.decoder_start_token_id = 50257;
ET_LOG(
Info,
"Using decoder_start_token_id=50257 for model: %s",
FLAGS_model_name.c_str());
}

auto result =
runner.transcribe(features, config, [&](const std::string& piece) {
Expand Down
7 changes: 2 additions & 5 deletions extension/asr/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
Info,
"Conversion complete, first value = %f",
static_cast<float>(
preprocessed_features
->mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
preprocessed_features->mutable_data_ptr<float>()[0]));
}
}

Expand Down Expand Up @@ -223,9 +222,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
ET_LOG(
Info,
"Encoder first value: %f",
static_cast<float>(
encoder_output_tensor
.mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
static_cast<float>(encoder_output_tensor.mutable_data_ptr<float>()[0]));

auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>(
std::move(encoder_output_tensor));
Expand Down
Loading