diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_cuda_artifact.sh index 85e34ae5b80..3ff27fc2bd0 100755 --- a/.ci/scripts/export_model_cuda_artifact.sh +++ b/.ci/scripts/export_model_cuda_artifact.sh @@ -17,7 +17,7 @@ Arguments: hf_model HuggingFace model ID (required) Supported models: - mistralai/Voxtral-Mini-3B-2507 - - openai/whisper-small + - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}) - google/gemma-3-4b-it quant_name Quantization type (optional, default: non-quantized) @@ -62,13 +62,17 @@ case "$HF_MODEL" in PREPROCESSOR_FEATURE_SIZE="128" PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte" ;; - openai/whisper-small) + openai/whisper-*) MODEL_NAME="whisper" TASK="automatic-speech-recognition" MAX_SEQ_LEN="" EXTRA_PIP="librosa" - PREPROCESSOR_FEATURE_SIZE="80" PREPROCESSOR_OUTPUT="whisper_preprocessor.pte" + if [[ "$HF_MODEL" == *"large-v3"* ]]; then + PREPROCESSOR_FEATURE_SIZE="128" + else + PREPROCESSOR_FEATURE_SIZE="80" + fi ;; google/gemma-3-4b-it) MODEL_NAME="gemma3" @@ -80,7 +84,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it" exit 1 ;; esac diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_cuda_e2e.sh index 02845bf4b96..dc577dfc753 100755 --- a/.ci/scripts/test_model_cuda_e2e.sh +++ b/.ci/scripts/test_model_cuda_e2e.sh @@ -17,7 +17,7 @@ Arguments: hf_model HuggingFace model ID (required) Supported models: - mistralai/Voxtral-Mini-3B-2507 - - openai/whisper-small + - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}) - google/gemma-3-4b-it quant_name Quantization type (required) @@ -91,13 +91,13 @@ case "$HF_MODEL" in AUDIO_FILE="poem.wav" IMAGE_PATH="" ;; - openai/whisper-small) - MODEL_NAME="whisper" + openai/whisper-*) + MODEL_NAME="${HF_MODEL#openai/}" RUNNER_TARGET="whisper_runner" RUNNER_PATH="whisper" EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes" PREPROCESSOR="whisper_preprocessor.pte" - TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore + TOKENIZER_URL="https://huggingface.co/${HF_MODEL}/resolve/main" # @lint-ignore TOKENIZER_FILE="" AUDIO_URL="" AUDIO_FILE="output.wav" @@ -117,7 +117,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it" exit 1 ;; esac @@ -142,7 +142,7 @@ fi # Download test files if [ "$AUDIO_URL" != "" ]; then curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE -elif [ "$MODEL_NAME" = "whisper" ]; then +elif [[ "$MODEL_NAME" == *whisper* ]]; then conda install -y -c conda-forge "ffmpeg<8" pip install datasets soundfile torchcodec python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])" @@ -179,8 +179,8 @@ case "$MODEL_NAME" in voxtral) RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR" ;; - whisper) - RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR" + whisper-*) + RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR --model_name ${MODEL_NAME}" ;; gemma3) RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH" diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 356180772c4..80d5484ff15 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -104,6 +104,8 @@ jobs: name: "Voxtral-Mini-3B-2507" - repo: "openai" name: "whisper-small" + - repo: "openai" + name: "whisper-large-v3-turbo" - repo: "google" name: "gemma-3-4b-it" quant: @@ -223,6 +225,8 @@ jobs: name: "Voxtral-Mini-3B-2507" - repo: "openai" name: "whisper-small" + - repo: "openai" + name: "whisper-large-v3-turbo" - repo: "google" name: "gemma-3-4b-it" quant: diff --git a/examples/models/whisper/README.md b/examples/models/whisper/README.md index a4025441f7e..2bd47663305 100644 --- a/examples/models/whisper/README.md +++ b/examples/models/whisper/README.md @@ -61,6 +61,7 @@ This command generates: Export a preprocessor to convert raw audio to mel-spectrograms: ```bash +# Use --feature_size 128 for whisper-large-v3 and whisper-large-v3-turbo python -m executorch.extension.audio.mel_spectrogram \ --feature_size 80 \ --stack_output \ @@ -90,14 +91,22 @@ optimum-cli export executorch \ ### Download Tokenizer -Download the tokenizer files required for inference: +Download the tokenizer files required for inference according to your model version: +**For Whisper Small:** ```bash curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_map.json -o special_tokens_map.json ``` +**For Whisper Large v2:** +```bash +curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer.json -o tokenizer.json +curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer_config.json -o tokenizer_config.json +curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/special_tokens_map.json -o special_tokens_map.json +``` + ### Prepare Audio Generate test audio or use an existing WAV file. The model expects 16kHz mono audio. diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp index b4462e2c39a..d4b156a413a 100644 --- a/examples/models/whisper/main.cpp +++ b/examples/models/whisper/main.cpp @@ -39,6 +39,10 @@ DEFINE_string( audio_path, "", "Path to input audio file. Accepts .wav or raw float .bin."); +DEFINE_string( + model_name, + "base", + "Whisper model name (base, small, medium, large, large-v2, large-v3, large-v3-turbo)."); DEFINE_double( temperature, 0.0, @@ -109,7 +113,22 @@ int main(int argc, char** argv) { executorch::extension::asr::AsrTranscribeConfig config; config.max_new_tokens = FLAGS_max_new_tokens; config.temperature = static_cast(FLAGS_temperature); - config.decoder_start_token_id = 50257; + + // Set decoder_start_token_id based on model version + if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" || + FLAGS_model_name == "large-v3-turbo") { + config.decoder_start_token_id = 50258; + ET_LOG( + Info, + "Using decoder_start_token_id=50258 for model: %s", + FLAGS_model_name.c_str()); + } else { + config.decoder_start_token_id = 50257; + ET_LOG( + Info, + "Using decoder_start_token_id=50257 for model: %s", + FLAGS_model_name.c_str()); + } auto result = runner.transcribe(features, config, [&](const std::string& piece) { diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 6bbb44e4faa..4f2523989c1 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -192,8 +192,7 @@ Result> AsrRunner::transcribe( Info, "Conversion complete, first value = %f", static_cast( - preprocessed_features - ->mutable_data_ptr<::executorch::aten::BFloat16>()[0])); + preprocessed_features->mutable_data_ptr()[0])); } } @@ -223,9 +222,7 @@ Result> AsrRunner::transcribe( ET_LOG( Info, "Encoder first value: %f", - static_cast( - encoder_output_tensor - .mutable_data_ptr<::executorch::aten::BFloat16>()[0])); + static_cast(encoder_output_tensor.mutable_data_ptr()[0])); auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>( std::move(encoder_output_tensor));