Support whisper-large-{v2, v3, v3-turbo} (#15680)

Gasoonjia · web-flow · commit 8a8af363c464 · 2025-11-07T14:54:12.000-08:00
This PR supports whisper-large-{v2, v3, v3-turbo} on et-cuda backend,
and update readme and ci.
diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_cuda_artifact.sh
@@ -17,7 +17,7 @@ Arguments:
   hf_model     HuggingFace model ID (required)
                Supported models:
                  - mistralai/Voxtral-Mini-3B-2507
-                 - openai/whisper-small
+                 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                  - google/gemma-3-4b-it
 
   quant_name   Quantization type (optional, default: non-quantized)
@@ -62,13 +62,17 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE="128"
     PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
     ;;
-  openai/whisper-small)
+  openai/whisper-*)
     MODEL_NAME="whisper"
     TASK="automatic-speech-recognition"
     MAX_SEQ_LEN=""
     EXTRA_PIP="librosa"
-    PREPROCESSOR_FEATURE_SIZE="80"
     PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
+    if [[ "$HF_MODEL" == *"large-v3"* ]]; then
+      PREPROCESSOR_FEATURE_SIZE="128"
+    else
+      PREPROCESSOR_FEATURE_SIZE="80"
+    fi
     ;;
   google/gemma-3-4b-it)
     MODEL_NAME="gemma3"
@@ -80,7 +84,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it"
     exit 1
     ;;
 esac
diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_cuda_e2e.sh
@@ -17,7 +17,7 @@ Arguments:
   hf_model    HuggingFace model ID (required)
               Supported models:
                 - mistralai/Voxtral-Mini-3B-2507
-                - openai/whisper-small
+                - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                 - google/gemma-3-4b-it
 
   quant_name  Quantization type (required)
@@ -91,13 +91,13 @@ case "$HF_MODEL" in
     AUDIO_FILE="poem.wav"
     IMAGE_PATH=""
     ;;
-  openai/whisper-small)
-    MODEL_NAME="whisper"
+  openai/whisper-*)
+    MODEL_NAME="${HF_MODEL#openai/}"
     RUNNER_TARGET="whisper_runner"
     RUNNER_PATH="whisper"
     EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes"
     PREPROCESSOR="whisper_preprocessor.pte"
-    TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
+    TOKENIZER_URL="https://huggingface.co/${HF_MODEL}/resolve/main" # @lint-ignore
     TOKENIZER_FILE=""
     AUDIO_URL=""
     AUDIO_FILE="output.wav"
@@ -117,7 +117,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it"
     exit 1
     ;;
 esac
@@ -142,7 +142,7 @@ fi
 # Download test files
 if [ "$AUDIO_URL" != "" ]; then
   curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
-elif [ "$MODEL_NAME" = "whisper" ]; then
+elif [[ "$MODEL_NAME" == *whisper* ]]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile torchcodec
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
@@ -179,8 +179,8 @@ case "$MODEL_NAME" in
   voxtral)
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
     ;;
-  whisper)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
+  whisper-*)
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR --model_name ${MODEL_NAME}"
     ;;
   gemma3)
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -104,6 +104,8 @@ jobs:
             name: "Voxtral-Mini-3B-2507"
           - repo: "openai"
             name: "whisper-small"
+          - repo: "openai"
+            name: "whisper-large-v3-turbo"
           - repo: "google"
             name: "gemma-3-4b-it"
         quant:
@@ -223,6 +225,8 @@ jobs:
             name: "Voxtral-Mini-3B-2507"
           - repo: "openai"
             name: "whisper-small"
+          - repo: "openai"
+            name: "whisper-large-v3-turbo"
           - repo: "google"
             name: "gemma-3-4b-it"
         quant:
diff --git a/examples/models/whisper/README.md b/examples/models/whisper/README.md
@@ -61,6 +61,7 @@ This command generates:
 Export a preprocessor to convert raw audio to mel-spectrograms:
 
 ```bash
+# Use --feature_size 128 for whisper-large-v3 and whisper-large-v3-turbo
 python -m executorch.extension.audio.mel_spectrogram \
     --feature_size 80 \
     --stack_output \
@@ -90,14 +91,22 @@ optimum-cli export executorch \
 
 ### Download Tokenizer
 
-Download the tokenizer files required for inference:
+Download the tokenizer files required for inference according to your model version:
 
+**For Whisper Small:**
 ```bash
 curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json
 curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json
 curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_map.json -o special_tokens_map.json
 ```
 
+**For Whisper Large v2:**
+```bash
+curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer.json -o tokenizer.json
+curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer_config.json -o tokenizer_config.json
+curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/special_tokens_map.json -o special_tokens_map.json
+```
+
 ### Prepare Audio
 
 Generate test audio or use an existing WAV file. The model expects 16kHz mono audio.
diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp
@@ -39,6 +39,10 @@ DEFINE_string(
     audio_path,
     "",
     "Path to input audio file. Accepts .wav or raw float .bin.");
+DEFINE_string(
+    model_name,
+    "base",
+    "Whisper model name (base, small, medium, large, large-v2, large-v3, large-v3-turbo).");
 DEFINE_double(
     temperature,
     0.0,
@@ -109,7 +113,22 @@ int main(int argc, char** argv) {
   executorch::extension::asr::AsrTranscribeConfig config;
   config.max_new_tokens = FLAGS_max_new_tokens;
   config.temperature = static_cast<float>(FLAGS_temperature);
-  config.decoder_start_token_id = 50257;
+
+  // Set decoder_start_token_id based on model version
+  if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" ||
+      FLAGS_model_name == "large-v3-turbo") {
+    config.decoder_start_token_id = 50258;
+    ET_LOG(
+        Info,
+        "Using decoder_start_token_id=50258 for model: %s",
+        FLAGS_model_name.c_str());
+  } else {
+    config.decoder_start_token_id = 50257;
+    ET_LOG(
+        Info,
+        "Using decoder_start_token_id=50257 for model: %s",
+        FLAGS_model_name.c_str());
+  }
 
   auto result =
       runner.transcribe(features, config, [&](const std::string& piece) {
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
@@ -192,8 +192,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
           Info,
           "Conversion complete, first value = %f",
           static_cast<float>(
-              preprocessed_features
-                  ->mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
+              preprocessed_features->mutable_data_ptr<float>()[0]));
     }
   }
 
@@ -223,9 +222,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   ET_LOG(
       Info,
       "Encoder first value: %f",
-      static_cast<float>(
-          encoder_output_tensor
-              .mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
+      static_cast<float>(encoder_output_tensor.mutable_data_ptr<float>()[0]));
 
   auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>(
       std::move(encoder_output_tensor));