Skip to content

Commit 8a8af36

Browse files
authored
Support whisper-large-{v2, v3, v3-turbo} (#15680)
This PR supports whisper-large-{v2, v3, v3-turbo} on et-cuda backend, and update readme and ci.
1 parent 24c6961 commit 8a8af36

File tree

6 files changed

+52
-19
lines changed

6 files changed

+52
-19
lines changed

.ci/scripts/export_model_cuda_artifact.sh

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Arguments:
1717
hf_model HuggingFace model ID (required)
1818
Supported models:
1919
- mistralai/Voxtral-Mini-3B-2507
20-
- openai/whisper-small
20+
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2121
- google/gemma-3-4b-it
2222
2323
quant_name Quantization type (optional, default: non-quantized)
@@ -62,13 +62,17 @@ case "$HF_MODEL" in
6262
PREPROCESSOR_FEATURE_SIZE="128"
6363
PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
6464
;;
65-
openai/whisper-small)
65+
openai/whisper-*)
6666
MODEL_NAME="whisper"
6767
TASK="automatic-speech-recognition"
6868
MAX_SEQ_LEN=""
6969
EXTRA_PIP="librosa"
70-
PREPROCESSOR_FEATURE_SIZE="80"
7170
PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
71+
if [[ "$HF_MODEL" == *"large-v3"* ]]; then
72+
PREPROCESSOR_FEATURE_SIZE="128"
73+
else
74+
PREPROCESSOR_FEATURE_SIZE="80"
75+
fi
7276
;;
7377
google/gemma-3-4b-it)
7478
MODEL_NAME="gemma3"
@@ -80,7 +84,7 @@ case "$HF_MODEL" in
8084
;;
8185
*)
8286
echo "Error: Unsupported model '$HF_MODEL'"
83-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
87+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it"
8488
exit 1
8589
;;
8690
esac

.ci/scripts/test_model_cuda_e2e.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Arguments:
1717
hf_model HuggingFace model ID (required)
1818
Supported models:
1919
- mistralai/Voxtral-Mini-3B-2507
20-
- openai/whisper-small
20+
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2121
- google/gemma-3-4b-it
2222
2323
quant_name Quantization type (required)
@@ -91,13 +91,13 @@ case "$HF_MODEL" in
9191
AUDIO_FILE="poem.wav"
9292
IMAGE_PATH=""
9393
;;
94-
openai/whisper-small)
95-
MODEL_NAME="whisper"
94+
openai/whisper-*)
95+
MODEL_NAME="${HF_MODEL#openai/}"
9696
RUNNER_TARGET="whisper_runner"
9797
RUNNER_PATH="whisper"
9898
EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes"
9999
PREPROCESSOR="whisper_preprocessor.pte"
100-
TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
100+
TOKENIZER_URL="https://huggingface.co/${HF_MODEL}/resolve/main" # @lint-ignore
101101
TOKENIZER_FILE=""
102102
AUDIO_URL=""
103103
AUDIO_FILE="output.wav"
@@ -117,7 +117,7 @@ case "$HF_MODEL" in
117117
;;
118118
*)
119119
echo "Error: Unsupported model '$HF_MODEL'"
120-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
120+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it"
121121
exit 1
122122
;;
123123
esac
@@ -142,7 +142,7 @@ fi
142142
# Download test files
143143
if [ "$AUDIO_URL" != "" ]; then
144144
curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
145-
elif [ "$MODEL_NAME" = "whisper" ]; then
145+
elif [[ "$MODEL_NAME" == *whisper* ]]; then
146146
conda install -y -c conda-forge "ffmpeg<8"
147147
pip install datasets soundfile torchcodec
148148
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
@@ -179,8 +179,8 @@ case "$MODEL_NAME" in
179179
voxtral)
180180
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
181181
;;
182-
whisper)
183-
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
182+
whisper-*)
183+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR --model_name ${MODEL_NAME}"
184184
;;
185185
gemma3)
186186
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"

.github/workflows/cuda.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ jobs:
104104
name: "Voxtral-Mini-3B-2507"
105105
- repo: "openai"
106106
name: "whisper-small"
107+
- repo: "openai"
108+
name: "whisper-large-v3-turbo"
107109
- repo: "google"
108110
name: "gemma-3-4b-it"
109111
quant:
@@ -223,6 +225,8 @@ jobs:
223225
name: "Voxtral-Mini-3B-2507"
224226
- repo: "openai"
225227
name: "whisper-small"
228+
- repo: "openai"
229+
name: "whisper-large-v3-turbo"
226230
- repo: "google"
227231
name: "gemma-3-4b-it"
228232
quant:

examples/models/whisper/README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ This command generates:
6161
Export a preprocessor to convert raw audio to mel-spectrograms:
6262

6363
```bash
64+
# Use --feature_size 128 for whisper-large-v3 and whisper-large-v3-turbo
6465
python -m executorch.extension.audio.mel_spectrogram \
6566
--feature_size 80 \
6667
--stack_output \
@@ -90,14 +91,22 @@ optimum-cli export executorch \
9091

9192
### Download Tokenizer
9293

93-
Download the tokenizer files required for inference:
94+
Download the tokenizer files required for inference according to your model version:
9495

96+
**For Whisper Small:**
9597
```bash
9698
curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json
9799
curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json
98100
curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_map.json -o special_tokens_map.json
99101
```
100102

103+
**For Whisper Large v2:**
104+
```bash
105+
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer.json -o tokenizer.json
106+
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer_config.json -o tokenizer_config.json
107+
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/special_tokens_map.json -o special_tokens_map.json
108+
```
109+
101110
### Prepare Audio
102111

103112
Generate test audio or use an existing WAV file. The model expects 16kHz mono audio.

examples/models/whisper/main.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ DEFINE_string(
3939
audio_path,
4040
"",
4141
"Path to input audio file. Accepts .wav or raw float .bin.");
42+
DEFINE_string(
43+
model_name,
44+
"base",
45+
"Whisper model name (base, small, medium, large, large-v2, large-v3, large-v3-turbo).");
4246
DEFINE_double(
4347
temperature,
4448
0.0,
@@ -109,7 +113,22 @@ int main(int argc, char** argv) {
109113
executorch::extension::asr::AsrTranscribeConfig config;
110114
config.max_new_tokens = FLAGS_max_new_tokens;
111115
config.temperature = static_cast<float>(FLAGS_temperature);
112-
config.decoder_start_token_id = 50257;
116+
117+
// Set decoder_start_token_id based on model version
118+
if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" ||
119+
FLAGS_model_name == "large-v3-turbo") {
120+
config.decoder_start_token_id = 50258;
121+
ET_LOG(
122+
Info,
123+
"Using decoder_start_token_id=50258 for model: %s",
124+
FLAGS_model_name.c_str());
125+
} else {
126+
config.decoder_start_token_id = 50257;
127+
ET_LOG(
128+
Info,
129+
"Using decoder_start_token_id=50257 for model: %s",
130+
FLAGS_model_name.c_str());
131+
}
113132

114133
auto result =
115134
runner.transcribe(features, config, [&](const std::string& piece) {

extension/asr/runner/runner.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
192192
Info,
193193
"Conversion complete, first value = %f",
194194
static_cast<float>(
195-
preprocessed_features
196-
->mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
195+
preprocessed_features->mutable_data_ptr<float>()[0]));
197196
}
198197
}
199198

@@ -223,9 +222,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
223222
ET_LOG(
224223
Info,
225224
"Encoder first value: %f",
226-
static_cast<float>(
227-
encoder_output_tensor
228-
.mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
225+
static_cast<float>(encoder_output_tensor.mutable_data_ptr<float>()[0]));
229226

230227
auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>(
231228
std::move(encoder_output_tensor));

0 commit comments

Comments
 (0)