Skip to content

Commit a47c3c3

Browse files
authored
[GuideLLM Refactor] Replace librosa, pydub, and soundfile with torchcodec (#411)
## TODO - [ ] ~~More flexible version locking in multimodal extras group~~ - Goal with this was to add locking for different torchcodec/torch versions but honestly its not worth the hassle - [x] Check for multi-modal libs being installed - [ ] More testing on `encode_audio` ## Summary <!-- Include a short paragraph of the changes introduced in this PR. If this PR requires additional context or rationale, explain why the changes are necessary. --> Replaces audio processing libraries with `torchcodec` which eliminates 19 dependencies and brings us inline with what HuggingFace `datasets` is doing. ## Details <!-- Provide a detailed list of all changes introduced in this pull request. --> - ## Test Plan <!-- List the steps needed to test this PR. --> - Run against audio server with ```bash guidellm benchmark run \ --target http://localhost:8000 \ --profile "synchronous" \ --max-requests 20 \ --request-type "audio_transcriptions" \ --data "openslr/librispeech_asr" \ --data-args '{"name": "clean", "split": "test"}' ``` --- - [x] "I certify that all code in this PR is my own, except as noted below." ## Use of AI - [x] Includes AI-assisted code completion - [ ] Includes code generated by an AI application - [ ] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`)
2 parents 57683a2 + cf5a2e3 commit a47c3c3

File tree

7 files changed

+1614
-1275
lines changed

7 files changed

+1614
-1275
lines changed

pylock.toml

Lines changed: 1124 additions & 880 deletions
Large diffs are not rendered by default.

pyproject.toml

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,26 +62,37 @@ dependencies = [
6262
"httpx[http2]<1.0.0",
6363
"loguru",
6464
"msgpack",
65-
"numpy<2.0.0",
66-
"pillow",
65+
"numpy>=2.0.0",
6766
"protobuf",
6867
"pydantic>=2.11.7",
6968
"pydantic-settings>=2.0.0",
70-
"pydub",
7169
"pyyaml>=6.0.0",
7270
"rich",
7371
"sanic",
7472
"transformers",
7573
"uvloop>=0.18",
76-
"librosa>=0.11.0",
7774
"torch",
7875
]
7976

8077
[project.optional-dependencies]
78+
# Meta Extras
79+
all = ["guidellm[perf,openai,multimodal]"]
80+
recommended = ["guidellm[perf,openai]"]
81+
# Feature Extras
8182
perf = ["orjson", "msgpack", "msgspec", "uvloop"]
8283
openai = ["tiktoken>=0.11.0", "blobfile>=3.1.0"]
83-
recommended = ["guidellm[perf,openai]"]
84+
multimodal = [
85+
"datasets[audio,vision]>=4.1.0",
86+
"pillow",
87+
# Torchcodec needs specific torch version
88+
"torch==2.9.*",
89+
"torchcodec==0.8",
90+
]
91+
# Dev Tooling
8492
dev = [
93+
# Install all optional dependencies
94+
"guidellm[all]",
95+
8596
# build
8697
"build>=1.0.0",
8798
"setuptools>=61.0",

src/guidellm/data/preprocessors/formatters.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from __future__ import annotations
22

3+
from abc import ABCMeta
34
from typing import Any
45

56
from guidellm.data.preprocessors.preprocessor import (
67
DatasetPreprocessor,
78
PreprocessorRegistry,
89
)
910
from guidellm.data.schemas import GenerativeDatasetColumnType
10-
from guidellm.data.utils import encode_audio, encode_image, encode_video, text_stats
11+
from guidellm.data.utils import text_stats
1112
from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
1213

1314
__all__ = [
@@ -18,8 +19,28 @@
1819
]
1920

2021

22+
class RequestFormatter(DatasetPreprocessor, metaclass=ABCMeta):
23+
@staticmethod
24+
def encode_audio(*args, **kwargs):
25+
from guidellm.extras.multimodal import encode_audio
26+
27+
return encode_audio(*args, **kwargs)
28+
29+
@staticmethod
30+
def encode_image(*args, **kwargs):
31+
from guidellm.extras.multimodal import encode_image
32+
33+
return encode_image(*args, **kwargs)
34+
35+
@staticmethod
36+
def encode_video(*args, **kwargs):
37+
from guidellm.extras.multimodal import encode_video
38+
39+
return encode_video(*args, **kwargs)
40+
41+
2142
@PreprocessorRegistry.register("text_completions")
22-
class GenerativeTextCompletionsRequestFormatter(DatasetPreprocessor):
43+
class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
2344
def __init__(
2445
self,
2546
model: str,
@@ -92,7 +113,7 @@ def __call__(
92113

93114

94115
@PreprocessorRegistry.register("chat_completions")
95-
class GenerativeChatCompletionsRequestFormatter(DatasetPreprocessor):
116+
class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
96117
def __init__(
97118
self,
98119
model: str,
@@ -120,7 +141,7 @@ def __init__(
120141
encode_kwargs.get("audio", {}) if encode_kwargs else {}
121142
)
122143

123-
def __call__(
144+
def __call__( # noqa: C901, PLR0912, PLR0915
124145
self, columns: dict[GenerativeDatasetColumnType, list[Any]]
125146
) -> GenerationRequest:
126147
arguments = GenerationRequestArguments(body={})
@@ -200,7 +221,7 @@ def __call__(
200221
if not image:
201222
continue
202223

203-
image_dict = encode_image(image, **self.encode_image_kwargs)
224+
image_dict = self.encode_image(image, **self.encode_image_kwargs)
204225
if (image_pixels := image_dict.get("image_pixels")) is not None:
205226
input_metrics.image_pixels = (
206227
input_metrics.image_pixels or 0
@@ -223,7 +244,7 @@ def __call__(
223244
if not video:
224245
continue
225246

226-
video_dict = encode_video(video, **self.encode_video_kwargs)
247+
video_dict = self.encode_video(video, **self.encode_video_kwargs)
227248
if (video_frames := video_dict.get("video_frames")) is not None:
228249
input_metrics.video_frames = (
229250
input_metrics.video_frames or 0
@@ -250,7 +271,9 @@ def __call__(
250271
if not audio:
251272
continue
252273

253-
audio_dict = encode_audio(audio, b64encode=True, **self.encode_audio_kwargs)
274+
audio_dict = self.encode_audio(
275+
audio, b64encode=True, **self.encode_audio_kwargs
276+
)
254277
if (audio_samples := audio_dict.get("audio_samples")) is not None:
255278
input_metrics.audio_samples = (
256279
input_metrics.audio_samples or 0
@@ -288,7 +311,7 @@ def __call__(
288311

289312

290313
@PreprocessorRegistry.register("audio_transcriptions")
291-
class GenerativeAudioTranscriptionRequestFormatter(DatasetPreprocessor):
314+
class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
292315
def __init__(
293316
self,
294317
model: str,
@@ -345,7 +368,7 @@ def __call__( # noqa: C901
345368
f"one audio column, but got {len(audio_columns)}."
346369
)
347370

348-
audio_dict = encode_audio(
371+
audio_dict = self.encode_audio(
349372
audio_columns[0], b64encode=False, **self.encode_audio_kwargs
350373
)
351374
input_metrics.audio_samples = audio_dict.get("audio_samples")
Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,10 @@
11
from .dataset import DEFAULT_SPLITS, resolve_dataset_split
22
from .functions import (
3-
encode_audio,
4-
encode_image,
5-
encode_video,
6-
get_file_format,
7-
is_url,
8-
resize_image,
93
text_stats,
104
)
115

126
__all__ = [
137
"DEFAULT_SPLITS",
14-
"encode_audio",
15-
"encode_image",
16-
"encode_video",
17-
"get_file_format",
18-
"is_url",
19-
"resize_image",
208
"resolve_dataset_split",
219
"text_stats",
2210
]

0 commit comments

Comments
 (0)