Skip to content

Commit d9c2528

Browse files
Add benchmarks scripts for Torch MultiQueue benchmarks
1 parent ceae49b commit d9c2528

File tree

2 files changed

+97
-1
lines changed

2 files changed

+97
-1
lines changed

devops/scripts/benchmarks/benches/compute.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def git_url(self) -> str:
6262

6363
def git_hash(self) -> str:
6464
# Nov 17, 2025
65-
return "932ae79f7cca7e156285fc10a59610927c769e89"
65+
return "ec6710ff85cb6bd9232ca67237e782618b4d8382"
6666

6767
def setup(self) -> None:
6868
if options.sycl is None:
@@ -269,6 +269,7 @@ def benchmarks(self) -> list[Benchmark]:
269269
)
270270
)
271271

272+
# Add RecordAndReplay benchmarks
272273
record_and_replay_params = product([0, 1], [0, 1])
273274
for emulate, instantiate in record_and_replay_params:
274275

@@ -315,6 +316,39 @@ def createRrBench(variant_name: str, **kwargs):
315316
),
316317
]
317318

319+
# Add TorchMultiQueue benchmarks
320+
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
321+
322+
def createTorchMultiQueueBench(variant_name: str, **kwargs):
323+
return TorchMultiQueue(
324+
self,
325+
runtime,
326+
variant_name,
327+
PROFILERS.TIMER,
328+
**kwargs,
329+
)
330+
331+
benches += [
332+
createTorchMultiQueueBench(
333+
"large",
334+
workgroupCount=4096,
335+
workgroupSize=512,
336+
kernelsPerQueue=20,
337+
),
338+
createTorchMultiQueueBench(
339+
"medium",
340+
workgroupCount=512,
341+
workgroupSize=256,
342+
kernelsPerQueue=10,
343+
),
344+
createTorchMultiQueueBench(
345+
"small",
346+
workgroupCount=256,
347+
workgroupSize=124,
348+
kernelsPerQueue=4,
349+
),
350+
]
351+
318352
# Add UR-specific benchmarks
319353
benches += [
320354
# TODO: multithread_benchmark_ur fails with segfault
@@ -735,6 +769,48 @@ def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
735769
return [f"--{k}={v}" for k, v in self._rr_params.items()]
736770

737771

772+
class TorchMultiQueue(ComputeBenchmark):
773+
def __init__(
774+
self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
775+
):
776+
self._variant_name = variant_name
777+
self._smq_params = kwargs
778+
self._iterations_regular = 1000
779+
self._iterations_trace = 10
780+
super().__init__(
781+
suite,
782+
f"torch_benchmark_{runtime.value}",
783+
"KernelSubmitMultiQueue",
784+
runtime,
785+
profiler_type,
786+
)
787+
788+
def name(self):
789+
ret = []
790+
for k, v in self._smq_params.items():
791+
ret.append(f"{k} {v}")
792+
ret.sort()
793+
return self._bench_name + " " + ", ".join(ret)
794+
795+
def display_name(self) -> str:
796+
return f"{self.explicit_group()} {self._runtime.value}"
797+
798+
def explicit_group(self):
799+
return f"{self._test} {self._variant_name}"
800+
801+
def get_tags(self):
802+
return ["pytorch_" + runtime_to_tag_name(self._runtime)]
803+
804+
def _supported_runtimes(self) -> list[RUNTIMES]:
805+
return super()._supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
806+
807+
def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
808+
iters = self._get_iters(run_trace)
809+
return [f"--iterations={iters}"] + [
810+
f"--{k}={v}" for k, v in self._smq_params.items()
811+
]
812+
813+
738814
class QueueInOrderMemcpy(ComputeBenchmark):
739815
def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
740816
self._is_copy_only = isCopyOnly

devops/scripts/benchmarks/tests/test_integration.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,26 @@ def test_submit_kernel(self):
188188
{"L0", "latency", "micro", "submit"},
189189
)
190190

191+
def test_torch_l0(self):
192+
self._checkCase(
193+
"torch_benchmark_l0 kernelsPerQueue 20, workgroupCount 4096, workgroupSize 512",
194+
"KernelSubmitMultiQueue large",
195+
{"pytorch_L0"},
196+
)
197+
198+
def test_torch_sycl(self):
199+
self._checkCase(
200+
"torch_benchmark_sycl kernelsPerQueue 10, workgroupCount 512, workgroupSize 256",
201+
"KernelSubmitMultiQueue medium",
202+
{"pytorch_SYCL"},
203+
)
204+
205+
def test_torch_syclpreview(self):
206+
self._checkCase(
207+
"torch_benchmark_syclpreview kernelsPerQueue 4, workgroupCount 256, workgroupSize 124",
208+
"KernelSubmitMultiQueue small",
209+
{"pytorch_SYCL"},
210+
)
191211

192212
if __name__ == "__main__":
193213
unittest.main()

0 commit comments

Comments
 (0)