Add benchmarks scripts for Torch MultiQueue benchmarks

luszczewskakasia1 · luszczewskakasia1 · commit 03e4181a6462 · 2025-11-21T14:39:48.000Z
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
@@ -166,6 +166,11 @@ runs:
     with:
       ref: ${{ env.BENCHMARK_RESULTS_BRANCH }}
       path: llvm-ci-perf-results
+  
+  # - name: Show compute-benchmarks version
+  #   shell: bash
+  #   run: |
+      
   - name: Build and run benchmarks
     env:
       # Need to append "_<device>_<backend>" to save name in order to follow
@@ -274,6 +279,19 @@ runs:
         export COMPUTE_BENCHMARKS_BUILD_PATH=$WORKDIR/compute-benchmarks-build
         python3 ./devops/scripts/benchmarks/tests/test_integration.py
       fi
+
+  - name: Show compute-benchmarks version
+    shell: bash
+    run: |
+      echo "Compute-benchmarks version info:"
+      python3 -c "
+      import sys
+      sys.path.append('./devops/scripts/benchmarks')
+      from benches.compute import ComputeBench
+      bench = ComputeBench()
+      print(f'Git hash: {bench.git_hash()}')
+      print(f'Git URL: {bench.git_url()}')
+      "
   - name: Cache changes and upload github summary
     if: always()
     shell: bash
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -62,7 +62,10 @@ def git_url(self) -> str:
 
     def git_hash(self) -> str:
         # Nov 17, 2025
-        return "932ae79f7cca7e156285fc10a59610927c769e89"
+        git_hash_value = "ec6710ff85cb6bd9232ca67237e782618b4d8382"
+        log.info(f"ComputeBench git hash: {git_hash_value}")
+
+        return "ec6710ff85cb6bd9232ca67237e782618b4d8382"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -269,6 +272,7 @@ def benchmarks(self) -> list[Benchmark]:
                 )
             )
 
+        # Add RecordAndReplay benchmarks
         record_and_replay_params = product([0, 1], [0, 1])
         for emulate, instantiate in record_and_replay_params:
 
@@ -315,6 +319,39 @@ def createRrBench(variant_name: str, **kwargs):
                 ),
             ]
 
+        # Add TorchMultiQueue benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+
+            def createTorchMultiQueueBench(variant_name: str, **kwargs):
+                return TorchMultiQueue(
+                    self,
+                    runtime,
+                    variant_name,
+                    PROFILERS.TIMER,
+                    **kwargs,
+                )
+
+            benches += [
+                createTorchMultiQueueBench(
+                    "large",
+                    workgroupCount=4096,
+                    workgroupSize=512,
+                    kernelsPerQueue=20,
+                ),
+                createTorchMultiQueueBench(
+                    "medium",
+                    workgroupCount=512,
+                    workgroupSize=256,
+                    kernelsPerQueue=10,
+                ),
+                createTorchMultiQueueBench(
+                    "small",
+                    workgroupCount=256,
+                    workgroupSize=124,
+                    kernelsPerQueue=4,
+                ),
+            ]
+
         # Add UR-specific benchmarks
         benches += [
             # TODO: multithread_benchmark_ur fails with segfault
@@ -735,6 +772,48 @@ def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         return [f"--{k}={v}" for k, v in self._rr_params.items()]
 
 
+class TorchMultiQueue(ComputeBenchmark):
+    def __init__(
+        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        self._variant_name = variant_name
+        self._smq_params = kwargs
+        self._iterations_regular = 1000
+        self._iterations_trace = 10
+        super().__init__(
+            suite,
+            f"torch_benchmark_{runtime.value}",
+            "KernelSubmitMultiQueue",
+            runtime,
+            profiler_type,
+        )
+
+    def name(self):
+        ret = []
+        for k, v in self._smq_params.items():
+            ret.append(f"{k} {v}")
+        ret.sort()
+        return self._bench_name + " " + ", ".join(ret)
+
+    def display_name(self) -> str:
+        return f"{self.explicit_group()} {self._runtime.value}"
+
+    def explicit_group(self):
+        return f"{self._test} {self._variant_name}"
+
+    def get_tags(self):
+        return ["pytorch_" + runtime_to_tag_name(self._runtime)]
+
+    def _supported_runtimes(self) -> list[RUNTIMES]:
+        return super()._supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
+
+    def _bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
+        iters = self._get_iters(run_trace)
+        return [f"--iterations={iters}"] + [
+            f"--{k}={v}" for k, v in self._smq_params.items()
+        ]
+
+
 class QueueInOrderMemcpy(ComputeBenchmark):
     def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
         self._is_copy_only = isCopyOnly
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
@@ -108,7 +108,6 @@ def run_iterations(
     """
 
     for iter in range(iters):
-        log.info(f"running {benchmark.name()}, iteration {iter}... ")
         try:
             bench_results = benchmark.run(
                 env_vars, run_trace=run_trace, force_trace=force_trace
@@ -207,7 +206,9 @@ def process_results(
             if stddev_threshold_override is not None
             else options.stddev_threshold
         )
+        print("Threshold",threshold)
         threshold_scaled = threshold * mean_value
+        print(threshold_scaled)
 
         if stddev > threshold_scaled:
             log.warning(
@@ -293,6 +294,9 @@ def main(directory, additional_env_vars, compare_names, filter, execution_stats)
     # TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI
 
     for s in suites:
+        if isinstance(s, ComputeBench):
+            log.info(f"Benchmarks version - {s.name()}: {s.git_hash()}")
+   
         if s.name() not in enabled_suites(options.preset):
             continue
 
@@ -887,6 +891,7 @@ def validate_and_parse_env_args(env_args):
         execution_stats["warnings"] += 1
 
     log.info(f"Selected device architecture: {options.device_architecture}")
+    log.info("Benchmarks version", )
 
     main(
         args.benchmark_directory,
diff --git a/devops/scripts/benchmarks/tests/test_integration.py b/devops/scripts/benchmarks/tests/test_integration.py
@@ -188,6 +188,26 @@ def test_submit_kernel(self):
             {"L0", "latency", "micro", "submit"},
         )
 
+    def test_torch_l0(self):
+        self._checkCase(
+            "torch_benchmark_l0 kernelsPerQueue 20, workgroupCount 4096, workgroupSize 512",
+            "KernelSubmitMultiQueue large",
+            {"pytorch_L0"},
+        )
+
+    def test_torch_sycl(self):
+        self._checkCase(
+            "torch_benchmark_sycl kernelsPerQueue 10, workgroupCount 512, workgroupSize 256",
+            "KernelSubmitMultiQueue medium",
+            {"pytorch_SYCL"},
+        )
+
+    def test_torch_syclpreview(self):
+        self._checkCase(
+            "torch_benchmark_syclpreview kernelsPerQueue 4, workgroupCount 256, workgroupSize 124",
+            "KernelSubmitMultiQueue small",
+            {"pytorch_SYCL"},
+        )
 
 if __name__ == "__main__":
     unittest.main()