vllm-project · sjmonson · Nov 4, 2025
diff --git a/src/guidellm/benchmark/scenarios/chat.json b/src/guidellm/benchmark/scenarios/chat.json
diff --git a/src/guidellm/benchmark/scenarios/concurrent-1k-1k-equal.json b/src/guidellm/benchmark/scenarios/concurrent-1k-1k-equal.json
@@ -0,0 +1,11 @@
+{
+  "description": "Prefill/Decode balanced scenario. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.",
+  "profile": "concurrent",
+  "request-type": "text_completions",
+  "data": {
+    "prompt_tokens": 1000,
+    "output_tokens": 1000
+  },
+  "rate": [1, 50, 100, 200, 300, 500, 650],
+  "max-seconds": "600"
+}
diff --git a/src/guidellm/benchmark/scenarios/concurrent-2ki-128-equal.json b/src/guidellm/benchmark/scenarios/concurrent-2ki-128-equal.json
@@ -0,0 +1,11 @@
+{
+  "description": "Prefill heavy scenario. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.",
+  "profile": "concurrent",
+  "request-type": "text_completions",
+  "data": {
+    "prompt_tokens": 2048,
+    "output_tokens": 128
+  },
+  "rate": [1, 50, 100, 200, 300, 500, 650],
+  "max-seconds": "600"
+}
diff --git a/src/guidellm/benchmark/scenarios/concurrent-512-2ki-norm.json b/src/guidellm/benchmark/scenarios/concurrent-512-2ki-norm.json
@@ -0,0 +1,17 @@
+{
+  "description": "Generation heavy scenario with sequence length variance. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.",
+  "profile": "concurrent",
+  "request-type": "text_completions",
+  "data": {
+    "prompt_tokens": 512,
+    "prompt_tokens_stdev": 128,
+    "prompt_tokens_min": 1,
+    "prompt_tokens_max": 1024,
+    "output_tokens": 2048,
+    "output_tokens_stdev": 512,
+    "output_tokens_min": 1,
+    "output_tokens_max": 4096
+  },
+  "rate": [1, 5, 25, 50, 100, 150, 200, 250, 300, 400, 500, 650],
+  "max-seconds": "600"
+}
diff --git a/src/guidellm/benchmark/scenarios/concurrent-8k-1k-equal.json b/src/guidellm/benchmark/scenarios/concurrent-8k-1k-equal.json
@@ -0,0 +1,11 @@
+{
+  "description": "Large context scenario. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.",
+  "profile": "concurrent",
+  "request-type": "text_completions",
+  "data": {
+    "prompt_tokens": 8000,
+    "output_tokens": 1000
+  },
+  "rate": [1, 50, 100, 200, 300, 500, 650],
+  "max-seconds": "600"
+}
diff --git a/src/guidellm/benchmark/scenarios/rag.json b/src/guidellm/benchmark/scenarios/rag.json