diff --git a/src/guidellm/benchmark/scenarios/chat.json b/src/guidellm/benchmark/scenarios/chat.json deleted file mode 100644 index 58fd18e2..00000000 --- a/src/guidellm/benchmark/scenarios/chat.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "profile": "sweep", - "data": [ - "prompt_tokens=512,prompt_tokens_stdev=128,prompt_tokens_min=1,prompt_tokens_max=1024,output_tokens=256,output_tokens_stdev=64,output_tokens_min=1,output_tokens_max=1024" - ] -} \ No newline at end of file diff --git a/src/guidellm/benchmark/scenarios/concurrent-1k-1k-equal.json b/src/guidellm/benchmark/scenarios/concurrent-1k-1k-equal.json new file mode 100644 index 00000000..efeca564 --- /dev/null +++ b/src/guidellm/benchmark/scenarios/concurrent-1k-1k-equal.json @@ -0,0 +1,11 @@ +{ + "description": "Prefill/Decode balanced scenario. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.", + "profile": "concurrent", + "request-type": "text_completions", + "data": { + "prompt_tokens": 1000, + "output_tokens": 1000 + }, + "rate": [1, 50, 100, 200, 300, 500, 650], + "max-seconds": "600" +} diff --git a/src/guidellm/benchmark/scenarios/concurrent-2ki-128-equal.json b/src/guidellm/benchmark/scenarios/concurrent-2ki-128-equal.json new file mode 100644 index 00000000..dd207b33 --- /dev/null +++ b/src/guidellm/benchmark/scenarios/concurrent-2ki-128-equal.json @@ -0,0 +1,11 @@ +{ + "description": "Prefill heavy scenario. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.", + "profile": "concurrent", + "request-type": "text_completions", + "data": { + "prompt_tokens": 2048, + "output_tokens": 128 + }, + "rate": [1, 50, 100, 200, 300, 500, 650], + "max-seconds": "600" +} diff --git a/src/guidellm/benchmark/scenarios/concurrent-512-2ki-norm.json b/src/guidellm/benchmark/scenarios/concurrent-512-2ki-norm.json new file mode 100644 index 00000000..3b16904b --- /dev/null +++ b/src/guidellm/benchmark/scenarios/concurrent-512-2ki-norm.json @@ -0,0 +1,17 @@ +{ + "description": "Generation heavy scenario with sequence length variance. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.", + "profile": "concurrent", + "request-type": "text_completions", + "data": { + "prompt_tokens": 512, + "prompt_tokens_stdev": 128, + "prompt_tokens_min": 1, + "prompt_tokens_max": 1024, + "output_tokens": 2048, + "output_tokens_stdev": 512, + "output_tokens_min": 1, + "output_tokens_max": 4096 + }, + "rate": [1, 5, 25, 50, 100, 150, 200, 250, 300, 400, 500, 650], + "max-seconds": "600" +} diff --git a/src/guidellm/benchmark/scenarios/concurrent-8k-1k-equal.json b/src/guidellm/benchmark/scenarios/concurrent-8k-1k-equal.json new file mode 100644 index 00000000..1170854f --- /dev/null +++ b/src/guidellm/benchmark/scenarios/concurrent-8k-1k-equal.json @@ -0,0 +1,11 @@ +{ + "description": "Large context scenario. Note: This scenario is optimized for NVIDIA H200s and may need to be adjusted for other hardware.", + "profile": "concurrent", + "request-type": "text_completions", + "data": { + "prompt_tokens": 8000, + "output_tokens": 1000 + }, + "rate": [1, 50, 100, 200, 300, 500, 650], + "max-seconds": "600" +} diff --git a/src/guidellm/benchmark/scenarios/rag.json b/src/guidellm/benchmark/scenarios/rag.json deleted file mode 100644 index ea38d76e..00000000 --- a/src/guidellm/benchmark/scenarios/rag.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "profile": "sweep", - "data": [ - "prompt_tokens=4096,prompt_tokens_stdev=512,prompt_tokens_min=2048,prompt_tokens_max=6144,output_tokens=512,output_tokens_stdev=128,output_tokens_min=1,output_tokens_max=1024" - ] -} \ No newline at end of file