Skip to content

Commit 7488c2b

Browse files
authored
Add prompt_cached_tokens metrics from each response. (#1814)
* Add prompt_cached_tokens metrics from each response. * take zero cache hits into account.
1 parent 95fd944 commit 7488c2b

File tree

5 files changed

+108
-3
lines changed

5 files changed

+108
-3
lines changed

pkg/epp/handlers/response.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ func (s *StreamingServer) HandleResponseBody(ctx context.Context, reqCtx *Reques
4949
CompletionTokens: int(usg["completion_tokens"].(float64)),
5050
TotalTokens: int(usg["total_tokens"].(float64)),
5151
}
52+
if usg["prompt_token_details"] != nil {
53+
detailsMap := usg["prompt_token_details"].(map[string]any)
54+
if cachedTokens, ok := detailsMap["cached_tokens"]; ok {
55+
usage.PromptTokenDetails = &PromptTokenDetails{
56+
CachedTokens: int(cachedTokens.(float64)),
57+
}
58+
}
59+
}
5260
reqCtx.Usage = usage
5361
logger.V(logutil.VERBOSE).Info("Response generated", "usage", reqCtx.Usage)
5462
}
@@ -78,6 +86,11 @@ func (s *StreamingServer) HandleResponseBodyModelStreaming(ctx context.Context,
7886
reqCtx.Usage = resp.Usage
7987
metrics.RecordInputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, resp.Usage.PromptTokens)
8088
metrics.RecordOutputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, resp.Usage.CompletionTokens)
89+
cachedToken := 0
90+
if resp.Usage.PromptTokenDetails != nil {
91+
cachedToken = resp.Usage.PromptTokenDetails.CachedTokens
92+
}
93+
metrics.RecordPromptCachedTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, cachedToken)
8194
_, err := s.director.HandleResponseBodyComplete(ctx, reqCtx)
8295
if err != nil {
8396
logger.Error(err, "error in HandleResponseBodyComplete")
@@ -193,7 +206,12 @@ type ResponseBody struct {
193206
}
194207

195208
type Usage struct {
196-
PromptTokens int `json:"prompt_tokens"`
197-
CompletionTokens int `json:"completion_tokens"`
198-
TotalTokens int `json:"total_tokens"`
209+
PromptTokens int `json:"prompt_tokens"`
210+
CompletionTokens int `json:"completion_tokens"`
211+
TotalTokens int `json:"total_tokens"`
212+
PromptTokenDetails *PromptTokenDetails `json:"prompt_token_details,omitempty"`
213+
}
214+
215+
type PromptTokenDetails struct {
216+
CachedTokens int `json:"cached_tokens"`
199217
}

pkg/epp/handlers/response_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,40 @@ const (
5151
}
5252
}
5353
`
54+
bodyWithCachedTokens = `
55+
{
56+
"id": "cmpl-573498d260f2423f9e42817bbba3743a",
57+
"object": "text_completion",
58+
"created": 1732563765,
59+
"model": "meta-llama/Llama-3.1-8B-Instruct",
60+
"choices": [
61+
{
62+
"index": 0,
63+
"text": " Chronicle\nThe San Francisco Chronicle has a new book review section, and it's a good one. The reviews are short, but they're well-written and well-informed. The Chronicle's book review section is a good place to start if you're looking for a good book review.\nThe Chronicle's book review section is a good place to start if you're looking for a good book review. The Chronicle's book review section",
64+
"logprobs": null,
65+
"finish_reason": "length",
66+
"stop_reason": null,
67+
"prompt_logprobs": null
68+
}
69+
],
70+
"usage": {
71+
"prompt_tokens": 11,
72+
"total_tokens": 111,
73+
"completion_tokens": 100,
74+
"prompt_token_details": {
75+
"cached_tokens": 10
76+
}
77+
}
78+
}
79+
`
5480

5581
streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":null}
5682
`
5783

5884
streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
85+
data: [DONE]
86+
`
87+
streamingBodyWithUsageAndCachedTokens = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10,"prompt_token_details":{"cached_tokens":5}}}
5988
data: [DONE]
6089
`
6190
)
@@ -100,6 +129,18 @@ func TestHandleResponseBody(t *testing.T) {
100129
CompletionTokens: 100,
101130
},
102131
},
132+
{
133+
name: "success with cached tokens",
134+
body: []byte(bodyWithCachedTokens),
135+
want: Usage{
136+
PromptTokens: 11,
137+
TotalTokens: 111,
138+
CompletionTokens: 100,
139+
PromptTokenDetails: &PromptTokenDetails{
140+
CachedTokens: 10,
141+
},
142+
},
143+
},
103144
}
104145

105146
for _, test := range tests {
@@ -161,6 +202,22 @@ func TestHandleStreamedResponseBody(t *testing.T) {
161202
CompletionTokens: 10,
162203
},
163204
},
205+
{
206+
name: "streaming request with usage and cached tokens",
207+
body: streamingBodyWithUsageAndCachedTokens,
208+
reqCtx: &RequestContext{
209+
modelServerStreaming: true,
210+
},
211+
wantErr: false,
212+
want: Usage{
213+
PromptTokens: 7,
214+
TotalTokens: 17,
215+
CompletionTokens: 10,
216+
PromptTokenDetails: &PromptTokenDetails{
217+
CachedTokens: 5,
218+
},
219+
},
220+
},
164221
}
165222

166223
for _, test := range tests {

pkg/epp/handlers/server.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,11 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
316316
metrics.RecordResponseSizes(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.ResponseSize)
317317
metrics.RecordInputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.Usage.PromptTokens)
318318
metrics.RecordOutputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.Usage.CompletionTokens)
319+
cachedToken := 0
320+
if reqCtx.Usage.PromptTokenDetails != nil {
321+
cachedToken = reqCtx.Usage.PromptTokenDetails.CachedTokens
322+
}
323+
metrics.RecordPromptCachedTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, cachedToken)
319324
}
320325
}
321326
}

pkg/epp/metrics/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,17 @@ var (
125125
[]string{"model_name", "target_model_name"},
126126
)
127127

128+
promptCachedTokens = prometheus.NewHistogramVec(
129+
prometheus.HistogramOpts{
130+
Subsystem: InferenceObjectiveComponent,
131+
Name: "prompt_cached_tokens",
132+
Help: metricsutil.HelpMsgWithStability("Inference objective prompt cached token count distribution for requests in each model.", compbasemetrics.ALPHA),
133+
// Most models have a input context window less than 1 million tokens.
134+
Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576},
135+
},
136+
[]string{"model_name", "target_model_name"},
137+
)
138+
128139
runningRequests = prometheus.NewGaugeVec(
129140
prometheus.GaugeOpts{
130141
Subsystem: InferenceObjectiveComponent,
@@ -278,6 +289,7 @@ func Register(customCollectors ...prometheus.Collector) {
278289
metrics.Registry.MustRegister(responseSizes)
279290
metrics.Registry.MustRegister(inputTokens)
280291
metrics.Registry.MustRegister(outputTokens)
292+
metrics.Registry.MustRegister(promptCachedTokens)
281293
metrics.Registry.MustRegister(runningRequests)
282294
metrics.Registry.MustRegister(NormalizedTimePerOutputToken)
283295
metrics.Registry.MustRegister(inferencePoolAvgKVCache)
@@ -306,6 +318,7 @@ func Reset() {
306318
responseSizes.Reset()
307319
inputTokens.Reset()
308320
outputTokens.Reset()
321+
promptCachedTokens.Reset()
309322
runningRequests.Reset()
310323
NormalizedTimePerOutputToken.Reset()
311324
inferencePoolAvgKVCache.Reset()
@@ -369,6 +382,11 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
369382
}
370383
}
371384

385+
// RecordPromptCachedTokens records prompt cached tokens count.
386+
func RecordPromptCachedTokens(modelName, targetModelName string, size int) {
387+
promptCachedTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
388+
}
389+
372390
// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.
373391
func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool {
374392
if !complete.After(received) {

pkg/epp/metrics/metrics_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ const (
4242
OutputTokensMetric = InferenceObjectiveComponent + "_output_tokens"
4343
NormalizedTimePerOutputTokenMetric = InferenceObjectiveComponent + "_normalized_time_per_output_token_seconds"
4444
RunningRequestsMetric = InferenceObjectiveComponent + "_running_requests"
45+
PromptCachedTokensMetric = InferenceObjectiveComponent + "_prompt_cached_tokens"
4546
KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization"
4647
QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size"
4748
PerPodQueueSizeMetrics = InferencePoolComponent + "_per_pod_queue_size"
@@ -373,6 +374,7 @@ func TestRecordResponseMetrics(t *testing.T) {
373374
inputToken int
374375
outputToken int
375376
respSize int
377+
cachedToken int
376378
}
377379
scenarios := []struct {
378380
name string
@@ -386,27 +388,31 @@ func TestRecordResponseMetrics(t *testing.T) {
386388
respSize: 1200,
387389
inputToken: 10,
388390
outputToken: 100,
391+
cachedToken: 5,
389392
},
390393
{
391394
modelName: "m10",
392395
targetModelName: "t10",
393396
respSize: 500,
394397
inputToken: 20,
395398
outputToken: 200,
399+
cachedToken: 10,
396400
},
397401
{
398402
modelName: "m10",
399403
targetModelName: "t11",
400404
respSize: 2480,
401405
inputToken: 30,
402406
outputToken: 300,
407+
cachedToken: 15,
403408
},
404409
{
405410
modelName: "m20",
406411
targetModelName: "t20",
407412
respSize: 80,
408413
inputToken: 40,
409414
outputToken: 400,
415+
cachedToken: 20,
410416
},
411417
},
412418
}}
@@ -416,6 +422,7 @@ func TestRecordResponseMetrics(t *testing.T) {
416422
RecordInputTokens(resp.modelName, resp.targetModelName, resp.inputToken)
417423
RecordOutputTokens(resp.modelName, resp.targetModelName, resp.outputToken)
418424
RecordResponseSizes(resp.modelName, resp.targetModelName, resp.respSize)
425+
RecordPromptCachedTokens(resp.modelName, resp.targetModelName, resp.cachedToken)
419426
}
420427
wantResponseSize, err := os.Open("testdata/response_sizes_metric")
421428
defer func() {

0 commit comments

Comments
 (0)