Skip to content

Commit 869e5a4

Browse files
refactor!: restructure metrics response for clear abstraction separation
BREAKING CHANGE: Major restructuring of metrics API responses to separate abstraction layers, time windows, and current vs lifetime metrics. ## Server Metrics Restructure Separated server metrics into 4 clear tiers: 1. Application tier: queue_workers (count, current/lifetime utilization) 2. Application tier: job_processing (lifetime totals, current performance) 3. Process tier: worker_processes (per-worker resource averages) 4. System tier: server_resources (actual server CPU/memory) Breaking changes: - Renamed system_limits → server_resources for clarity - Split flat utilization_rate into current_busy_percent and lifetime_busy_percent - Restructured workers/performance/utilization into nested hierarchy ## Queue Metrics Separation Separated queue metrics by time scope: - depth: Instantaneous queue state (current snapshot) - performance_60s: Windowed metrics with explicit window_seconds - lifetime: Lifetime metrics (failure_rate_percent since first job) - workers: Current vs lifetime busy percentages Breaking changes: - Flat depth/pending/etc moved into depth object - throughput_per_minute moved to performance_60s.throughput_per_minute - utilization_rate split into workers.current_busy_percent and lifetime_busy_percent ## Trend Analysis Enhancement Added comprehensive time_window context to all trend methods: - window_seconds: Duration of analysis window - window_start/window_end: ISO8601 timestamps - analyzed_at: When analysis was performed - sample_count: Number of data points analyzed Breaking changes: - Added time_window object wrapper to all trend responses - Moved period_seconds into time_window structure ## Critical Bug Fix Fixed jobs/minute calculation error (5x multiplication): - Changed from cumulative worker uptime to actual elapsed time - Used oldest worker uptime as proxy for wall-clock elapsed time - Example: 5 workers × 10min = 50min cumulative → now 10min elapsed ## Dashboard Filtering Updates Updated all dashboard filter methods to: - Map new hierarchical structures correctly - Maintain separation of current vs lifetime metrics - Preserve time window context Files modified: - src/Services/WorkerMetricsQueryService.php - src/Services/QueueMetricsQueryService.php - src/Services/OverviewQueryService.php - src/Services/TrendAnalysisService.php
1 parent 7f5b48a commit 869e5a4

File tree

4 files changed

+221
-114
lines changed

4 files changed

+221
-114
lines changed

src/Services/OverviewQueryService.php

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,21 @@ public function getOverview(bool $slim = true): array
134134
private function filterQueuesForDashboard(array $queues): array
135135
{
136136
return array_map(function ($queue) {
137+
$depth = is_array($queue['depth'] ?? null) ? $queue['depth'] : [];
138+
$performance60s = is_array($queue['performance_60s'] ?? null) ? $queue['performance_60s'] : [];
139+
$lifetime = is_array($queue['lifetime'] ?? null) ? $queue['lifetime'] : [];
140+
$workers = is_array($queue['workers'] ?? null) ? $queue['workers'] : [];
141+
137142
return [
138143
'connection' => $queue['connection'] ?? '',
139144
'queue' => $queue['queue'] ?? '',
140-
'depth' => $queue['depth'] ?? 0,
141-
'pending' => $queue['pending'] ?? 0,
142-
'active_workers' => $queue['active_workers'] ?? 0,
143-
'throughput_per_minute' => $queue['throughput_per_minute'] ?? 0,
144-
'failure_rate' => $queue['failure_rate'] ?? 0,
145-
'utilization_rate' => $queue['utilization_rate'] ?? 0,
145+
'depth' => $depth['total'] ?? 0,
146+
'pending' => $depth['pending'] ?? 0,
147+
'active_workers' => $workers['active_count'] ?? 0,
148+
'throughput_per_minute' => $performance60s['throughput_per_minute'] ?? 0,
149+
'failure_rate' => $lifetime['failure_rate_percent'] ?? 0,
150+
'current_busy_percent' => $workers['current_busy_percent'] ?? 0,
151+
'lifetime_busy_percent' => $workers['lifetime_busy_percent'] ?? 0,
146152
];
147153
}, $queues);
148154
}
@@ -177,41 +183,48 @@ private function filterJobsForDashboard(array $jobs): array
177183
* Filter server metrics to essential dashboard fields only.
178184
*
179185
* Returns simplified server data with clear separation between:
180-
* - Worker metrics: Job counts, worker utilization (from queue workers)
181-
* - System limits: CPU cores, total memory (physical server resources)
186+
* - Worker metrics: Worker count, utilization (from queue workers)
187+
* - Job metrics: Jobs processed (from queue workers)
188+
* - System metrics: Actual server CPU/memory from SystemMetrics (physical server resources)
189+
*
190+
* Note: Worker process CPU/memory metrics are NOT included in dashboard as they're not
191+
* useful for server-level overview. Use server_resources for actual server resource usage.
182192
*
183193
* @param array<string, array<string, mixed>> $servers
184194
* @return array<string, array<string, mixed>>
185195
*/
186196
private function filterServersForDashboard(array $servers): array
187197
{
188198
return array_map(function ($server) {
189-
$workers = is_array($server['workers'] ?? null) ? $server['workers'] : [];
190-
$utilization = is_array($server['utilization'] ?? null) ? $server['utilization'] : [];
191-
$performance = is_array($server['performance'] ?? null) ? $server['performance'] : [];
192-
$systemLimits = is_array($server['system_limits'] ?? null) ? $server['system_limits'] : null;
193-
194-
$serverUtilization = $utilization['server_utilization'] ?? 0;
195-
$utilizationPercent = is_numeric($serverUtilization) ? round((float) $serverUtilization * 100, 2) : 0;
199+
$queueWorkers = is_array($server['queue_workers'] ?? null) ? $server['queue_workers'] : [];
200+
$workerCount = is_array($queueWorkers['count'] ?? null) ? $queueWorkers['count'] : [];
201+
$workerUtilization = is_array($queueWorkers['utilization'] ?? null) ? $queueWorkers['utilization'] : [];
202+
$jobProcessing = is_array($server['job_processing'] ?? null) ? $server['job_processing'] : [];
203+
$jobLifetime = is_array($jobProcessing['lifetime'] ?? null) ? $jobProcessing['lifetime'] : [];
204+
$serverResources = is_array($server['server_resources'] ?? null) ? $server['server_resources'] : null;
196205

197206
$result = [
198207
'hostname' => $server['hostname'] ?? '',
199208
// Worker-level metrics (from queue workers)
200209
'workers' => [
201-
'total' => $workers['total'] ?? 0,
202-
'active' => $workers['active'] ?? 0,
203-
'idle' => $workers['idle'] ?? 0,
204-
'utilization_percent' => $utilizationPercent,
210+
'total' => $workerCount['total'] ?? 0,
211+
'active' => $workerCount['active'] ?? 0,
212+
'idle' => $workerCount['idle'] ?? 0,
213+
'current_busy_percent' => $workerUtilization['current_busy_percent'] ?? 0.0,
214+
'lifetime_busy_percent' => $workerUtilization['lifetime_busy_percent'] ?? 0.0,
205215
],
206216
// Job processing metrics (from queue workers)
207217
'jobs' => [
208-
'processed' => $performance['total_jobs_processed'] ?? 0,
218+
'total_processed' => $jobLifetime['total_processed'] ?? 0,
219+
'total_failed' => $jobLifetime['total_failed'] ?? 0,
220+
'failure_rate_percent' => $jobLifetime['failure_rate_percent'] ?? 0.0,
209221
],
210222
];
211223

212-
// System resource limits (physical server capacity)
213-
if ($systemLimits !== null) {
214-
$result['system_limits'] = $systemLimits;
224+
// System resource metrics (actual server CPU/memory from SystemMetrics)
225+
// This is the REAL server usage, not worker process usage
226+
if ($serverResources !== null) {
227+
$result['server_resources'] = $serverResources;
215228
}
216229

217230
return $result;
@@ -226,11 +239,17 @@ private function filterServersForDashboard(array $servers): array
226239
*/
227240
private function filterWorkersForDashboard(array $workers): array
228241
{
242+
$count = is_array($workers['count'] ?? null) ? $workers['count'] : [];
243+
$utilization = is_array($workers['utilization'] ?? null) ? $workers['utilization'] : [];
244+
$performance = is_array($workers['performance'] ?? null) ? $workers['performance'] : [];
245+
229246
return [
230-
'total' => $workers['total'] ?? 0,
231-
'active' => $workers['active'] ?? 0,
232-
'idle' => $workers['idle'] ?? 0,
233-
'total_jobs_processed' => $workers['total_jobs_processed'] ?? 0,
247+
'total' => $count['total'] ?? 0,
248+
'active' => $count['active'] ?? 0,
249+
'idle' => $count['idle'] ?? 0,
250+
'current_busy_percent' => $utilization['current_busy_percent'] ?? 0.0,
251+
'lifetime_busy_percent' => $utilization['lifetime_busy_percent'] ?? 0.0,
252+
'total_jobs_processed' => $performance['total_jobs_processed'] ?? 0,
234253
];
235254
}
236255
}

src/Services/QueueMetricsQueryService.php

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ public function getAllQueuesWithMetrics(): array
143143

144144
$activeWorkers = $workers->count();
145145

146-
// Calculate queue utilization rate from worker busy/idle time
146+
// Calculate worker utilization from busy/idle time
147147
$totalBusyTime = 0;
148148
$totalIdleTime = 0;
149149
foreach ($workers as $worker) {
@@ -152,7 +152,11 @@ public function getAllQueuesWithMetrics(): array
152152
}
153153

154154
$totalTime = $totalBusyTime + $totalIdleTime;
155-
$utilizationRate = $totalTime > 0 ? ($totalBusyTime / $totalTime) * 100 : 0;
155+
$lifetimeBusyPercent = $totalTime > 0 ? ($totalBusyTime / $totalTime) * 100 : 0;
156+
157+
// Calculate current worker state (% busy right now)
158+
$busyWorkers = $workers->filter(fn ($w) => $w->state->value === 'busy')->count();
159+
$currentBusyPercent = $activeWorkers > 0 ? ($busyWorkers / $activeWorkers) * 100 : 0;
156160

157161
// Get trend data
158162
$trends = $this->getQueueTrends($connection, $queue);
@@ -161,17 +165,31 @@ public function getAllQueuesWithMetrics(): array
161165
'connection' => $connection,
162166
'queue' => $queue,
163167
'driver' => $connection,
164-
'depth' => $depth->totalJobs(),
165-
'pending' => $depth->pendingJobs,
166-
'scheduled' => $depth->delayedJobs,
167-
'reserved' => $depth->reservedJobs,
168-
'oldest_job_age_seconds' => $depth->secondsOldestPendingJob() ?? 0,
169-
'oldest_job_age_status' => $depth->oldestPendingJobAge?->toIso8601String() ?? 'unknown',
170-
'throughput_per_minute' => $metrics->throughputPerMinute,
171-
'avg_duration_ms' => $metrics->avgDuration,
172-
'failure_rate' => $metrics->failureRate,
173-
'utilization_rate' => round($utilizationRate, 2),
174-
'active_workers' => $activeWorkers,
168+
// Instantaneous queue state (current snapshot)
169+
'depth' => [
170+
'total' => $depth->totalJobs(),
171+
'pending' => $depth->pendingJobs,
172+
'scheduled' => $depth->delayedJobs,
173+
'reserved' => $depth->reservedJobs,
174+
'oldest_job_age_seconds' => $depth->secondsOldestPendingJob() ?? 0,
175+
'oldest_job_age_status' => $depth->oldestPendingJobAge?->toIso8601String() ?? 'unknown',
176+
],
177+
// Windowed performance metrics (60-second window from CalculateQueueMetricsAction)
178+
'performance_60s' => [
179+
'throughput_per_minute' => $metrics->throughputPerMinute,
180+
'avg_duration_ms' => $metrics->avgDuration,
181+
'window_seconds' => 60,
182+
],
183+
// Lifetime metrics (since first job)
184+
'lifetime' => [
185+
'failure_rate_percent' => $metrics->failureRate,
186+
],
187+
// Worker metrics for this queue
188+
'workers' => [
189+
'active_count' => $activeWorkers,
190+
'current_busy_percent' => round($currentBusyPercent, 2),
191+
'lifetime_busy_percent' => round($lifetimeBusyPercent, 2),
192+
],
175193
'baseline' => $baseline ? $baseline->toArray() : null,
176194
'trends' => $trends,
177195
'timestamp' => now()->toIso8601String(),

src/Services/TrendAnalysisService.php

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,23 +96,33 @@ public function analyzeQueueDepthTrend(
9696
'available' => true,
9797
'connection' => $connection,
9898
'queue' => $queue,
99-
'period_seconds' => $periodSeconds,
100-
'data_points' => $count,
99+
// Time window context
100+
'time_window' => [
101+
'window_seconds' => $periodSeconds,
102+
'window_start' => $startTime->toIso8601String(),
103+
'window_end' => $now->toIso8601String(),
104+
'analyzed_at' => $now->toIso8601String(),
105+
'sample_count' => $count,
106+
'sample_interval_seconds' => $intervalSeconds,
107+
],
108+
// Current and historical statistics
101109
'statistics' => [
102110
'current' => end($depths) ?: 0,
103111
'average' => round($avg, 2),
104112
'min' => $min,
105113
'max' => $max,
106114
'std_dev' => round($stdDev, 2),
107115
],
116+
// Trend direction and confidence
108117
'trend' => [
109118
'slope' => round($trend['slope'], 4),
110119
'direction' => $trend['slope'] > 0.1 ? 'increasing' : ($trend['slope'] < -0.1 ? 'decreasing' : 'stable'),
111120
'confidence' => round($trend['r_squared'], 3),
112121
],
122+
// Forecast for next interval
113123
'forecast' => [
114124
'next_value' => round($forecast, 2),
115-
'next_timestamp' => $now->copy()->addSeconds($intervalSeconds)->timestamp,
125+
'next_timestamp' => $now->copy()->addSeconds($intervalSeconds)->toIso8601String(),
116126
],
117127
];
118128
}
@@ -185,13 +195,22 @@ public function analyzeThroughputTrend(
185195
'available' => true,
186196
'connection' => $connection,
187197
'queue' => $queue,
188-
'period_seconds' => $periodSeconds,
198+
// Time window context
199+
'time_window' => [
200+
'window_seconds' => $periodSeconds,
201+
'window_start' => $startTime->toIso8601String(),
202+
'window_end' => $now->toIso8601String(),
203+
'analyzed_at' => $now->toIso8601String(),
204+
'sample_count' => $count,
205+
],
206+
// Throughput statistics
189207
'statistics' => [
190208
'total_jobs' => $totalJobs,
191209
'average_per_interval' => round($avg, 2),
192210
'jobs_per_minute' => round($jobsPerMinute, 2),
193211
'jobs_per_hour' => round($jobsPerMinute * 60, 2),
194212
],
213+
// Trend direction
195214
'trend' => [
196215
'slope' => round($trend['slope'], 4),
197216
'direction' => $trend['slope'] > 0 ? 'increasing' : ($trend['slope'] < 0 ? 'decreasing' : 'stable'),
@@ -256,14 +275,22 @@ public function analyzeWorkerEfficiencyTrend(int $periodSeconds = 3600): array
256275

257276
return [
258277
'available' => true,
259-
'period_seconds' => $periodSeconds,
260-
'data_points' => $count,
278+
// Time window context
279+
'time_window' => [
280+
'window_seconds' => $periodSeconds,
281+
'window_start' => $startTime->toIso8601String(),
282+
'window_end' => $now->toIso8601String(),
283+
'analyzed_at' => $now->toIso8601String(),
284+
'sample_count' => $count,
285+
],
286+
// Worker efficiency statistics
261287
'efficiency' => [
262288
'current' => round(end($efficiencies) ?: 0, 2),
263289
'average' => round(array_sum($efficiencies) / max($count, 1), 2),
264290
'min' => round(min($efficiencies), 2),
265291
'max' => round(max($efficiencies), 2),
266292
],
293+
// Resource usage statistics
267294
'resource_usage' => [
268295
'avg_memory_mb' => round(array_sum($memoryUsages) / max($count, 1), 2),
269296
'avg_cpu_percent' => round(array_sum($cpuUsages) / max($count, 1), 2),

0 commit comments

Comments
 (0)