diff --git a/docs/perfmon-live-data-audit.md b/docs/perfmon-live-data-audit.md new file mode 100644 index 00000000..6e287d33 --- /dev/null +++ b/docs/perfmon-live-data-audit.md @@ -0,0 +1,207 @@ +# PerfMon Live-Data Audit + +Date: 2026-02-08 +Scope: Read-only audit of current PerfMon implementation and minimal path to live telemetry. + +## 1) Current PerfMon Rendering And Toggle + +- PerfMon is mounted in the top header controls in `ui/src/App.tsx:369` as ``. +- The header is sticky and high stacking context (`ui/src/App.tsx:265`), with a shared tooltip provider wrapping controls (`ui/src/App.tsx:267`, `ui/src/App.tsx:394`). +- `isAgentLive` is derived from websocket agent state in `ui/src/App.tsx:87`: + - `wsState.agentStatus === 'running' || wsState.agentStatus === 'paused'` +- The panel open state is local in `ui/src/components/PerfMonPanel.tsx:25`. +- Open/close behavior: + - Button toggles state (`ui/src/components/PerfMonPanel.tsx:58`) + - Outside click closes (`ui/src/components/PerfMonPanel.tsx:32`) + - `Escape` closes (`ui/src/components/PerfMonPanel.tsx:38`) +- Panel mount behavior: + - Not portaled; rendered as absolute element under button (`ui/src/components/PerfMonPanel.tsx:80`) + - Uses `z-40` inside header context. +- Tooltip behavior: + - PerfMon button uses Radix tooltip (`ui/src/components/PerfMonPanel.tsx:69`) + - Tooltip content is portaled in shared wrapper (`ui/src/components/ui/tooltip.tsx:42`, `ui/src/components/ui/tooltip.tsx:44`). + +## 2) Current Data Flow + +### `usePerfMonMock` shape and cadence + +`ui/src/hooks/usePerfMonMock.ts` returns: + +- `tokens` + - `currentRun` + - `totalSession` + - `usagePercent` +- `cpu` + - `percent` +- `memory` + - `used` + - `total` + - `percent` +- `gpu` + - `available` + - `percent` + - `used` + - `total` + +Generation model: + +- Deterministic smooth wave + clamp (`const wave`, `const clamp`) in `ui/src/hooks/usePerfMonMock.ts:3`, `ui/src/hooks/usePerfMonMock.ts:6`. +- No random noise spikes. +- GPU availability flips periodically (`ui/src/hooks/usePerfMonMock.ts:53`). + +Update cadence: + +- `1s` while running, `3s` while idle (`ui/src/hooks/usePerfMonMock.ts:85`). +- Interval-based updates (`ui/src/hooks/usePerfMonMock.ts:94`). + +### PerfMonPanel consumption + +- `PerfMonPanel` consumes `usePerfMonMock(isLive)` directly (`ui/src/components/PerfMonPanel.tsx:27`). +- `isLive` affects: + - mock update rate (through hook) + - button/panel badge variant and label (`Live`/`Idle`) (`ui/src/components/PerfMonPanel.tsx:72`, `ui/src/components/PerfMonPanel.tsx:83`). + +Assumption: + +- “Agent running” is inferred only from websocket `agentStatus` in `App`. + +## 3) Existing Live Sources In Repo + +### WebSocket channel already used in app + +- Client hook: `useProjectWebSocket(projectName)` in `ui/src/hooks/useWebSocket.ts:61`. +- Connects to `/ws/projects/{project_name}` (`ui/src/hooks/useWebSocket.ts:88`). +- Current state already tracked in UI hook: + - `progress`, `agentStatus`, `isConnected`, `activeAgents`, `orchestratorStatus`, `devServerStatus` (`ui/src/hooks/useWebSocket.ts:33`, `ui/src/hooks/useWebSocket.ts:39`, `ui/src/hooks/useWebSocket.ts:41`, `ui/src/hooks/useWebSocket.ts:46`, `ui/src/hooks/useWebSocket.ts:54`). +- Message handlers exist for: + - `progress` (`ui/src/hooks/useWebSocket.ts:104`) + - `agent_status` (`ui/src/hooks/useWebSocket.ts:116`) + - `orchestrator_update` (`ui/src/hooks/useWebSocket.ts:284`) + - `dev_server_status` (`ui/src/hooks/useWebSocket.ts:321`) + +### Backend websocket wiring + +- WebSocket endpoint in `server/main.py:167`, delegated to `project_websocket` (`server/main.py:170`). +- Handler in `server/websocket.py:719`. +- Existing push sources: + - progress polling task (`server/websocket.py:685`, `server/websocket.py:843`) + - agent output callback (`server/websocket.py:758`, registered at `server/websocket.py:810`) + - agent status callback (`server/websocket.py:795`, registered at `server/websocket.py:811`) + - dev server status callback (`server/websocket.py:827`, registered at `server/websocket.py:840`) + +### REST status endpoints and polling patterns + +- Agent status endpoint: + - UI call `getAgentStatus` at `ui/src/lib/api.ts:232` + - Backend route `server/routers/agent.py:70` + - Polling hook every 3s at `ui/src/hooks/useProjects.ts:140`, `ui/src/hooks/useProjects.ts:145` +- Features polling every 5s at `ui/src/hooks/useProjects.ts:82`, `ui/src/hooks/useProjects.ts:87` +- Project stats endpoint exists (`/stats`) for pass/in-progress totals only (`server/routers/projects.py:367`). + +### Current WS contract does not include perf metrics + +- `WSMessageType` and `WSMessage` in `ui/src/lib/types.ts:243`, `ui/src/lib/types.ts:318` do not include perf telemetry fields. + +## 4) Minimal Live-Data Integration Plan + +### Preferred transport + +- Prefer existing project WebSocket path over new polling endpoint. +- Reason: + - Current page already keeps one WS open. + - Existing architecture already emits multiple live message types over this channel. + - Lower surface area than adding new endpoint + polling hook + cache invalidation path. + +### Proposed API contract + +Message type: `perf_metrics` + +```json +{ + "type": "perf_metrics", + "timestamp": "2026-02-08T20:15:30.123Z", + "project": "my-project", + "run": { + "status": "running", + "pid": 12345, + "started_at": "2026-02-08T20:10:00Z", + "run_id": "12345-2026-02-08T20:10:00Z" + }, + "tokens": { + "current_run": 1420, + "total_session": 9180, + "available": false + }, + "cpu": { + "percent": 37.2 + }, + "memory": { + "used_gb": 6.2, + "total_gb": 16.0, + "percent": 38.8 + }, + "gpu": { + "available": false, + "percent": null, + "vram_used_gb": null, + "vram_total_gb": null + } +} +``` + +### Error/empty states and dev fallback + +- UI behavior: + - If no live payload yet, show empty placeholders (`Not available`) without errors. + - If stale payload (e.g. >10s old), mark as stale and dim values. +- Dev fallback to mock only: + - `import.meta.env.DEV && import.meta.env.VITE_PERFMON_MOCK !== '0'` + - Use mock when live payload absent or disabled by backend. +- Production: + - No automatic fake fallback; show unavailable states instead. + +## 5) Risks / Maintainer Concerns + +### Security + +- Host-level metrics may expose machine characteristics. +- In remote mode (`AUTOFORGE_ALLOW_REMOTE` in `server/main.py:97`), exposure risk is higher. +- Keep payload coarse and project-scoped, avoid file paths/usernames/process cmdline leakage. +- Ensure project-name validation and existing WS auth/path checks remain the gate. + +### Performance + +- Target 1s updates while running, 3s when idle to match current UI cadence expectations. +- Avoid rerender churn: + - do not update PerfMon state if values change minimally + - optionally pause updates when panel is closed (or keep lower-frequency store updates). + +### Cross-platform + +- CPU/memory are feasible with existing server dependency `psutil`. +- GPU/VRAM portability is inconsistent across OS/vendors. +- Contract should support `gpu.available=false` and null GPU values. + +## 6) Implementation Checklist (5-8 Small Steps) + +1. Add perf telemetry message types to shared UI contracts in `ui/src/lib/types.ts` (`WSPerfMetricsMessage`, union update). +2. Extend websocket UI state in `ui/src/hooks/useWebSocket.ts` with `perfMetrics` and add `case 'perf_metrics'`. +3. Add backend schema class(es) in `server/schemas.py` for perf payload (optional but recommended for contract clarity). +4. In `server/websocket.py`, add a lightweight perf sampling loop inside `project_websocket` that emits `perf_metrics` at 1s/3s cadence. +5. Source initial live fields from existing manager state: + - `status`, `pid`, `started_at` from process manager already used in `server/routers/agent.py:78`. + - CPU/memory from `psutil`. + - GPU optional/null when unavailable. +6. Update `ui/src/components/PerfMonPanel.tsx` to consume live metrics from WS first, with mock fallback in dev only. +7. Add UI stale/unavailable states and keep existing `Live`/`Idle` badge behavior. +8. Add a focused test for panel rendering with telemetry payload shape (or hook-level test), without adding a new test framework. + +## Recommended Next PR Scope + +Preferred scope: **add server websocket perf message + UI wiring** in one small PR. + +Rationale: + +- Delivers true live data immediately. +- Reuses established transport and state patterns already central to this screen. +- Keeps diff localized to websocket contract + PerfMon consumption code, avoiding new endpoint and polling complexity. diff --git a/server/websocket.py b/server/websocket.py index e6600643..9aa6d017 100644 --- a/server/websocket.py +++ b/server/websocket.py @@ -14,6 +14,7 @@ from typing import Set from fastapi import WebSocket, WebSocketDisconnect +import psutil from .schemas import AGENT_MASCOTS from .services.chat_constants import ROOT_DIR @@ -716,6 +717,60 @@ async def poll_progress(websocket: WebSocket, project_name: str, project_dir: Pa break +def _collect_perf_metrics(agent_manager) -> dict: + """Collect coarse, non-sensitive metrics for PerfMon.""" + # System-level metrics are robust across platforms and do not expose paths/usernames. + cpu_percent = float(psutil.cpu_percent(interval=None)) + memory = psutil.virtual_memory() + + started_at = None + if agent_manager.started_at is not None: + started_at = agent_manager.started_at.isoformat() + + return { + "type": "perf_metrics", + "timestamp": datetime.now().isoformat(), + "run": { + "status": agent_manager.status, + "pid": agent_manager.pid, + "started_at": started_at, + }, + "tokens": { + "available": False, + "current_run": None, + "total_session": None, + }, + "cpu": { + "percent": round(cpu_percent, 1), + }, + "memory": { + "used_gb": round(memory.used / (1024 ** 3), 1), + "total_gb": round(memory.total / (1024 ** 3), 1), + "percent": round(float(memory.percent), 1), + }, + "gpu": { + "available": False, + "percent": None, + "vram_used_gb": None, + "vram_total_gb": None, + }, + } + + +async def poll_perf_metrics(websocket: WebSocket, agent_manager): + """Send PerfMon telemetry over WebSocket at adaptive cadence.""" + while True: + try: + await websocket.send_json(_collect_perf_metrics(agent_manager)) + interval = 1 if agent_manager.status in ("running", "paused") else 3 + await asyncio.sleep(interval) + except asyncio.CancelledError: + raise + except Exception as e: + logger.warning(f"Perf metrics polling error: {e}") + break + + async def project_websocket(websocket: WebSocket, project_name: str): """ WebSocket endpoint for project updates. @@ -841,6 +896,7 @@ async def on_dev_status_change(status: str): # Start progress polling task poll_task = asyncio.create_task(poll_progress(websocket, project_name, project_dir)) + perf_task = asyncio.create_task(poll_perf_metrics(websocket, agent_manager)) try: # Send initial agent status @@ -889,10 +945,15 @@ async def on_dev_status_change(status: str): finally: # Clean up poll_task.cancel() + perf_task.cancel() try: await poll_task except asyncio.CancelledError: pass + try: + await perf_task + except asyncio.CancelledError: + pass # Unregister agent callbacks agent_manager.remove_output_callback(on_output) diff --git a/ui/src/App.tsx b/ui/src/App.tsx index f1c0970a..7f7f52f2 100644 --- a/ui/src/App.tsx +++ b/ui/src/App.tsx @@ -27,6 +27,7 @@ import { KeyboardShortcutsHelp } from './components/KeyboardShortcutsHelp' import { ThemeSelector } from './components/ThemeSelector' import { ResetProjectModal } from './components/ResetProjectModal' import { ProjectSetupRequired } from './components/ProjectSetupRequired' +import { PerfMonPanel } from './components/PerfMonPanel' import { getDependencyGraph, startAgent } from './lib/api' import { Loader2, Settings, Moon, Sun, RotateCcw, BookOpen } from 'lucide-react' import type { Feature } from './lib/types' @@ -83,6 +84,7 @@ function App() { useAgentStatus(selectedProject) // Keep polling for status updates const wsState = useProjectWebSocket(selectedProject) const { theme, setTheme, darkMode, toggleDarkMode, themes } = useTheme() + const isAgentLive = wsState.agentStatus === 'running' || wsState.agentStatus === 'paused' // Get has_spec from the selected project const selectedProjectData = projects?.find(p => p.name === selectedProject) @@ -364,6 +366,8 @@ function App() { Docs + + {/* Theme selector */} + {value !== null && ( +
+ )} +
+ ) +} + +function formatValue(value: number | null, decimals = 0) { + if (value === null) return 'Not available' + return value.toLocaleString(undefined, { + minimumFractionDigits: decimals, + maximumFractionDigits: decimals, + }) +} + +export function PerfMonPanel({ isLive, perfMetrics }: PerfMonPanelProps) { + const [isOpen, setIsOpen] = useState(false) + const containerRef = useRef(null) + const mockMetrics = usePerfMonMock(isLive) + const allowMockFallback = import.meta.env.DEV && import.meta.env.VITE_PERFMON_MOCK !== '0' + const usingLive = !!perfMetrics + const usingMock = !usingLive && allowMockFallback + + const lastUpdateMs = perfMetrics ? Date.parse(perfMetrics.timestamp) : null + const isStale = lastUpdateMs !== null && Number.isFinite(lastUpdateMs) && (Date.now() - lastUpdateMs > 10_000) + + const tokenCurrentRun = perfMetrics + ? (perfMetrics.tokens.available ? perfMetrics.tokens.current_run : null) + : (usingMock ? mockMetrics.tokens.currentRun : null) + + const tokenTotalSession = perfMetrics + ? (perfMetrics.tokens.available ? perfMetrics.tokens.total_session : null) + : (usingMock ? mockMetrics.tokens.totalSession : null) + + const tokenUsagePercent = perfMetrics + ? (perfMetrics.tokens.available && perfMetrics.tokens.current_run !== null + ? Math.min(100, Math.max(0, (perfMetrics.tokens.current_run / 5000) * 100)) + : null) + : (usingMock ? mockMetrics.tokens.usagePercent : null) + + const cpuPercent = perfMetrics ? perfMetrics.cpu.percent : (usingMock ? mockMetrics.cpu.percent : null) + const memoryUsedGb = perfMetrics ? perfMetrics.memory.used_gb : (usingMock ? mockMetrics.memory.used : null) + const memoryTotalGb = perfMetrics ? perfMetrics.memory.total_gb : (usingMock ? mockMetrics.memory.total : null) + const memoryPercent = perfMetrics ? perfMetrics.memory.percent : (usingMock ? mockMetrics.memory.percent : null) + + const gpuAvailable = perfMetrics ? perfMetrics.gpu.available : usingMock + const gpuPercent = perfMetrics + ? (perfMetrics.gpu.available ? perfMetrics.gpu.percent : null) + : (usingMock ? mockMetrics.gpu.percent : null) + const gpuVramUsed = perfMetrics + ? (perfMetrics.gpu.available ? perfMetrics.gpu.vram_used_gb : null) + : (usingMock ? mockMetrics.gpu.used : null) + const gpuVramTotal = perfMetrics + ? (perfMetrics.gpu.available ? perfMetrics.gpu.vram_total_gb : null) + : (usingMock ? mockMetrics.gpu.total : null) + + useEffect(() => { + if (!isOpen) return + + const handleClickOutside = (event: MouseEvent) => { + if (containerRef.current && !containerRef.current.contains(event.target as Node)) { + setIsOpen(false) + } + } + + const handleEscape = (event: KeyboardEvent) => { + if (event.key === 'Escape') { + setIsOpen(false) + } + } + + document.addEventListener('mousedown', handleClickOutside) + document.addEventListener('keydown', handleEscape) + + return () => { + document.removeEventListener('mousedown', handleClickOutside) + document.removeEventListener('keydown', handleEscape) + } + }, [isOpen]) + + return ( +
+
+ + + + + PerfMon + + + {isLive ? 'Live' : 'Idle'} + +
+ + {isOpen && ( + + +
PerfMon
+
+ {isStale && ( + + Stale + + )} + + {isLive ? 'Live' : 'Idle'} + +
+
+ + {!usingLive && !usingMock && ( +
+ Live telemetry not available. +
+ )} +
+
+ Tokens · Current run + + {tokenCurrentRun === null ? 'Not available' : tokenCurrentRun.toLocaleString()} + +
+ +
+ Total session + + {tokenTotalSession === null ? 'Not available' : tokenTotalSession.toLocaleString()} + +
+
+ +
+
+
+ CPU + + {cpuPercent === null ? 'Not available' : `${Math.round(cpuPercent)}%`} + +
+ +
+
+
+ Memory + + {memoryUsedGb === null || memoryTotalGb === null + ? 'Not available' + : `${formatValue(memoryUsedGb, 1)} / ${formatValue(memoryTotalGb, 1)} GB`} + +
+ +
+
+ +
+
+ GPU + {gpuAvailable ? ( + + {gpuPercent === null ? 'Not available' : `${Math.round(gpuPercent)}%`} + + ) : ( + Not available + )} +
+ {gpuAvailable ? ( + <> + +
+ {gpuVramUsed === null || gpuVramTotal === null + ? 'VRAM Not available' + : `VRAM ${formatValue(gpuVramUsed, 1)} / ${formatValue(gpuVramTotal, 1)} GB`} +
+ + ) : null} +
+
+
+ )} +
+ ) +} diff --git a/ui/src/hooks/usePerfMonMock.ts b/ui/src/hooks/usePerfMonMock.ts new file mode 100644 index 00000000..b3257448 --- /dev/null +++ b/ui/src/hooks/usePerfMonMock.ts @@ -0,0 +1,100 @@ +import { useEffect, useMemo, useRef, useState } from 'react' + +const clamp = (value: number, min: number, max: number) => + Math.min(max, Math.max(min, value)) + +const wave = ( + t: number, + period: number, + min: number, + max: number, + phase = 0 +) => { + const angle = (t / period) * Math.PI * 2 + phase + const normalized = (Math.sin(angle) + 1) / 2 + return min + (max - min) * normalized +} + +export interface PerfMonMetrics { + tokens: { + currentRun: number + totalSession: number + usagePercent: number + } + cpu: { + percent: number + } + memory: { + used: number + total: number + percent: number + } + gpu: { + available: boolean + percent: number + used: number + total: number + } +} + +function buildMetrics(t: number): PerfMonMetrics { + const currentRun = Math.round(wave(t, 12, 900, 3200)) + const totalSession = Math.round( + clamp(12000 + t * 28 + wave(t, 22, -300, 300, 0.6), 0, 120000) + ) + const tokensUsagePercent = clamp((currentRun / 5000) * 100, 0, 100) + + const cpuPercent = Math.round(wave(t, 8, 18, 82, 0.4)) + + const memoryTotal = 16 + const memoryUsed = Number(wave(t, 18, 5.4, 12.6, 1.1).toFixed(1)) + const memoryPercent = clamp((memoryUsed / memoryTotal) * 100, 0, 100) + + const gpuAvailable = Math.floor(t / 90) % 2 === 0 + const gpuTotal = 12 + const gpuPercent = Math.round(wave(t, 10, 12, 78, 0.2)) + const gpuUsed = Number(wave(t, 14, 2.8, 7.4, 0.9).toFixed(1)) + + return { + tokens: { + currentRun, + totalSession, + usagePercent: tokensUsagePercent, + }, + cpu: { + percent: cpuPercent, + }, + memory: { + used: memoryUsed, + total: memoryTotal, + percent: memoryPercent, + }, + gpu: { + available: gpuAvailable, + percent: gpuPercent, + used: gpuUsed, + total: gpuTotal, + }, + } +} + +export function usePerfMonMock(isRunning: boolean) { + const startRef = useRef(Date.now()) + const [metrics, setMetrics] = useState(() => buildMetrics(0)) + + const intervalMs = useMemo(() => (isRunning ? 1000 : 3000), [isRunning]) + + useEffect(() => { + const tick = () => { + const elapsedSeconds = (Date.now() - startRef.current) / 1000 + setMetrics(buildMetrics(elapsedSeconds)) + } + + tick() + const interval = setInterval(tick, intervalMs) + + return () => clearInterval(interval) + }, [intervalMs]) + + return metrics +} diff --git a/ui/src/hooks/useWebSocket.ts b/ui/src/hooks/useWebSocket.ts index b9c0a3fe..66452331 100644 --- a/ui/src/hooks/useWebSocket.ts +++ b/ui/src/hooks/useWebSocket.ts @@ -12,6 +12,7 @@ import type { AgentLogEntry, OrchestratorStatus, OrchestratorEvent, + PerfMetrics, } from '../lib/types' // Activity item for the feed @@ -52,6 +53,8 @@ interface WebSocketState { celebration: CelebrationTrigger | null // Orchestrator state for Mission Control orchestratorStatus: OrchestratorStatus | null + // Perf metrics for PerfMon panel + perfMetrics: PerfMetrics | null } const MAX_LOGS = 100 // Keep last 100 log lines @@ -73,6 +76,7 @@ export function useProjectWebSocket(projectName: string | null) { celebrationQueue: [], celebration: null, orchestratorStatus: null, + perfMetrics: null, }) const wsRef = useRef(null) @@ -326,6 +330,20 @@ export function useProjectWebSocket(projectName: string | null) { })) break + case 'perf_metrics': + setState(prev => ({ + ...prev, + perfMetrics: { + timestamp: message.timestamp, + run: message.run, + tokens: message.tokens, + cpu: message.cpu, + memory: message.memory, + gpu: message.gpu, + }, + })) + break + case 'pong': // Heartbeat response break @@ -398,6 +416,7 @@ export function useProjectWebSocket(projectName: string | null) { celebrationQueue: [], celebration: null, orchestratorStatus: null, + perfMetrics: null, }) if (!projectName) { diff --git a/ui/src/lib/types.ts b/ui/src/lib/types.ts index ba8eab94..6e13b18a 100644 --- a/ui/src/lib/types.ts +++ b/ui/src/lib/types.ts @@ -240,7 +240,7 @@ export interface OrchestratorStatus { } // WebSocket message types -export type WSMessageType = 'progress' | 'feature_update' | 'log' | 'agent_status' | 'pong' | 'dev_log' | 'dev_server_status' | 'agent_update' | 'orchestrator_update' +export type WSMessageType = 'progress' | 'feature_update' | 'log' | 'agent_status' | 'pong' | 'dev_log' | 'dev_server_status' | 'agent_update' | 'orchestrator_update' | 'perf_metrics' export interface WSProgressMessage { type: 'progress' @@ -315,6 +315,38 @@ export interface WSOrchestratorUpdateMessage { featureName?: string } +export interface PerfMetrics { + timestamp: string + run: { + status: AgentStatus + pid: number | null + started_at: string | null + } + tokens: { + available: boolean + current_run: number | null + total_session: number | null + } + cpu: { + percent: number | null + } + memory: { + used_gb: number | null + total_gb: number | null + percent: number | null + } + gpu: { + available: boolean + percent: number | null + vram_used_gb: number | null + vram_total_gb: number | null + } +} + +export interface WSPerfMetricsMessage extends PerfMetrics { + type: 'perf_metrics' +} + export type WSMessage = | WSProgressMessage | WSFeatureUpdateMessage @@ -325,6 +357,7 @@ export type WSMessage = | WSDevLogMessage | WSDevServerStatusMessage | WSOrchestratorUpdateMessage + | WSPerfMetricsMessage // ============================================================================ // Spec Chat Types