from __future__ import annotations from typing import Any from core.tools.base import BaseTool, ToolContext from core.tools.registry import registry from core.events import bus from core.subprocess import run_command class GPUTool(BaseTool): """ GPU introspection tool. Uses nvidia-smi when available. """ name = "gpu" description = "GPU usage, memory, and process inspection" # ========================================================= # EXECUTE # ========================================================= def execute(self, payload: dict[str, Any], ctx: ToolContext): action = str(payload.get("action", "info")).strip() bus.log( "GPU", "gpu_execute", "INFO", {"action": action} ) match action: case "info": return self.gpu_info() case "usage": return self.gpu_usage() case "processes": return self.gpu_processes() case "full": return self.full_snapshot() case _: raise ValueError(f"Unknown gpu action: {action}") # ========================================================= # GPU INFO # ========================================================= def gpu_info(self): result = run_command( cmd=[ "nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader" ] ) if result.get("return_code") != 0: return { "status": "error", "error": result.get("stderr", "nvidia-smi not available") } lines = result.get("stdout", "").strip().splitlines() gpus = [] for line in lines: parts = [p.strip() for p in line.split(",")] if len(parts) >= 3: gpus.append({ "name": parts[0], "driver": parts[1], "memory_total": parts[2] }) return { "gpu_count": len(gpus), "gpus": gpus } # ========================================================= # GPU USAGE # ========================================================= def gpu_usage(self): result = run_command( cmd=[ "nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu", "--format=csv,noheader,nounits" ] ) if result.get("return_code") != 0: return { "status": "error", "error": result.get("stderr", "") } lines = result.get("stdout", "").strip().splitlines() usage = [] for line in lines: parts = [p.strip() for p in line.split(",")] if len(parts) >= 4: usage.append({ "gpu_util_percent": parts[0], "memory_used_mb": parts[1], "memory_total_mb": parts[2], "temperature_c": parts[3] }) return { "gpus": usage } # ========================================================= # GPU PROCESSES # ========================================================= def gpu_processes(self): result = run_command( cmd=[ "nvidia-smi", "--query-compute-apps=pid,process_name,used_memory", "--format=csv,noheader" ] ) if result.get("return_code") != 0: return { "status": "error", "error": result.get("stderr", "") } lines = result.get("stdout", "").strip().splitlines() processes = [] for line in lines: parts = [p.strip() for p in line.split(",")] if len(parts) >= 3: processes.append({ "pid": parts[0], "name": parts[1], "memory": parts[2] }) return { "count": len(processes), "processes": processes } # ========================================================= # FULL SNAPSHOT # ========================================================= def full_snapshot(self): return { "info": self.gpu_info(), "usage": self.gpu_usage(), "processes": self.gpu_processes() } # ========================================================= # REGISTER # ========================================================= registry.register(GPUTool())