175 lines
4.7 KiB
Python
175 lines
4.7 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
from core.tools.base import BaseTool, ToolContext
|
|
from core.tools.registry import registry
|
|
from core.events import bus
|
|
from core.subprocess import run_command
|
|
|
|
|
|
class GPUTool(BaseTool):
|
|
"""
|
|
GPU introspection tool.
|
|
|
|
Uses nvidia-smi when available.
|
|
"""
|
|
|
|
name = "gpu"
|
|
description = "GPU usage, memory, and process inspection"
|
|
|
|
# =========================================================
|
|
# EXECUTE
|
|
# =========================================================
|
|
|
|
def execute(self, payload: dict[str, Any], ctx: ToolContext):
|
|
action = str(payload.get("action", "info")).strip()
|
|
|
|
bus.log(
|
|
"GPU",
|
|
"gpu_execute",
|
|
"INFO",
|
|
{"action": action}
|
|
)
|
|
|
|
match action:
|
|
case "info":
|
|
return self.gpu_info()
|
|
|
|
case "usage":
|
|
return self.gpu_usage()
|
|
|
|
case "processes":
|
|
return self.gpu_processes()
|
|
|
|
case "full":
|
|
return self.full_snapshot()
|
|
|
|
case _:
|
|
raise ValueError(f"Unknown gpu action: {action}")
|
|
|
|
# =========================================================
|
|
# GPU INFO
|
|
# =========================================================
|
|
|
|
def gpu_info(self):
|
|
result = run_command(
|
|
cmd=[
|
|
"nvidia-smi",
|
|
"--query-gpu=name,driver_version,memory.total",
|
|
"--format=csv,noheader"
|
|
]
|
|
)
|
|
|
|
if result.get("return_code") != 0:
|
|
return {
|
|
"status": "error",
|
|
"error": result.get("stderr", "nvidia-smi not available")
|
|
}
|
|
|
|
lines = result.get("stdout", "").strip().splitlines()
|
|
|
|
gpus = []
|
|
for line in lines:
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) >= 3:
|
|
gpus.append({
|
|
"name": parts[0],
|
|
"driver": parts[1],
|
|
"memory_total": parts[2]
|
|
})
|
|
|
|
return {
|
|
"gpu_count": len(gpus),
|
|
"gpus": gpus
|
|
}
|
|
|
|
# =========================================================
|
|
# GPU USAGE
|
|
# =========================================================
|
|
|
|
def gpu_usage(self):
|
|
result = run_command(
|
|
cmd=[
|
|
"nvidia-smi",
|
|
"--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu",
|
|
"--format=csv,noheader,nounits"
|
|
]
|
|
)
|
|
|
|
if result.get("return_code") != 0:
|
|
return {
|
|
"status": "error",
|
|
"error": result.get("stderr", "")
|
|
}
|
|
|
|
lines = result.get("stdout", "").strip().splitlines()
|
|
|
|
usage = []
|
|
for line in lines:
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) >= 4:
|
|
usage.append({
|
|
"gpu_util_percent": parts[0],
|
|
"memory_used_mb": parts[1],
|
|
"memory_total_mb": parts[2],
|
|
"temperature_c": parts[3]
|
|
})
|
|
|
|
return {
|
|
"gpus": usage
|
|
}
|
|
|
|
# =========================================================
|
|
# GPU PROCESSES
|
|
# =========================================================
|
|
|
|
def gpu_processes(self):
|
|
result = run_command(
|
|
cmd=[
|
|
"nvidia-smi",
|
|
"--query-compute-apps=pid,process_name,used_memory",
|
|
"--format=csv,noheader"
|
|
]
|
|
)
|
|
|
|
if result.get("return_code") != 0:
|
|
return {
|
|
"status": "error",
|
|
"error": result.get("stderr", "")
|
|
}
|
|
|
|
lines = result.get("stdout", "").strip().splitlines()
|
|
|
|
processes = []
|
|
for line in lines:
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) >= 3:
|
|
processes.append({
|
|
"pid": parts[0],
|
|
"name": parts[1],
|
|
"memory": parts[2]
|
|
})
|
|
|
|
return {
|
|
"count": len(processes),
|
|
"processes": processes
|
|
}
|
|
|
|
# =========================================================
|
|
# FULL SNAPSHOT
|
|
# =========================================================
|
|
|
|
def full_snapshot(self):
|
|
return {
|
|
"info": self.gpu_info(),
|
|
"usage": self.gpu_usage(),
|
|
"processes": self.gpu_processes()
|
|
}
|
|
|
|
|
|
# =========================================================
|
|
# REGISTER
|
|
# =========================================================
|
|
|
|
registry.register(GPUTool()) |
