Add dashboard summary view, server detail routing, and VM listing

Rearchitect to pure aggregator dashboard (no /proc) with bare agents
on each host. Agents report VM data via sudo virsh (cached 10s).
UI rewritten with hash-based routing: summary card grid and per-server
detail view with CPU grid, memory, load, uptime, and VM table.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-24 19:28:07 -06:00
parent 3a9ba28552
commit fb79f81527
4 changed files with 310 additions and 162 deletions

97
app.py
View File

@@ -1,5 +1,6 @@
import json
import os
import subprocess
import threading
import time
import urllib.request
@@ -10,7 +11,7 @@ app = Flask(__name__)
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
# Server configuration: "name:url,name:url" where "local" means read from /proc
# Server configuration: "name:url,name:url"
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
servers = []
if SYSMON_SERVERS:
@@ -18,13 +19,20 @@ if SYSMON_SERVERS:
name, url = entry.strip().split(":", 1)
servers.append({"name": name.strip(), "url": url.strip()})
# Dashboard-only mode: no local /proc reading when SYSMON_SERVERS is set
IS_DASHBOARD = bool(servers)
# Shared state for per-core CPU usage
cpu_snapshot = {"cores": [], "overall": 0.0}
cpu_lock = threading.Lock()
# VM cache
_vm_cache = {"data": [], "ts": 0}
_vm_lock = threading.Lock()
VM_CACHE_TTL = 10
def parse_proc_stat():
"""Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
cores = []
overall = None
with open(f"{PROC}/stat") as f:
@@ -45,7 +53,6 @@ def parse_proc_stat():
def cpu_sampler():
"""Background thread: sample /proc/stat every 1s, compute deltas."""
global cpu_snapshot
prev_cores, prev_overall = parse_proc_stat()
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
@@ -79,7 +86,6 @@ def cpu_sampler():
def get_memory():
"""Parse /proc/meminfo, return dict with MB values."""
info = {}
with open(f"{PROC}/meminfo") as f:
for line in f:
@@ -103,7 +109,6 @@ def get_memory():
def get_load():
"""Parse /proc/loadavg."""
with open(f"{PROC}/loadavg") as f:
parts = f.read().split()
return {
@@ -114,7 +119,6 @@ def get_load():
def get_uptime():
"""Parse /proc/uptime, return human-readable string."""
with open(f"{PROC}/uptime") as f:
secs = float(f.read().split()[0])
days = int(secs // 86400)
@@ -129,22 +133,72 @@ def get_uptime():
return {"seconds": round(secs), "human": " ".join(parts)}
def get_vms():
"""Get VM list via sudo virsh. Returns list of dicts. Cached for VM_CACHE_TTL seconds."""
with _vm_lock:
now = time.time()
if now - _vm_cache["ts"] < VM_CACHE_TTL:
return _vm_cache["data"]
try:
result = subprocess.run(
["sudo", "virsh", "list", "--all", "--name"],
capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return []
names = [n.strip() for n in result.stdout.strip().split("\n") if n.strip()]
vms = []
for name in names:
info = subprocess.run(
["sudo", "virsh", "dominfo", name],
capture_output=True, text=True, timeout=5
)
if info.returncode != 0:
continue
vm = {"name": name, "state": "unknown", "vcpus": 0, "memory_mb": 0, "autostart": False}
for line in info.stdout.split("\n"):
if ":" not in line:
continue
key, val = line.split(":", 1)
key = key.strip()
val = val.strip()
if key == "State":
vm["state"] = val
elif key == "CPU(s)":
vm["vcpus"] = int(val)
elif key == "Max memory":
# virsh reports in KiB
vm["memory_mb"] = int(val.split()[0]) // 1024
elif key == "Autostart":
vm["autostart"] = val.lower() in ("enable", "enabled")
vms.append(vm)
with _vm_lock:
_vm_cache["data"] = vms
_vm_cache["ts"] = time.time()
return vms
except Exception:
return []
def get_local_stats():
"""Build stats dict from local /proc."""
with cpu_lock:
snap = cpu_snapshot.copy()
return {
stats = {
"cores": snap["cores"],
"overall_cpu": snap["overall"],
"memory": get_memory(),
"load": get_load(),
"uptime": get_uptime(),
"num_cores": len(snap["cores"]),
"vms": get_vms(),
}
return stats
def fetch_remote_stats(url, timeout=2):
"""Fetch /api/stats from a remote agent. Returns dict or None on failure."""
def fetch_remote_stats(url, timeout=3):
try:
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
with urllib.request.urlopen(req, timeout=timeout) as resp:
@@ -160,6 +214,8 @@ def index():
@app.route("/api/stats")
def stats():
if IS_DASHBOARD:
return jsonify({"error": "dashboard-only mode, use /api/servers"}), 400
return jsonify(get_local_stats())
@@ -167,23 +223,18 @@ def stats():
def all_servers():
results = []
for srv in servers:
if srv["url"] == "local":
data = get_local_stats()
data = fetch_remote_stats(srv["url"])
if data:
data["name"] = srv["name"]
data["status"] = "online"
data.setdefault("num_cores", len(data.get("cores", [])))
results.append(data)
else:
data = fetch_remote_stats(srv["url"])
if data:
data["name"] = srv["name"]
data["status"] = "online"
data.setdefault("num_cores", len(data.get("cores", [])))
results.append(data)
else:
results.append({"name": srv["name"], "status": "unreachable"})
results.append({"name": srv["name"], "status": "unreachable"})
return jsonify(servers=results)
# Start background sampler
t = threading.Thread(target=cpu_sampler, daemon=True)
t.start()
# Only start CPU sampler in agent mode (not dashboard-only)
if not IS_DASHBOARD:
t = threading.Thread(target=cpu_sampler, daemon=True)
t.start()