Add dashboard summary view, server detail routing, and VM listing

Rearchitect to pure aggregator dashboard (no /proc) with bare agents on each host. Agents report VM data via sudo virsh (cached 10s). UI rewritten with hash-based routing: summary card grid and per-server detail view with CPU grid, memory, load, uptime, and VM table. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 19:28:07 -06:00
parent 3a9ba28552
commit fb79f81527
4 changed files with 310 additions and 162 deletions
--- a/app.py
+++ b/app.py
@@ -1,5 +1,6 @@
 import json
 import os
+import subprocess
 import threading
 import time
 import urllib.request
@@ -10,7 +11,7 @@ app = Flask(__name__)

 PROC = os.environ.get("SYSMON_PROC", "/host/proc")

-# Server configuration: "name:url,name:url" where "local" means read from /proc
+# Server configuration: "name:url,name:url"
 SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
 servers = []
 if SYSMON_SERVERS:
@@ -18,13 +19,20 @@ if SYSMON_SERVERS:
        name, url = entry.strip().split(":", 1)
        servers.append({"name": name.strip(), "url": url.strip()})

+# Dashboard-only mode: no local /proc reading when SYSMON_SERVERS is set
+IS_DASHBOARD = bool(servers)
+
 # Shared state for per-core CPU usage
 cpu_snapshot = {"cores": [], "overall": 0.0}
 cpu_lock = threading.Lock()

+# VM cache
+_vm_cache = {"data": [], "ts": 0}
+_vm_lock = threading.Lock()
+VM_CACHE_TTL = 10
+

 def parse_proc_stat():
-    """Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
    cores = []
    overall = None
    with open(f"{PROC}/stat") as f:
@@ -45,7 +53,6 @@ def parse_proc_stat():


 def cpu_sampler():
-    """Background thread: sample /proc/stat every 1s, compute deltas."""
    global cpu_snapshot
    prev_cores, prev_overall = parse_proc_stat()
    prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
@@ -79,7 +86,6 @@ def cpu_sampler():


 def get_memory():
-    """Parse /proc/meminfo, return dict with MB values."""
    info = {}
    with open(f"{PROC}/meminfo") as f:
        for line in f:
@@ -103,7 +109,6 @@ def get_memory():


 def get_load():
-    """Parse /proc/loadavg."""
    with open(f"{PROC}/loadavg") as f:
        parts = f.read().split()
    return {
@@ -114,7 +119,6 @@ def get_load():


 def get_uptime():
-    """Parse /proc/uptime, return human-readable string."""
    with open(f"{PROC}/uptime") as f:
        secs = float(f.read().split()[0])
    days = int(secs // 86400)
@@ -129,22 +133,72 @@ def get_uptime():
    return {"seconds": round(secs), "human": " ".join(parts)}


+def get_vms():
+    """Get VM list via sudo virsh. Returns list of dicts. Cached for VM_CACHE_TTL seconds."""
+    with _vm_lock:
+        now = time.time()
+        if now - _vm_cache["ts"] < VM_CACHE_TTL:
+            return _vm_cache["data"]
+
+    try:
+        result = subprocess.run(
+            ["sudo", "virsh", "list", "--all", "--name"],
+            capture_output=True, text=True, timeout=5
+        )
+        if result.returncode != 0:
+            return []
+
+        names = [n.strip() for n in result.stdout.strip().split("\n") if n.strip()]
+        vms = []
+        for name in names:
+            info = subprocess.run(
+                ["sudo", "virsh", "dominfo", name],
+                capture_output=True, text=True, timeout=5
+            )
+            if info.returncode != 0:
+                continue
+            vm = {"name": name, "state": "unknown", "vcpus": 0, "memory_mb": 0, "autostart": False}
+            for line in info.stdout.split("\n"):
+                if ":" not in line:
+                    continue
+                key, val = line.split(":", 1)
+                key = key.strip()
+                val = val.strip()
+                if key == "State":
+                    vm["state"] = val
+                elif key == "CPU(s)":
+                    vm["vcpus"] = int(val)
+                elif key == "Max memory":
+                    # virsh reports in KiB
+                    vm["memory_mb"] = int(val.split()[0]) // 1024
+                elif key == "Autostart":
+                    vm["autostart"] = val.lower() in ("enable", "enabled")
+            vms.append(vm)
+
+        with _vm_lock:
+            _vm_cache["data"] = vms
+            _vm_cache["ts"] = time.time()
+        return vms
+    except Exception:
+        return []
+
+
 def get_local_stats():
-    """Build stats dict from local /proc."""
    with cpu_lock:
        snap = cpu_snapshot.copy()
-    return {
+    stats = {
        "cores": snap["cores"],
        "overall_cpu": snap["overall"],
        "memory": get_memory(),
        "load": get_load(),
        "uptime": get_uptime(),
        "num_cores": len(snap["cores"]),
+        "vms": get_vms(),
    }
+    return stats


-def fetch_remote_stats(url, timeout=2):
-    """Fetch /api/stats from a remote agent. Returns dict or None on failure."""
+def fetch_remote_stats(url, timeout=3):
    try:
        req = urllib.request.Request(url.rstrip("/") + "/api/stats")
        with urllib.request.urlopen(req, timeout=timeout) as resp:
@@ -160,6 +214,8 @@ def index():

@app.route("/api/stats")
 def stats():
+    if IS_DASHBOARD:
+        return jsonify({"error": "dashboard-only mode, use /api/servers"}), 400
    return jsonify(get_local_stats())


@@ -167,23 +223,18 @@ def stats():
 def all_servers():
    results = []
    for srv in servers:
-        if srv["url"] == "local":
-            data = get_local_stats()
+        data = fetch_remote_stats(srv["url"])
+        if data:
            data["name"] = srv["name"]
            data["status"] = "online"
+            data.setdefault("num_cores", len(data.get("cores", [])))
            results.append(data)
        else:
-            data = fetch_remote_stats(srv["url"])
-            if data:
-                data["name"] = srv["name"]
-                data["status"] = "online"
-                data.setdefault("num_cores", len(data.get("cores", [])))
-                results.append(data)
-            else:
-                results.append({"name": srv["name"], "status": "unreachable"})
+            results.append({"name": srv["name"], "status": "unreachable"})
    return jsonify(servers=results)


-# Start background sampler
-t = threading.Thread(target=cpu_sampler, daemon=True)
-t.start()
+# Only start CPU sampler in agent mode (not dashboard-only)
+if not IS_DASHBOARD:
+    t = threading.Thread(target=cpu_sampler, daemon=True)
+    t.start()