Add multi-server system monitor with agent/dashboard architecture

Single Flask codebase runs in agent mode (serves /api/stats from local /proc) or dashboard mode (aggregates local + remote agents). Currently monitors compute1 (64-core, podman container) and console (16-core, bare systemd service). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 19:09:34 -06:00
commit 3a9ba28552
5 changed files with 569 additions and 0 deletions
--- a/app.py
+++ b/app.py
@@ -0,0 +1,189 @@
+import json
+import os
+import threading
+import time
+import urllib.request
+
+from flask import Flask, jsonify, render_template
+
+app = Flask(__name__)
+
+PROC = os.environ.get("SYSMON_PROC", "/host/proc")
+
+# Server configuration: "name:url,name:url" where "local" means read from /proc
+SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
+servers = []
+if SYSMON_SERVERS:
+    for entry in SYSMON_SERVERS.split(","):
+        name, url = entry.strip().split(":", 1)
+        servers.append({"name": name.strip(), "url": url.strip()})
+
+# Shared state for per-core CPU usage
+cpu_snapshot = {"cores": [], "overall": 0.0}
+cpu_lock = threading.Lock()
+
+
+def parse_proc_stat():
+    """Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
+    cores = []
+    overall = None
+    with open(f"{PROC}/stat") as f:
+        for line in f:
+            if line.startswith("cpu"):
+                parts = line.split()
+                name = parts[0]
+                vals = list(map(int, parts[1:]))
+                idle = vals[3] + vals[4]
+                total = sum(vals[:8])
+                busy = total - idle
+                if name == "cpu":
+                    overall = (busy, total)
+                else:
+                    core_id = int(name[3:])
+                    cores.append((core_id, busy, total))
+    return cores, overall
+
+
+def cpu_sampler():
+    """Background thread: sample /proc/stat every 1s, compute deltas."""
+    global cpu_snapshot
+    prev_cores, prev_overall = parse_proc_stat()
+    prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
+
+    while True:
+        time.sleep(1)
+        cur_cores, cur_overall = parse_proc_stat()
+        cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
+
+        result = []
+        for cid in sorted(cur_map.keys()):
+            if cid in prev_map:
+                db = cur_map[cid][0] - prev_map[cid][0]
+                dt = cur_map[cid][1] - prev_map[cid][1]
+                pct = (db / dt * 100) if dt > 0 else 0.0
+            else:
+                pct = 0.0
+            result.append({"id": cid, "usage_percent": round(pct, 1)})
+
+        overall_pct = 0.0
+        if prev_overall and cur_overall:
+            db = cur_overall[0] - prev_overall[0]
+            dt = cur_overall[1] - prev_overall[1]
+            overall_pct = (db / dt * 100) if dt > 0 else 0.0
+
+        with cpu_lock:
+            cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
+
+        prev_map = cur_map
+        prev_overall = cur_overall
+
+
+def get_memory():
+    """Parse /proc/meminfo, return dict with MB values."""
+    info = {}
+    with open(f"{PROC}/meminfo") as f:
+        for line in f:
+            parts = line.split()
+            key = parts[0].rstrip(":")
+            val_kb = int(parts[1])
+            info[key] = val_kb
+
+    total = info["MemTotal"]
+    available = info["MemAvailable"]
+    cached = info.get("Cached", 0) + info.get("Buffers", 0)
+    used = total - available
+
+    return {
+        "total_mb": round(total / 1024),
+        "used_mb": round(used / 1024),
+        "available_mb": round(available / 1024),
+        "cached_mb": round(cached / 1024),
+        "percent": round(used / total * 100, 1),
+    }
+
+
+def get_load():
+    """Parse /proc/loadavg."""
+    with open(f"{PROC}/loadavg") as f:
+        parts = f.read().split()
+    return {
+        "load1": float(parts[0]),
+        "load5": float(parts[1]),
+        "load15": float(parts[2]),
+    }
+
+
+def get_uptime():
+    """Parse /proc/uptime, return human-readable string."""
+    with open(f"{PROC}/uptime") as f:
+        secs = float(f.read().split()[0])
+    days = int(secs // 86400)
+    hours = int((secs % 86400) // 3600)
+    mins = int((secs % 3600) // 60)
+    parts = []
+    if days:
+        parts.append(f"{days}d")
+    if hours:
+        parts.append(f"{hours}h")
+    parts.append(f"{mins}m")
+    return {"seconds": round(secs), "human": " ".join(parts)}
+
+
+def get_local_stats():
+    """Build stats dict from local /proc."""
+    with cpu_lock:
+        snap = cpu_snapshot.copy()
+    return {
+        "cores": snap["cores"],
+        "overall_cpu": snap["overall"],
+        "memory": get_memory(),
+        "load": get_load(),
+        "uptime": get_uptime(),
+        "num_cores": len(snap["cores"]),
+    }
+
+
+def fetch_remote_stats(url, timeout=2):
+    """Fetch /api/stats from a remote agent. Returns dict or None on failure."""
+    try:
+        req = urllib.request.Request(url.rstrip("/") + "/api/stats")
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read().decode())
+    except Exception:
+        return None
+
+
+@app.route("/")
+def index():
+    return render_template("index.html")
+
+
+@app.route("/api/stats")
+def stats():
+    return jsonify(get_local_stats())
+
+
+@app.route("/api/servers")
+def all_servers():
+    results = []
+    for srv in servers:
+        if srv["url"] == "local":
+            data = get_local_stats()
+            data["name"] = srv["name"]
+            data["status"] = "online"
+            results.append(data)
+        else:
+            data = fetch_remote_stats(srv["url"])
+            if data:
+                data["name"] = srv["name"]
+                data["status"] = "online"
+                data.setdefault("num_cores", len(data.get("cores", [])))
+                results.append(data)
+            else:
+                results.append({"name": srv["name"], "status": "unreachable"})
+    return jsonify(servers=results)
+
+
+# Start background sampler
+t = threading.Thread(target=cpu_sampler, daemon=True)
+t.start()