Add multi-server system monitor with agent/dashboard architecture

Single Flask codebase runs in agent mode (serves /api/stats from local /proc) or dashboard mode (aggregates local + remote agents). Currently monitors compute1 (64-core, podman container) and console (16-core, bare systemd service). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 19:09:34 -06:00
commit 3a9ba28552
5 changed files with 569 additions and 0 deletions
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
+FROM python:3.11-slim
+
+RUN pip install --no-cache-dir flask gunicorn
+
+WORKDIR /app
+COPY app.py .
+COPY templates/ templates/
+
+EXPOSE 8083
+
+CMD ["gunicorn", "-b", "0.0.0.0:8083", "-w", "1", "--threads", "2", "app:app"]
--- a/app.py
+++ b/app.py
@@ -0,0 +1,189 @@
+import json
+import os
+import threading
+import time
+import urllib.request
+
+from flask import Flask, jsonify, render_template
+
+app = Flask(__name__)
+
+PROC = os.environ.get("SYSMON_PROC", "/host/proc")
+
+# Server configuration: "name:url,name:url" where "local" means read from /proc
+SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
+servers = []
+if SYSMON_SERVERS:
+    for entry in SYSMON_SERVERS.split(","):
+        name, url = entry.strip().split(":", 1)
+        servers.append({"name": name.strip(), "url": url.strip()})
+
+# Shared state for per-core CPU usage
+cpu_snapshot = {"cores": [], "overall": 0.0}
+cpu_lock = threading.Lock()
+
+
+def parse_proc_stat():
+    """Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
+    cores = []
+    overall = None
+    with open(f"{PROC}/stat") as f:
+        for line in f:
+            if line.startswith("cpu"):
+                parts = line.split()
+                name = parts[0]
+                vals = list(map(int, parts[1:]))
+                idle = vals[3] + vals[4]
+                total = sum(vals[:8])
+                busy = total - idle
+                if name == "cpu":
+                    overall = (busy, total)
+                else:
+                    core_id = int(name[3:])
+                    cores.append((core_id, busy, total))
+    return cores, overall
+
+
+def cpu_sampler():
+    """Background thread: sample /proc/stat every 1s, compute deltas."""
+    global cpu_snapshot
+    prev_cores, prev_overall = parse_proc_stat()
+    prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
+
+    while True:
+        time.sleep(1)
+        cur_cores, cur_overall = parse_proc_stat()
+        cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
+
+        result = []
+        for cid in sorted(cur_map.keys()):
+            if cid in prev_map:
+                db = cur_map[cid][0] - prev_map[cid][0]
+                dt = cur_map[cid][1] - prev_map[cid][1]
+                pct = (db / dt * 100) if dt > 0 else 0.0
+            else:
+                pct = 0.0
+            result.append({"id": cid, "usage_percent": round(pct, 1)})
+
+        overall_pct = 0.0
+        if prev_overall and cur_overall:
+            db = cur_overall[0] - prev_overall[0]
+            dt = cur_overall[1] - prev_overall[1]
+            overall_pct = (db / dt * 100) if dt > 0 else 0.0
+
+        with cpu_lock:
+            cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
+
+        prev_map = cur_map
+        prev_overall = cur_overall
+
+
+def get_memory():
+    """Parse /proc/meminfo, return dict with MB values."""
+    info = {}
+    with open(f"{PROC}/meminfo") as f:
+        for line in f:
+            parts = line.split()
+            key = parts[0].rstrip(":")
+            val_kb = int(parts[1])
+            info[key] = val_kb
+
+    total = info["MemTotal"]
+    available = info["MemAvailable"]
+    cached = info.get("Cached", 0) + info.get("Buffers", 0)
+    used = total - available
+
+    return {
+        "total_mb": round(total / 1024),
+        "used_mb": round(used / 1024),
+        "available_mb": round(available / 1024),
+        "cached_mb": round(cached / 1024),
+        "percent": round(used / total * 100, 1),
+    }
+
+
+def get_load():
+    """Parse /proc/loadavg."""
+    with open(f"{PROC}/loadavg") as f:
+        parts = f.read().split()
+    return {
+        "load1": float(parts[0]),
+        "load5": float(parts[1]),
+        "load15": float(parts[2]),
+    }
+
+
+def get_uptime():
+    """Parse /proc/uptime, return human-readable string."""
+    with open(f"{PROC}/uptime") as f:
+        secs = float(f.read().split()[0])
+    days = int(secs // 86400)
+    hours = int((secs % 86400) // 3600)
+    mins = int((secs % 3600) // 60)
+    parts = []
+    if days:
+        parts.append(f"{days}d")
+    if hours:
+        parts.append(f"{hours}h")
+    parts.append(f"{mins}m")
+    return {"seconds": round(secs), "human": " ".join(parts)}
+
+
+def get_local_stats():
+    """Build stats dict from local /proc."""
+    with cpu_lock:
+        snap = cpu_snapshot.copy()
+    return {
+        "cores": snap["cores"],
+        "overall_cpu": snap["overall"],
+        "memory": get_memory(),
+        "load": get_load(),
+        "uptime": get_uptime(),
+        "num_cores": len(snap["cores"]),
+    }
+
+
+def fetch_remote_stats(url, timeout=2):
+    """Fetch /api/stats from a remote agent. Returns dict or None on failure."""
+    try:
+        req = urllib.request.Request(url.rstrip("/") + "/api/stats")
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read().decode())
+    except Exception:
+        return None
+
+
+@app.route("/")
+def index():
+    return render_template("index.html")
+
+
+@app.route("/api/stats")
+def stats():
+    return jsonify(get_local_stats())
+
+
+@app.route("/api/servers")
+def all_servers():
+    results = []
+    for srv in servers:
+        if srv["url"] == "local":
+            data = get_local_stats()
+            data["name"] = srv["name"]
+            data["status"] = "online"
+            results.append(data)
+        else:
+            data = fetch_remote_stats(srv["url"])
+            if data:
+                data["name"] = srv["name"]
+                data["status"] = "online"
+                data.setdefault("num_cores", len(data.get("cores", [])))
+                results.append(data)
+            else:
+                results.append({"name": srv["name"], "status": "unreachable"})
+    return jsonify(servers=results)
+
+
+# Start background sampler
+t = threading.Thread(target=cpu_sampler, daemon=True)
+t.start()
--- a/deploy-agent.sh
+++ b/deploy-agent.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+HOST="${1:-console}"
+
+echo "Deploying sysmon agent to $HOST..."
+
+# Create directory and copy app
+ssh "$HOST" 'mkdir -p ~/sysmon-agent/templates'
+scp ~/sysmon/app.py "$HOST":~/sysmon-agent/
+scp ~/sysmon/templates/index.html "$HOST":~/sysmon-agent/templates/
+
+# Install dependencies and set up systemd service
+ssh "$HOST" bash <<'REMOTE'
+set -e
+
+pip3 install --break-system-packages --quiet flask gunicorn 2>/dev/null || \
+  pip3 install --quiet flask gunicorn
+
+# Create systemd service
+sudo tee /etc/systemd/system/sysmon-agent.service > /dev/null <<EOF
+[Unit]
+Description=Sysmon Agent
+After=network.target
+
+[Service]
+Type=simple
+User=kamaji
+WorkingDirectory=/home/kamaji/sysmon-agent
+Environment=SYSMON_PROC=/proc
+ExecStart=/home/kamaji/.local/bin/gunicorn -b 0.0.0.0:8083 -w 1 --threads 2 app:app
+Restart=always
+RestartSec=5
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+sudo systemctl daemon-reload
+sudo systemctl enable sysmon-agent
+sudo systemctl restart sysmon-agent
+
+echo "Agent status:"
+sudo systemctl status sysmon-agent --no-pager -l
+REMOTE
+
+echo ""
+echo "Verify: curl http://$HOST:8083/api/stats"
--- a/run.sh
+++ b/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+
+cd ~/sysmon
+
+echo "Building sysmon container..."
+podman build -t sysmon .
+
+echo "Stopping existing container (if any)..."
+podman stop sysmon 2>/dev/null || true
+podman rm sysmon 2>/dev/null || true
+
+echo "Starting sysmon dashboard on port 8083..."
+podman run -d \
+  --name sysmon \
+  -p 8083:8083 \
+  --security-opt label=disable \
+  -v /proc:/host/proc:ro \
+  -e SYSMON_SERVERS="compute1:local,console:http://192.168.88.5:8083" \
+  --restart unless-stopped \
+  sysmon
+
+echo "Done. Dashboard at http://$(hostname):8083"
--- a/templates/index.html
+++ b/templates/index.html
@@ -0,0 +1,298 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>System Monitor</title>
+<style>
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+  body {
+    background: #0d1117; color: #c9d1d9;
+    font-family: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', monospace;
+    font-size: 14px; padding: 20px;
+  }
+  h1 { font-size: 18px; color: #58a6ff; margin-bottom: 4px; }
+  .subtitle { color: #484f58; font-size: 12px; margin-bottom: 20px; }
+  .section { margin-bottom: 24px; }
+  .section-title {
+    font-size: 13px; color: #8b949e; text-transform: uppercase;
+    letter-spacing: 1px; margin-bottom: 10px; border-bottom: 1px solid #21262d;
+    padding-bottom: 6px;
+  }
+
+  /* Server sections */
+  .server-section {
+    margin-bottom: 32px; padding: 16px;
+    border: 1px solid #21262d; border-radius: 8px;
+  }
+  .server-header {
+    display: flex; align-items: center; gap: 10px; margin-bottom: 16px;
+  }
+  .server-name { font-size: 16px; color: #58a6ff; font-weight: bold; }
+  .server-badge {
+    font-size: 11px; padding: 2px 8px; border-radius: 10px; font-weight: bold;
+  }
+  .server-badge.online { background: #238636; color: #fff; }
+  .server-badge.unreachable { background: #da3633; color: #fff; }
+  .server-unreachable {
+    color: #484f58; font-style: italic; padding: 20px; text-align: center;
+  }
+
+  /* CPU grid */
+  .cpu-grid { display: grid; gap: 4px; }
+  .core-cell {
+    background: #161b22; border: 1px solid #21262d; border-radius: 4px;
+    padding: 4px; text-align: center; position: relative; overflow: hidden;
+    min-height: 44px; display: flex; flex-direction: column; justify-content: center;
+  }
+  .core-id { font-size: 10px; color: #484f58; }
+  .core-pct { font-size: 13px; font-weight: bold; }
+  .core-bar {
+    position: absolute; bottom: 0; left: 0; right: 0; height: 3px;
+    background: #21262d; border-radius: 0 0 3px 3px;
+  }
+  .core-bar-fill {
+    height: 100%; border-radius: 0 0 3px 3px;
+    transition: width 0.3s ease, background-color 0.3s ease;
+  }
+
+  /* Bars */
+  .bar-container {
+    background: #161b22; border: 1px solid #21262d; border-radius: 6px;
+    height: 32px; position: relative; overflow: hidden;
+  }
+  .bar-fill {
+    height: 100%; transition: width 0.3s ease;
+    display: flex; align-items: center; padding-left: 10px;
+    font-size: 13px; font-weight: bold; white-space: nowrap;
+  }
+
+  /* Memory bar */
+  .mem-bar { display: flex; height: 100%; }
+  .mem-bar > div {
+    height: 100%; display: flex; align-items: center; justify-content: center;
+    font-size: 11px; font-weight: bold; white-space: nowrap; overflow: hidden;
+    transition: width 0.3s ease;
+  }
+  .mem-used { background: #da3633; }
+  .mem-cached { background: #d29922; }
+  .mem-free { background: #238636; }
+  .mem-details {
+    display: flex; gap: 20px; margin-top: 6px; font-size: 12px; color: #8b949e;
+    flex-wrap: wrap;
+  }
+  .mem-details span { display: flex; align-items: center; gap: 4px; }
+  .mem-dot { width: 8px; height: 8px; border-radius: 50%; display: inline-block; }
+
+  /* Load */
+  .load-row { display: flex; gap: 16px; flex-wrap: wrap; }
+  .load-item {
+    background: #161b22; border: 1px solid #21262d; border-radius: 6px;
+    padding: 12px 20px; text-align: center; flex: 1; min-width: 120px;
+  }
+  .load-label { font-size: 11px; color: #484f58; margin-bottom: 4px; }
+  .load-value { font-size: 24px; font-weight: bold; }
+
+  /* Uptime */
+  .uptime {
+    background: #161b22; border: 1px solid #21262d; border-radius: 6px;
+    padding: 10px 16px; display: inline-block; font-size: 15px;
+  }
+
+  /* Status indicator */
+  .status {
+    position: fixed; top: 12px; right: 20px; font-size: 11px; color: #484f58;
+  }
+  .status-dot {
+    display: inline-block; width: 6px; height: 6px; border-radius: 50%;
+    background: #238636; margin-right: 4px; vertical-align: middle;
+  }
+
+  @media (max-width: 700px) {
+    body { padding: 12px; }
+    .server-section { padding: 10px; }
+  }
+</style>
+</head>
+<body>
+<h1>System Monitor</h1>
+<div class="subtitle" id="subtitle">connecting...</div>
+<div class="status"><span class="status-dot" id="status-dot"></span><span id="status-text">connecting...</span></div>
+
+<div id="servers-container"></div>
+
+<script>
+const container = document.getElementById('servers-container');
+const serverSections = {};  // track built sections by name
+
+function usageColor(pct) {
+  if (pct < 30) return '#238636';
+  if (pct < 60) return '#d29922';
+  if (pct < 85) return '#da3633';
+  return '#f85149';
+}
+
+function loadColor(val, numCores) {
+  const ratio = val / numCores;
+  if (ratio < 0.3) return '#238636';
+  if (ratio < 0.6) return '#d29922';
+  if (ratio < 0.9) return '#da3633';
+  return '#f85149';
+}
+
+function formatMB(mb) {
+  if (mb >= 1024) return (mb / 1024).toFixed(1) + ' GB';
+  return mb + ' MB';
+}
+
+function gridCols(numCores) {
+  if (numCores <= 4) return 2;
+  if (numCores <= 8) return 4;
+  if (numCores <= 16) return 4;
+  return 8;
+}
+
+function buildServerSection(name) {
+  const section = document.createElement('div');
+  section.className = 'server-section';
+  section.id = 'srv-' + name;
+  section.innerHTML =
+    '<div class="server-header">' +
+      '<div class="server-name">' + name + '</div>' +
+      '<div class="server-badge" id="badge-' + name + '">...</div>' +
+    '</div>' +
+    '<div id="content-' + name + '">' +
+      '<div class="section">' +
+        '<div class="section-title">CPU Cores \u2014 <span id="overall-' + name + '">0</span>% overall</div>' +
+        '<div class="cpu-grid" id="grid-' + name + '"></div>' +
+      '</div>' +
+      '<div class="section">' +
+        '<div class="section-title">Memory</div>' +
+        '<div class="bar-container"><div class="mem-bar" id="mem-' + name + '"></div></div>' +
+        '<div class="mem-details" id="memd-' + name + '"></div>' +
+      '</div>' +
+      '<div class="section">' +
+        '<div class="section-title">Load Average</div>' +
+        '<div class="load-row" id="load-' + name + '"></div>' +
+      '</div>' +
+      '<div class="section">' +
+        '<div class="section-title">Uptime</div>' +
+        '<div class="uptime" id="up-' + name + '">\u2014</div>' +
+      '</div>' +
+    '</div>';
+  container.appendChild(section);
+  serverSections[name] = { built: true, coreCount: 0 };
+}
+
+function ensureCores(name, numCores) {
+  if (serverSections[name].coreCount === numCores) return;
+  const grid = document.getElementById('grid-' + name);
+  grid.style.gridTemplateColumns = 'repeat(' + gridCols(numCores) + ', 1fr)';
+  grid.innerHTML = '';
+  for (let i = 0; i < numCores; i++) {
+    const cell = document.createElement('div');
+    cell.className = 'core-cell';
+    cell.innerHTML =
+      '<div class="core-id">' + i + '</div>' +
+      '<div class="core-pct" id="cpct-' + name + '-' + i + '">\u2014</div>' +
+      '<div class="core-bar"><div class="core-bar-fill" id="cbar-' + name + '-' + i + '"></div></div>';
+    grid.appendChild(cell);
+  }
+  serverSections[name].coreCount = numCores;
+}
+
+function updateServer(srv) {
+  if (!serverSections[srv.name]) buildServerSection(srv.name);
+
+  const badge = document.getElementById('badge-' + srv.name);
+  const content = document.getElementById('content-' + srv.name);
+
+  if (srv.status === 'unreachable') {
+    badge.textContent = 'unreachable';
+    badge.className = 'server-badge unreachable';
+    content.innerHTML = '<div class="server-unreachable">Server unreachable</div>';
+    serverSections[srv.name].coreCount = 0;  // force rebuild on reconnect
+    return;
+  }
+
+  badge.textContent = 'online';
+  badge.className = 'server-badge online';
+
+  // Restore content if it was showing unreachable
+  if (content.querySelector('.server-unreachable')) {
+    const section = document.getElementById('srv-' + srv.name);
+    section.remove();
+    delete serverSections[srv.name];
+    buildServerSection(srv.name);
+  }
+
+  const numCores = srv.num_cores || srv.cores.length;
+  ensureCores(srv.name, numCores);
+
+  // CPU cores
+  for (const core of srv.cores) {
+    const pctEl = document.getElementById('cpct-' + srv.name + '-' + core.id);
+    const barEl = document.getElementById('cbar-' + srv.name + '-' + core.id);
+    if (pctEl) {
+      pctEl.textContent = core.usage_percent + '%';
+      pctEl.style.color = usageColor(core.usage_percent);
+    }
+    if (barEl) {
+      barEl.style.width = core.usage_percent + '%';
+      barEl.style.backgroundColor = usageColor(core.usage_percent);
+    }
+  }
+
+  // Overall CPU
+  document.getElementById('overall-' + srv.name).textContent = srv.overall_cpu;
+
+  // Memory
+  const mem = srv.memory;
+  const usedPct = (mem.used_mb - mem.cached_mb) / mem.total_mb * 100;
+  const cachedPct = mem.cached_mb / mem.total_mb * 100;
+  const freePct = 100 - usedPct - cachedPct;
+  document.getElementById('mem-' + srv.name).innerHTML =
+    '<div class="mem-used" style="width:' + usedPct + '%">' + (usedPct > 5 ? formatMB(mem.used_mb - mem.cached_mb) : '') + '</div>' +
+    '<div class="mem-cached" style="width:' + cachedPct + '%">' + (cachedPct > 5 ? formatMB(mem.cached_mb) : '') + '</div>' +
+    '<div class="mem-free" style="width:' + freePct + '%">' + (freePct > 5 ? formatMB(mem.available_mb) : '') + '</div>';
+  document.getElementById('memd-' + srv.name).innerHTML =
+    '<span><span class="mem-dot" style="background:#da3633"></span>Used: ' + formatMB(mem.used_mb - mem.cached_mb) + '</span>' +
+    '<span><span class="mem-dot" style="background:#d29922"></span>Cached: ' + formatMB(mem.cached_mb) + '</span>' +
+    '<span><span class="mem-dot" style="background:#238636"></span>Available: ' + formatMB(mem.available_mb) + '</span>' +
+    '<span style="color:#484f58">Total: ' + formatMB(mem.total_mb) + '</span>' +
+    '<span style="color:#484f58">' + mem.percent + '%</span>';
+
+  // Load
+  const load = srv.load;
+  document.getElementById('load-' + srv.name).innerHTML =
+    '<div class="load-item"><div class="load-label">1 min</div><div class="load-value" style="color:' + loadColor(load.load1, numCores) + '">' + load.load1.toFixed(2) + '</div></div>' +
+    '<div class="load-item"><div class="load-label">5 min</div><div class="load-value" style="color:' + loadColor(load.load5, numCores) + '">' + load.load5.toFixed(2) + '</div></div>' +
+    '<div class="load-item"><div class="load-label">15 min</div><div class="load-value" style="color:' + loadColor(load.load15, numCores) + '">' + load.load15.toFixed(2) + '</div></div>';
+
+  // Uptime
+  document.getElementById('up-' + srv.name).textContent = srv.uptime.human;
+}
+
+let fails = 0;
+async function poll() {
+  try {
+    const r = await fetch('/api/servers');
+    const data = await r.json();
+    for (const srv of data.servers) updateServer(srv);
+    const online = data.servers.filter(s => s.status === 'online').length;
+    document.getElementById('subtitle').textContent = online + '/' + data.servers.length + ' servers online';
+    fails = 0;
+    document.getElementById('status-dot').style.background = '#238636';
+    document.getElementById('status-text').textContent = 'live';
+  } catch (e) {
+    fails++;
+    document.getElementById('status-dot').style.background = '#da3633';
+    document.getElementById('status-text').textContent = 'error (' + fails + ')';
+  }
+}
+
+poll();
+setInterval(poll, 1000);
+</script>
+</body>
+</html>