commit 3a9ba28552332e1683d8039482f34bc348fbf964 Author: kamaji Date: Tue Feb 24 19:09:34 2026 -0600 Add multi-server system monitor with agent/dashboard architecture Single Flask codebase runs in agent mode (serves /api/stats from local /proc) or dashboard mode (aggregates local + remote agents). Currently monitors compute1 (64-core, podman container) and console (16-core, bare systemd service). Co-Authored-By: Claude Opus 4.6 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..dcd7c6a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.11-slim + +RUN pip install --no-cache-dir flask gunicorn + +WORKDIR /app +COPY app.py . +COPY templates/ templates/ + +EXPOSE 8083 + +CMD ["gunicorn", "-b", "0.0.0.0:8083", "-w", "1", "--threads", "2", "app:app"] diff --git a/app.py b/app.py new file mode 100644 index 0000000..502fe2e --- /dev/null +++ b/app.py @@ -0,0 +1,189 @@ +import json +import os +import threading +import time +import urllib.request + +from flask import Flask, jsonify, render_template + +app = Flask(__name__) + +PROC = os.environ.get("SYSMON_PROC", "/host/proc") + +# Server configuration: "name:url,name:url" where "local" means read from /proc +SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "") +servers = [] +if SYSMON_SERVERS: + for entry in SYSMON_SERVERS.split(","): + name, url = entry.strip().split(":", 1) + servers.append({"name": name.strip(), "url": url.strip()}) + +# Shared state for per-core CPU usage +cpu_snapshot = {"cores": [], "overall": 0.0} +cpu_lock = threading.Lock() + + +def parse_proc_stat(): + """Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total).""" + cores = [] + overall = None + with open(f"{PROC}/stat") as f: + for line in f: + if line.startswith("cpu"): + parts = line.split() + name = parts[0] + vals = list(map(int, parts[1:])) + idle = vals[3] + vals[4] + total = sum(vals[:8]) + busy = total - idle + if name == "cpu": + overall = (busy, total) + else: + core_id = int(name[3:]) + cores.append((core_id, busy, total)) + return cores, overall + + +def cpu_sampler(): + """Background thread: sample /proc/stat every 1s, compute deltas.""" + global cpu_snapshot + prev_cores, prev_overall = parse_proc_stat() + prev_map = {cid: (busy, total) for cid, busy, total in prev_cores} + + while True: + time.sleep(1) + cur_cores, cur_overall = parse_proc_stat() + cur_map = {cid: (busy, total) for cid, busy, total in cur_cores} + + result = [] + for cid in sorted(cur_map.keys()): + if cid in prev_map: + db = cur_map[cid][0] - prev_map[cid][0] + dt = cur_map[cid][1] - prev_map[cid][1] + pct = (db / dt * 100) if dt > 0 else 0.0 + else: + pct = 0.0 + result.append({"id": cid, "usage_percent": round(pct, 1)}) + + overall_pct = 0.0 + if prev_overall and cur_overall: + db = cur_overall[0] - prev_overall[0] + dt = cur_overall[1] - prev_overall[1] + overall_pct = (db / dt * 100) if dt > 0 else 0.0 + + with cpu_lock: + cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)} + + prev_map = cur_map + prev_overall = cur_overall + + +def get_memory(): + """Parse /proc/meminfo, return dict with MB values.""" + info = {} + with open(f"{PROC}/meminfo") as f: + for line in f: + parts = line.split() + key = parts[0].rstrip(":") + val_kb = int(parts[1]) + info[key] = val_kb + + total = info["MemTotal"] + available = info["MemAvailable"] + cached = info.get("Cached", 0) + info.get("Buffers", 0) + used = total - available + + return { + "total_mb": round(total / 1024), + "used_mb": round(used / 1024), + "available_mb": round(available / 1024), + "cached_mb": round(cached / 1024), + "percent": round(used / total * 100, 1), + } + + +def get_load(): + """Parse /proc/loadavg.""" + with open(f"{PROC}/loadavg") as f: + parts = f.read().split() + return { + "load1": float(parts[0]), + "load5": float(parts[1]), + "load15": float(parts[2]), + } + + +def get_uptime(): + """Parse /proc/uptime, return human-readable string.""" + with open(f"{PROC}/uptime") as f: + secs = float(f.read().split()[0]) + days = int(secs // 86400) + hours = int((secs % 86400) // 3600) + mins = int((secs % 3600) // 60) + parts = [] + if days: + parts.append(f"{days}d") + if hours: + parts.append(f"{hours}h") + parts.append(f"{mins}m") + return {"seconds": round(secs), "human": " ".join(parts)} + + +def get_local_stats(): + """Build stats dict from local /proc.""" + with cpu_lock: + snap = cpu_snapshot.copy() + return { + "cores": snap["cores"], + "overall_cpu": snap["overall"], + "memory": get_memory(), + "load": get_load(), + "uptime": get_uptime(), + "num_cores": len(snap["cores"]), + } + + +def fetch_remote_stats(url, timeout=2): + """Fetch /api/stats from a remote agent. Returns dict or None on failure.""" + try: + req = urllib.request.Request(url.rstrip("/") + "/api/stats") + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode()) + except Exception: + return None + + +@app.route("/") +def index(): + return render_template("index.html") + + +@app.route("/api/stats") +def stats(): + return jsonify(get_local_stats()) + + +@app.route("/api/servers") +def all_servers(): + results = [] + for srv in servers: + if srv["url"] == "local": + data = get_local_stats() + data["name"] = srv["name"] + data["status"] = "online" + results.append(data) + else: + data = fetch_remote_stats(srv["url"]) + if data: + data["name"] = srv["name"] + data["status"] = "online" + data.setdefault("num_cores", len(data.get("cores", []))) + results.append(data) + else: + results.append({"name": srv["name"], "status": "unreachable"}) + return jsonify(servers=results) + + +# Start background sampler +t = threading.Thread(target=cpu_sampler, daemon=True) +t.start() diff --git a/deploy-agent.sh b/deploy-agent.sh new file mode 100755 index 0000000..ebb786a --- /dev/null +++ b/deploy-agent.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +HOST="${1:-console}" + +echo "Deploying sysmon agent to $HOST..." + +# Create directory and copy app +ssh "$HOST" 'mkdir -p ~/sysmon-agent/templates' +scp ~/sysmon/app.py "$HOST":~/sysmon-agent/ +scp ~/sysmon/templates/index.html "$HOST":~/sysmon-agent/templates/ + +# Install dependencies and set up systemd service +ssh "$HOST" bash <<'REMOTE' +set -e + +pip3 install --break-system-packages --quiet flask gunicorn 2>/dev/null || \ + pip3 install --quiet flask gunicorn + +# Create systemd service +sudo tee /etc/systemd/system/sysmon-agent.service > /dev/null </dev/null || true +podman rm sysmon 2>/dev/null || true + +echo "Starting sysmon dashboard on port 8083..." +podman run -d \ + --name sysmon \ + -p 8083:8083 \ + --security-opt label=disable \ + -v /proc:/host/proc:ro \ + -e SYSMON_SERVERS="compute1:local,console:http://192.168.88.5:8083" \ + --restart unless-stopped \ + sysmon + +echo "Done. Dashboard at http://$(hostname):8083" diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..707c8b7 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,298 @@ + + + + + +System Monitor + + + +

System Monitor

+
connecting...
+
connecting...
+ +
+ + + +