Files
sysmon/app.py
kamaji 3a9ba28552 Add multi-server system monitor with agent/dashboard architecture
Single Flask codebase runs in agent mode (serves /api/stats from local
/proc) or dashboard mode (aggregates local + remote agents). Currently
monitors compute1 (64-core, podman container) and console (16-core,
bare systemd service).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 19:09:34 -06:00

190 lines
5.4 KiB
Python

import json
import os
import threading
import time
import urllib.request
from flask import Flask, jsonify, render_template
app = Flask(__name__)
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
# Server configuration: "name:url,name:url" where "local" means read from /proc
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
servers = []
if SYSMON_SERVERS:
for entry in SYSMON_SERVERS.split(","):
name, url = entry.strip().split(":", 1)
servers.append({"name": name.strip(), "url": url.strip()})
# Shared state for per-core CPU usage
cpu_snapshot = {"cores": [], "overall": 0.0}
cpu_lock = threading.Lock()
def parse_proc_stat():
"""Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
cores = []
overall = None
with open(f"{PROC}/stat") as f:
for line in f:
if line.startswith("cpu"):
parts = line.split()
name = parts[0]
vals = list(map(int, parts[1:]))
idle = vals[3] + vals[4]
total = sum(vals[:8])
busy = total - idle
if name == "cpu":
overall = (busy, total)
else:
core_id = int(name[3:])
cores.append((core_id, busy, total))
return cores, overall
def cpu_sampler():
"""Background thread: sample /proc/stat every 1s, compute deltas."""
global cpu_snapshot
prev_cores, prev_overall = parse_proc_stat()
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
while True:
time.sleep(1)
cur_cores, cur_overall = parse_proc_stat()
cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
result = []
for cid in sorted(cur_map.keys()):
if cid in prev_map:
db = cur_map[cid][0] - prev_map[cid][0]
dt = cur_map[cid][1] - prev_map[cid][1]
pct = (db / dt * 100) if dt > 0 else 0.0
else:
pct = 0.0
result.append({"id": cid, "usage_percent": round(pct, 1)})
overall_pct = 0.0
if prev_overall and cur_overall:
db = cur_overall[0] - prev_overall[0]
dt = cur_overall[1] - prev_overall[1]
overall_pct = (db / dt * 100) if dt > 0 else 0.0
with cpu_lock:
cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
prev_map = cur_map
prev_overall = cur_overall
def get_memory():
"""Parse /proc/meminfo, return dict with MB values."""
info = {}
with open(f"{PROC}/meminfo") as f:
for line in f:
parts = line.split()
key = parts[0].rstrip(":")
val_kb = int(parts[1])
info[key] = val_kb
total = info["MemTotal"]
available = info["MemAvailable"]
cached = info.get("Cached", 0) + info.get("Buffers", 0)
used = total - available
return {
"total_mb": round(total / 1024),
"used_mb": round(used / 1024),
"available_mb": round(available / 1024),
"cached_mb": round(cached / 1024),
"percent": round(used / total * 100, 1),
}
def get_load():
"""Parse /proc/loadavg."""
with open(f"{PROC}/loadavg") as f:
parts = f.read().split()
return {
"load1": float(parts[0]),
"load5": float(parts[1]),
"load15": float(parts[2]),
}
def get_uptime():
"""Parse /proc/uptime, return human-readable string."""
with open(f"{PROC}/uptime") as f:
secs = float(f.read().split()[0])
days = int(secs // 86400)
hours = int((secs % 86400) // 3600)
mins = int((secs % 3600) // 60)
parts = []
if days:
parts.append(f"{days}d")
if hours:
parts.append(f"{hours}h")
parts.append(f"{mins}m")
return {"seconds": round(secs), "human": " ".join(parts)}
def get_local_stats():
"""Build stats dict from local /proc."""
with cpu_lock:
snap = cpu_snapshot.copy()
return {
"cores": snap["cores"],
"overall_cpu": snap["overall"],
"memory": get_memory(),
"load": get_load(),
"uptime": get_uptime(),
"num_cores": len(snap["cores"]),
}
def fetch_remote_stats(url, timeout=2):
"""Fetch /api/stats from a remote agent. Returns dict or None on failure."""
try:
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode())
except Exception:
return None
@app.route("/")
def index():
return render_template("index.html")
@app.route("/api/stats")
def stats():
return jsonify(get_local_stats())
@app.route("/api/servers")
def all_servers():
results = []
for srv in servers:
if srv["url"] == "local":
data = get_local_stats()
data["name"] = srv["name"]
data["status"] = "online"
results.append(data)
else:
data = fetch_remote_stats(srv["url"])
if data:
data["name"] = srv["name"]
data["status"] = "online"
data.setdefault("num_cores", len(data.get("cores", [])))
results.append(data)
else:
results.append({"name": srv["name"], "status": "unreachable"})
return jsonify(servers=results)
# Start background sampler
t = threading.Thread(target=cpu_sampler, daemon=True)
t.start()