Add multi-server system monitor with agent/dashboard architecture
Single Flask codebase runs in agent mode (serves /api/stats from local /proc) or dashboard mode (aggregates local + remote agents). Currently monitors compute1 (64-core, podman container) and console (16-core, bare systemd service). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
189
app.py
Normal file
189
app.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
from flask import Flask, jsonify, render_template
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
|
||||
|
||||
# Server configuration: "name:url,name:url" where "local" means read from /proc
|
||||
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
|
||||
servers = []
|
||||
if SYSMON_SERVERS:
|
||||
for entry in SYSMON_SERVERS.split(","):
|
||||
name, url = entry.strip().split(":", 1)
|
||||
servers.append({"name": name.strip(), "url": url.strip()})
|
||||
|
||||
# Shared state for per-core CPU usage
|
||||
cpu_snapshot = {"cores": [], "overall": 0.0}
|
||||
cpu_lock = threading.Lock()
|
||||
|
||||
|
||||
def parse_proc_stat():
|
||||
"""Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
|
||||
cores = []
|
||||
overall = None
|
||||
with open(f"{PROC}/stat") as f:
|
||||
for line in f:
|
||||
if line.startswith("cpu"):
|
||||
parts = line.split()
|
||||
name = parts[0]
|
||||
vals = list(map(int, parts[1:]))
|
||||
idle = vals[3] + vals[4]
|
||||
total = sum(vals[:8])
|
||||
busy = total - idle
|
||||
if name == "cpu":
|
||||
overall = (busy, total)
|
||||
else:
|
||||
core_id = int(name[3:])
|
||||
cores.append((core_id, busy, total))
|
||||
return cores, overall
|
||||
|
||||
|
||||
def cpu_sampler():
|
||||
"""Background thread: sample /proc/stat every 1s, compute deltas."""
|
||||
global cpu_snapshot
|
||||
prev_cores, prev_overall = parse_proc_stat()
|
||||
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
|
||||
|
||||
while True:
|
||||
time.sleep(1)
|
||||
cur_cores, cur_overall = parse_proc_stat()
|
||||
cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
|
||||
|
||||
result = []
|
||||
for cid in sorted(cur_map.keys()):
|
||||
if cid in prev_map:
|
||||
db = cur_map[cid][0] - prev_map[cid][0]
|
||||
dt = cur_map[cid][1] - prev_map[cid][1]
|
||||
pct = (db / dt * 100) if dt > 0 else 0.0
|
||||
else:
|
||||
pct = 0.0
|
||||
result.append({"id": cid, "usage_percent": round(pct, 1)})
|
||||
|
||||
overall_pct = 0.0
|
||||
if prev_overall and cur_overall:
|
||||
db = cur_overall[0] - prev_overall[0]
|
||||
dt = cur_overall[1] - prev_overall[1]
|
||||
overall_pct = (db / dt * 100) if dt > 0 else 0.0
|
||||
|
||||
with cpu_lock:
|
||||
cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
|
||||
|
||||
prev_map = cur_map
|
||||
prev_overall = cur_overall
|
||||
|
||||
|
||||
def get_memory():
|
||||
"""Parse /proc/meminfo, return dict with MB values."""
|
||||
info = {}
|
||||
with open(f"{PROC}/meminfo") as f:
|
||||
for line in f:
|
||||
parts = line.split()
|
||||
key = parts[0].rstrip(":")
|
||||
val_kb = int(parts[1])
|
||||
info[key] = val_kb
|
||||
|
||||
total = info["MemTotal"]
|
||||
available = info["MemAvailable"]
|
||||
cached = info.get("Cached", 0) + info.get("Buffers", 0)
|
||||
used = total - available
|
||||
|
||||
return {
|
||||
"total_mb": round(total / 1024),
|
||||
"used_mb": round(used / 1024),
|
||||
"available_mb": round(available / 1024),
|
||||
"cached_mb": round(cached / 1024),
|
||||
"percent": round(used / total * 100, 1),
|
||||
}
|
||||
|
||||
|
||||
def get_load():
|
||||
"""Parse /proc/loadavg."""
|
||||
with open(f"{PROC}/loadavg") as f:
|
||||
parts = f.read().split()
|
||||
return {
|
||||
"load1": float(parts[0]),
|
||||
"load5": float(parts[1]),
|
||||
"load15": float(parts[2]),
|
||||
}
|
||||
|
||||
|
||||
def get_uptime():
|
||||
"""Parse /proc/uptime, return human-readable string."""
|
||||
with open(f"{PROC}/uptime") as f:
|
||||
secs = float(f.read().split()[0])
|
||||
days = int(secs // 86400)
|
||||
hours = int((secs % 86400) // 3600)
|
||||
mins = int((secs % 3600) // 60)
|
||||
parts = []
|
||||
if days:
|
||||
parts.append(f"{days}d")
|
||||
if hours:
|
||||
parts.append(f"{hours}h")
|
||||
parts.append(f"{mins}m")
|
||||
return {"seconds": round(secs), "human": " ".join(parts)}
|
||||
|
||||
|
||||
def get_local_stats():
|
||||
"""Build stats dict from local /proc."""
|
||||
with cpu_lock:
|
||||
snap = cpu_snapshot.copy()
|
||||
return {
|
||||
"cores": snap["cores"],
|
||||
"overall_cpu": snap["overall"],
|
||||
"memory": get_memory(),
|
||||
"load": get_load(),
|
||||
"uptime": get_uptime(),
|
||||
"num_cores": len(snap["cores"]),
|
||||
}
|
||||
|
||||
|
||||
def fetch_remote_stats(url, timeout=2):
|
||||
"""Fetch /api/stats from a remote agent. Returns dict or None on failure."""
|
||||
try:
|
||||
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
@app.route("/api/stats")
|
||||
def stats():
|
||||
return jsonify(get_local_stats())
|
||||
|
||||
|
||||
@app.route("/api/servers")
|
||||
def all_servers():
|
||||
results = []
|
||||
for srv in servers:
|
||||
if srv["url"] == "local":
|
||||
data = get_local_stats()
|
||||
data["name"] = srv["name"]
|
||||
data["status"] = "online"
|
||||
results.append(data)
|
||||
else:
|
||||
data = fetch_remote_stats(srv["url"])
|
||||
if data:
|
||||
data["name"] = srv["name"]
|
||||
data["status"] = "online"
|
||||
data.setdefault("num_cores", len(data.get("cores", [])))
|
||||
results.append(data)
|
||||
else:
|
||||
results.append({"name": srv["name"], "status": "unreachable"})
|
||||
return jsonify(servers=results)
|
||||
|
||||
|
||||
# Start background sampler
|
||||
t = threading.Thread(target=cpu_sampler, daemon=True)
|
||||
t.start()
|
||||
Reference in New Issue
Block a user