Single Flask codebase runs in agent mode (serves /api/stats from local /proc) or dashboard mode (aggregates local + remote agents). Currently monitors compute1 (64-core, podman container) and console (16-core, bare systemd service). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
190 lines
5.4 KiB
Python
190 lines
5.4 KiB
Python
import json
|
|
import os
|
|
import threading
|
|
import time
|
|
import urllib.request
|
|
|
|
from flask import Flask, jsonify, render_template
|
|
|
|
app = Flask(__name__)
|
|
|
|
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
|
|
|
|
# Server configuration: "name:url,name:url" where "local" means read from /proc
|
|
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
|
|
servers = []
|
|
if SYSMON_SERVERS:
|
|
for entry in SYSMON_SERVERS.split(","):
|
|
name, url = entry.strip().split(":", 1)
|
|
servers.append({"name": name.strip(), "url": url.strip()})
|
|
|
|
# Shared state for per-core CPU usage
|
|
cpu_snapshot = {"cores": [], "overall": 0.0}
|
|
cpu_lock = threading.Lock()
|
|
|
|
|
|
def parse_proc_stat():
|
|
"""Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
|
|
cores = []
|
|
overall = None
|
|
with open(f"{PROC}/stat") as f:
|
|
for line in f:
|
|
if line.startswith("cpu"):
|
|
parts = line.split()
|
|
name = parts[0]
|
|
vals = list(map(int, parts[1:]))
|
|
idle = vals[3] + vals[4]
|
|
total = sum(vals[:8])
|
|
busy = total - idle
|
|
if name == "cpu":
|
|
overall = (busy, total)
|
|
else:
|
|
core_id = int(name[3:])
|
|
cores.append((core_id, busy, total))
|
|
return cores, overall
|
|
|
|
|
|
def cpu_sampler():
|
|
"""Background thread: sample /proc/stat every 1s, compute deltas."""
|
|
global cpu_snapshot
|
|
prev_cores, prev_overall = parse_proc_stat()
|
|
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
|
|
|
|
while True:
|
|
time.sleep(1)
|
|
cur_cores, cur_overall = parse_proc_stat()
|
|
cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
|
|
|
|
result = []
|
|
for cid in sorted(cur_map.keys()):
|
|
if cid in prev_map:
|
|
db = cur_map[cid][0] - prev_map[cid][0]
|
|
dt = cur_map[cid][1] - prev_map[cid][1]
|
|
pct = (db / dt * 100) if dt > 0 else 0.0
|
|
else:
|
|
pct = 0.0
|
|
result.append({"id": cid, "usage_percent": round(pct, 1)})
|
|
|
|
overall_pct = 0.0
|
|
if prev_overall and cur_overall:
|
|
db = cur_overall[0] - prev_overall[0]
|
|
dt = cur_overall[1] - prev_overall[1]
|
|
overall_pct = (db / dt * 100) if dt > 0 else 0.0
|
|
|
|
with cpu_lock:
|
|
cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
|
|
|
|
prev_map = cur_map
|
|
prev_overall = cur_overall
|
|
|
|
|
|
def get_memory():
|
|
"""Parse /proc/meminfo, return dict with MB values."""
|
|
info = {}
|
|
with open(f"{PROC}/meminfo") as f:
|
|
for line in f:
|
|
parts = line.split()
|
|
key = parts[0].rstrip(":")
|
|
val_kb = int(parts[1])
|
|
info[key] = val_kb
|
|
|
|
total = info["MemTotal"]
|
|
available = info["MemAvailable"]
|
|
cached = info.get("Cached", 0) + info.get("Buffers", 0)
|
|
used = total - available
|
|
|
|
return {
|
|
"total_mb": round(total / 1024),
|
|
"used_mb": round(used / 1024),
|
|
"available_mb": round(available / 1024),
|
|
"cached_mb": round(cached / 1024),
|
|
"percent": round(used / total * 100, 1),
|
|
}
|
|
|
|
|
|
def get_load():
|
|
"""Parse /proc/loadavg."""
|
|
with open(f"{PROC}/loadavg") as f:
|
|
parts = f.read().split()
|
|
return {
|
|
"load1": float(parts[0]),
|
|
"load5": float(parts[1]),
|
|
"load15": float(parts[2]),
|
|
}
|
|
|
|
|
|
def get_uptime():
|
|
"""Parse /proc/uptime, return human-readable string."""
|
|
with open(f"{PROC}/uptime") as f:
|
|
secs = float(f.read().split()[0])
|
|
days = int(secs // 86400)
|
|
hours = int((secs % 86400) // 3600)
|
|
mins = int((secs % 3600) // 60)
|
|
parts = []
|
|
if days:
|
|
parts.append(f"{days}d")
|
|
if hours:
|
|
parts.append(f"{hours}h")
|
|
parts.append(f"{mins}m")
|
|
return {"seconds": round(secs), "human": " ".join(parts)}
|
|
|
|
|
|
def get_local_stats():
|
|
"""Build stats dict from local /proc."""
|
|
with cpu_lock:
|
|
snap = cpu_snapshot.copy()
|
|
return {
|
|
"cores": snap["cores"],
|
|
"overall_cpu": snap["overall"],
|
|
"memory": get_memory(),
|
|
"load": get_load(),
|
|
"uptime": get_uptime(),
|
|
"num_cores": len(snap["cores"]),
|
|
}
|
|
|
|
|
|
def fetch_remote_stats(url, timeout=2):
|
|
"""Fetch /api/stats from a remote agent. Returns dict or None on failure."""
|
|
try:
|
|
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return json.loads(resp.read().decode())
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
@app.route("/")
|
|
def index():
|
|
return render_template("index.html")
|
|
|
|
|
|
@app.route("/api/stats")
|
|
def stats():
|
|
return jsonify(get_local_stats())
|
|
|
|
|
|
@app.route("/api/servers")
|
|
def all_servers():
|
|
results = []
|
|
for srv in servers:
|
|
if srv["url"] == "local":
|
|
data = get_local_stats()
|
|
data["name"] = srv["name"]
|
|
data["status"] = "online"
|
|
results.append(data)
|
|
else:
|
|
data = fetch_remote_stats(srv["url"])
|
|
if data:
|
|
data["name"] = srv["name"]
|
|
data["status"] = "online"
|
|
data.setdefault("num_cores", len(data.get("cores", [])))
|
|
results.append(data)
|
|
else:
|
|
results.append({"name": srv["name"], "status": "unreachable"})
|
|
return jsonify(servers=results)
|
|
|
|
|
|
# Start background sampler
|
|
t = threading.Thread(target=cpu_sampler, daemon=True)
|
|
t.start()
|