Rearchitect to pure aggregator dashboard (no /proc) with bare agents on each host. Agents report VM data via sudo virsh (cached 10s). UI rewritten with hash-based routing: summary card grid and per-server detail view with CPU grid, memory, load, uptime, and VM table. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
241 lines
6.9 KiB
Python
241 lines
6.9 KiB
Python
import json
|
|
import os
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import urllib.request
|
|
|
|
from flask import Flask, jsonify, render_template
|
|
|
|
app = Flask(__name__)
|
|
|
|
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
|
|
|
|
# Server configuration: "name:url,name:url"
|
|
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
|
|
servers = []
|
|
if SYSMON_SERVERS:
|
|
for entry in SYSMON_SERVERS.split(","):
|
|
name, url = entry.strip().split(":", 1)
|
|
servers.append({"name": name.strip(), "url": url.strip()})
|
|
|
|
# Dashboard-only mode: no local /proc reading when SYSMON_SERVERS is set
|
|
IS_DASHBOARD = bool(servers)
|
|
|
|
# Shared state for per-core CPU usage
|
|
cpu_snapshot = {"cores": [], "overall": 0.0}
|
|
cpu_lock = threading.Lock()
|
|
|
|
# VM cache
|
|
_vm_cache = {"data": [], "ts": 0}
|
|
_vm_lock = threading.Lock()
|
|
VM_CACHE_TTL = 10
|
|
|
|
|
|
def parse_proc_stat():
|
|
cores = []
|
|
overall = None
|
|
with open(f"{PROC}/stat") as f:
|
|
for line in f:
|
|
if line.startswith("cpu"):
|
|
parts = line.split()
|
|
name = parts[0]
|
|
vals = list(map(int, parts[1:]))
|
|
idle = vals[3] + vals[4]
|
|
total = sum(vals[:8])
|
|
busy = total - idle
|
|
if name == "cpu":
|
|
overall = (busy, total)
|
|
else:
|
|
core_id = int(name[3:])
|
|
cores.append((core_id, busy, total))
|
|
return cores, overall
|
|
|
|
|
|
def cpu_sampler():
|
|
global cpu_snapshot
|
|
prev_cores, prev_overall = parse_proc_stat()
|
|
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
|
|
|
|
while True:
|
|
time.sleep(1)
|
|
cur_cores, cur_overall = parse_proc_stat()
|
|
cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
|
|
|
|
result = []
|
|
for cid in sorted(cur_map.keys()):
|
|
if cid in prev_map:
|
|
db = cur_map[cid][0] - prev_map[cid][0]
|
|
dt = cur_map[cid][1] - prev_map[cid][1]
|
|
pct = (db / dt * 100) if dt > 0 else 0.0
|
|
else:
|
|
pct = 0.0
|
|
result.append({"id": cid, "usage_percent": round(pct, 1)})
|
|
|
|
overall_pct = 0.0
|
|
if prev_overall and cur_overall:
|
|
db = cur_overall[0] - prev_overall[0]
|
|
dt = cur_overall[1] - prev_overall[1]
|
|
overall_pct = (db / dt * 100) if dt > 0 else 0.0
|
|
|
|
with cpu_lock:
|
|
cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
|
|
|
|
prev_map = cur_map
|
|
prev_overall = cur_overall
|
|
|
|
|
|
def get_memory():
|
|
info = {}
|
|
with open(f"{PROC}/meminfo") as f:
|
|
for line in f:
|
|
parts = line.split()
|
|
key = parts[0].rstrip(":")
|
|
val_kb = int(parts[1])
|
|
info[key] = val_kb
|
|
|
|
total = info["MemTotal"]
|
|
available = info["MemAvailable"]
|
|
cached = info.get("Cached", 0) + info.get("Buffers", 0)
|
|
used = total - available
|
|
|
|
return {
|
|
"total_mb": round(total / 1024),
|
|
"used_mb": round(used / 1024),
|
|
"available_mb": round(available / 1024),
|
|
"cached_mb": round(cached / 1024),
|
|
"percent": round(used / total * 100, 1),
|
|
}
|
|
|
|
|
|
def get_load():
|
|
with open(f"{PROC}/loadavg") as f:
|
|
parts = f.read().split()
|
|
return {
|
|
"load1": float(parts[0]),
|
|
"load5": float(parts[1]),
|
|
"load15": float(parts[2]),
|
|
}
|
|
|
|
|
|
def get_uptime():
|
|
with open(f"{PROC}/uptime") as f:
|
|
secs = float(f.read().split()[0])
|
|
days = int(secs // 86400)
|
|
hours = int((secs % 86400) // 3600)
|
|
mins = int((secs % 3600) // 60)
|
|
parts = []
|
|
if days:
|
|
parts.append(f"{days}d")
|
|
if hours:
|
|
parts.append(f"{hours}h")
|
|
parts.append(f"{mins}m")
|
|
return {"seconds": round(secs), "human": " ".join(parts)}
|
|
|
|
|
|
def get_vms():
|
|
"""Get VM list via sudo virsh. Returns list of dicts. Cached for VM_CACHE_TTL seconds."""
|
|
with _vm_lock:
|
|
now = time.time()
|
|
if now - _vm_cache["ts"] < VM_CACHE_TTL:
|
|
return _vm_cache["data"]
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["sudo", "virsh", "list", "--all", "--name"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if result.returncode != 0:
|
|
return []
|
|
|
|
names = [n.strip() for n in result.stdout.strip().split("\n") if n.strip()]
|
|
vms = []
|
|
for name in names:
|
|
info = subprocess.run(
|
|
["sudo", "virsh", "dominfo", name],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if info.returncode != 0:
|
|
continue
|
|
vm = {"name": name, "state": "unknown", "vcpus": 0, "memory_mb": 0, "autostart": False}
|
|
for line in info.stdout.split("\n"):
|
|
if ":" not in line:
|
|
continue
|
|
key, val = line.split(":", 1)
|
|
key = key.strip()
|
|
val = val.strip()
|
|
if key == "State":
|
|
vm["state"] = val
|
|
elif key == "CPU(s)":
|
|
vm["vcpus"] = int(val)
|
|
elif key == "Max memory":
|
|
# virsh reports in KiB
|
|
vm["memory_mb"] = int(val.split()[0]) // 1024
|
|
elif key == "Autostart":
|
|
vm["autostart"] = val.lower() in ("enable", "enabled")
|
|
vms.append(vm)
|
|
|
|
with _vm_lock:
|
|
_vm_cache["data"] = vms
|
|
_vm_cache["ts"] = time.time()
|
|
return vms
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def get_local_stats():
|
|
with cpu_lock:
|
|
snap = cpu_snapshot.copy()
|
|
stats = {
|
|
"cores": snap["cores"],
|
|
"overall_cpu": snap["overall"],
|
|
"memory": get_memory(),
|
|
"load": get_load(),
|
|
"uptime": get_uptime(),
|
|
"num_cores": len(snap["cores"]),
|
|
"vms": get_vms(),
|
|
}
|
|
return stats
|
|
|
|
|
|
def fetch_remote_stats(url, timeout=3):
|
|
try:
|
|
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return json.loads(resp.read().decode())
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
@app.route("/")
|
|
def index():
|
|
return render_template("index.html")
|
|
|
|
|
|
@app.route("/api/stats")
|
|
def stats():
|
|
if IS_DASHBOARD:
|
|
return jsonify({"error": "dashboard-only mode, use /api/servers"}), 400
|
|
return jsonify(get_local_stats())
|
|
|
|
|
|
@app.route("/api/servers")
|
|
def all_servers():
|
|
results = []
|
|
for srv in servers:
|
|
data = fetch_remote_stats(srv["url"])
|
|
if data:
|
|
data["name"] = srv["name"]
|
|
data["status"] = "online"
|
|
data.setdefault("num_cores", len(data.get("cores", [])))
|
|
results.append(data)
|
|
else:
|
|
results.append({"name": srv["name"], "status": "unreachable"})
|
|
return jsonify(servers=results)
|
|
|
|
|
|
# Only start CPU sampler in agent mode (not dashboard-only)
|
|
if not IS_DASHBOARD:
|
|
t = threading.Thread(target=cpu_sampler, daemon=True)
|
|
t.start()
|