Parse domstats --cpu-total --balloon for all VMs in a single call. Track CPU time deltas between samples to compute per-VM CPU %. Compute guest memory usage from balloon stats (available - unused). Split VM caching: base info (dominfo) 30s TTL, live stats 5s TTL. UI shows CPU % column (color-coded) and memory used/total with bars. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
367 lines
11 KiB
Python
367 lines
11 KiB
Python
import json
|
|
import os
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import urllib.request
|
|
|
|
from flask import Flask, jsonify, render_template
|
|
|
|
app = Flask(__name__)
|
|
|
|
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
|
|
|
|
# Server configuration: "name:url,name:url"
|
|
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
|
|
servers = []
|
|
if SYSMON_SERVERS:
|
|
for entry in SYSMON_SERVERS.split(","):
|
|
name, url = entry.strip().split(":", 1)
|
|
servers.append({"name": name.strip(), "url": url.strip()})
|
|
|
|
# Dashboard-only mode: no local /proc reading when SYSMON_SERVERS is set
|
|
IS_DASHBOARD = bool(servers)
|
|
|
|
# Shared state for per-core CPU usage
|
|
cpu_snapshot = {"cores": [], "overall": 0.0}
|
|
cpu_lock = threading.Lock()
|
|
|
|
# VM base info cache (dominfo — slow, 30s TTL)
|
|
_vm_base_cache = {"data": [], "ts": 0}
|
|
_vm_base_lock = threading.Lock()
|
|
VM_BASE_TTL = 30
|
|
|
|
# VM live stats cache (domstats — fast, 5s TTL)
|
|
_vm_live_cache = {"data": {}, "ts": 0}
|
|
_vm_live_lock = threading.Lock()
|
|
VM_LIVE_TTL = 5
|
|
|
|
# CPU delta tracking for VM CPU %
|
|
_prev_domstats = {"by_name": {}, "ts": 0}
|
|
_prev_domstats_lock = threading.Lock()
|
|
|
|
|
|
def parse_proc_stat():
|
|
cores = []
|
|
overall = None
|
|
with open(f"{PROC}/stat") as f:
|
|
for line in f:
|
|
if line.startswith("cpu"):
|
|
parts = line.split()
|
|
name = parts[0]
|
|
vals = list(map(int, parts[1:]))
|
|
idle = vals[3] + vals[4]
|
|
total = sum(vals[:8])
|
|
busy = total - idle
|
|
if name == "cpu":
|
|
overall = (busy, total)
|
|
else:
|
|
core_id = int(name[3:])
|
|
cores.append((core_id, busy, total))
|
|
return cores, overall
|
|
|
|
|
|
def cpu_sampler():
|
|
global cpu_snapshot
|
|
prev_cores, prev_overall = parse_proc_stat()
|
|
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
|
|
|
|
while True:
|
|
time.sleep(1)
|
|
cur_cores, cur_overall = parse_proc_stat()
|
|
cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
|
|
|
|
result = []
|
|
for cid in sorted(cur_map.keys()):
|
|
if cid in prev_map:
|
|
db = cur_map[cid][0] - prev_map[cid][0]
|
|
dt = cur_map[cid][1] - prev_map[cid][1]
|
|
pct = (db / dt * 100) if dt > 0 else 0.0
|
|
else:
|
|
pct = 0.0
|
|
result.append({"id": cid, "usage_percent": round(pct, 1)})
|
|
|
|
overall_pct = 0.0
|
|
if prev_overall and cur_overall:
|
|
db = cur_overall[0] - prev_overall[0]
|
|
dt = cur_overall[1] - prev_overall[1]
|
|
overall_pct = (db / dt * 100) if dt > 0 else 0.0
|
|
|
|
with cpu_lock:
|
|
cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
|
|
|
|
prev_map = cur_map
|
|
prev_overall = cur_overall
|
|
|
|
|
|
def get_memory():
|
|
info = {}
|
|
with open(f"{PROC}/meminfo") as f:
|
|
for line in f:
|
|
parts = line.split()
|
|
key = parts[0].rstrip(":")
|
|
val_kb = int(parts[1])
|
|
info[key] = val_kb
|
|
|
|
total = info["MemTotal"]
|
|
available = info["MemAvailable"]
|
|
cached = info.get("Cached", 0) + info.get("Buffers", 0)
|
|
used = total - available
|
|
|
|
return {
|
|
"total_mb": round(total / 1024),
|
|
"used_mb": round(used / 1024),
|
|
"available_mb": round(available / 1024),
|
|
"cached_mb": round(cached / 1024),
|
|
"percent": round(used / total * 100, 1),
|
|
}
|
|
|
|
|
|
def get_load():
|
|
with open(f"{PROC}/loadavg") as f:
|
|
parts = f.read().split()
|
|
return {
|
|
"load1": float(parts[0]),
|
|
"load5": float(parts[1]),
|
|
"load15": float(parts[2]),
|
|
}
|
|
|
|
|
|
def get_uptime():
|
|
with open(f"{PROC}/uptime") as f:
|
|
secs = float(f.read().split()[0])
|
|
days = int(secs // 86400)
|
|
hours = int((secs % 86400) // 3600)
|
|
mins = int((secs % 3600) // 60)
|
|
parts = []
|
|
if days:
|
|
parts.append(f"{days}d")
|
|
if hours:
|
|
parts.append(f"{hours}h")
|
|
parts.append(f"{mins}m")
|
|
return {"seconds": round(secs), "human": " ".join(parts)}
|
|
|
|
|
|
def parse_domstats():
|
|
"""Run virsh domstats once for all VMs. Returns {name: {cpu_time, balloon_available, balloon_unused, balloon_rss}}."""
|
|
try:
|
|
result = subprocess.run(
|
|
["sudo", "virsh", "domstats", "--cpu-total", "--balloon"],
|
|
capture_output=True, text=True, timeout=10
|
|
)
|
|
if result.returncode != 0:
|
|
return {}
|
|
except Exception:
|
|
return {}
|
|
|
|
stats = {}
|
|
current_name = None
|
|
current = {}
|
|
for line in result.stdout.split("\n"):
|
|
line = line.strip()
|
|
if line.startswith("Domain:"):
|
|
if current_name and current:
|
|
stats[current_name] = current
|
|
current_name = line.split("'")[1] if "'" in line else None
|
|
current = {}
|
|
elif "=" in line and current_name:
|
|
key, val = line.split("=", 1)
|
|
key = key.strip()
|
|
val = val.strip()
|
|
if key == "cpu.time":
|
|
current["cpu_time"] = int(val)
|
|
elif key == "balloon.available":
|
|
current["balloon_available"] = int(val)
|
|
elif key == "balloon.unused":
|
|
current["balloon_unused"] = int(val)
|
|
elif key == "balloon.rss":
|
|
current["balloon_rss"] = int(val)
|
|
if current_name and current:
|
|
stats[current_name] = current
|
|
return stats
|
|
|
|
|
|
def get_vm_live_stats():
|
|
"""Get live VM stats (domstats) with CPU delta tracking. Cached for VM_LIVE_TTL seconds."""
|
|
with _vm_live_lock:
|
|
now = time.time()
|
|
if now - _vm_live_cache["ts"] < VM_LIVE_TTL:
|
|
return _vm_live_cache["data"]
|
|
|
|
raw = parse_domstats()
|
|
now = time.time()
|
|
|
|
with _prev_domstats_lock:
|
|
prev_cpu = _prev_domstats["by_name"]
|
|
prev_ts = _prev_domstats["ts"]
|
|
dt = now - prev_ts if prev_ts > 0 else 0
|
|
|
|
live = {}
|
|
for name, s in raw.items():
|
|
cpu_pct = 0.0
|
|
if dt > 0 and name in prev_cpu:
|
|
delta_ns = s.get("cpu_time", 0) - prev_cpu[name]
|
|
if delta_ns > 0:
|
|
# cpu.time is total across all vcpus, so divide by wall time only
|
|
# This gives % of one CPU core; cap at 100 * vcpus but we'll
|
|
# normalize per-VM in get_vms() using vcpu count
|
|
cpu_pct = delta_ns / (dt * 1e9) * 100
|
|
|
|
balloon_avail = s.get("balloon_available", 0)
|
|
balloon_unused = s.get("balloon_unused", 0)
|
|
balloon_rss = s.get("balloon_rss", 0)
|
|
|
|
if balloon_avail > 0:
|
|
mem_total = balloon_avail // 1024 # KiB to MB
|
|
mem_used = (balloon_avail - balloon_unused) // 1024
|
|
elif balloon_rss > 0:
|
|
mem_total = 0
|
|
mem_used = balloon_rss // 1024
|
|
else:
|
|
mem_total = 0
|
|
mem_used = 0
|
|
|
|
live[name] = {
|
|
"raw_cpu_pct": round(cpu_pct, 2),
|
|
"memory_used_mb": mem_used,
|
|
"memory_total_mb": mem_total,
|
|
}
|
|
|
|
_prev_domstats["by_name"] = {n: s.get("cpu_time", 0) for n, s in raw.items()}
|
|
_prev_domstats["ts"] = now
|
|
|
|
with _vm_live_lock:
|
|
_vm_live_cache["data"] = live
|
|
_vm_live_cache["ts"] = now
|
|
return live
|
|
|
|
|
|
def get_vm_base_info():
|
|
"""Get base VM info (dominfo). Cached for VM_BASE_TTL seconds."""
|
|
with _vm_base_lock:
|
|
now = time.time()
|
|
if now - _vm_base_cache["ts"] < VM_BASE_TTL:
|
|
return _vm_base_cache["data"]
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["sudo", "virsh", "list", "--all", "--name"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if result.returncode != 0:
|
|
return []
|
|
|
|
names = [n.strip() for n in result.stdout.strip().split("\n") if n.strip()]
|
|
vms = []
|
|
for name in names:
|
|
info = subprocess.run(
|
|
["sudo", "virsh", "dominfo", name],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if info.returncode != 0:
|
|
continue
|
|
vm = {"name": name, "state": "unknown", "vcpus": 0, "memory_mb": 0, "autostart": False}
|
|
for line in info.stdout.split("\n"):
|
|
if ":" not in line:
|
|
continue
|
|
key, val = line.split(":", 1)
|
|
key = key.strip()
|
|
val = val.strip()
|
|
if key == "State":
|
|
vm["state"] = val
|
|
elif key == "CPU(s)":
|
|
vm["vcpus"] = int(val)
|
|
elif key == "Max memory":
|
|
vm["memory_mb"] = int(val.split()[0]) // 1024
|
|
elif key == "Autostart":
|
|
vm["autostart"] = val.lower() in ("enable", "enabled")
|
|
vms.append(vm)
|
|
|
|
with _vm_base_lock:
|
|
_vm_base_cache["data"] = vms
|
|
_vm_base_cache["ts"] = time.time()
|
|
return vms
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def get_vms():
|
|
"""Get VM list with live CPU % and memory usage merged in."""
|
|
base = get_vm_base_info()
|
|
live = get_vm_live_stats()
|
|
|
|
result = []
|
|
for vm in base:
|
|
vm = vm.copy()
|
|
stats = live.get(vm["name"])
|
|
if stats and vm["state"] == "running":
|
|
# Normalize CPU: raw_cpu_pct is % of one core, divide by vcpus for per-VM %
|
|
vcpus = vm["vcpus"] or 1
|
|
cpu_pct = stats["raw_cpu_pct"] / vcpus
|
|
cpu_pct = round(max(0, min(100, cpu_pct)), 1)
|
|
vm["cpu_percent"] = cpu_pct
|
|
vm["memory_used_mb"] = stats["memory_used_mb"]
|
|
vm["memory_total_mb"] = stats["memory_total_mb"] or vm["memory_mb"]
|
|
else:
|
|
vm["cpu_percent"] = 0.0
|
|
vm["memory_used_mb"] = 0
|
|
vm["memory_total_mb"] = vm["memory_mb"] if vm["state"] == "running" else 0
|
|
result.append(vm)
|
|
return result
|
|
|
|
|
|
def get_local_stats():
|
|
with cpu_lock:
|
|
snap = cpu_snapshot.copy()
|
|
return {
|
|
"cores": snap["cores"],
|
|
"overall_cpu": snap["overall"],
|
|
"memory": get_memory(),
|
|
"load": get_load(),
|
|
"uptime": get_uptime(),
|
|
"num_cores": len(snap["cores"]),
|
|
"vms": get_vms(),
|
|
}
|
|
|
|
|
|
def fetch_remote_stats(url, timeout=3):
|
|
try:
|
|
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return json.loads(resp.read().decode())
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
@app.route("/")
|
|
def index():
|
|
return render_template("index.html")
|
|
|
|
|
|
@app.route("/api/stats")
|
|
def stats():
|
|
if IS_DASHBOARD:
|
|
return jsonify({"error": "dashboard-only mode, use /api/servers"}), 400
|
|
return jsonify(get_local_stats())
|
|
|
|
|
|
@app.route("/api/servers")
|
|
def all_servers():
|
|
results = []
|
|
for srv in servers:
|
|
data = fetch_remote_stats(srv["url"])
|
|
if data:
|
|
data["name"] = srv["name"]
|
|
data["status"] = "online"
|
|
data.setdefault("num_cores", len(data.get("cores", [])))
|
|
results.append(data)
|
|
else:
|
|
results.append({"name": srv["name"], "status": "unreachable"})
|
|
return jsonify(servers=results)
|
|
|
|
|
|
# Only start CPU sampler in agent mode (not dashboard-only)
|
|
if not IS_DASHBOARD:
|
|
t = threading.Thread(target=cpu_sampler, daemon=True)
|
|
t.start()
|