Files
sysmon/app.py
kamaji e10a57bb6c Fix dashboard timeout by making disk stats non-blocking
virsh guestinfo calls (one per VM) took ~4s on cold cache, exceeding
the dashboard's 3s fetch timeout. Disk stats now refresh in a background
thread and return stale/empty data immediately. Also raised dashboard
fetch timeout to 8s for cold-cache dominfo calls.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 22:19:21 -06:00

426 lines
14 KiB
Python

import json
import os
import subprocess
import threading
import time
import urllib.request
from flask import Flask, jsonify, render_template
app = Flask(__name__)
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
# Server configuration: "name:url,name:url"
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
servers = []
if SYSMON_SERVERS:
for entry in SYSMON_SERVERS.split(","):
name, url = entry.strip().split(":", 1)
servers.append({"name": name.strip(), "url": url.strip()})
# Dashboard-only mode: no local /proc reading when SYSMON_SERVERS is set
IS_DASHBOARD = bool(servers)
# Shared state for per-core CPU usage
cpu_snapshot = {"cores": [], "overall": 0.0}
cpu_lock = threading.Lock()
# VM base info cache (dominfo — slow, 30s TTL)
_vm_base_cache = {"data": [], "ts": 0}
_vm_base_lock = threading.Lock()
VM_BASE_TTL = 30
# VM live stats cache (domstats — fast, 5s TTL)
_vm_live_cache = {"data": {}, "ts": 0}
_vm_live_lock = threading.Lock()
VM_LIVE_TTL = 5
# CPU delta tracking for VM CPU %
_prev_domstats = {"by_name": {}, "ts": 0}
_prev_domstats_lock = threading.Lock()
# VM disk stats cache (guestinfo — per-VM calls, 30s TTL)
_vm_disk_cache = {"data": {}, "ts": 0}
_vm_disk_lock = threading.Lock()
VM_DISK_TTL = 30
def parse_proc_stat():
cores = []
overall = None
with open(f"{PROC}/stat") as f:
for line in f:
if line.startswith("cpu"):
parts = line.split()
name = parts[0]
vals = list(map(int, parts[1:]))
idle = vals[3] + vals[4]
total = sum(vals[:8])
busy = total - idle
if name == "cpu":
overall = (busy, total)
else:
core_id = int(name[3:])
cores.append((core_id, busy, total))
return cores, overall
def cpu_sampler():
global cpu_snapshot
prev_cores, prev_overall = parse_proc_stat()
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
while True:
time.sleep(1)
cur_cores, cur_overall = parse_proc_stat()
cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
result = []
for cid in sorted(cur_map.keys()):
if cid in prev_map:
db = cur_map[cid][0] - prev_map[cid][0]
dt = cur_map[cid][1] - prev_map[cid][1]
pct = (db / dt * 100) if dt > 0 else 0.0
else:
pct = 0.0
result.append({"id": cid, "usage_percent": round(pct, 1)})
overall_pct = 0.0
if prev_overall and cur_overall:
db = cur_overall[0] - prev_overall[0]
dt = cur_overall[1] - prev_overall[1]
overall_pct = (db / dt * 100) if dt > 0 else 0.0
with cpu_lock:
cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
prev_map = cur_map
prev_overall = cur_overall
def get_memory():
info = {}
with open(f"{PROC}/meminfo") as f:
for line in f:
parts = line.split()
key = parts[0].rstrip(":")
val_kb = int(parts[1])
info[key] = val_kb
total = info["MemTotal"]
available = info["MemAvailable"]
cached = info.get("Cached", 0) + info.get("Buffers", 0)
used = total - available
return {
"total_mb": round(total / 1024),
"used_mb": round(used / 1024),
"available_mb": round(available / 1024),
"cached_mb": round(cached / 1024),
"percent": round(used / total * 100, 1),
}
def get_load():
with open(f"{PROC}/loadavg") as f:
parts = f.read().split()
return {
"load1": float(parts[0]),
"load5": float(parts[1]),
"load15": float(parts[2]),
}
def get_uptime():
with open(f"{PROC}/uptime") as f:
secs = float(f.read().split()[0])
days = int(secs // 86400)
hours = int((secs % 86400) // 3600)
mins = int((secs % 3600) // 60)
parts = []
if days:
parts.append(f"{days}d")
if hours:
parts.append(f"{hours}h")
parts.append(f"{mins}m")
return {"seconds": round(secs), "human": " ".join(parts)}
def parse_domstats():
"""Run virsh domstats once for all VMs. Returns {name: {cpu_time, balloon_available, balloon_unused, balloon_rss}}."""
try:
result = subprocess.run(
["sudo", "virsh", "domstats", "--cpu-total", "--balloon"],
capture_output=True, text=True, timeout=10
)
if result.returncode != 0:
return {}
except Exception:
return {}
stats = {}
current_name = None
current = {}
for line in result.stdout.split("\n"):
line = line.strip()
if line.startswith("Domain:"):
if current_name and current:
stats[current_name] = current
current_name = line.split("'")[1] if "'" in line else None
current = {}
elif "=" in line and current_name:
key, val = line.split("=", 1)
key = key.strip()
val = val.strip()
if key == "cpu.time":
current["cpu_time"] = int(val)
elif key == "balloon.available":
current["balloon_available"] = int(val)
elif key == "balloon.unused":
current["balloon_unused"] = int(val)
elif key == "balloon.rss":
current["balloon_rss"] = int(val)
if current_name and current:
stats[current_name] = current
return stats
def get_vm_live_stats():
"""Get live VM stats (domstats) with CPU delta tracking. Cached for VM_LIVE_TTL seconds."""
with _vm_live_lock:
now = time.time()
if now - _vm_live_cache["ts"] < VM_LIVE_TTL:
return _vm_live_cache["data"]
raw = parse_domstats()
now = time.time()
with _prev_domstats_lock:
prev_cpu = _prev_domstats["by_name"]
prev_ts = _prev_domstats["ts"]
dt = now - prev_ts if prev_ts > 0 else 0
live = {}
for name, s in raw.items():
cpu_pct = 0.0
if dt > 0 and name in prev_cpu:
delta_ns = s.get("cpu_time", 0) - prev_cpu[name]
if delta_ns > 0:
# cpu.time is total across all vcpus, so divide by wall time only
# This gives % of one CPU core; cap at 100 * vcpus but we'll
# normalize per-VM in get_vms() using vcpu count
cpu_pct = delta_ns / (dt * 1e9) * 100
balloon_avail = s.get("balloon_available", 0)
balloon_unused = s.get("balloon_unused", 0)
balloon_rss = s.get("balloon_rss", 0)
if balloon_avail > 0 and balloon_unused >= 0:
mem_total = balloon_avail // 1024 # KiB to MB
mem_used = (balloon_avail - balloon_unused) // 1024
else:
# No guest-side stats (e.g. Windows without full guest agent).
# Mark as unavailable — frontend will show allocated RAM only.
mem_total = 0
mem_used = 0
live[name] = {
"raw_cpu_pct": round(cpu_pct, 2),
"memory_used_mb": mem_used,
"memory_total_mb": mem_total,
}
_prev_domstats["by_name"] = {n: s.get("cpu_time", 0) for n, s in raw.items()}
_prev_domstats["ts"] = now
with _vm_live_lock:
_vm_live_cache["data"] = live
_vm_live_cache["ts"] = now
return live
def _fetch_disk_stats(running_names):
"""Background worker: fetch filesystem stats for all running VMs."""
disks = {}
for name in running_names:
try:
result = subprocess.run(
["sudo", "virsh", "guestinfo", name, "--filesystem"],
capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
continue
fs = {}
for line in result.stdout.split("\n"):
if ":" not in line:
continue
key, val = line.split(":", 1)
fs[key.strip()] = val.strip()
count = int(fs.get("fs.count", 0))
vm_disks = []
for i in range(count):
mp = fs.get(f"fs.{i}.mountpoint", "")
total_b = int(fs.get(f"fs.{i}.total-bytes", 0))
used_b = int(fs.get(f"fs.{i}.used-bytes", 0))
if total_b > 0:
vm_disks.append({
"mountpoint": mp,
"total_gb": round(total_b / (1024**3), 1),
"used_gb": round(used_b / (1024**3), 1),
})
if vm_disks:
disks[name] = vm_disks
except Exception:
continue
with _vm_disk_lock:
_vm_disk_cache["data"] = disks
_vm_disk_cache["ts"] = time.time()
_vm_disk_cache["refreshing"] = False
def get_vm_disk_stats(running_names):
"""Get filesystem usage. Returns cached data immediately; refreshes in background."""
with _vm_disk_lock:
now = time.time()
stale = now - _vm_disk_cache["ts"] >= VM_DISK_TTL
refreshing = _vm_disk_cache.get("refreshing", False)
if stale and not refreshing:
_vm_disk_cache["refreshing"] = True
t = threading.Thread(target=_fetch_disk_stats, args=(running_names,), daemon=True)
t.start()
return _vm_disk_cache["data"]
def get_vm_base_info():
"""Get base VM info (dominfo). Cached for VM_BASE_TTL seconds."""
with _vm_base_lock:
now = time.time()
if now - _vm_base_cache["ts"] < VM_BASE_TTL:
return _vm_base_cache["data"]
try:
result = subprocess.run(
["sudo", "virsh", "list", "--all", "--name"],
capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return []
names = [n.strip() for n in result.stdout.strip().split("\n") if n.strip()]
vms = []
for name in names:
info = subprocess.run(
["sudo", "virsh", "dominfo", name],
capture_output=True, text=True, timeout=5
)
if info.returncode != 0:
continue
vm = {"name": name, "state": "unknown", "vcpus": 0, "memory_mb": 0, "autostart": False}
for line in info.stdout.split("\n"):
if ":" not in line:
continue
key, val = line.split(":", 1)
key = key.strip()
val = val.strip()
if key == "State":
vm["state"] = val
elif key == "CPU(s)":
vm["vcpus"] = int(val)
elif key == "Max memory":
vm["memory_mb"] = int(val.split()[0]) // 1024
elif key == "Autostart":
vm["autostart"] = val.lower() in ("enable", "enabled")
vms.append(vm)
with _vm_base_lock:
_vm_base_cache["data"] = vms
_vm_base_cache["ts"] = time.time()
return vms
except Exception:
return []
def get_vms():
"""Get VM list with live CPU %, memory usage, and disk usage merged in."""
base = get_vm_base_info()
live = get_vm_live_stats()
running_names = [vm["name"] for vm in base if vm["state"] == "running"]
disks = get_vm_disk_stats(running_names)
result = []
for vm in base:
vm = vm.copy()
stats = live.get(vm["name"])
if stats and vm["state"] == "running":
# Normalize CPU: raw_cpu_pct is % of one core, divide by vcpus for per-VM %
vcpus = vm["vcpus"] or 1
cpu_pct = stats["raw_cpu_pct"] / vcpus
cpu_pct = round(max(0, min(100, cpu_pct)), 1)
vm["cpu_percent"] = cpu_pct
vm["memory_used_mb"] = stats["memory_used_mb"]
vm["memory_total_mb"] = stats["memory_total_mb"]
else:
vm["cpu_percent"] = 0.0
vm["memory_used_mb"] = 0
vm["memory_total_mb"] = vm["memory_mb"] if vm["state"] == "running" else 0
vm["disks"] = disks.get(vm["name"], [])
result.append(vm)
return result
def get_local_stats():
with cpu_lock:
snap = cpu_snapshot.copy()
return {
"cores": snap["cores"],
"overall_cpu": snap["overall"],
"memory": get_memory(),
"load": get_load(),
"uptime": get_uptime(),
"num_cores": len(snap["cores"]),
"vms": get_vms(),
}
def fetch_remote_stats(url, timeout=8):
try:
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode())
except Exception:
return None
@app.route("/")
def index():
return render_template("index.html")
@app.route("/api/stats")
def stats():
if IS_DASHBOARD:
return jsonify({"error": "dashboard-only mode, use /api/servers"}), 400
return jsonify(get_local_stats())
@app.route("/api/servers")
def all_servers():
results = []
for srv in servers:
data = fetch_remote_stats(srv["url"])
if data:
data["name"] = srv["name"]
data["status"] = "online"
data.setdefault("num_cores", len(data.get("cores", [])))
results.append(data)
else:
results.append({"name": srv["name"], "status": "unreachable"})
return jsonify(servers=results)
# Only start CPU sampler in agent mode (not dashboard-only)
if not IS_DASHBOARD:
t = threading.Thread(target=cpu_sampler, daemon=True)
t.start()