import json import os import subprocess import threading import time import urllib.request from flask import Flask, jsonify, render_template app = Flask(__name__) PROC = os.environ.get("SYSMON_PROC", "/host/proc") # Server configuration: "name:url,name:url" SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "") servers = [] if SYSMON_SERVERS: for entry in SYSMON_SERVERS.split(","): name, url = entry.strip().split(":", 1) servers.append({"name": name.strip(), "url": url.strip()}) # Dashboard-only mode: no local /proc reading when SYSMON_SERVERS is set IS_DASHBOARD = bool(servers) # Shared state for per-core CPU usage cpu_snapshot = {"cores": [], "overall": 0.0} cpu_lock = threading.Lock() # VM base info cache (dominfo — slow, 30s TTL) _vm_base_cache = {"data": [], "ts": 0} _vm_base_lock = threading.Lock() VM_BASE_TTL = 30 # VM live stats cache (domstats — fast, 5s TTL) _vm_live_cache = {"data": {}, "ts": 0} _vm_live_lock = threading.Lock() VM_LIVE_TTL = 5 # CPU delta tracking for VM CPU % _prev_domstats = {"by_name": {}, "ts": 0} _prev_domstats_lock = threading.Lock() # VM disk stats cache (guestinfo — per-VM calls, 30s TTL) _vm_disk_cache = {"data": {}, "ts": 0} _vm_disk_lock = threading.Lock() VM_DISK_TTL = 30 def parse_proc_stat(): cores = [] overall = None with open(f"{PROC}/stat") as f: for line in f: if line.startswith("cpu"): parts = line.split() name = parts[0] vals = list(map(int, parts[1:])) idle = vals[3] + vals[4] total = sum(vals[:8]) busy = total - idle if name == "cpu": overall = (busy, total) else: core_id = int(name[3:]) cores.append((core_id, busy, total)) return cores, overall def cpu_sampler(): global cpu_snapshot prev_cores, prev_overall = parse_proc_stat() prev_map = {cid: (busy, total) for cid, busy, total in prev_cores} while True: time.sleep(1) cur_cores, cur_overall = parse_proc_stat() cur_map = {cid: (busy, total) for cid, busy, total in cur_cores} result = [] for cid in sorted(cur_map.keys()): if cid in prev_map: db = cur_map[cid][0] - prev_map[cid][0] dt = cur_map[cid][1] - prev_map[cid][1] pct = (db / dt * 100) if dt > 0 else 0.0 else: pct = 0.0 result.append({"id": cid, "usage_percent": round(pct, 1)}) overall_pct = 0.0 if prev_overall and cur_overall: db = cur_overall[0] - prev_overall[0] dt = cur_overall[1] - prev_overall[1] overall_pct = (db / dt * 100) if dt > 0 else 0.0 with cpu_lock: cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)} prev_map = cur_map prev_overall = cur_overall def get_memory(): info = {} with open(f"{PROC}/meminfo") as f: for line in f: parts = line.split() key = parts[0].rstrip(":") val_kb = int(parts[1]) info[key] = val_kb total = info["MemTotal"] available = info["MemAvailable"] cached = info.get("Cached", 0) + info.get("Buffers", 0) used = total - available return { "total_mb": round(total / 1024), "used_mb": round(used / 1024), "available_mb": round(available / 1024), "cached_mb": round(cached / 1024), "percent": round(used / total * 100, 1), } def get_load(): with open(f"{PROC}/loadavg") as f: parts = f.read().split() return { "load1": float(parts[0]), "load5": float(parts[1]), "load15": float(parts[2]), } def get_uptime(): with open(f"{PROC}/uptime") as f: secs = float(f.read().split()[0]) days = int(secs // 86400) hours = int((secs % 86400) // 3600) mins = int((secs % 3600) // 60) parts = [] if days: parts.append(f"{days}d") if hours: parts.append(f"{hours}h") parts.append(f"{mins}m") return {"seconds": round(secs), "human": " ".join(parts)} def parse_domstats(): """Run virsh domstats once for all VMs. Returns {name: {cpu_time, balloon_available, balloon_unused, balloon_rss}}.""" try: result = subprocess.run( ["sudo", "virsh", "domstats", "--cpu-total", "--balloon"], capture_output=True, text=True, timeout=10 ) if result.returncode != 0: return {} except Exception: return {} stats = {} current_name = None current = {} for line in result.stdout.split("\n"): line = line.strip() if line.startswith("Domain:"): if current_name and current: stats[current_name] = current current_name = line.split("'")[1] if "'" in line else None current = {} elif "=" in line and current_name: key, val = line.split("=", 1) key = key.strip() val = val.strip() if key == "cpu.time": current["cpu_time"] = int(val) elif key == "balloon.available": current["balloon_available"] = int(val) elif key == "balloon.unused": current["balloon_unused"] = int(val) elif key == "balloon.rss": current["balloon_rss"] = int(val) if current_name and current: stats[current_name] = current return stats def get_vm_live_stats(): """Get live VM stats (domstats) with CPU delta tracking. Cached for VM_LIVE_TTL seconds.""" with _vm_live_lock: now = time.time() if now - _vm_live_cache["ts"] < VM_LIVE_TTL: return _vm_live_cache["data"] raw = parse_domstats() now = time.time() with _prev_domstats_lock: prev_cpu = _prev_domstats["by_name"] prev_ts = _prev_domstats["ts"] dt = now - prev_ts if prev_ts > 0 else 0 live = {} for name, s in raw.items(): cpu_pct = 0.0 if dt > 0 and name in prev_cpu: delta_ns = s.get("cpu_time", 0) - prev_cpu[name] if delta_ns > 0: # cpu.time is total across all vcpus, so divide by wall time only # This gives % of one CPU core; cap at 100 * vcpus but we'll # normalize per-VM in get_vms() using vcpu count cpu_pct = delta_ns / (dt * 1e9) * 100 balloon_avail = s.get("balloon_available", 0) balloon_unused = s.get("balloon_unused", 0) balloon_rss = s.get("balloon_rss", 0) if balloon_avail > 0 and balloon_unused >= 0: mem_total = balloon_avail // 1024 # KiB to MB mem_used = (balloon_avail - balloon_unused) // 1024 else: # No guest-side stats (e.g. Windows without full guest agent). # Mark as unavailable — frontend will show allocated RAM only. mem_total = 0 mem_used = 0 live[name] = { "raw_cpu_pct": round(cpu_pct, 2), "memory_used_mb": mem_used, "memory_total_mb": mem_total, } _prev_domstats["by_name"] = {n: s.get("cpu_time", 0) for n, s in raw.items()} _prev_domstats["ts"] = now with _vm_live_lock: _vm_live_cache["data"] = live _vm_live_cache["ts"] = now return live def _fetch_disk_stats(running_names): """Background worker: fetch filesystem stats for all running VMs.""" disks = {} for name in running_names: try: result = subprocess.run( ["sudo", "virsh", "guestinfo", name, "--filesystem"], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: continue fs = {} for line in result.stdout.split("\n"): if ":" not in line: continue key, val = line.split(":", 1) fs[key.strip()] = val.strip() count = int(fs.get("fs.count", 0)) vm_disks = [] for i in range(count): mp = fs.get(f"fs.{i}.mountpoint", "") total_b = int(fs.get(f"fs.{i}.total-bytes", 0)) used_b = int(fs.get(f"fs.{i}.used-bytes", 0)) if total_b > 0: vm_disks.append({ "mountpoint": mp, "total_gb": round(total_b / (1024**3), 1), "used_gb": round(used_b / (1024**3), 1), }) if vm_disks: disks[name] = vm_disks except Exception: continue with _vm_disk_lock: _vm_disk_cache["data"] = disks _vm_disk_cache["ts"] = time.time() _vm_disk_cache["refreshing"] = False def get_vm_disk_stats(running_names): """Get filesystem usage. Returns cached data immediately; refreshes in background.""" with _vm_disk_lock: now = time.time() stale = now - _vm_disk_cache["ts"] >= VM_DISK_TTL refreshing = _vm_disk_cache.get("refreshing", False) if stale and not refreshing: _vm_disk_cache["refreshing"] = True t = threading.Thread(target=_fetch_disk_stats, args=(running_names,), daemon=True) t.start() return _vm_disk_cache["data"] def get_vm_base_info(): """Get base VM info (dominfo). Cached for VM_BASE_TTL seconds.""" with _vm_base_lock: now = time.time() if now - _vm_base_cache["ts"] < VM_BASE_TTL: return _vm_base_cache["data"] try: result = subprocess.run( ["sudo", "virsh", "list", "--all", "--name"], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: return [] names = [n.strip() for n in result.stdout.strip().split("\n") if n.strip()] vms = [] for name in names: info = subprocess.run( ["sudo", "virsh", "dominfo", name], capture_output=True, text=True, timeout=5 ) if info.returncode != 0: continue vm = {"name": name, "state": "unknown", "vcpus": 0, "memory_mb": 0, "autostart": False} for line in info.stdout.split("\n"): if ":" not in line: continue key, val = line.split(":", 1) key = key.strip() val = val.strip() if key == "State": vm["state"] = val elif key == "CPU(s)": vm["vcpus"] = int(val) elif key == "Max memory": vm["memory_mb"] = int(val.split()[0]) // 1024 elif key == "Autostart": vm["autostart"] = val.lower() in ("enable", "enabled") vms.append(vm) with _vm_base_lock: _vm_base_cache["data"] = vms _vm_base_cache["ts"] = time.time() return vms except Exception: return [] def get_vms(): """Get VM list with live CPU %, memory usage, and disk usage merged in.""" base = get_vm_base_info() live = get_vm_live_stats() running_names = [vm["name"] for vm in base if vm["state"] == "running"] disks = get_vm_disk_stats(running_names) result = [] for vm in base: vm = vm.copy() stats = live.get(vm["name"]) if stats and vm["state"] == "running": # Normalize CPU: raw_cpu_pct is % of one core, divide by vcpus for per-VM % vcpus = vm["vcpus"] or 1 cpu_pct = stats["raw_cpu_pct"] / vcpus cpu_pct = round(max(0, min(100, cpu_pct)), 1) vm["cpu_percent"] = cpu_pct vm["memory_used_mb"] = stats["memory_used_mb"] vm["memory_total_mb"] = stats["memory_total_mb"] else: vm["cpu_percent"] = 0.0 vm["memory_used_mb"] = 0 vm["memory_total_mb"] = vm["memory_mb"] if vm["state"] == "running" else 0 vm["disks"] = disks.get(vm["name"], []) result.append(vm) return result def get_local_stats(): with cpu_lock: snap = cpu_snapshot.copy() return { "cores": snap["cores"], "overall_cpu": snap["overall"], "memory": get_memory(), "load": get_load(), "uptime": get_uptime(), "num_cores": len(snap["cores"]), "vms": get_vms(), } def fetch_remote_stats(url, timeout=8): try: req = urllib.request.Request(url.rstrip("/") + "/api/stats") with urllib.request.urlopen(req, timeout=timeout) as resp: return json.loads(resp.read().decode()) except Exception: return None @app.route("/") def index(): return render_template("index.html") @app.route("/api/stats") def stats(): if IS_DASHBOARD: return jsonify({"error": "dashboard-only mode, use /api/servers"}), 400 return jsonify(get_local_stats()) @app.route("/api/servers") def all_servers(): results = [] for srv in servers: data = fetch_remote_stats(srv["url"]) if data: data["name"] = srv["name"] data["status"] = "online" data.setdefault("num_cores", len(data.get("cores", []))) results.append(data) else: results.append({"name": srv["name"], "status": "unreachable"}) return jsonify(servers=results) # Only start CPU sampler in agent mode (not dashboard-only) if not IS_DASHBOARD: t = threading.Thread(target=cpu_sampler, daemon=True) t.start()