Add multi-server system monitor with agent/dashboard architecture

Single Flask codebase runs in agent mode (serves /api/stats from local
/proc) or dashboard mode (aggregates local + remote agents). Currently
monitors compute1 (64-core, podman container) and console (16-core,
bare systemd service).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-24 19:09:34 -06:00
commit 3a9ba28552
5 changed files with 569 additions and 0 deletions

11
Dockerfile Normal file
View File

@@ -0,0 +1,11 @@
FROM python:3.11-slim
RUN pip install --no-cache-dir flask gunicorn
WORKDIR /app
COPY app.py .
COPY templates/ templates/
EXPOSE 8083
CMD ["gunicorn", "-b", "0.0.0.0:8083", "-w", "1", "--threads", "2", "app:app"]

189
app.py Normal file
View File

@@ -0,0 +1,189 @@
import json
import os
import threading
import time
import urllib.request
from flask import Flask, jsonify, render_template
app = Flask(__name__)
PROC = os.environ.get("SYSMON_PROC", "/host/proc")
# Server configuration: "name:url,name:url" where "local" means read from /proc
SYSMON_SERVERS = os.environ.get("SYSMON_SERVERS", "")
servers = []
if SYSMON_SERVERS:
for entry in SYSMON_SERVERS.split(","):
name, url = entry.strip().split(":", 1)
servers.append({"name": name.strip(), "url": url.strip()})
# Shared state for per-core CPU usage
cpu_snapshot = {"cores": [], "overall": 0.0}
cpu_lock = threading.Lock()
def parse_proc_stat():
"""Parse /proc/stat and return per-cpu jiffies as list of (id, user+nice+system, total)."""
cores = []
overall = None
with open(f"{PROC}/stat") as f:
for line in f:
if line.startswith("cpu"):
parts = line.split()
name = parts[0]
vals = list(map(int, parts[1:]))
idle = vals[3] + vals[4]
total = sum(vals[:8])
busy = total - idle
if name == "cpu":
overall = (busy, total)
else:
core_id = int(name[3:])
cores.append((core_id, busy, total))
return cores, overall
def cpu_sampler():
"""Background thread: sample /proc/stat every 1s, compute deltas."""
global cpu_snapshot
prev_cores, prev_overall = parse_proc_stat()
prev_map = {cid: (busy, total) for cid, busy, total in prev_cores}
while True:
time.sleep(1)
cur_cores, cur_overall = parse_proc_stat()
cur_map = {cid: (busy, total) for cid, busy, total in cur_cores}
result = []
for cid in sorted(cur_map.keys()):
if cid in prev_map:
db = cur_map[cid][0] - prev_map[cid][0]
dt = cur_map[cid][1] - prev_map[cid][1]
pct = (db / dt * 100) if dt > 0 else 0.0
else:
pct = 0.0
result.append({"id": cid, "usage_percent": round(pct, 1)})
overall_pct = 0.0
if prev_overall and cur_overall:
db = cur_overall[0] - prev_overall[0]
dt = cur_overall[1] - prev_overall[1]
overall_pct = (db / dt * 100) if dt > 0 else 0.0
with cpu_lock:
cpu_snapshot = {"cores": result, "overall": round(overall_pct, 1)}
prev_map = cur_map
prev_overall = cur_overall
def get_memory():
"""Parse /proc/meminfo, return dict with MB values."""
info = {}
with open(f"{PROC}/meminfo") as f:
for line in f:
parts = line.split()
key = parts[0].rstrip(":")
val_kb = int(parts[1])
info[key] = val_kb
total = info["MemTotal"]
available = info["MemAvailable"]
cached = info.get("Cached", 0) + info.get("Buffers", 0)
used = total - available
return {
"total_mb": round(total / 1024),
"used_mb": round(used / 1024),
"available_mb": round(available / 1024),
"cached_mb": round(cached / 1024),
"percent": round(used / total * 100, 1),
}
def get_load():
"""Parse /proc/loadavg."""
with open(f"{PROC}/loadavg") as f:
parts = f.read().split()
return {
"load1": float(parts[0]),
"load5": float(parts[1]),
"load15": float(parts[2]),
}
def get_uptime():
"""Parse /proc/uptime, return human-readable string."""
with open(f"{PROC}/uptime") as f:
secs = float(f.read().split()[0])
days = int(secs // 86400)
hours = int((secs % 86400) // 3600)
mins = int((secs % 3600) // 60)
parts = []
if days:
parts.append(f"{days}d")
if hours:
parts.append(f"{hours}h")
parts.append(f"{mins}m")
return {"seconds": round(secs), "human": " ".join(parts)}
def get_local_stats():
"""Build stats dict from local /proc."""
with cpu_lock:
snap = cpu_snapshot.copy()
return {
"cores": snap["cores"],
"overall_cpu": snap["overall"],
"memory": get_memory(),
"load": get_load(),
"uptime": get_uptime(),
"num_cores": len(snap["cores"]),
}
def fetch_remote_stats(url, timeout=2):
"""Fetch /api/stats from a remote agent. Returns dict or None on failure."""
try:
req = urllib.request.Request(url.rstrip("/") + "/api/stats")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode())
except Exception:
return None
@app.route("/")
def index():
return render_template("index.html")
@app.route("/api/stats")
def stats():
return jsonify(get_local_stats())
@app.route("/api/servers")
def all_servers():
results = []
for srv in servers:
if srv["url"] == "local":
data = get_local_stats()
data["name"] = srv["name"]
data["status"] = "online"
results.append(data)
else:
data = fetch_remote_stats(srv["url"])
if data:
data["name"] = srv["name"]
data["status"] = "online"
data.setdefault("num_cores", len(data.get("cores", [])))
results.append(data)
else:
results.append({"name": srv["name"], "status": "unreachable"})
return jsonify(servers=results)
# Start background sampler
t = threading.Thread(target=cpu_sampler, daemon=True)
t.start()

48
deploy-agent.sh Executable file
View File

@@ -0,0 +1,48 @@
#!/bin/bash
set -e
HOST="${1:-console}"
echo "Deploying sysmon agent to $HOST..."
# Create directory and copy app
ssh "$HOST" 'mkdir -p ~/sysmon-agent/templates'
scp ~/sysmon/app.py "$HOST":~/sysmon-agent/
scp ~/sysmon/templates/index.html "$HOST":~/sysmon-agent/templates/
# Install dependencies and set up systemd service
ssh "$HOST" bash <<'REMOTE'
set -e
pip3 install --break-system-packages --quiet flask gunicorn 2>/dev/null || \
pip3 install --quiet flask gunicorn
# Create systemd service
sudo tee /etc/systemd/system/sysmon-agent.service > /dev/null <<EOF
[Unit]
Description=Sysmon Agent
After=network.target
[Service]
Type=simple
User=kamaji
WorkingDirectory=/home/kamaji/sysmon-agent
Environment=SYSMON_PROC=/proc
ExecStart=/home/kamaji/.local/bin/gunicorn -b 0.0.0.0:8083 -w 1 --threads 2 app:app
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable sysmon-agent
sudo systemctl restart sysmon-agent
echo "Agent status:"
sudo systemctl status sysmon-agent --no-pager -l
REMOTE
echo ""
echo "Verify: curl http://$HOST:8083/api/stats"

23
run.sh Executable file
View File

@@ -0,0 +1,23 @@
#!/bin/bash
set -e
cd ~/sysmon
echo "Building sysmon container..."
podman build -t sysmon .
echo "Stopping existing container (if any)..."
podman stop sysmon 2>/dev/null || true
podman rm sysmon 2>/dev/null || true
echo "Starting sysmon dashboard on port 8083..."
podman run -d \
--name sysmon \
-p 8083:8083 \
--security-opt label=disable \
-v /proc:/host/proc:ro \
-e SYSMON_SERVERS="compute1:local,console:http://192.168.88.5:8083" \
--restart unless-stopped \
sysmon
echo "Done. Dashboard at http://$(hostname):8083"

298
templates/index.html Normal file
View File

@@ -0,0 +1,298 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>System Monitor</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
background: #0d1117; color: #c9d1d9;
font-family: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', monospace;
font-size: 14px; padding: 20px;
}
h1 { font-size: 18px; color: #58a6ff; margin-bottom: 4px; }
.subtitle { color: #484f58; font-size: 12px; margin-bottom: 20px; }
.section { margin-bottom: 24px; }
.section-title {
font-size: 13px; color: #8b949e; text-transform: uppercase;
letter-spacing: 1px; margin-bottom: 10px; border-bottom: 1px solid #21262d;
padding-bottom: 6px;
}
/* Server sections */
.server-section {
margin-bottom: 32px; padding: 16px;
border: 1px solid #21262d; border-radius: 8px;
}
.server-header {
display: flex; align-items: center; gap: 10px; margin-bottom: 16px;
}
.server-name { font-size: 16px; color: #58a6ff; font-weight: bold; }
.server-badge {
font-size: 11px; padding: 2px 8px; border-radius: 10px; font-weight: bold;
}
.server-badge.online { background: #238636; color: #fff; }
.server-badge.unreachable { background: #da3633; color: #fff; }
.server-unreachable {
color: #484f58; font-style: italic; padding: 20px; text-align: center;
}
/* CPU grid */
.cpu-grid { display: grid; gap: 4px; }
.core-cell {
background: #161b22; border: 1px solid #21262d; border-radius: 4px;
padding: 4px; text-align: center; position: relative; overflow: hidden;
min-height: 44px; display: flex; flex-direction: column; justify-content: center;
}
.core-id { font-size: 10px; color: #484f58; }
.core-pct { font-size: 13px; font-weight: bold; }
.core-bar {
position: absolute; bottom: 0; left: 0; right: 0; height: 3px;
background: #21262d; border-radius: 0 0 3px 3px;
}
.core-bar-fill {
height: 100%; border-radius: 0 0 3px 3px;
transition: width 0.3s ease, background-color 0.3s ease;
}
/* Bars */
.bar-container {
background: #161b22; border: 1px solid #21262d; border-radius: 6px;
height: 32px; position: relative; overflow: hidden;
}
.bar-fill {
height: 100%; transition: width 0.3s ease;
display: flex; align-items: center; padding-left: 10px;
font-size: 13px; font-weight: bold; white-space: nowrap;
}
/* Memory bar */
.mem-bar { display: flex; height: 100%; }
.mem-bar > div {
height: 100%; display: flex; align-items: center; justify-content: center;
font-size: 11px; font-weight: bold; white-space: nowrap; overflow: hidden;
transition: width 0.3s ease;
}
.mem-used { background: #da3633; }
.mem-cached { background: #d29922; }
.mem-free { background: #238636; }
.mem-details {
display: flex; gap: 20px; margin-top: 6px; font-size: 12px; color: #8b949e;
flex-wrap: wrap;
}
.mem-details span { display: flex; align-items: center; gap: 4px; }
.mem-dot { width: 8px; height: 8px; border-radius: 50%; display: inline-block; }
/* Load */
.load-row { display: flex; gap: 16px; flex-wrap: wrap; }
.load-item {
background: #161b22; border: 1px solid #21262d; border-radius: 6px;
padding: 12px 20px; text-align: center; flex: 1; min-width: 120px;
}
.load-label { font-size: 11px; color: #484f58; margin-bottom: 4px; }
.load-value { font-size: 24px; font-weight: bold; }
/* Uptime */
.uptime {
background: #161b22; border: 1px solid #21262d; border-radius: 6px;
padding: 10px 16px; display: inline-block; font-size: 15px;
}
/* Status indicator */
.status {
position: fixed; top: 12px; right: 20px; font-size: 11px; color: #484f58;
}
.status-dot {
display: inline-block; width: 6px; height: 6px; border-radius: 50%;
background: #238636; margin-right: 4px; vertical-align: middle;
}
@media (max-width: 700px) {
body { padding: 12px; }
.server-section { padding: 10px; }
}
</style>
</head>
<body>
<h1>System Monitor</h1>
<div class="subtitle" id="subtitle">connecting...</div>
<div class="status"><span class="status-dot" id="status-dot"></span><span id="status-text">connecting...</span></div>
<div id="servers-container"></div>
<script>
const container = document.getElementById('servers-container');
const serverSections = {}; // track built sections by name
function usageColor(pct) {
if (pct < 30) return '#238636';
if (pct < 60) return '#d29922';
if (pct < 85) return '#da3633';
return '#f85149';
}
function loadColor(val, numCores) {
const ratio = val / numCores;
if (ratio < 0.3) return '#238636';
if (ratio < 0.6) return '#d29922';
if (ratio < 0.9) return '#da3633';
return '#f85149';
}
function formatMB(mb) {
if (mb >= 1024) return (mb / 1024).toFixed(1) + ' GB';
return mb + ' MB';
}
function gridCols(numCores) {
if (numCores <= 4) return 2;
if (numCores <= 8) return 4;
if (numCores <= 16) return 4;
return 8;
}
function buildServerSection(name) {
const section = document.createElement('div');
section.className = 'server-section';
section.id = 'srv-' + name;
section.innerHTML =
'<div class="server-header">' +
'<div class="server-name">' + name + '</div>' +
'<div class="server-badge" id="badge-' + name + '">...</div>' +
'</div>' +
'<div id="content-' + name + '">' +
'<div class="section">' +
'<div class="section-title">CPU Cores \u2014 <span id="overall-' + name + '">0</span>% overall</div>' +
'<div class="cpu-grid" id="grid-' + name + '"></div>' +
'</div>' +
'<div class="section">' +
'<div class="section-title">Memory</div>' +
'<div class="bar-container"><div class="mem-bar" id="mem-' + name + '"></div></div>' +
'<div class="mem-details" id="memd-' + name + '"></div>' +
'</div>' +
'<div class="section">' +
'<div class="section-title">Load Average</div>' +
'<div class="load-row" id="load-' + name + '"></div>' +
'</div>' +
'<div class="section">' +
'<div class="section-title">Uptime</div>' +
'<div class="uptime" id="up-' + name + '">\u2014</div>' +
'</div>' +
'</div>';
container.appendChild(section);
serverSections[name] = { built: true, coreCount: 0 };
}
function ensureCores(name, numCores) {
if (serverSections[name].coreCount === numCores) return;
const grid = document.getElementById('grid-' + name);
grid.style.gridTemplateColumns = 'repeat(' + gridCols(numCores) + ', 1fr)';
grid.innerHTML = '';
for (let i = 0; i < numCores; i++) {
const cell = document.createElement('div');
cell.className = 'core-cell';
cell.innerHTML =
'<div class="core-id">' + i + '</div>' +
'<div class="core-pct" id="cpct-' + name + '-' + i + '">\u2014</div>' +
'<div class="core-bar"><div class="core-bar-fill" id="cbar-' + name + '-' + i + '"></div></div>';
grid.appendChild(cell);
}
serverSections[name].coreCount = numCores;
}
function updateServer(srv) {
if (!serverSections[srv.name]) buildServerSection(srv.name);
const badge = document.getElementById('badge-' + srv.name);
const content = document.getElementById('content-' + srv.name);
if (srv.status === 'unreachable') {
badge.textContent = 'unreachable';
badge.className = 'server-badge unreachable';
content.innerHTML = '<div class="server-unreachable">Server unreachable</div>';
serverSections[srv.name].coreCount = 0; // force rebuild on reconnect
return;
}
badge.textContent = 'online';
badge.className = 'server-badge online';
// Restore content if it was showing unreachable
if (content.querySelector('.server-unreachable')) {
const section = document.getElementById('srv-' + srv.name);
section.remove();
delete serverSections[srv.name];
buildServerSection(srv.name);
}
const numCores = srv.num_cores || srv.cores.length;
ensureCores(srv.name, numCores);
// CPU cores
for (const core of srv.cores) {
const pctEl = document.getElementById('cpct-' + srv.name + '-' + core.id);
const barEl = document.getElementById('cbar-' + srv.name + '-' + core.id);
if (pctEl) {
pctEl.textContent = core.usage_percent + '%';
pctEl.style.color = usageColor(core.usage_percent);
}
if (barEl) {
barEl.style.width = core.usage_percent + '%';
barEl.style.backgroundColor = usageColor(core.usage_percent);
}
}
// Overall CPU
document.getElementById('overall-' + srv.name).textContent = srv.overall_cpu;
// Memory
const mem = srv.memory;
const usedPct = (mem.used_mb - mem.cached_mb) / mem.total_mb * 100;
const cachedPct = mem.cached_mb / mem.total_mb * 100;
const freePct = 100 - usedPct - cachedPct;
document.getElementById('mem-' + srv.name).innerHTML =
'<div class="mem-used" style="width:' + usedPct + '%">' + (usedPct > 5 ? formatMB(mem.used_mb - mem.cached_mb) : '') + '</div>' +
'<div class="mem-cached" style="width:' + cachedPct + '%">' + (cachedPct > 5 ? formatMB(mem.cached_mb) : '') + '</div>' +
'<div class="mem-free" style="width:' + freePct + '%">' + (freePct > 5 ? formatMB(mem.available_mb) : '') + '</div>';
document.getElementById('memd-' + srv.name).innerHTML =
'<span><span class="mem-dot" style="background:#da3633"></span>Used: ' + formatMB(mem.used_mb - mem.cached_mb) + '</span>' +
'<span><span class="mem-dot" style="background:#d29922"></span>Cached: ' + formatMB(mem.cached_mb) + '</span>' +
'<span><span class="mem-dot" style="background:#238636"></span>Available: ' + formatMB(mem.available_mb) + '</span>' +
'<span style="color:#484f58">Total: ' + formatMB(mem.total_mb) + '</span>' +
'<span style="color:#484f58">' + mem.percent + '%</span>';
// Load
const load = srv.load;
document.getElementById('load-' + srv.name).innerHTML =
'<div class="load-item"><div class="load-label">1 min</div><div class="load-value" style="color:' + loadColor(load.load1, numCores) + '">' + load.load1.toFixed(2) + '</div></div>' +
'<div class="load-item"><div class="load-label">5 min</div><div class="load-value" style="color:' + loadColor(load.load5, numCores) + '">' + load.load5.toFixed(2) + '</div></div>' +
'<div class="load-item"><div class="load-label">15 min</div><div class="load-value" style="color:' + loadColor(load.load15, numCores) + '">' + load.load15.toFixed(2) + '</div></div>';
// Uptime
document.getElementById('up-' + srv.name).textContent = srv.uptime.human;
}
let fails = 0;
async function poll() {
try {
const r = await fetch('/api/servers');
const data = await r.json();
for (const srv of data.servers) updateServer(srv);
const online = data.servers.filter(s => s.status === 'online').length;
document.getElementById('subtitle').textContent = online + '/' + data.servers.length + ' servers online';
fails = 0;
document.getElementById('status-dot').style.background = '#238636';
document.getElementById('status-text').textContent = 'live';
} catch (e) {
fails++;
document.getElementById('status-dot').style.background = '#da3633';
document.getElementById('status-text').textContent = 'error (' + fails + ')';
}
}
poll();
setInterval(poll, 1000);
</script>
</body>
</html>