Refactor dashboard connection setup: update README and scripts to accept simplified dashboard address format. Enhance logging in agent for better debugging and information tracking during data collection processes.
This commit is contained in:
@@ -70,7 +70,7 @@ python3 dashboard.py
|
||||
Installs to `/opt/zpulse-agent`. Must run as root for SMART data & ZFS access.
|
||||
|
||||
```bash
|
||||
sudo ./agent/setup.sh ws://DASHBOARD_IP:8888/ws/agent
|
||||
sudo ./agent/setup.sh DASHBOARD_IP:8888
|
||||
```
|
||||
|
||||
This installs `smartmontools` and `zfsutils-linux`, creates a venv, and sets up a systemd service that auto-starts and reconnects.
|
||||
@@ -82,7 +82,7 @@ cd agent
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
sudo ./venv/bin/python agent.py ws://DASHBOARD_IP:8888/ws/agent
|
||||
sudo ./venv/bin/python agent.py DASHBOARD_IP:8888
|
||||
```
|
||||
|
||||
|
||||
@@ -90,7 +90,6 @@ sudo ./venv/bin/python agent.py ws://DASHBOARD_IP:8888/ws/agent
|
||||
|
||||
Open the dashboard in a browser, click Settings. Enter your Gotify server URL and app token, hit Test, then Save. Alert thresholds for temperature, space usage, SMART failures, and pool health are all configured from the same panel.
|
||||
|
||||
ws://10.0.0.34:8888/ws/agent
|
||||
## What It Monitors
|
||||
|
||||
- Fleet overview with all connected servers, health status, storage usage, alert counts
|
||||
|
||||
@@ -16,7 +16,6 @@ import time
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import apv
|
||||
@@ -99,6 +98,7 @@ def collect_dimm_info():
|
||||
|
||||
out, _, rc = run_cmd(['dmidecode', '-t', 'memory'])
|
||||
if rc != 0:
|
||||
logging.debug('dmidecode not available or failed (rc=%d)', rc)
|
||||
return []
|
||||
dimms = []
|
||||
for block in out.split('Memory Device')[1:]:
|
||||
@@ -126,12 +126,15 @@ def collect_dimm_info():
|
||||
'rank' : d.get('Rank', ''),
|
||||
'populated' : populated,
|
||||
})
|
||||
populated_count = sum(1 for d in dimms if d['populated'])
|
||||
logging.info('Collected %d DIMM slots (%d populated)', len(dimms), populated_count)
|
||||
return dimms
|
||||
|
||||
|
||||
def collect_system_info():
|
||||
'''Collect hostname, kernel, ZFS version, uptime, RAM, CPU, and DIMM info.'''
|
||||
|
||||
logging.debug('Collecting system info...')
|
||||
info = {
|
||||
'hostname' : capabilities['hostname'],
|
||||
'kernel' : '',
|
||||
@@ -178,6 +181,7 @@ def collect_system_info():
|
||||
except Exception:
|
||||
pass
|
||||
info['dimms'] = collect_dimm_info()
|
||||
logging.debug('System info: kernel=%s, zfs=%s, cpu=%s', info['kernel'], info['zfs_version'] or 'N/A', info['cpu_model'][:40] or 'unknown')
|
||||
return info
|
||||
|
||||
|
||||
@@ -217,9 +221,6 @@ def compute_health_score(disk: dict):
|
||||
return max(0, min(100, int(score)))
|
||||
|
||||
|
||||
# ── Fast Temperature ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
# ── Disk & SMART Collection ─────────────────────────────────────────────────
|
||||
|
||||
def collect_smart(device: str):
|
||||
@@ -229,10 +230,12 @@ def collect_smart(device: str):
|
||||
:param device: Block device path (e.g. /dev/sda)
|
||||
'''
|
||||
|
||||
logging.debug('Running smartctl on %s', device)
|
||||
out, _, _ = run_cmd(['smartctl', '-j', '-a', device], timeout=30)
|
||||
try:
|
||||
data = json.loads(out)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
logging.debug('smartctl returned no valid JSON for %s', device)
|
||||
return {'smart_available': False}
|
||||
|
||||
info = {
|
||||
@@ -308,8 +311,10 @@ def collect_udev_info(device: str):
|
||||
def collect_disks():
|
||||
'''Enumerate physical disks via lsblk and collect SMART data in parallel.'''
|
||||
|
||||
logging.info('Collecting disk information...')
|
||||
out, _, rc = run_cmd(['lsblk', '-d', '-b', '-o', 'NAME,SIZE,MODEL,SERIAL,ROTA,TRAN,TYPE', '-J'])
|
||||
if rc != 0:
|
||||
logging.warning('lsblk failed (rc=%d)', rc)
|
||||
return []
|
||||
try:
|
||||
data = json.loads(out)
|
||||
@@ -333,7 +338,10 @@ def collect_disks():
|
||||
'pool' : pool_map.get(name, ''),
|
||||
})
|
||||
|
||||
logging.info('Found %d physical disk(s) via lsblk', len(devs))
|
||||
|
||||
if capabilities['smartctl'] and devs:
|
||||
logging.info('Querying SMART data for %d disk(s)...', len(devs))
|
||||
with ThreadPoolExecutor(max_workers=min(8, len(devs))) as executor:
|
||||
futures = {executor.submit(collect_smart, d['path']): d for d in devs}
|
||||
for future in as_completed(futures):
|
||||
@@ -357,6 +365,8 @@ def collect_disks():
|
||||
with lock:
|
||||
cache['pool_map'] = pool_map
|
||||
|
||||
smart_count = sum(1 for d in devs if d.get('health') is not None)
|
||||
logging.info('Disk collection complete: %d disk(s), %d with SMART data', len(devs), smart_count)
|
||||
return devs
|
||||
|
||||
|
||||
@@ -367,6 +377,7 @@ def collect_pool_mapping():
|
||||
|
||||
if not capabilities['zfs']:
|
||||
return {}
|
||||
logging.debug('Building pool-to-device mapping...')
|
||||
mapping = {}
|
||||
out, _, rc = run_cmd(['zpool', 'status', '-L'])
|
||||
if rc != 0:
|
||||
@@ -418,6 +429,7 @@ def collect_pools():
|
||||
|
||||
if not capabilities['zfs']:
|
||||
return []
|
||||
logging.info('Collecting ZFS pool data...')
|
||||
out, _, rc = run_cmd(['zpool', 'list', '-Hp', '-o', 'name,size,alloc,free,frag,cap,dedup,health,ashift'])
|
||||
if rc != 0:
|
||||
return []
|
||||
@@ -448,6 +460,7 @@ def collect_pools():
|
||||
pool['scan'], pool['vdevs'], pool['errors_summary'] = parse_pool_status(s_out)
|
||||
pool['scrub_age_days'] = parse_scrub_age(pool['scan'])
|
||||
pools.append(pool)
|
||||
logging.info('Collected %d ZFS pool(s)', len(pools))
|
||||
return pools
|
||||
|
||||
|
||||
@@ -500,6 +513,7 @@ def collect_datasets_and_snapshots():
|
||||
|
||||
if not capabilities['zfs']:
|
||||
return [], []
|
||||
logging.debug('Collecting ZFS datasets and snapshots...')
|
||||
out, _, rc = run_cmd(['zfs', 'list', '-t', 'all', '-Hp', '-o', 'name,used,avail,refer,mountpoint,compression,compressratio,recordsize,type,quota,reservation,creation', '-s', 'creation'])
|
||||
if rc != 0:
|
||||
return [], []
|
||||
@@ -536,6 +550,7 @@ def collect_datasets_and_snapshots():
|
||||
'quota' : int(p[9]) if len(p) > 9 and p[9] not in ('-', '0', 'none', '') else 0,
|
||||
'reservation' : int(p[10]) if len(p) > 10 and p[10] not in ('-', '0', 'none', '') else 0,
|
||||
})
|
||||
logging.info('Collected %d dataset(s), %d snapshot(s)', len(datasets), len(snapshots))
|
||||
return datasets, snapshots
|
||||
|
||||
|
||||
@@ -593,12 +608,12 @@ def collect_iostat():
|
||||
return rates
|
||||
|
||||
|
||||
|
||||
# ── Background Worker ────────────────────────────────────────────────────────
|
||||
|
||||
def background_worker():
|
||||
'''Collect all monitoring data on timed intervals and update the shared cache.'''
|
||||
|
||||
logging.info('Background worker started (IO every %ds, pools every %ds, SMART every %ds)', IO_INTERVAL, POOL_INTERVAL, SMART_INTERVAL)
|
||||
tick = 0
|
||||
collect_iostat()
|
||||
time.sleep(1)
|
||||
@@ -626,6 +641,7 @@ def background_worker():
|
||||
cache['disks'] = disks
|
||||
|
||||
if not init_done.is_set():
|
||||
logging.info('Initial data collection complete')
|
||||
init_done.set()
|
||||
|
||||
tick += 1
|
||||
@@ -643,6 +659,7 @@ async def ws_sender(ws):
|
||||
:param ws: Active WebSocket connection to the dashboard
|
||||
'''
|
||||
|
||||
logging.info('Sending initial data burst to dashboard...')
|
||||
with lock:
|
||||
io_msg = json.dumps({'type': 'io', 'ts': time.time(), 'rates': cache['io_rates'], 'pool_map': cache['pool_map']})
|
||||
pools_msg = json.dumps({'type': 'pools', 'ts': time.time(), 'pools': cache['pools']})
|
||||
@@ -657,6 +674,7 @@ async def ws_sender(ws):
|
||||
await ws.send(snaps_msg)
|
||||
await ws.send(disks_msg)
|
||||
await ws.send(io_msg)
|
||||
logging.info('Initial burst sent (system, pools, datasets, snapshots, disks, io)')
|
||||
|
||||
tick = 0
|
||||
while True:
|
||||
@@ -697,14 +715,17 @@ async def ws_receiver(ws):
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
cmd = data.get('type')
|
||||
logging.debug('Received command: %s', cmd)
|
||||
if cmd == 'smarttest':
|
||||
device = data.get('device', '')
|
||||
test_type = data.get('test_type', 'short')
|
||||
logging.info('SMART self-test requested: %s on %s', test_type, device)
|
||||
if test_type not in ('short', 'long', 'conveyance'):
|
||||
continue
|
||||
if not re.match(r'^/dev/(sd[a-z]+|nvme\d+n\d+|da\d+)$', device):
|
||||
continue
|
||||
out, err, rc = await asyncio.to_thread(run_cmd, ['smartctl', '-t', test_type, device])
|
||||
logging.info('SMART test %s on %s: %s', test_type, device, 'started' if rc == 0 else 'failed')
|
||||
await ws.send(json.dumps({
|
||||
'type': 'smarttest_result', 'device': device,
|
||||
'test_type': test_type, 'success': rc == 0,
|
||||
@@ -735,16 +756,31 @@ async def ws_main(dashboard_url: str):
|
||||
|
||||
|
||||
|
||||
def parse_dashboard_target(target: str) -> str:
|
||||
'''
|
||||
Convert a user-provided dashboard target into a full WebSocket URL.
|
||||
Accepts "ip:port", "ip port", or a full ws:// URL.
|
||||
|
||||
:param target: Dashboard target string
|
||||
'''
|
||||
|
||||
target = target.strip()
|
||||
if target.startswith('ws://') or target.startswith('wss://'):
|
||||
return target
|
||||
target = target.replace(' ', ':')
|
||||
if ':' not in target:
|
||||
target += ':8888'
|
||||
return f'ws://{target}/ws/agent'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('dashboard_url', help='Dashboard WebSocket URL (e.g. ws://10.0.0.50:8888/ws/agent)')
|
||||
parser.add_argument('dashboard', help='Dashboard address (e.g. 10.0.0.50:8888 or "10.0.0.50 8888")')
|
||||
parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
if args.debug:
|
||||
apv.setup_logging(level='DEBUG', log_to_disk=True, max_log_size=5*1024*1024, max_backups=5, compress_backups=True, log_file_name='havoc', show_details=True)
|
||||
apv.setup_logging(level='DEBUG', log_to_disk=True, max_log_size=5*1024*1024, max_backups=5, compress_backups=True, log_file_name='zpulse-agent', show_details=True)
|
||||
logging.debug('Debug logging enabled')
|
||||
else:
|
||||
apv.setup_logging(level='INFO')
|
||||
@@ -754,13 +790,15 @@ if __name__ == '__main__':
|
||||
if os.geteuid() != 0:
|
||||
raise RuntimeError('This program must be ran as root')
|
||||
|
||||
dashboard_url = parse_dashboard_target(args.dashboard)
|
||||
|
||||
logging.info('ZPulse Agent starting — host: %s', capabilities['hostname'])
|
||||
logging.info(' smartctl: %s', 'available' if capabilities['smartctl'] else 'NOT FOUND')
|
||||
logging.info(' zfs: %s', 'available' if capabilities['zfs'] else 'NOT FOUND')
|
||||
logging.info(' dashboard: %s', args.dashboard_url)
|
||||
logging.info(' dashboard: %s', dashboard_url)
|
||||
|
||||
worker = threading.Thread(target=background_worker, daemon=True)
|
||||
worker.start()
|
||||
init_done.wait(timeout=120)
|
||||
|
||||
asyncio.run(ws_main(args.dashboard_url))
|
||||
asyncio.run(ws_main(dashboard_url))
|
||||
@@ -10,14 +10,14 @@ INSTALL_DIR="/opt/zpulse-agent"
|
||||
SERVICE_NAME="zpulse-agent"
|
||||
|
||||
# Check if running as root & an argument is provided
|
||||
[ "$(id -u)" -ne 0 ] && { echo "Run as root: sudo $0 <dashboard_url>"; exit 1; }
|
||||
[ -z "$1" ] && { echo "Usage: sudo $0 ws://DASHBOARD_IP:8888/ws/agent"; exit 1; }
|
||||
[ "$(id -u)" -ne 0 ] && { echo "Run as root: sudo $0 <ip:port>"; exit 1; }
|
||||
[ -z "$1" ] && { echo "Usage: sudo $0 10.0.0.50:8888"; exit 1; }
|
||||
|
||||
# Set the dashboard URL
|
||||
DASHBOARD_URL="$1"
|
||||
# Set the dashboard address
|
||||
DASHBOARD_ADDR="$1"
|
||||
|
||||
# Install system packages
|
||||
apt-get update -qq && apt-get install -y dmidecodesmartmontools zfsutils-linux python3-pip python3-venv
|
||||
apt-get update -qq && apt-get install -y dmidecode smartmontools zfsutils-linux python3-pip python3-venv
|
||||
|
||||
# Copy agent files to install directory
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
@@ -37,7 +37,7 @@ Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=$INSTALL_DIR/venv/bin/python $INSTALL_DIR/agent.py $DASHBOARD_URL
|
||||
ExecStart=$INSTALL_DIR/venv/bin/python $INSTALL_DIR/agent.py $DASHBOARD_ADDR
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
StandardOutput=journal
|
||||
|
||||
@@ -289,11 +289,6 @@ def get_server_list():
|
||||
pools = pools_msg.get('pools', []) if isinstance(pools_msg, dict) else []
|
||||
sys_msg = a.current.get('system', {})
|
||||
sys_info = sys_msg.get('info', {}) if isinstance(sys_msg, dict) else {}
|
||||
try:
|
||||
with open('/proc/uptime') as f:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
out.append({
|
||||
'hostname' : hn,
|
||||
'online' : a.online,
|
||||
|
||||
@@ -115,7 +115,7 @@ section{margin-bottom:1.5rem}
|
||||
.st .attr-warn{color:var(--yellow)}.st .attr-crit{color:var(--red);font-weight:600}.st .attr-note{color:var(--text2);font-style:italic;font-size:.62rem}
|
||||
.empty{text-align:center;padding:1.25rem;color:var(--text2);font-size:.82rem}
|
||||
.tc{color:var(--green)}.tw{color:var(--yellow)}.th{color:var(--red)}
|
||||
@media(max-width:1024px){.g5{grid-template-columns:repeat(3,1fr)}.g4{grid-template-columns:repeat(2,1fr)}.g2{grid-template-columns:1fr}.pool-stats{grid-template-columns:repeat(3,1fr)}.disk-stats{grid-template-columns:repeat(2,1fr)}}
|
||||
@media(max-width:1024px){.g5{grid-template-columns:repeat(3,1fr)}.g4{grid-template-columns:repeat(2,1fr)}.g2{grid-template-columns:1fr}.pool-stats{grid-template-columns:repeat(3,1fr)}}
|
||||
@media(max-width:640px){nav .nav-links{display:none}main{padding:.75rem}.g5,.g4,.g3{grid-template-columns:1fr 1fr}}
|
||||
</style>
|
||||
</head>
|
||||
@@ -210,7 +210,6 @@ const scoreLabel=s=>{if(typeof s!=='number')return'';return s>=90?'Excellent':s>
|
||||
const relTime=ts=>{const s=Math.floor(Date.now()/1000-ts);if(s<60)return'just now';if(s<3600)return Math.floor(s/60)+'m ago';if(s<86400)return Math.floor(s/3600)+'h ago';return Math.floor(s/86400)+'d ago'};
|
||||
const fmtUptime=sec=>{if(!sec)return'';const d=Math.floor(sec/86400),h=Math.floor((sec%86400)/3600);return d>0?d+'d '+h+'h':h+'h '+Math.floor((sec%3600)/60)+'m'};
|
||||
const SEAGATE=new Set([1,7,195]);
|
||||
const DISK_COLORS=['#3b82f6','#f97316','#22c55e','#a855f7','#06b6d4','#ef4444','#eab308','#ec4899','#14b8a6','#f43f5e','#8b5cf6','#84cc16'];
|
||||
|
||||
let ws=null, selectedServer=null, servers=[], settingsData={};
|
||||
const state={disks:[],pools:[],datasets:[],snapshots:[],ioRates:{},poolMap:{},systemInfo:{},alertsActive:[],alertLog:[]};
|
||||
@@ -302,9 +301,6 @@ function renderRAM(si){
|
||||
}).join('');
|
||||
}
|
||||
|
||||
function renderOverviewFromState(){}
|
||||
|
||||
|
||||
const _expanded={pools:new Set(),disks:new Set(),datasets:new Set()};
|
||||
function togglePool(el){const h=el,b=el.nextElementSibling;h.classList.toggle('open');b.classList.toggle('open');const name=h.querySelector('.pool-name')?.textContent;if(name){if(h.classList.contains('open'))_expanded.pools.add(name);else _expanded.pools.delete(name)}}
|
||||
function renderPools(pools){
|
||||
@@ -512,19 +508,19 @@ function handleMessage(msg){
|
||||
}
|
||||
break;
|
||||
case 'pools':
|
||||
if(msg.hostname===selectedServer){state.pools=msg.pools||[];renderPools(state.pools);renderOverviewFromState()}
|
||||
if(msg.hostname===selectedServer){state.pools=msg.pools||[];renderPools(state.pools)}
|
||||
break;
|
||||
case 'disks':
|
||||
if(msg.hostname===selectedServer){state.disks=msg.disks||[];renderDisks(state.disks);renderOverviewFromState()}
|
||||
if(msg.hostname===selectedServer){state.disks=msg.disks||[];renderDisks(state.disks)}
|
||||
break;
|
||||
case 'datasets':
|
||||
if(msg.hostname===selectedServer){state.datasets=msg.datasets||[];renderDatasets(state.datasets);renderOverviewFromState()}
|
||||
if(msg.hostname===selectedServer){state.datasets=msg.datasets||[];renderDatasets(state.datasets)}
|
||||
break;
|
||||
case 'snapshots':
|
||||
if(msg.hostname===selectedServer){state.snapshots=msg.snapshots||[];renderSnapshots(state.snapshots)}
|
||||
break;
|
||||
case 'system':
|
||||
if(msg.hostname===selectedServer){state.systemInfo=msg.info||{};renderOverviewFromState();renderRAM(state.systemInfo)}
|
||||
if(msg.hostname===selectedServer){state.systemInfo=msg.info||{};renderRAM(state.systemInfo)}
|
||||
break;
|
||||
case 'alerts':
|
||||
if(msg.hostname===selectedServer)renderAlerts({active:msg.active||[],log:msg.log||[]});
|
||||
@@ -547,7 +543,6 @@ function loadFullState(msg){
|
||||
if(c.snapshots&&c.snapshots.snapshots){state.snapshots=c.snapshots.snapshots;renderSnapshots(state.snapshots)}
|
||||
if(c.io&&c.io.rates){state.ioRates=c.io.rates;state.poolMap=c.io.pool_map||{};renderIOStats(c.io.rates,c.io.pool_map||{})}
|
||||
if(c.system&&c.system.info)state.systemInfo=c.system.info;
|
||||
renderOverviewFromState();
|
||||
renderRAM(state.systemInfo);
|
||||
if(h&&h.timestamps&&h.timestamps.length)loadHist(h);
|
||||
if(msg.alerts)renderAlerts(msg.alerts);
|
||||
|
||||
Reference in New Issue
Block a user