# Updated GPUMonitor with Prometheus export
from prometheus_client import Gauge, start_http_server
class GPUMonitor:
def __init__(self, gpu_ids):
# Prometheus metrics
self.gpu_util = Gauge('gpu_utilization', 'GPU utilization %', ['gpu_id'])
self.gpu_mem = Gauge('gpu_memory', 'GPU memory used MB', ['gpu_id'])
self.gpu_temp = Gauge('gpu_temp', 'GPU temperature C', ['gpu_id'])
# Start metrics server
start_http_server(8000)
def collect_metrics(self):
metrics = []
for i, gpu in enumerate(self.gpus):
# ... existing collection code ...
# Export to Prometheus
self.gpu_util.labels(gpu_id=i).set(util.gpu)
self.gpu_mem.labels(gpu_id=i).set(mem.used / 1024**2)
self.gpu_temp.labels(gpu_id=i).set(temp)
|
(GRAFANNA)Visualization & Dashboard
GPU Cluster Overview |
Job Performance |
Alert Dashboard |
Heatmap of GPU utilization across nodes
Memory usage trends
Temperature distribution
|
Training loss/accuracy curves
Batch processing times
GPU memory vs. batch size
|
Active incidents
Error rate over time
Recovery success rate
|
|