fixed amd gpu monitoring

This commit is contained in:
2025-09-25 17:55:09 +02:00
parent 2a2cd12f43
commit ee951835c3
2 changed files with 111 additions and 199 deletions

View File

@@ -1,7 +1,7 @@
import subprocess import subprocess
import re
import time import time
import logging import logging
import json
from typing import Dict, Any, Optional from typing import Dict, Any, Optional
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -15,46 +15,24 @@ class GPUAMDMonitor:
self.gpu_available = self._check_gpu_availability() self.gpu_available = self._check_gpu_availability()
def _check_gpu_availability(self) -> bool: def _check_gpu_availability(self) -> bool:
"""Check if AMD GPU monitoring tools are available""" """Check if rocm-smi is available"""
try: try:
# Check for rocm-smi (AMD)
result = subprocess.run(['rocm-smi', '--help'], result = subprocess.run(['rocm-smi', '--help'],
capture_output=True, text=True, timeout=5) capture_output=True, text=True, timeout=5)
if result.returncode == 0: return result.returncode == 0
return True
except (subprocess.TimeoutExpired, FileNotFoundError): except (subprocess.TimeoutExpired, FileNotFoundError):
pass return False
try:
# Check for radeontop
result = subprocess.run(['radeontop', '--help'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0:
return True
except (subprocess.TimeoutExpired, FileNotFoundError):
pass
# Check for GPU in /sys/class/drm
try:
import os
gpu_dirs = [d for d in os.listdir('/sys/class/drm') if d.startswith('card')]
return len(gpu_dirs) > 0
except:
pass
return False
def get_gpu_info(self) -> Dict[str, Any]: def get_gpu_info(self) -> Dict[str, Any]:
"""Get static GPU information""" """Get static GPU information"""
if not self.gpu_available: if not self.gpu_available:
return {'available': False, 'message': 'No GPU monitoring tools found'} return {'available': False, 'message': 'rocm-smi not available'}
if not self._cached_data.get('gpu_info'): if not self._cached_data.get('gpu_info'):
try: try:
gpu_info = self._get_rocm_info() gpu_info = self._get_rocm_info()
if not gpu_info: if not gpu_info:
gpu_info = self._get_sys_gpu_info() gpu_info = {'available': False, 'message': 'Could not get GPU info from rocm-smi'}
self._cached_data['gpu_info'] = gpu_info self._cached_data['gpu_info'] = gpu_info
except Exception as e: except Exception as e:
logger.error(f"Error getting GPU info: {e}") logger.error(f"Error getting GPU info: {e}")
@@ -74,9 +52,9 @@ class GPUAMDMonitor:
try: try:
stats = self._get_rocm_stats() stats = self._get_rocm_stats()
if not stats: if not stats:
stats = self._get_fallback_stats() stats = {'available': False, 'message': 'Could not get GPU stats from rocm-smi'}
else:
stats['timestamp'] = now stats['timestamp'] = now
self._cached_data['stats'] = stats self._cached_data['stats'] = stats
self.last_update = now self.last_update = now
@@ -87,196 +65,128 @@ class GPUAMDMonitor:
return self._cached_data.get('stats', {'available': False}) return self._cached_data.get('stats', {'available': False})
def _get_rocm_info(self) -> Optional[Dict[str, Any]]: def _get_rocm_info(self) -> Optional[Dict[str, Any]]:
"""Get GPU info using rocm-smi""" """Get GPU info using rocm-smi with JSON output"""
try: try:
result = subprocess.run(['rocm-smi', '--showid', '--showproductname'], result = subprocess.run(['rocm-smi', '--showid', '--showproductname', '--json'],
capture_output=True, text=True, timeout=10) capture_output=True, text=True, timeout=10)
if result.returncode == 0: if result.returncode != 0:
lines = result.stdout.strip().split('\n') return None
gpu_info = {'available': True, 'driver': 'rocm-smi', 'cards': []}
for line in lines: data = json.loads(result.stdout)
if 'GPU[' in line and ':' in line: gpu_info = {'available': True, 'driver': 'rocm-smi', 'cards': []}
# Parse GPU ID and name
parts = line.split(':')
if len(parts) >= 2:
gpu_id = parts[0].strip()
gpu_name = parts[1].strip()
gpu_info['cards'].append({
'id': gpu_id,
'name': gpu_name
})
return gpu_info if gpu_info['cards'] else None # Parse JSON structure
if isinstance(data, dict):
for gpu_id, gpu_data in data.items():
if gpu_id.startswith('card') and isinstance(gpu_data, dict):
# Get GPU name from various possible fields
gpu_name = (gpu_data.get('Card Series') or
gpu_data.get('Device Name') or
gpu_data.get('Card SKU') or
'Unknown GPU')
gpu_info['cards'].append({
'id': gpu_id,
'name': gpu_name
})
except Exception as e: return gpu_info if gpu_info['cards'] else None
logger.debug(f"rocm-smi not available: {e}")
except (subprocess.SubprocessError, json.JSONDecodeError, Exception) as e:
logger.debug(f"Error getting GPU info: {e}")
return None return None
def _get_rocm_stats(self) -> Optional[Dict[str, Any]]: def _get_rocm_stats(self) -> Optional[Dict[str, Any]]:
"""Get GPU stats using rocm-smi""" """Get GPU stats using rocm-smi with JSON output"""
try: try:
# Get temperature, usage, and memory info # Get temperature and usage
result = subprocess.run(['rocm-smi', '--showtemp', '--showuse', '--showmeminfo'], result = subprocess.run(['rocm-smi', '--showtemp', '--showuse', '--showpower', '--json'],
capture_output=True, text=True, timeout=10) capture_output=True, text=True, timeout=10)
if result.returncode == 0: if result.returncode != 0:
stats = {'available': True, 'cards': []} return None
lines = result.stdout.strip().split('\n') data = json.loads(result.stdout)
current_gpu = None
for line in lines: # Get memory info separately
line = line.strip() mem_result = subprocess.run(['rocm-smi', '--showmeminfo', 'vram', '--json'],
capture_output=True, text=True, timeout=10)
mem_data = {}
if mem_result.returncode == 0:
try:
mem_data = json.loads(mem_result.stdout)
except json.JSONDecodeError:
pass
# Parse GPU identifier
if line.startswith('GPU['):
gpu_match = re.search(r'GPU\[(\d+)\]', line)
if gpu_match:
current_gpu = {
'id': int(gpu_match.group(1)),
'temperature': None,
'usage': None,
'memory_used': None,
'memory_total': None
}
stats['cards'].append(current_gpu)
# Parse temperature
elif 'Temperature' in line and current_gpu is not None:
temp_match = re.search(r'(\d+\.\d+)°C', line)
if temp_match:
current_gpu['temperature'] = float(temp_match.group(1))
# Parse GPU usage
elif 'GPU use' in line and current_gpu is not None:
usage_match = re.search(r'(\d+)%', line)
if usage_match:
current_gpu['usage'] = int(usage_match.group(1))
# Parse memory info
elif 'Memory' in line and current_gpu is not None:
mem_match = re.search(r'(\d+)MB / (\d+)MB', line)
if mem_match:
current_gpu['memory_used'] = int(mem_match.group(1))
current_gpu['memory_total'] = int(mem_match.group(2))
return stats if stats['cards'] else None
except Exception as e:
logger.debug(f"rocm-smi stats not available: {e}")
return None
def _get_sys_gpu_info(self) -> Dict[str, Any]:
"""Get GPU info from /sys filesystem"""
try:
import os
gpu_info = {'available': True, 'driver': 'sysfs', 'cards': []}
drm_path = '/sys/class/drm'
if os.path.exists(drm_path):
card_dirs = [d for d in os.listdir(drm_path) if d.startswith('card') and not d.endswith('-')]
for card_dir in sorted(card_dirs):
card_path = os.path.join(drm_path, card_dir)
device_path = os.path.join(card_path, 'device')
gpu_name = "Unknown GPU"
# Try to get GPU name from various sources
for name_file in ['product_name', 'device/product_name']:
name_path = os.path.join(card_path, name_file)
if os.path.exists(name_path):
try:
with open(name_path, 'r') as f:
gpu_name = f.read().strip()
break
except:
pass
# Try vendor and device IDs
if gpu_name == "Unknown GPU":
try:
vendor_path = os.path.join(device_path, 'vendor')
device_id_path = os.path.join(device_path, 'device')
if os.path.exists(vendor_path) and os.path.exists(device_id_path):
with open(vendor_path, 'r') as f:
vendor = f.read().strip()
with open(device_id_path, 'r') as f:
device_id = f.read().strip()
if vendor == '0x1002': # AMD vendor ID
gpu_name = f"AMD GPU ({device_id})"
except:
pass
gpu_info['cards'].append({
'id': card_dir,
'name': gpu_name,
'path': card_path
})
return gpu_info if gpu_info['cards'] else {'available': False}
except Exception as e:
logger.error(f"Error getting sysfs GPU info: {e}")
return {'available': False, 'error': str(e)}
def _get_fallback_stats(self) -> Dict[str, Any]:
"""Get basic GPU stats from /sys filesystem"""
try:
import os
stats = {'available': True, 'cards': []} stats = {'available': True, 'cards': []}
# Try to read basic info from sysfs # Parse JSON structure
drm_path = '/sys/class/drm' if isinstance(data, dict):
if os.path.exists(drm_path): for gpu_id, gpu_data in data.items():
card_dirs = [d for d in os.listdir(drm_path) if d.startswith('card') and not d.endswith('-')] if gpu_id.startswith('card') and isinstance(gpu_data, dict):
gpu_stats = {
'id': gpu_id,
'temperature': None,
'usage': None,
'memory_used': None,
'memory_total': None,
'power': None
}
for card_dir in sorted(card_dirs): # Extract temperature (try different possible fields)
card_path = os.path.join(drm_path, card_dir) temp = gpu_data.get('Temperature (Sensor edge) (C)')
device_path = os.path.join(card_path, 'device') if temp is not None and temp != 'N/A':
try:
gpu_stats['temperature'] = float(temp)
except (ValueError, TypeError):
pass
gpu_stats = { # Extract GPU usage
'id': card_dir, usage = gpu_data.get('GPU use (%)')
'temperature': None, if usage is not None and usage != 'N/A':
'usage': None, try:
'memory_used': None, gpu_stats['usage'] = int(usage)
'memory_total': None except (ValueError, TypeError):
} pass
# Try to read temperature from hwmon # Extract power
hwmon_path = os.path.join(device_path, 'hwmon') power = gpu_data.get('Average Graphics Package Power (W)')
if os.path.exists(hwmon_path): if power is not None and power != 'N/A':
for hwmon_dir in os.listdir(hwmon_path): try:
temp_file = os.path.join(hwmon_path, hwmon_dir, 'temp1_input') gpu_stats['power'] = float(power)
if os.path.exists(temp_file): except (ValueError, TypeError):
pass
# Extract memory info from separate call
if gpu_id in mem_data:
mem_info = mem_data[gpu_id]
# Memory in bytes
mem_total_bytes = mem_info.get('VRAM Total Memory (B)')
mem_used_bytes = mem_info.get('VRAM Total Used Memory (B)')
if mem_total_bytes is not None:
try: try:
with open(temp_file, 'r') as f: # Convert to MB
temp_millicelsius = int(f.read().strip()) gpu_stats['memory_total'] = int(mem_total_bytes) // (1024 * 1024)
gpu_stats['temperature'] = temp_millicelsius / 1000.0 except (ValueError, TypeError):
break
except:
pass pass
# Try to read GPU usage (if available) if mem_used_bytes is not None:
gpu_busy_file = os.path.join(device_path, 'gpu_busy_percent') try:
if os.path.exists(gpu_busy_file): # Convert to MB
try: gpu_stats['memory_used'] = int(mem_used_bytes) // (1024 * 1024)
with open(gpu_busy_file, 'r') as f: except (ValueError, TypeError):
gpu_stats['usage'] = int(f.read().strip()) pass
except:
pass
stats['cards'].append(gpu_stats) stats['cards'].append(gpu_stats)
return stats if stats['cards'] else {'available': False} return stats if stats['cards'] else None
except (subprocess.SubprocessError, json.JSONDecodeError, Exception) as e:
logger.debug(f"Error getting GPU stats: {e}")
return None
except Exception as e:
logger.error(f"Error getting fallback GPU stats: {e}")
return {'available': False, 'error': str(e)}
def get_primary_gpu_stats(self) -> Dict[str, Any]: def get_primary_gpu_stats(self) -> Dict[str, Any]:
"""Get stats for the primary/first GPU""" """Get stats for the primary/first GPU"""
@@ -287,7 +197,8 @@ class GPUAMDMonitor:
'available': False, 'available': False,
'usage': 0, 'usage': 0,
'temperature': 0, 'temperature': 0,
'memory_percent': 0 'memory_percent': 0,
'power': 0
} }
primary_gpu = all_stats['cards'][0] primary_gpu = all_stats['cards'][0]
@@ -305,5 +216,6 @@ class GPUAMDMonitor:
'temperature': primary_gpu.get('temperature', 0) or 0, 'temperature': primary_gpu.get('temperature', 0) or 0,
'memory_percent': memory_percent, 'memory_percent': memory_percent,
'memory_used': primary_gpu.get('memory_used', 0) or 0, 'memory_used': primary_gpu.get('memory_used', 0) or 0,
'memory_total': primary_gpu.get('memory_total', 0) or 0 'memory_total': primary_gpu.get('memory_total', 0) or 0,
'power': primary_gpu.get('power', 0) or 0
} }

View File

@@ -6,8 +6,8 @@ from dataclasses import dataclass, field
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List
from enum import Enum from enum import Enum
from nicegui import binding from nicegui import binding
from gpu_amd_monitor import GPUAMDMonitor from .gpu_amd_monitor import GPUAMDMonitor
from gpu_nvidia_monitor import GPUNVIDIAMonitor from .gpu_nvidia_monitor import GPUNVIDIAMonitor
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)