fixed amd gpu monitoring
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import subprocess
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -15,46 +15,24 @@ class GPUAMDMonitor:
|
||||
self.gpu_available = self._check_gpu_availability()
|
||||
|
||||
def _check_gpu_availability(self) -> bool:
|
||||
"""Check if AMD GPU monitoring tools are available"""
|
||||
"""Check if rocm-smi is available"""
|
||||
try:
|
||||
# Check for rocm-smi (AMD)
|
||||
result = subprocess.run(['rocm-smi', '--help'],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
if result.returncode == 0:
|
||||
return True
|
||||
return result.returncode == 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
pass
|
||||
|
||||
try:
|
||||
# Check for radeontop
|
||||
result = subprocess.run(['radeontop', '--help'],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
if result.returncode == 0:
|
||||
return True
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
pass
|
||||
|
||||
# Check for GPU in /sys/class/drm
|
||||
try:
|
||||
import os
|
||||
gpu_dirs = [d for d in os.listdir('/sys/class/drm') if d.startswith('card')]
|
||||
return len(gpu_dirs) > 0
|
||||
except:
|
||||
pass
|
||||
|
||||
return False
|
||||
return False
|
||||
|
||||
def get_gpu_info(self) -> Dict[str, Any]:
|
||||
"""Get static GPU information"""
|
||||
if not self.gpu_available:
|
||||
return {'available': False, 'message': 'No GPU monitoring tools found'}
|
||||
return {'available': False, 'message': 'rocm-smi not available'}
|
||||
|
||||
if not self._cached_data.get('gpu_info'):
|
||||
try:
|
||||
gpu_info = self._get_rocm_info()
|
||||
if not gpu_info:
|
||||
gpu_info = self._get_sys_gpu_info()
|
||||
|
||||
gpu_info = {'available': False, 'message': 'Could not get GPU info from rocm-smi'}
|
||||
self._cached_data['gpu_info'] = gpu_info
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting GPU info: {e}")
|
||||
@@ -74,9 +52,9 @@ class GPUAMDMonitor:
|
||||
try:
|
||||
stats = self._get_rocm_stats()
|
||||
if not stats:
|
||||
stats = self._get_fallback_stats()
|
||||
|
||||
stats['timestamp'] = now
|
||||
stats = {'available': False, 'message': 'Could not get GPU stats from rocm-smi'}
|
||||
else:
|
||||
stats['timestamp'] = now
|
||||
self._cached_data['stats'] = stats
|
||||
self.last_update = now
|
||||
|
||||
@@ -87,196 +65,128 @@ class GPUAMDMonitor:
|
||||
return self._cached_data.get('stats', {'available': False})
|
||||
|
||||
def _get_rocm_info(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get GPU info using rocm-smi"""
|
||||
"""Get GPU info using rocm-smi with JSON output"""
|
||||
try:
|
||||
result = subprocess.run(['rocm-smi', '--showid', '--showproductname'],
|
||||
result = subprocess.run(['rocm-smi', '--showid', '--showproductname', '--json'],
|
||||
capture_output=True, text=True, timeout=10)
|
||||
|
||||
if result.returncode == 0:
|
||||
lines = result.stdout.strip().split('\n')
|
||||
gpu_info = {'available': True, 'driver': 'rocm-smi', 'cards': []}
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
|
||||
for line in lines:
|
||||
if 'GPU[' in line and ':' in line:
|
||||
# Parse GPU ID and name
|
||||
parts = line.split(':')
|
||||
if len(parts) >= 2:
|
||||
gpu_id = parts[0].strip()
|
||||
gpu_name = parts[1].strip()
|
||||
gpu_info['cards'].append({
|
||||
'id': gpu_id,
|
||||
'name': gpu_name
|
||||
})
|
||||
data = json.loads(result.stdout)
|
||||
gpu_info = {'available': True, 'driver': 'rocm-smi', 'cards': []}
|
||||
|
||||
return gpu_info if gpu_info['cards'] else None
|
||||
# Parse JSON structure
|
||||
if isinstance(data, dict):
|
||||
for gpu_id, gpu_data in data.items():
|
||||
if gpu_id.startswith('card') and isinstance(gpu_data, dict):
|
||||
# Get GPU name from various possible fields
|
||||
gpu_name = (gpu_data.get('Card Series') or
|
||||
gpu_data.get('Device Name') or
|
||||
gpu_data.get('Card SKU') or
|
||||
'Unknown GPU')
|
||||
gpu_info['cards'].append({
|
||||
'id': gpu_id,
|
||||
'name': gpu_name
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"rocm-smi not available: {e}")
|
||||
return gpu_info if gpu_info['cards'] else None
|
||||
|
||||
except (subprocess.SubprocessError, json.JSONDecodeError, Exception) as e:
|
||||
logger.debug(f"Error getting GPU info: {e}")
|
||||
return None
|
||||
|
||||
def _get_rocm_stats(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get GPU stats using rocm-smi"""
|
||||
"""Get GPU stats using rocm-smi with JSON output"""
|
||||
try:
|
||||
# Get temperature, usage, and memory info
|
||||
result = subprocess.run(['rocm-smi', '--showtemp', '--showuse', '--showmeminfo'],
|
||||
# Get temperature and usage
|
||||
result = subprocess.run(['rocm-smi', '--showtemp', '--showuse', '--showpower', '--json'],
|
||||
capture_output=True, text=True, timeout=10)
|
||||
|
||||
if result.returncode == 0:
|
||||
stats = {'available': True, 'cards': []}
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
|
||||
lines = result.stdout.strip().split('\n')
|
||||
current_gpu = None
|
||||
data = json.loads(result.stdout)
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
# Get memory info separately
|
||||
mem_result = subprocess.run(['rocm-smi', '--showmeminfo', 'vram', '--json'],
|
||||
capture_output=True, text=True, timeout=10)
|
||||
mem_data = {}
|
||||
if mem_result.returncode == 0:
|
||||
try:
|
||||
mem_data = json.loads(mem_result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Parse GPU identifier
|
||||
if line.startswith('GPU['):
|
||||
gpu_match = re.search(r'GPU\[(\d+)\]', line)
|
||||
if gpu_match:
|
||||
current_gpu = {
|
||||
'id': int(gpu_match.group(1)),
|
||||
'temperature': None,
|
||||
'usage': None,
|
||||
'memory_used': None,
|
||||
'memory_total': None
|
||||
}
|
||||
stats['cards'].append(current_gpu)
|
||||
|
||||
# Parse temperature
|
||||
elif 'Temperature' in line and current_gpu is not None:
|
||||
temp_match = re.search(r'(\d+\.\d+)°C', line)
|
||||
if temp_match:
|
||||
current_gpu['temperature'] = float(temp_match.group(1))
|
||||
|
||||
# Parse GPU usage
|
||||
elif 'GPU use' in line and current_gpu is not None:
|
||||
usage_match = re.search(r'(\d+)%', line)
|
||||
if usage_match:
|
||||
current_gpu['usage'] = int(usage_match.group(1))
|
||||
|
||||
# Parse memory info
|
||||
elif 'Memory' in line and current_gpu is not None:
|
||||
mem_match = re.search(r'(\d+)MB / (\d+)MB', line)
|
||||
if mem_match:
|
||||
current_gpu['memory_used'] = int(mem_match.group(1))
|
||||
current_gpu['memory_total'] = int(mem_match.group(2))
|
||||
|
||||
return stats if stats['cards'] else None
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"rocm-smi stats not available: {e}")
|
||||
return None
|
||||
|
||||
def _get_sys_gpu_info(self) -> Dict[str, Any]:
|
||||
"""Get GPU info from /sys filesystem"""
|
||||
try:
|
||||
import os
|
||||
gpu_info = {'available': True, 'driver': 'sysfs', 'cards': []}
|
||||
|
||||
drm_path = '/sys/class/drm'
|
||||
if os.path.exists(drm_path):
|
||||
card_dirs = [d for d in os.listdir(drm_path) if d.startswith('card') and not d.endswith('-')]
|
||||
|
||||
for card_dir in sorted(card_dirs):
|
||||
card_path = os.path.join(drm_path, card_dir)
|
||||
device_path = os.path.join(card_path, 'device')
|
||||
|
||||
gpu_name = "Unknown GPU"
|
||||
|
||||
# Try to get GPU name from various sources
|
||||
for name_file in ['product_name', 'device/product_name']:
|
||||
name_path = os.path.join(card_path, name_file)
|
||||
if os.path.exists(name_path):
|
||||
try:
|
||||
with open(name_path, 'r') as f:
|
||||
gpu_name = f.read().strip()
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try vendor and device IDs
|
||||
if gpu_name == "Unknown GPU":
|
||||
try:
|
||||
vendor_path = os.path.join(device_path, 'vendor')
|
||||
device_id_path = os.path.join(device_path, 'device')
|
||||
|
||||
if os.path.exists(vendor_path) and os.path.exists(device_id_path):
|
||||
with open(vendor_path, 'r') as f:
|
||||
vendor = f.read().strip()
|
||||
with open(device_id_path, 'r') as f:
|
||||
device_id = f.read().strip()
|
||||
|
||||
if vendor == '0x1002': # AMD vendor ID
|
||||
gpu_name = f"AMD GPU ({device_id})"
|
||||
except:
|
||||
pass
|
||||
|
||||
gpu_info['cards'].append({
|
||||
'id': card_dir,
|
||||
'name': gpu_name,
|
||||
'path': card_path
|
||||
})
|
||||
|
||||
return gpu_info if gpu_info['cards'] else {'available': False}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting sysfs GPU info: {e}")
|
||||
return {'available': False, 'error': str(e)}
|
||||
|
||||
def _get_fallback_stats(self) -> Dict[str, Any]:
|
||||
"""Get basic GPU stats from /sys filesystem"""
|
||||
try:
|
||||
import os
|
||||
stats = {'available': True, 'cards': []}
|
||||
|
||||
# Try to read basic info from sysfs
|
||||
drm_path = '/sys/class/drm'
|
||||
if os.path.exists(drm_path):
|
||||
card_dirs = [d for d in os.listdir(drm_path) if d.startswith('card') and not d.endswith('-')]
|
||||
# Parse JSON structure
|
||||
if isinstance(data, dict):
|
||||
for gpu_id, gpu_data in data.items():
|
||||
if gpu_id.startswith('card') and isinstance(gpu_data, dict):
|
||||
gpu_stats = {
|
||||
'id': gpu_id,
|
||||
'temperature': None,
|
||||
'usage': None,
|
||||
'memory_used': None,
|
||||
'memory_total': None,
|
||||
'power': None
|
||||
}
|
||||
|
||||
for card_dir in sorted(card_dirs):
|
||||
card_path = os.path.join(drm_path, card_dir)
|
||||
device_path = os.path.join(card_path, 'device')
|
||||
# Extract temperature (try different possible fields)
|
||||
temp = gpu_data.get('Temperature (Sensor edge) (C)')
|
||||
if temp is not None and temp != 'N/A':
|
||||
try:
|
||||
gpu_stats['temperature'] = float(temp)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
gpu_stats = {
|
||||
'id': card_dir,
|
||||
'temperature': None,
|
||||
'usage': None,
|
||||
'memory_used': None,
|
||||
'memory_total': None
|
||||
}
|
||||
# Extract GPU usage
|
||||
usage = gpu_data.get('GPU use (%)')
|
||||
if usage is not None and usage != 'N/A':
|
||||
try:
|
||||
gpu_stats['usage'] = int(usage)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Try to read temperature from hwmon
|
||||
hwmon_path = os.path.join(device_path, 'hwmon')
|
||||
if os.path.exists(hwmon_path):
|
||||
for hwmon_dir in os.listdir(hwmon_path):
|
||||
temp_file = os.path.join(hwmon_path, hwmon_dir, 'temp1_input')
|
||||
if os.path.exists(temp_file):
|
||||
# Extract power
|
||||
power = gpu_data.get('Average Graphics Package Power (W)')
|
||||
if power is not None and power != 'N/A':
|
||||
try:
|
||||
gpu_stats['power'] = float(power)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Extract memory info from separate call
|
||||
if gpu_id in mem_data:
|
||||
mem_info = mem_data[gpu_id]
|
||||
|
||||
# Memory in bytes
|
||||
mem_total_bytes = mem_info.get('VRAM Total Memory (B)')
|
||||
mem_used_bytes = mem_info.get('VRAM Total Used Memory (B)')
|
||||
|
||||
if mem_total_bytes is not None:
|
||||
try:
|
||||
with open(temp_file, 'r') as f:
|
||||
temp_millicelsius = int(f.read().strip())
|
||||
gpu_stats['temperature'] = temp_millicelsius / 1000.0
|
||||
break
|
||||
except:
|
||||
# Convert to MB
|
||||
gpu_stats['memory_total'] = int(mem_total_bytes) // (1024 * 1024)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Try to read GPU usage (if available)
|
||||
gpu_busy_file = os.path.join(device_path, 'gpu_busy_percent')
|
||||
if os.path.exists(gpu_busy_file):
|
||||
try:
|
||||
with open(gpu_busy_file, 'r') as f:
|
||||
gpu_stats['usage'] = int(f.read().strip())
|
||||
except:
|
||||
pass
|
||||
if mem_used_bytes is not None:
|
||||
try:
|
||||
# Convert to MB
|
||||
gpu_stats['memory_used'] = int(mem_used_bytes) // (1024 * 1024)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
stats['cards'].append(gpu_stats)
|
||||
stats['cards'].append(gpu_stats)
|
||||
|
||||
return stats if stats['cards'] else {'available': False}
|
||||
return stats if stats['cards'] else None
|
||||
|
||||
except (subprocess.SubprocessError, json.JSONDecodeError, Exception) as e:
|
||||
logger.debug(f"Error getting GPU stats: {e}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting fallback GPU stats: {e}")
|
||||
return {'available': False, 'error': str(e)}
|
||||
|
||||
def get_primary_gpu_stats(self) -> Dict[str, Any]:
|
||||
"""Get stats for the primary/first GPU"""
|
||||
@@ -287,7 +197,8 @@ class GPUAMDMonitor:
|
||||
'available': False,
|
||||
'usage': 0,
|
||||
'temperature': 0,
|
||||
'memory_percent': 0
|
||||
'memory_percent': 0,
|
||||
'power': 0
|
||||
}
|
||||
|
||||
primary_gpu = all_stats['cards'][0]
|
||||
@@ -305,5 +216,6 @@ class GPUAMDMonitor:
|
||||
'temperature': primary_gpu.get('temperature', 0) or 0,
|
||||
'memory_percent': memory_percent,
|
||||
'memory_used': primary_gpu.get('memory_used', 0) or 0,
|
||||
'memory_total': primary_gpu.get('memory_total', 0) or 0
|
||||
'memory_total': primary_gpu.get('memory_total', 0) or 0,
|
||||
'power': primary_gpu.get('power', 0) or 0
|
||||
}
|
||||
|
||||
@@ -6,8 +6,8 @@ from dataclasses import dataclass, field
|
||||
from typing import Dict, Any, Optional, List
|
||||
from enum import Enum
|
||||
from nicegui import binding
|
||||
from gpu_amd_monitor import GPUAMDMonitor
|
||||
from gpu_nvidia_monitor import GPUNVIDIAMonitor
|
||||
from .gpu_amd_monitor import GPUAMDMonitor
|
||||
from .gpu_nvidia_monitor import GPUNVIDIAMonitor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user