diff --git a/src/utils/gpu_amd_monitor.py b/src/utils/gpu_amd_monitor.py index 35184cc..b13b337 100644 --- a/src/utils/gpu_amd_monitor.py +++ b/src/utils/gpu_amd_monitor.py @@ -1,7 +1,7 @@ import subprocess -import re import time import logging +import json from typing import Dict, Any, Optional logger = logging.getLogger(__name__) @@ -15,46 +15,24 @@ class GPUAMDMonitor: self.gpu_available = self._check_gpu_availability() def _check_gpu_availability(self) -> bool: - """Check if AMD GPU monitoring tools are available""" + """Check if rocm-smi is available""" try: - # Check for rocm-smi (AMD) result = subprocess.run(['rocm-smi', '--help'], capture_output=True, text=True, timeout=5) - if result.returncode == 0: - return True + return result.returncode == 0 except (subprocess.TimeoutExpired, FileNotFoundError): - pass - - try: - # Check for radeontop - result = subprocess.run(['radeontop', '--help'], - capture_output=True, text=True, timeout=5) - if result.returncode == 0: - return True - except (subprocess.TimeoutExpired, FileNotFoundError): - pass - - # Check for GPU in /sys/class/drm - try: - import os - gpu_dirs = [d for d in os.listdir('/sys/class/drm') if d.startswith('card')] - return len(gpu_dirs) > 0 - except: - pass - - return False + return False def get_gpu_info(self) -> Dict[str, Any]: """Get static GPU information""" if not self.gpu_available: - return {'available': False, 'message': 'No GPU monitoring tools found'} + return {'available': False, 'message': 'rocm-smi not available'} if not self._cached_data.get('gpu_info'): try: gpu_info = self._get_rocm_info() if not gpu_info: - gpu_info = self._get_sys_gpu_info() - + gpu_info = {'available': False, 'message': 'Could not get GPU info from rocm-smi'} self._cached_data['gpu_info'] = gpu_info except Exception as e: logger.error(f"Error getting GPU info: {e}") @@ -74,9 +52,9 @@ class GPUAMDMonitor: try: stats = self._get_rocm_stats() if not stats: - stats = self._get_fallback_stats() - - stats['timestamp'] = now + stats = {'available': False, 'message': 'Could not get GPU stats from rocm-smi'} + else: + stats['timestamp'] = now self._cached_data['stats'] = stats self.last_update = now @@ -87,196 +65,128 @@ class GPUAMDMonitor: return self._cached_data.get('stats', {'available': False}) def _get_rocm_info(self) -> Optional[Dict[str, Any]]: - """Get GPU info using rocm-smi""" + """Get GPU info using rocm-smi with JSON output""" try: - result = subprocess.run(['rocm-smi', '--showid', '--showproductname'], + result = subprocess.run(['rocm-smi', '--showid', '--showproductname', '--json'], capture_output=True, text=True, timeout=10) - if result.returncode == 0: - lines = result.stdout.strip().split('\n') - gpu_info = {'available': True, 'driver': 'rocm-smi', 'cards': []} + if result.returncode != 0: + return None - for line in lines: - if 'GPU[' in line and ':' in line: - # Parse GPU ID and name - parts = line.split(':') - if len(parts) >= 2: - gpu_id = parts[0].strip() - gpu_name = parts[1].strip() - gpu_info['cards'].append({ - 'id': gpu_id, - 'name': gpu_name - }) + data = json.loads(result.stdout) + gpu_info = {'available': True, 'driver': 'rocm-smi', 'cards': []} - return gpu_info if gpu_info['cards'] else None + # Parse JSON structure + if isinstance(data, dict): + for gpu_id, gpu_data in data.items(): + if gpu_id.startswith('card') and isinstance(gpu_data, dict): + # Get GPU name from various possible fields + gpu_name = (gpu_data.get('Card Series') or + gpu_data.get('Device Name') or + gpu_data.get('Card SKU') or + 'Unknown GPU') + gpu_info['cards'].append({ + 'id': gpu_id, + 'name': gpu_name + }) - except Exception as e: - logger.debug(f"rocm-smi not available: {e}") + return gpu_info if gpu_info['cards'] else None + + except (subprocess.SubprocessError, json.JSONDecodeError, Exception) as e: + logger.debug(f"Error getting GPU info: {e}") return None def _get_rocm_stats(self) -> Optional[Dict[str, Any]]: - """Get GPU stats using rocm-smi""" + """Get GPU stats using rocm-smi with JSON output""" try: - # Get temperature, usage, and memory info - result = subprocess.run(['rocm-smi', '--showtemp', '--showuse', '--showmeminfo'], + # Get temperature and usage + result = subprocess.run(['rocm-smi', '--showtemp', '--showuse', '--showpower', '--json'], capture_output=True, text=True, timeout=10) - if result.returncode == 0: - stats = {'available': True, 'cards': []} + if result.returncode != 0: + return None - lines = result.stdout.strip().split('\n') - current_gpu = None + data = json.loads(result.stdout) - for line in lines: - line = line.strip() + # Get memory info separately + mem_result = subprocess.run(['rocm-smi', '--showmeminfo', 'vram', '--json'], + capture_output=True, text=True, timeout=10) + mem_data = {} + if mem_result.returncode == 0: + try: + mem_data = json.loads(mem_result.stdout) + except json.JSONDecodeError: + pass - # Parse GPU identifier - if line.startswith('GPU['): - gpu_match = re.search(r'GPU\[(\d+)\]', line) - if gpu_match: - current_gpu = { - 'id': int(gpu_match.group(1)), - 'temperature': None, - 'usage': None, - 'memory_used': None, - 'memory_total': None - } - stats['cards'].append(current_gpu) - - # Parse temperature - elif 'Temperature' in line and current_gpu is not None: - temp_match = re.search(r'(\d+\.\d+)°C', line) - if temp_match: - current_gpu['temperature'] = float(temp_match.group(1)) - - # Parse GPU usage - elif 'GPU use' in line and current_gpu is not None: - usage_match = re.search(r'(\d+)%', line) - if usage_match: - current_gpu['usage'] = int(usage_match.group(1)) - - # Parse memory info - elif 'Memory' in line and current_gpu is not None: - mem_match = re.search(r'(\d+)MB / (\d+)MB', line) - if mem_match: - current_gpu['memory_used'] = int(mem_match.group(1)) - current_gpu['memory_total'] = int(mem_match.group(2)) - - return stats if stats['cards'] else None - - except Exception as e: - logger.debug(f"rocm-smi stats not available: {e}") - return None - - def _get_sys_gpu_info(self) -> Dict[str, Any]: - """Get GPU info from /sys filesystem""" - try: - import os - gpu_info = {'available': True, 'driver': 'sysfs', 'cards': []} - - drm_path = '/sys/class/drm' - if os.path.exists(drm_path): - card_dirs = [d for d in os.listdir(drm_path) if d.startswith('card') and not d.endswith('-')] - - for card_dir in sorted(card_dirs): - card_path = os.path.join(drm_path, card_dir) - device_path = os.path.join(card_path, 'device') - - gpu_name = "Unknown GPU" - - # Try to get GPU name from various sources - for name_file in ['product_name', 'device/product_name']: - name_path = os.path.join(card_path, name_file) - if os.path.exists(name_path): - try: - with open(name_path, 'r') as f: - gpu_name = f.read().strip() - break - except: - pass - - # Try vendor and device IDs - if gpu_name == "Unknown GPU": - try: - vendor_path = os.path.join(device_path, 'vendor') - device_id_path = os.path.join(device_path, 'device') - - if os.path.exists(vendor_path) and os.path.exists(device_id_path): - with open(vendor_path, 'r') as f: - vendor = f.read().strip() - with open(device_id_path, 'r') as f: - device_id = f.read().strip() - - if vendor == '0x1002': # AMD vendor ID - gpu_name = f"AMD GPU ({device_id})" - except: - pass - - gpu_info['cards'].append({ - 'id': card_dir, - 'name': gpu_name, - 'path': card_path - }) - - return gpu_info if gpu_info['cards'] else {'available': False} - - except Exception as e: - logger.error(f"Error getting sysfs GPU info: {e}") - return {'available': False, 'error': str(e)} - - def _get_fallback_stats(self) -> Dict[str, Any]: - """Get basic GPU stats from /sys filesystem""" - try: - import os stats = {'available': True, 'cards': []} - # Try to read basic info from sysfs - drm_path = '/sys/class/drm' - if os.path.exists(drm_path): - card_dirs = [d for d in os.listdir(drm_path) if d.startswith('card') and not d.endswith('-')] + # Parse JSON structure + if isinstance(data, dict): + for gpu_id, gpu_data in data.items(): + if gpu_id.startswith('card') and isinstance(gpu_data, dict): + gpu_stats = { + 'id': gpu_id, + 'temperature': None, + 'usage': None, + 'memory_used': None, + 'memory_total': None, + 'power': None + } - for card_dir in sorted(card_dirs): - card_path = os.path.join(drm_path, card_dir) - device_path = os.path.join(card_path, 'device') + # Extract temperature (try different possible fields) + temp = gpu_data.get('Temperature (Sensor edge) (C)') + if temp is not None and temp != 'N/A': + try: + gpu_stats['temperature'] = float(temp) + except (ValueError, TypeError): + pass - gpu_stats = { - 'id': card_dir, - 'temperature': None, - 'usage': None, - 'memory_used': None, - 'memory_total': None - } + # Extract GPU usage + usage = gpu_data.get('GPU use (%)') + if usage is not None and usage != 'N/A': + try: + gpu_stats['usage'] = int(usage) + except (ValueError, TypeError): + pass - # Try to read temperature from hwmon - hwmon_path = os.path.join(device_path, 'hwmon') - if os.path.exists(hwmon_path): - for hwmon_dir in os.listdir(hwmon_path): - temp_file = os.path.join(hwmon_path, hwmon_dir, 'temp1_input') - if os.path.exists(temp_file): + # Extract power + power = gpu_data.get('Average Graphics Package Power (W)') + if power is not None and power != 'N/A': + try: + gpu_stats['power'] = float(power) + except (ValueError, TypeError): + pass + + # Extract memory info from separate call + if gpu_id in mem_data: + mem_info = mem_data[gpu_id] + + # Memory in bytes + mem_total_bytes = mem_info.get('VRAM Total Memory (B)') + mem_used_bytes = mem_info.get('VRAM Total Used Memory (B)') + + if mem_total_bytes is not None: try: - with open(temp_file, 'r') as f: - temp_millicelsius = int(f.read().strip()) - gpu_stats['temperature'] = temp_millicelsius / 1000.0 - break - except: + # Convert to MB + gpu_stats['memory_total'] = int(mem_total_bytes) // (1024 * 1024) + except (ValueError, TypeError): pass - # Try to read GPU usage (if available) - gpu_busy_file = os.path.join(device_path, 'gpu_busy_percent') - if os.path.exists(gpu_busy_file): - try: - with open(gpu_busy_file, 'r') as f: - gpu_stats['usage'] = int(f.read().strip()) - except: - pass + if mem_used_bytes is not None: + try: + # Convert to MB + gpu_stats['memory_used'] = int(mem_used_bytes) // (1024 * 1024) + except (ValueError, TypeError): + pass - stats['cards'].append(gpu_stats) + stats['cards'].append(gpu_stats) - return stats if stats['cards'] else {'available': False} + return stats if stats['cards'] else None + + except (subprocess.SubprocessError, json.JSONDecodeError, Exception) as e: + logger.debug(f"Error getting GPU stats: {e}") + return None - except Exception as e: - logger.error(f"Error getting fallback GPU stats: {e}") - return {'available': False, 'error': str(e)} def get_primary_gpu_stats(self) -> Dict[str, Any]: """Get stats for the primary/first GPU""" @@ -287,7 +197,8 @@ class GPUAMDMonitor: 'available': False, 'usage': 0, 'temperature': 0, - 'memory_percent': 0 + 'memory_percent': 0, + 'power': 0 } primary_gpu = all_stats['cards'][0] @@ -305,5 +216,6 @@ class GPUAMDMonitor: 'temperature': primary_gpu.get('temperature', 0) or 0, 'memory_percent': memory_percent, 'memory_used': primary_gpu.get('memory_used', 0) or 0, - 'memory_total': primary_gpu.get('memory_total', 0) or 0 + 'memory_total': primary_gpu.get('memory_total', 0) or 0, + 'power': primary_gpu.get('power', 0) or 0 } diff --git a/src/utils/gpu_monitor.py b/src/utils/gpu_monitor.py index 4042494..35c997a 100644 --- a/src/utils/gpu_monitor.py +++ b/src/utils/gpu_monitor.py @@ -6,8 +6,8 @@ from dataclasses import dataclass, field from typing import Dict, Any, Optional, List from enum import Enum from nicegui import binding -from gpu_amd_monitor import GPUAMDMonitor -from gpu_nvidia_monitor import GPUNVIDIAMonitor +from .gpu_amd_monitor import GPUAMDMonitor +from .gpu_nvidia_monitor import GPUNVIDIAMonitor logger = logging.getLogger(__name__)