Utility functions for metrics logging and GPU memory monitoring.
This module provides functions for tracking GPU memory usage, measuring model layer sizes, and comprehensive logging during compression workflows. Supports both NVIDIA and AMD GPU monitoring with detailed memory statistics and performance metrics.
Classes:
CompressionLogger
CompressionLogger(module: Module)
Log metrics related to compression algorithm
Parameters:
-
start_tick
– time when algorithm started"
-
losses
– loss as result of algorithm
Source code in llmcompressor/utils/metric_logging.py
| def __init__(self, module: torch.nn.Module):
self.module = module
self.start_tick = None
self.loss = None
|
get_GPU_usage_amd
get_GPU_usage_amd() -> List[Tuple[float, float]]
get gpu usage for AMD GPUs using amdsmi lib
Source code in llmcompressor/utils/metric_logging.py
| def get_GPU_usage_amd() -> List[Tuple[float, float]]:
"""
get gpu usage for AMD GPUs using amdsmi lib
"""
usage = []
try:
import amdsmi
try:
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
vram_memory_usage = amdsmi.amdsmi_get_gpu_memory_usage(
device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
)
vram_memory_total = amdsmi.amdsmi_get_gpu_memory_total(
device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
)
memory_percentage = vram_memory_usage / vram_memory_total
usage.append(
(memory_percentage, vram_memory_total / (1e9)),
)
amdsmi.amdsmi_shut_down()
except amdsmi.AmdSmiException as error:
logger.warning(f"amdsmi library error:\n {error}")
except ImportError:
logger.warning("Failed to obtain GPU usage from amdsmi")
return usage
|
get_GPU_usage_nv
get_GPU_usage_nv() -> List[Tuple[float, float]]
get gpu usage for Nvidia GPUs using nvml lib
Source code in llmcompressor/utils/metric_logging.py
| def get_GPU_usage_nv() -> List[Tuple[float, float]]:
"""
get gpu usage for Nvidia GPUs using nvml lib
"""
try:
import pynvml
from pynvml import NVMLError
try:
pynvml.nvmlInit()
except NVMLError as _err:
logger.warning(f"Pynml library error:\n {_err}")
return []
device_count = pynvml.nvmlDeviceGetCount()
usage = [] # [(percentage, total_memory_MB)]
# Iterate through all GPUs
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
memory_usage_percentage = mem_info.used / mem_info.total
total_memory_gb = mem_info.total / (1e9)
usage.append(
(memory_usage_percentage, total_memory_gb),
)
pynvml.nvmlShutdown()
return usage
except ImportError:
logger.warning("Failed to obtain GPU usage from pynvml")
return []
|