Skip to content

llmcompressor.utils.metric_logging

Utility functions for metrics logging and GPU memory monitoring.

This module provides functions for tracking GPU memory usage, measuring model layer sizes, and comprehensive logging during compression workflows. Supports both NVIDIA and AMD GPU monitoring with detailed memory statistics and performance metrics.

Classes:

CompressionLogger

CompressionLogger(module: Module)

Log metrics related to compression algorithm

Parameters:

  • start_tick

    time when algorithm started"

  • losses

    loss as result of algorithm

Source code in llmcompressor/utils/metric_logging.py
def __init__(self, module: torch.nn.Module):
    self.module = module
    self.start_tick = None
    self.loss = None

get_GPU_usage_amd

get_GPU_usage_amd() -> List[Tuple[float, float]]

get gpu usage for AMD GPUs using amdsmi lib

Source code in llmcompressor/utils/metric_logging.py
def get_GPU_usage_amd() -> List[Tuple[float, float]]:
    """
    get gpu usage for AMD GPUs using amdsmi lib
    """
    usage = []
    try:
        import amdsmi

        try:
            amdsmi.amdsmi_init()
            devices = amdsmi.amdsmi_get_processor_handles()

            for device in devices:
                vram_memory_usage = amdsmi.amdsmi_get_gpu_memory_usage(
                    device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
                )
                vram_memory_total = amdsmi.amdsmi_get_gpu_memory_total(
                    device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
                )

                memory_percentage = vram_memory_usage / vram_memory_total
                usage.append(
                    (memory_percentage, vram_memory_total / (1e9)),
                )
            amdsmi.amdsmi_shut_down()
        except amdsmi.AmdSmiException as error:
            logger.warning(f"amdsmi library error:\n {error}")
    except ImportError:
        logger.warning("Failed to obtain GPU usage from amdsmi")

    return usage

get_GPU_usage_nv

get_GPU_usage_nv() -> List[Tuple[float, float]]

get gpu usage for Nvidia GPUs using nvml lib

Source code in llmcompressor/utils/metric_logging.py
def get_GPU_usage_nv() -> List[Tuple[float, float]]:
    """
    get gpu usage for Nvidia GPUs using nvml lib
    """
    try:
        import pynvml
        from pynvml import NVMLError

        try:
            pynvml.nvmlInit()
        except NVMLError as _err:
            logger.warning(f"Pynml library error:\n {_err}")
            return []

        device_count = pynvml.nvmlDeviceGetCount()
        usage = []  # [(percentage, total_memory_MB)]

        # Iterate through all GPUs
        for i in range(device_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            memory_usage_percentage = mem_info.used / mem_info.total
            total_memory_gb = mem_info.total / (1e9)
            usage.append(
                (memory_usage_percentage, total_memory_gb),
            )
        pynvml.nvmlShutdown()
        return usage

    except ImportError:
        logger.warning("Failed to obtain GPU usage from pynvml")
        return []