Skip to content

llmcompressor.modeling.granite4

Classes:

GraniteMoeHybridParallelExpertsLinear

GraniteMoeHybridParallelExpertsLinear(
    num_experts: int, input_size: int, output_size: int
)

Bases: Linear

Use a real Linear so that llmcompressor and vllm can handle it easier. 1. Change .weight from 3D [num_experts, output_size, input_size] to 2D [num_experts * output_size, input_size] before calling llm-compressor 2. Change it back to 3D before saving ckpt

Methods:

  • forward

    Modified from original forward()

  • from_3d_expert

    Reshape weights of GraniteMoeHybridParallelExperts module into 2D and store

  • to_3d_expert

    Convert weights and quantization parameters from 2D to 3D shape.

Source code in llmcompressor/modeling/granite4.py
def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
    """Use a real Linear so that llmcompressor and vllm can handle it easier.
    1. Change .weight from 3D [num_experts, output_size, input_size] to 2D
        [num_experts * output_size, input_size] before calling llm-compressor
    2. Change it back to 3D before saving ckpt
    """
    super().__init__(
        input_size, output_size * num_experts, bias=False, device="meta"
    )
    self.num_experts = num_experts
    self.input_size = input_size
    self.output_size = output_size
    self.is_2d: bool = True

forward

forward(inputs, expert_size)

Modified from original forward()

Source code in llmcompressor/modeling/granite4.py
def forward(self, inputs, expert_size):
    """Modified from original forward()"""

    input_list = inputs.split(expert_size, dim=0)

    weight_3d = self.weight.view(
        self.num_experts, self.output_size, self.input_size
    )
    output_list = []
    for i in range(self.num_experts):
        output_list.append(torch.nn.functional.linear(input_list[i], weight_3d[i]))

    results = torch.cat(output_list, dim=0)
    return results

from_3d_expert classmethod

from_3d_expert(original: GraniteMoeHybridParallelExperts)

Reshape weights of GraniteMoeHybridParallelExperts module into 2D and store them as weights of this "Linear" module.

Source code in llmcompressor/modeling/granite4.py
@classmethod
def from_3d_expert(cls, original: GraniteMoeHybridParallelExperts):
    """Reshape weights of GraniteMoeHybridParallelExperts module into 2D and store
    them as weights of this "Linear" module.
    """
    newMoeLin = cls(original.num_experts, original.input_size, original.output_size)
    newMoeLin.weight = torch.nn.Parameter(
        original.weight.view(-1, original.input_size).clone(),
        requires_grad=False,
    )
    original.to("cpu")
    newMoeLin.is_2d = True
    return newMoeLin

to_3d_expert

to_3d_expert() -> None

Convert weights and quantization parameters from 2D to 3D shape.

Source code in llmcompressor/modeling/granite4.py
def to_3d_expert(self) -> None:
    """Convert weights and quantization parameters from 2D to 3D shape."""
    dim0_mul = self.num_experts * self.output_size
    assert (
        self.weight.shape == torch.Size((dim0_mul, self.input_size))
        and hasattr(self, "weight_scale")
        and self.weight_scale.shape == torch.Size((dim0_mul, 1))
    ), "Shape mismatch, please check."

    self.weight = torch.nn.Parameter(
        self.weight.view(
            self.num_experts, self.output_size, self.input_size
        ).clone(),
        requires_grad=False,
    )
    self.weight_scale = torch.nn.Parameter(
        self.weight_scale.view(self.num_experts, self.output_size, 1).clone(),
        requires_grad=False,
    )
    if hasattr(self, "weight_zero_point"):
        assert self.weight_zero_point.shape == torch.Size((dim0_mul, 1))
        self.weight_zero_point = torch.nn.Parameter(
            self.weight_zero_point.view(
                self.num_experts, self.output_size, 1
            ).clone(),
            requires_grad=False,
        )
    self.is_2d = False