infer_and_set_per_module_quantization_format(
model,
quantization_format: Optional[str] = None,
save_compressed: bool = False,
sparsity_structure: Optional[str] = None,
) -> Optional[List[str]]
Infers the quantization format for a model based on its state and provided compression arguments. Also updates thhe quantization_scheme.format value based on the inferred format. Returns the unique list of formats in the model or None if empty list
For a summary of the formats, see docs/guides/compression_formats.md
.
Parameters:
- –
model to check for quantization, if the model is not quantized no quantization format is returned
- (
Optional[str]
, default: None
) – user provided quantization format, supercedes any inferred quantization format
- (
bool
, default: False
) – used to infer a quantization format if None is provided
Source code in llmcompressor/transformers/compression/quantization_format.py
| def infer_and_set_per_module_quantization_format(
model,
quantization_format: Optional[str] = None,
save_compressed: bool = False,
sparsity_structure: Optional[str] = None,
) -> Optional[List[str]]:
"""
Infers the quantization format for a model based on its state and provided
compression arguments. Also updates thhe quantization_scheme.format value
based on the inferred format. Returns the unique list of formats in the model
or None if empty list
For a summary of the formats, see `docs/guides/compression_formats.md`.
:param model: model to check for quantization, if the model is not quantized no
quantization format is returned
:param quantization_format: user provided quantization format, supercedes any
inferred quantization format
:param save_compressed: used to infer a quantization format if None is provided
:return compression format appropriate for model
"""
if not save_compressed:
return None
if quantization_format:
return [quantization_format]
unique_formats = []
for submodule in model.modules():
if is_module_quantized(submodule):
weight_scheme = submodule.quantization_scheme.weights
input_scheme = submodule.quantization_scheme.input_activations
if weight_scheme is None:
continue # no weight quant - nothing to compress
compression_format = _get_quant_compression_format(
input_scheme, weight_scheme, sparsity_structure
)
# If set, we check if it matches our inferred one
if submodule.quantization_scheme.format is not None:
# If it does not, warn the user
if submodule.quantization_scheme.format != compression_format.value:
logger.warning(
"The provided format for the module does not match the "
"inferred format. Compression may fail "
)
else:
# If not set, we set ours
submodule.quantization_scheme.format = compression_format.value
if submodule.quantization_scheme.format not in unique_formats:
unique_formats.append(submodule.quantization_scheme.format)
if len(unique_formats) > 0:
return unique_formats
return None
|