LLM Quantization: Deep Dive
How quantization reduces LLM size and speeds inference. GPTQ, AWQ, GGUF, bitsandbytes NF4, and the math behind weight quantization without accuracy collapse.
Why Quantize?
A 70B parameter model at bfloat16 requires 140GB of GPU memory — more than any single consumer GPU. Quantization compresses weights to 4-bit integers, reducing this to ~35GB. With careful quantization, the quality loss is under 2-5% on most benchmarks.
The core tradeoff: precision vs memory and speed. Quantization replaces 16-bit floating point weights with 4-bit (or 8-bit) integers, introducing approximation error in every matrix multiplication.
The Math of Quantization
Symmetric (absmax) quantization:
q = round(w / s)
w_reconstructed = q × s
where s = max(|w|) / 127 (for int8)
s = max(|w|) / 7 (for int4)import torch
import numpy as np
def symmetric_quantize(weights: torch.Tensor, bits: int = 8) -> tuple:
"""
Symmetric quantization: zero point is always 0.
Returns (quantized_weights, scale).
"""
max_int = 2 ** (bits - 1) - 1 # 127 for int8, 7 for int4
# Compute scale from max absolute value
scale = weights.abs().max() / max_int
# Quantize
q = torch.clamp(torch.round(weights / scale), -max_int, max_int).to(torch.int8)
return q, scale
def symmetric_dequantize(q: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
"""Reconstruct float weights from quantized representation."""
return q.float() * scale
# Example: quantize and check reconstruction error
weights = torch.randn(1024, 1024) * 0.02 # Typical LLM weight magnitude
q8, scale8 = symmetric_quantize(weights, bits=8)
w_reconstructed = symmetric_dequantize(q8, scale8)
mse = ((weights - w_reconstructed) ** 2).mean().item()
print(f"int8 MSE: {mse:.8f}") # Very small
q4, scale4 = symmetric_quantize(weights.clamp(-scale8 * 7, scale8 * 7), bits=4)
w4_reconstructed = symmetric_dequantize(q4, scale4)
mse4 = ((weights - w4_reconstructed) ** 2).mean().item()
print(f"int4 MSE: {mse4:.8f}") # Larger — 4-bit is much coarserThe problem with simple quantization: outlier weights. LLM weight matrices contain a small number of values much larger than the rest. A single large outlier forces the scale factor high, making all the normal values quantize to just a few distinct levels (coarse quantization of most weights to improve outlier range).
Grouped Quantization
Solution: use separate scale factors for groups of weights:
def grouped_quantize(
weights: torch.Tensor,
bits: int = 4,
group_size: int = 128,
) -> tuple:
"""
Quantize in groups of `group_size` elements.
Each group has its own scale factor.
This significantly reduces quantization error.
"""
orig_shape = weights.shape
# Reshape to (n_groups, group_size)
weights_flat = weights.reshape(-1, group_size)
n_groups = weights_flat.shape[0]
max_int = 2 ** (bits - 1) - 1
scales = weights_flat.abs().max(dim=1, keepdim=True).values / max_int
quantized = torch.clamp(
torch.round(weights_flat / scales), -max_int, max_int
).to(torch.int8)
return quantized.reshape(orig_shape), scales, group_size
def grouped_dequantize(
quantized: torch.Tensor,
scales: torch.Tensor,
group_size: int,
) -> torch.Tensor:
"""Reconstruct weights from grouped quantization."""
flat_q = quantized.reshape(-1, group_size)
reconstructed = flat_q.float() * scales
return reconstructed.reshape(quantized.shape)
# Grouped quantization (group_size=128) dramatically reduces MSE vs per-tensor
weights = torch.randn(4096, 4096) * 0.02
q, scales, gs = grouped_quantize(weights, bits=4, group_size=128)
w_rec = grouped_dequantize(q, scales, gs)
print(f"Grouped int4 MSE: {((weights - w_rec)**2).mean().item():.8f}")
# Much lower than per-tensor int4GPTQ: Post-Training Quantization with Calibration Data
GPTQ (Frantar et al., 2022) uses a small calibration dataset to reduce quantization error layer by layer:
# Using AutoGPTQ for quantization
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer
from datasets import load_dataset
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
# Prepare calibration data (small sample of the training distribution)
def get_calibration_data(n_samples: int = 128, seq_len: int = 2048) -> list:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
texts = [example["text"] for example in dataset if len(example["text"]) > 100]
calibration_data = []
for text in texts[:n_samples]:
tokens = tokenizer(
text,
return_tensors="pt",
max_length=seq_len,
truncation=True,
)
calibration_data.append(tokens["input_ids"])
return calibration_data
quantize_config = BaseQuantizeConfig(
bits=4, # 4-bit quantization
group_size=128, # Group size for per-group scales
damp_percent=0.01, # Dampening for numerical stability
desc_act=False, # Whether to use activation ordering
sym=True, # Symmetric quantization
)
model = AutoGPTQForCausalLM.from_pretrained(
"meta-llama/Meta-Llama-3-8B",
quantize_config=quantize_config,
)
calibration_data = get_calibration_data()
model.quantize(calibration_data)
# Save quantized model
model.save_quantized("./llama3-8b-gptq-4bit", use_safetensors=True)
print("Quantization complete")AWQ: Activation-Aware Weight Quantization
AWQ (Lin et al., 2023) identifies which weights are most important by looking at activation magnitudes, then protects those weights with higher precision or smaller quantization steps:
# Using AutoAWQ
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
quant_path = "./llama3-8b-awq-4bit"
quant_config = {
"zero_point": True, # Use asymmetric quantization
"q_group_size": 128,
"w_bit": 4,
"version": "GEMM", # Kernel: GEMM (fast) or GEMV (for small batches)
}
model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)
# AWQ needs calibration data to find salient weights
model.quantize(tokenizer, quant_config=quant_config)
model.save_quantized(quant_path)
# Load for inference
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)AWQ vs GPTQ:
- AWQ is generally faster at inference (optimized GEMM kernels)
- GPTQ typically has slightly better accuracy at the same bit-width
- AWQ quantization is faster to run (hours vs days for large models)
- Both achieve similar perplexity degradation: usually under 0.3-0.5 perplexity increase for 4-bit
NF4: NormalFloat4 for QLoRA
NF4 (NormalFloat 4-bit) is designed specifically for neural network weights that follow a normal distribution:
# NF4 quantization levels are not uniformly spaced
# They are chosen to be optimal for N(0, 1) distributions
NF4_LEVELS = [
-1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453,
-0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0,
0.07958029955625534, 0.16093020141124725, 0.24611230194568634, 0.33791524171829224,
0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0,
]
def quantize_nf4(weights: torch.Tensor) -> tuple:
"""
NF4 quantization: maps weights to nearest of 16 NF4 levels.
Optimal for normally-distributed weights.
"""
nf4_levels = torch.tensor(NF4_LEVELS, dtype=torch.float32)
# Normalize weights to [-1, 1]
scale = weights.abs().max()
normalized = weights / scale
# Find nearest NF4 level for each weight
# Distance from each weight to each level
distances = (normalized.unsqueeze(-1) - nf4_levels).abs()
quantized_indices = distances.argmin(dim=-1).to(torch.uint8) # 0-15 (4-bit)
return quantized_indices, scale, nf4_levels
def dequantize_nf4(indices: torch.Tensor, scale: float, nf4_levels: torch.Tensor) -> torch.Tensor:
"""Reconstruct float weights from NF4 indices."""
return nf4_levels[indices.long()] * scale
# In practice, bitsandbytes handles this with CUDA kernels
# model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, bnb_4bit_quant_type="nf4")GGUF: CPU-Friendly Quantization
GGUF (GPT-Generated Unified Format) enables running quantized models on CPU (with partial GPU offloading via llama.cpp):
# Install llama.cpp
pip install llama-cpp-python
# Download a GGUF model from HuggingFace
# e.g., bartowski/Meta-Llama-3-8B-Instruct-GGUFfrom llama_cpp import Llama
# Load Q4_K_M (a good balance of quality and size)
llm = Llama(
model_path="./Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
n_ctx=4096, # Context window
n_gpu_layers=20, # Offload some layers to GPU
n_threads=8, # CPU threads for non-GPU layers
verbose=False,
)
response = llm.create_chat_completion(
messages=[
{"role": "user", "content": "What is warfarin used for?"}
],
max_tokens=256,
temperature=0,
)
print(response["choices"][0]["message"]["content"])GGUF quantization variants (for Q4):
| Format | Description | Perplexity vs F16 | Size (7B) | |---|---|---|---| | Q4_0 | Basic 4-bit | +0.2 | 3.8 GB | | Q4_K_S | 4-bit, small K-quant | +0.1 | 4.4 GB | | Q4_K_M | 4-bit, medium K-quant | +0.05 | 4.8 GB | | Q5_K_M | 5-bit, medium K-quant | +0.02 | 5.7 GB | | Q8_0 | 8-bit | ~=F16 | 7.7 GB |
K_M (K-quant Medium) is the recommended default — it uses higher precision for important layers and lower precision for less sensitive layers.
Evaluating Quantization Quality
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
def measure_perplexity(model, tokenizer, text: str, device: str = "cuda") -> float:
"""Compute perplexity of a model on given text."""
encodings = tokenizer(text, return_tensors="pt").to(device)
max_length = model.config.max_position_embeddings
stride = 512
nlls = []
prev_end_loc = 0
for begin_loc in range(0, encodings.input_ids.size(1), stride):
end_loc = min(begin_loc + max_length, encodings.input_ids.size(1))
trg_len = end_loc - prev_end_loc
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = model(input_ids, labels=target_ids)
neg_log_likelihood = outputs.loss * trg_len
nlls.append(neg_log_likelihood)
prev_end_loc = end_loc
if end_loc == encodings.input_ids.size(1):
break
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
return ppl.item()
def benchmark_quantization(models: dict, test_text: str, tokenizer) -> dict:
"""Compare perplexity across quantization levels."""
results = {}
for name, model in models.items():
ppl = measure_perplexity(model, tokenizer, test_text)
results[name] = {
"perplexity": ppl,
"memory_gb": sum(p.numel() * p.element_size() for p in model.parameters()) / 1e9,
}
print(f"{name}: perplexity={ppl:.2f}, memory={results[name]['memory_gb']:.1f}GB")
return resultsFound this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.