LLM Quantization: Deep Dive

Why Quantize?

A 70B parameter model at bfloat16 requires 140GB of GPU memory — more than any single consumer GPU. Quantization compresses weights to 4-bit integers, reducing this to ~35GB. With careful quantization, the quality loss is under 2-5% on most benchmarks.

The core tradeoff: precision vs memory and speed. Quantization replaces 16-bit floating point weights with 4-bit (or 8-bit) integers, introducing approximation error in every matrix multiplication.

The Math of Quantization

Symmetric (absmax) quantization:

q = round(w / s)
w_reconstructed = q × s

where s = max(|w|) / 127   (for int8)
      s = max(|w|) / 7     (for int4)

Python

import torch
import numpy as np

def symmetric_quantize(weights: torch.Tensor, bits: int = 8) -> tuple:
    """
    Symmetric quantization: zero point is always 0.
    Returns (quantized_weights, scale).
    """
    max_int = 2 ** (bits - 1) - 1  # 127 for int8, 7 for int4

    # Compute scale from max absolute value
    scale = weights.abs().max() / max_int

    # Quantize
    q = torch.clamp(torch.round(weights / scale), -max_int, max_int).to(torch.int8)

    return q, scale


def symmetric_dequantize(q: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
    """Reconstruct float weights from quantized representation."""
    return q.float() * scale


# Example: quantize and check reconstruction error
weights = torch.randn(1024, 1024) * 0.02  # Typical LLM weight magnitude

q8, scale8 = symmetric_quantize(weights, bits=8)
w_reconstructed = symmetric_dequantize(q8, scale8)

mse = ((weights - w_reconstructed) ** 2).mean().item()
print(f"int8 MSE: {mse:.8f}")  # Very small

q4, scale4 = symmetric_quantize(weights.clamp(-scale8 * 7, scale8 * 7), bits=4)
w4_reconstructed = symmetric_dequantize(q4, scale4)
mse4 = ((weights - w4_reconstructed) ** 2).mean().item()
print(f"int4 MSE: {mse4:.8f}")  # Larger — 4-bit is much coarser

The problem with simple quantization: outlier weights. LLM weight matrices contain a small number of values much larger than the rest. A single large outlier forces the scale factor high, making all the normal values quantize to just a few distinct levels (coarse quantization of most weights to improve outlier range).

Grouped Quantization

Solution: use separate scale factors for groups of weights:

Python

def grouped_quantize(
    weights: torch.Tensor,
    bits: int = 4,
    group_size: int = 128,
) -> tuple:
    """
    Quantize in groups of `group_size` elements.
    Each group has its own scale factor.
    This significantly reduces quantization error.
    """
    orig_shape = weights.shape
    # Reshape to (n_groups, group_size)
    weights_flat = weights.reshape(-1, group_size)
    n_groups = weights_flat.shape[0]

    max_int = 2 ** (bits - 1) - 1
    scales = weights_flat.abs().max(dim=1, keepdim=True).values / max_int

    quantized = torch.clamp(
        torch.round(weights_flat / scales), -max_int, max_int
    ).to(torch.int8)

    return quantized.reshape(orig_shape), scales, group_size


def grouped_dequantize(
    quantized: torch.Tensor,
    scales: torch.Tensor,
    group_size: int,
) -> torch.Tensor:
    """Reconstruct weights from grouped quantization."""
    flat_q = quantized.reshape(-1, group_size)
    reconstructed = flat_q.float() * scales
    return reconstructed.reshape(quantized.shape)


# Grouped quantization (group_size=128) dramatically reduces MSE vs per-tensor
weights = torch.randn(4096, 4096) * 0.02
q, scales, gs = grouped_quantize(weights, bits=4, group_size=128)
w_rec = grouped_dequantize(q, scales, gs)
print(f"Grouped int4 MSE: {((weights - w_rec)**2).mean().item():.8f}")
# Much lower than per-tensor int4

GPTQ: Post-Training Quantization with Calibration Data

GPTQ (Frantar et al., 2022) uses a small calibration dataset to reduce quantization error layer by layer:

Python

# Using AutoGPTQ for quantization
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

# Prepare calibration data (small sample of the training distribution)
def get_calibration_data(n_samples: int = 128, seq_len: int = 2048) -> list:
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    texts = [example["text"] for example in dataset if len(example["text"]) > 100]

    calibration_data = []
    for text in texts[:n_samples]:
        tokens = tokenizer(
            text,
            return_tensors="pt",
            max_length=seq_len,
            truncation=True,
        )
        calibration_data.append(tokens["input_ids"])

    return calibration_data

quantize_config = BaseQuantizeConfig(
    bits=4,                # 4-bit quantization
    group_size=128,        # Group size for per-group scales
    damp_percent=0.01,     # Dampening for numerical stability
    desc_act=False,        # Whether to use activation ordering
    sym=True,              # Symmetric quantization
)

model = AutoGPTQForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    quantize_config=quantize_config,
)

calibration_data = get_calibration_data()
model.quantize(calibration_data)

# Save quantized model
model.save_quantized("./llama3-8b-gptq-4bit", use_safetensors=True)
print("Quantization complete")

AWQ: Activation-Aware Weight Quantization

AWQ (Lin et al., 2023) identifies which weights are most important by looking at activation magnitudes, then protects those weights with higher precision or smaller quantization steps:

Python

# Using AutoAWQ
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
quant_path = "./llama3-8b-awq-4bit"

quant_config = {
    "zero_point": True,      # Use asymmetric quantization
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM",       # Kernel: GEMM (fast) or GEMV (for small batches)
}

model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# AWQ needs calibration data to find salient weights
model.quantize(tokenizer, quant_config=quant_config)
model.save_quantized(quant_path)

# Load for inference
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)

AWQ vs GPTQ:

AWQ is generally faster at inference (optimized GEMM kernels)
GPTQ typically has slightly better accuracy at the same bit-width
AWQ quantization is faster to run (hours vs days for large models)
Both achieve similar perplexity degradation: usually under 0.3-0.5 perplexity increase for 4-bit

NF4: NormalFloat4 for QLoRA

NF4 (NormalFloat 4-bit) is designed specifically for neural network weights that follow a normal distribution:

Python

# NF4 quantization levels are not uniformly spaced
# They are chosen to be optimal for N(0, 1) distributions
NF4_LEVELS = [
    -1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453,
    -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0,
    0.07958029955625534, 0.16093020141124725, 0.24611230194568634, 0.33791524171829224,
    0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0,
]

def quantize_nf4(weights: torch.Tensor) -> tuple:
    """
    NF4 quantization: maps weights to nearest of 16 NF4 levels.
    Optimal for normally-distributed weights.
    """
    nf4_levels = torch.tensor(NF4_LEVELS, dtype=torch.float32)

    # Normalize weights to [-1, 1]
    scale = weights.abs().max()
    normalized = weights / scale

    # Find nearest NF4 level for each weight
    # Distance from each weight to each level
    distances = (normalized.unsqueeze(-1) - nf4_levels).abs()
    quantized_indices = distances.argmin(dim=-1).to(torch.uint8)  # 0-15 (4-bit)

    return quantized_indices, scale, nf4_levels


def dequantize_nf4(indices: torch.Tensor, scale: float, nf4_levels: torch.Tensor) -> torch.Tensor:
    """Reconstruct float weights from NF4 indices."""
    return nf4_levels[indices.long()] * scale

# In practice, bitsandbytes handles this with CUDA kernels
# model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, bnb_4bit_quant_type="nf4")

GGUF: CPU-Friendly Quantization

GGUF (GPT-Generated Unified Format) enables running quantized models on CPU (with partial GPU offloading via llama.cpp):

Bash

# Install llama.cpp
pip install llama-cpp-python

# Download a GGUF model from HuggingFace
# e.g., bartowski/Meta-Llama-3-8B-Instruct-GGUF

Python

from llama_cpp import Llama

# Load Q4_K_M (a good balance of quality and size)
llm = Llama(
    model_path="./Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
    n_ctx=4096,              # Context window
    n_gpu_layers=20,         # Offload some layers to GPU
    n_threads=8,             # CPU threads for non-GPU layers
    verbose=False,
)

response = llm.create_chat_completion(
    messages=[
        {"role": "user", "content": "What is warfarin used for?"}
    ],
    max_tokens=256,
    temperature=0,
)

print(response["choices"][0]["message"]["content"])

GGUF quantization variants (for Q4):

| Format | Description | Perplexity vs F16 | Size (7B) | |---|---|---|---| | Q4_0 | Basic 4-bit | +0.2 | 3.8 GB | | Q4_K_S | 4-bit, small K-quant | +0.1 | 4.4 GB | | Q4_K_M | 4-bit, medium K-quant | +0.05 | 4.8 GB | | Q5_K_M | 5-bit, medium K-quant | +0.02 | 5.7 GB | | Q8_0 | 8-bit | ~=F16 | 7.7 GB |

K_M (K-quant Medium) is the recommended default — it uses higher precision for important layers and lower precision for less sensitive layers.

Evaluating Quantization Quality

Python

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np

def measure_perplexity(model, tokenizer, text: str, device: str = "cuda") -> float:
    """Compute perplexity of a model on given text."""
    encodings = tokenizer(text, return_tensors="pt").to(device)
    max_length = model.config.max_position_embeddings
    stride = 512

    nlls = []
    prev_end_loc = 0

    for begin_loc in range(0, encodings.input_ids.size(1), stride):
        end_loc = min(begin_loc + max_length, encodings.input_ids.size(1))
        trg_len = end_loc - prev_end_loc

        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)
        prev_end_loc = end_loc

        if end_loc == encodings.input_ids.size(1):
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    return ppl.item()


def benchmark_quantization(models: dict, test_text: str, tokenizer) -> dict:
    """Compare perplexity across quantization levels."""
    results = {}
    for name, model in models.items():
        ppl = measure_perplexity(model, tokenizer, test_text)
        results[name] = {
            "perplexity": ppl,
            "memory_gb": sum(p.numel() * p.element_size() for p in model.parameters()) / 1e9,
        }
        print(f"{name}: perplexity={ppl:.2f}, memory={results[name]['memory_gb']:.1f}GB")

    return results

LLM Quantization: Deep Dive

Why Quantize?

The Math of Quantization

Grouped Quantization

GPTQ: Post-Training Quantization with Calibration Data

AWQ: Activation-Aware Weight Quantization

NF4: NormalFloat4 for QLoRA

GGUF: CPU-Friendly Quantization

Evaluating Quantization Quality

Enjoyed this article?

Leave a comment