Fine-Tuning LLMs · Lesson 3 of 16
Full Fine-Tuning vs Parameter-Efficient Fine-Tuning
Full Fine-Tuning vs PEFT
When you decide to fine-tune a model, your first technical decision is: update all parameters or just a small subset? This choice shapes your hardware requirements, training time, and the risk of destroying the model's general capabilities.
Full Fine-Tuning
Full fine-tuning updates every parameter in the model. For a 7 billion parameter model like Llama 3 8B, that means updating approximately 8 billion floating-point numbers on every gradient step.
What It Requires
# Full fine-tuning memory estimation
def estimate_full_finetune_memory(
num_parameters_billions: float,
precision: str = "bf16",
optimizer: str = "adamw"
) -> dict:
"""
Estimate GPU memory required for full fine-tuning.
For mixed precision training:
- Model weights: 2 bytes per param (bf16)
- Gradients: 2 bytes per param (bf16)
- Optimizer states (AdamW): 8 bytes per param (fp32 m and v)
- Activations: variable, estimate 2 bytes per param
Total: ~14 bytes per parameter for AdamW + bf16
"""
params = num_parameters_billions * 1e9
bytes_per_param = {
"bf16_adamw": 14, # weights(2) + grads(2) + optimizer(8) + activations(2)
"fp32_adamw": 20, # weights(4) + grads(4) + optimizer(8) + activations(4)
"bf16_sgd": 8, # weights(2) + grads(2) + activations(2) + no m/v
}
key = f"{precision}_{optimizer}"
bpp = bytes_per_param.get(key, 14)
total_bytes = params * bpp
total_gb = total_bytes / (1024 ** 3)
return {
"model": f"{num_parameters_billions}B parameters",
"precision": precision,
"optimizer": optimizer,
"estimated_gpu_memory_gb": round(total_gb, 1),
"required_hardware": _suggest_hardware(total_gb)
}
def _suggest_hardware(gb: float) -> str:
if gb <= 24:
return "RTX 4090 (24 GB)"
elif gb <= 48:
return "2x RTX 4090 or A6000 (48 GB)"
elif gb <= 80:
return "A100 80GB (single GPU)"
elif gb <= 160:
return "2x A100 80GB"
else:
return f"{int(gb / 80)} x A100 80GB (multi-node)"
# Estimates for common model sizes
for size in [1.0, 3.0, 7.0, 13.0, 70.0]:
result = estimate_full_finetune_memory(size)
print(f" {result['model']}: {result['estimated_gpu_memory_gb']} GB → {result['required_hardware']}")
# Output:
# 1.0B parameters: 13.1 GB → RTX 4090 (24 GB)
# 3.0B parameters: 39.2 GB → 2x RTX 4090 or A6000 (48 GB)
# 7.0B parameters: 91.5 GB → 2x A100 80GB
# 13.0B parameters: 170.1 GB → 3 x A100 80GB (multi-node)
# 70.0B parameters: 915.5 GB → 12 x A100 80GB (multi-node)Full Fine-Tuning Code
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import torch
model_name = "meta-llama/Llama-3.2-3B-Instruct"
# Load ALL parameters — nothing is frozen
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Count trainable parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable: {trainable_params:,} / {total_params:,} ({trainable_params/total_params:.1%})")
# Trainable: 3,212,749,824 / 3,212,749,824 (100.0%)
training_args = TrainingArguments(
output_dir="./full-finetune-output",
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=8, # effective batch = 16
learning_rate=2e-5,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
bf16=True,
gradient_checkpointing=True, # trade compute for memory
save_strategy="epoch",
evaluation_strategy="epoch",
logging_steps=10,
report_to="none",
)The Catastrophic Forgetting Risk
Full fine-tuning touches every parameter. If your training data is narrow (e.g., only drug Q&A), the model will shift all its weights toward that distribution — and may forget general English reasoning, coding, math, and world knowledge.
This is the primary reason PEFT exists.
PEFT: Parameter-Efficient Fine-Tuning
PEFT is a family of methods that achieve strong task adaptation by updating only a tiny fraction of parameters while keeping the original model weights frozen or nearly frozen.
# The key insight: you don't need to update everything
# Most of the model's "knowledge" is already correct.
# You need to redirect its behaviour, not rebuild it.
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-3B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto"
)
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16, # rank
lora_alpha=32, # scaling
target_modules=["q_proj", "v_proj"],
lora_dropout=0.1,
bias="none",
)
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
# trainable params: 6,815,744 || all params: 3,219,565,568 || trainable%: 0.2117%
# Only 0.2% of parameters are updated — the rest stay frozenPEFT Method 1: LoRA (Low-Rank Adaptation)
LoRA inserts small trainable rank-decomposition matrices alongside the original weight matrices. The base weights are frozen; only the small matrices are updated.
Mathematical form: Instead of updating W directly, you learn delta W = B × A where B is shape (d × r) and A is shape (r × k), with r much smaller than d or k.
import torch
import torch.nn as nn
import math
class LoRALinear(nn.Module):
"""LoRA applied to a linear layer."""
def __init__(self, in_features: int, out_features: int, rank: int = 16, alpha: float = 32):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.rank = rank
self.scaling = alpha / rank
# Frozen original weight (loaded from pre-trained model)
self.weight = nn.Parameter(
torch.randn(out_features, in_features),
requires_grad=False # FROZEN
)
# Trainable LoRA matrices
self.lora_A = nn.Parameter(
torch.randn(rank, in_features) / math.sqrt(rank)
)
self.lora_B = nn.Parameter(
torch.zeros(out_features, rank) # initialized to zero → starts at W
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# Original frozen forward pass
base_output = x @ self.weight.T
# LoRA delta: x @ A.T @ B.T, scaled
lora_output = (x @ self.lora_A.T) @ self.lora_B.T
lora_output = lora_output * self.scaling
return base_output + lora_output
# Parameter comparison
d_model, d_head = 4096, 4096
rank = 16
original_params = d_model * d_head # 16,777,216
lora_params = rank * d_model + rank * d_head # 131,072
compression = original_params / lora_params
print(f"Original W: {original_params:,} params")
print(f"LoRA (r={rank}): {lora_params:,} params")
print(f"Compression ratio: {compression:.0f}x")
# Compression ratio: 128xPEFT Method 2: QLoRA
QLoRA combines 4-bit quantization of the base model with LoRA adapters in bfloat16. The base model is loaded in 4-bit (saving memory), and only the LoRA matrices are trained in higher precision.
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import torch
# Step 1: Quantization configuration
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # NormalFloat4 — optimal for normally distributed weights
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True, # quantize the quantization constants too
)
# Step 2: Load base model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-70B-Instruct", # 70B model!
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
# Step 3: Prepare for k-bit training
model = prepare_model_for_kbit_training(model)
# Step 4: Add LoRA adapters (these remain in bfloat16)
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# 70B model on a single A100 80GB — impossible with any other methodPEFT Method 3: Adapter Layers
Adapters are small bottleneck modules inserted between transformer sub-layers. The original transformer layers stay frozen; only the adapter parameters are trained.
import torch
import torch.nn as nn
class HoulsbyAdapter(nn.Module):
"""
Houlsby-style adapter: down-project → activation → up-project.
Inserted after attention and after FFN in each transformer layer.
"""
def __init__(self, hidden_size: int, bottleneck_size: int = 64):
super().__init__()
self.down_project = nn.Linear(hidden_size, bottleneck_size)
self.activation = nn.GELU()
self.up_project = nn.Linear(bottleneck_size, hidden_size)
# Near-zero initialization so adapter starts as identity
nn.init.normal_(self.down_project.weight, std=1e-3)
nn.init.normal_(self.up_project.weight, std=1e-3)
nn.init.zeros_(self.down_project.bias)
nn.init.zeros_(self.up_project.bias)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# Residual connection — adapter starts as near-identity
return x + self.up_project(self.activation(self.down_project(x)))
# Parameter count for adapter vs LoRA
hidden_size = 4096
bottleneck = 64
adapter_params = 2 * (hidden_size * bottleneck + bottleneck) # down + up + biases
print(f"Adapter parameters per layer: {adapter_params:,}")
# Adapter parameters per layer: 528,128
# LoRA for same layers (q + v projections, rank 16)
lora_params_per_layer = 2 * (16 * 4096 + 4096 * 16)
print(f"LoRA parameters per layer: {lora_params_per_layer:,}")
# LoRA parameters per layer: 262,144PEFT Method 4: Prefix Tuning
Prefix tuning prepends trainable "soft prompt" vectors to the keys and values in each attention layer. The model learns to attend to these virtual tokens.
import torch
import torch.nn as nn
class PrefixTuning(nn.Module):
"""
Learnable prefix vectors prepended to K and V in attention.
The base model's parameters are frozen.
"""
def __init__(
self,
num_layers: int,
num_heads: int,
head_dim: int,
prefix_length: int = 20,
):
super().__init__()
self.prefix_length = prefix_length
self.num_layers = num_layers
# Shape: (num_layers, 2, prefix_length, num_heads * head_dim)
# 2 for key and value
self.prefix_embeddings = nn.Parameter(
torch.randn(num_layers, 2, prefix_length, num_heads * head_dim) * 0.01
)
def get_prefix(self, layer_idx: int, batch_size: int) -> tuple[torch.Tensor, torch.Tensor]:
prefix = self.prefix_embeddings[layer_idx] # (2, prefix_length, d)
prefix_k = prefix[0].unsqueeze(0).expand(batch_size, -1, -1)
prefix_v = prefix[1].unsqueeze(0).expand(batch_size, -1, -1)
return prefix_k, prefix_v
# Prefix tuning parameter count (much smaller than adapters)
num_layers, num_heads, head_dim, prefix_len = 32, 32, 128, 20
prefix_params = num_layers * 2 * prefix_len * (num_heads * head_dim)
print(f"Prefix tuning parameters: {prefix_params:,}")
# 32 × 2 × 20 × 4096 = 5,242,880Comparison Table
| Method | Trainable % | Memory Overhead | Inference Latency | Forgetting Risk | Checkpoint Size | |---|---|---|---|---|---| | Full Fine-Tuning | 100% | Very high (14 bytes/param) | None | High | Full model copy | | LoRA (r=16) | ~0.2% | Low (adapter only) | None (merged at inference) | Very low | Under 50 MB | | QLoRA (r=16) | ~0.2% | Very low (4-bit base) | Dequantization overhead | Very low | Under 50 MB | | Adapters | ~1-3% | Low-moderate | Small (extra layers) | Low | 50-200 MB | | Prefix Tuning | ~0.1% | Very low | Small (longer KV) | Very low | Under 10 MB |
Choosing the Right Method
def recommend_peft_method(
model_size_billions: float,
available_gpu_gb: float,
task_complexity: str, # "simple" | "moderate" | "complex"
multiple_tasks: bool
) -> dict:
"""Recommend the best PEFT approach for a given scenario."""
# Memory rule of thumb: 4-bit quantized model ≈ 0.6 GB per billion params
qlora_memory = model_size_billions * 0.6 + 2 # +2 GB for adapters and activations
# LoRA with bf16 base ≈ 2 GB per billion params
lora_memory = model_size_billions * 2.0 + 2
recommendations = []
if lora_memory <= available_gpu_gb:
rank = 8 if task_complexity == "simple" else 16 if task_complexity == "moderate" else 32
recommendations.append({
"method": "LoRA",
"rank": rank,
"reason": "Base model fits in memory; fastest training",
})
if qlora_memory <= available_gpu_gb and model_size_billions > 7:
recommendations.append({
"method": "QLoRA",
"rank": 16,
"reason": "Model too large for bf16 LoRA; 4-bit quantization enables training",
})
if multiple_tasks:
recommendations.append({
"method": "Adapter Layers",
"bottleneck": 64,
"reason": "Adapters can be hot-swapped for different tasks at inference time",
})
if not recommendations:
return {"method": "none", "reason": "Model too large even for QLoRA on this hardware"}
return recommendations[0]
# Examples
print(recommend_peft_method(8, 24, "moderate", False))
# {"method": "LoRA", "rank": 16, "reason": "Base model fits..."}
print(recommend_peft_method(70, 80, "complex", False))
# {"method": "QLoRA", "rank": 16, "reason": "Model too large for bf16..."}
print(recommend_peft_method(7, 24, "simple", True))
# {"method": "Adapter Layers", ...}Summary
PEFT is the default choice for almost all fine-tuning tasks today. The efficiency gains are dramatic:
- LoRA trains 0.2% of parameters and achieves performance within 1-3% of full fine-tuning on most benchmarks
- QLoRA lets you fine-tune a 70B model on a single A100 GPU that could not even load the model in bf16
- Checkpoint sizes drop from gigabytes to megabytes, enabling multiple task-specific adapters per base model
Full fine-tuning is reserved for scenarios where you need maximum performance, have abundant compute, and your domain is so different from the base model's training distribution that small adapter updates are insufficient.