Open Source LLMs: LLaMA, Mistral, and the Ecosystem
The open source LLM landscape: LLaMA-3, Mistral, Phi, Falcon, and Gemma. How to choose, download, run, and fine-tune open source models for production use.
The Open Source LLM Landscape
Since Meta released LLaMA in 2023, the open source LLM ecosystem has grown rapidly. Open source models offer:
- Full control: Run on your own infrastructure, no API dependency
- Privacy: Data stays on your servers (critical for healthcare, legal, finance)
- Cost: No per-token API costs at scale
- Customization: Fine-tune on domain-specific data without API restrictions
The tradeoff: inference infrastructure, model management, and update cycles require engineering investment.
Key Model Families
OPEN_SOURCE_MODELS = {
"LLaMA-3": {
"publisher": "Meta",
"sizes": ["8B", "70B", "405B"],
"license": "LLaMA-3 Community License (restricted commercial use)",
"strengths": ["General purpose", "Strong reasoning", "Good instruction following"],
"context": "8192 tokens (base), 128K (LLaMA-3.1)",
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
},
"Mistral": {
"publisher": "Mistral AI",
"sizes": ["7B", "8x7B (MoE)", "8x22B (MoE)"],
"license": "Apache 2.0 (7B), various (larger)",
"strengths": ["Efficient", "Sliding window attention", "Strong coding"],
"context": "32K tokens",
"hf_id": "mistralai/Mistral-7B-Instruct-v0.3",
},
"Phi-3": {
"publisher": "Microsoft",
"sizes": ["3.8B", "7B", "14B"],
"license": "MIT",
"strengths": ["Small size, strong performance", "Textbook-quality training data"],
"context": "128K tokens",
"hf_id": "microsoft/Phi-3-medium-128k-instruct",
},
"Gemma-2": {
"publisher": "Google",
"sizes": ["2B", "9B", "27B"],
"license": "Gemma Terms of Use",
"strengths": ["Small and efficient", "Strong at instruction following"],
"context": "8192 tokens",
"hf_id": "google/gemma-2-9b-it",
},
"Falcon": {
"publisher": "Technology Innovation Institute",
"sizes": ["7B", "40B", "180B"],
"license": "Apache 2.0",
"strengths": ["True open source (Apache 2.0)", "Multilingual"],
"context": "2048 tokens",
"hf_id": "tiiuae/falcon-7b-instruct",
},
"Qwen-2": {
"publisher": "Alibaba",
"sizes": ["0.5B", "1.5B", "7B", "72B"],
"license": "Apache 2.0 (small models)",
"strengths": ["Multilingual (Chinese/English)", "Strong math/coding"],
"context": "128K tokens",
"hf_id": "Qwen/Qwen2-7B-Instruct",
},
}Running with HuggingFace Transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
def load_open_source_model(
model_id: str,
quantization: str = "bfloat16", # "bfloat16", "int8", "int4"
device: str = "auto",
) -> tuple:
"""Load an open-source model for inference."""
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Model loading with appropriate dtype
if quantization == "bfloat16":
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map=device,
)
elif quantization == "int8":
from transformers import BitsAndBytesConfig
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=BitsAndBytesConfig(load_in_8bit=True),
device_map=device,
)
elif quantization == "int4":
from transformers import BitsAndBytesConfig
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
),
device_map=device,
)
return model, tokenizer
def generate_response(
model,
tokenizer,
prompt: str,
max_new_tokens: int = 512,
temperature: float = 0.0,
system_message: str = None,
) -> str:
"""Generate a response using the loaded model."""
# Apply chat template
messages = []
if system_message:
messages.append({"role": "system", "content": system_message})
messages.append({"role": "user", "content": prompt})
input_text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature if temperature > 0 else None,
do_sample=temperature > 0,
pad_token_id=tokenizer.eos_token_id,
)
# Decode only the newly generated tokens
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
return tokenizer.decode(new_tokens, skip_special_tokens=True)
# Usage
model, tokenizer = load_open_source_model(
"meta-llama/Meta-Llama-3-8B-Instruct",
quantization="bfloat16",
)
response = generate_response(
model, tokenizer,
prompt="What is the mechanism of action of warfarin?",
system_message="You are a clinical pharmacist.",
max_new_tokens=256,
)
print(response)Ollama: Easy Local Deployment
Ollama provides a simple way to run open-source models locally:
# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh
# Download and run LLaMA-3-8B
ollama pull llama3.1:8b
# Interactive chat
ollama run llama3.1:8b "What is the pharmacokinetics of warfarin?"
# Run as API server (OpenAI-compatible)
ollama servefrom openai import OpenAI
# Ollama exposes an OpenAI-compatible API
ollama_client = OpenAI(
base_url="http://localhost:11434/v1",
api_key="ollama", # Doesn't matter, just needs a value
)
def query_local_model(
prompt: str,
model: str = "llama3.1:8b",
system: str = None,
) -> str:
"""Query a locally running Ollama model."""
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
response = ollama_client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
return response.choices[0].message.content
# Available models via Ollama
POPULAR_OLLAMA_MODELS = {
"llama3.1:8b": "LLaMA 3.1 8B — good general purpose, 8GB VRAM",
"llama3.1:70b": "LLaMA 3.1 70B — high quality, 48GB VRAM",
"mistral:7b": "Mistral 7B — fast and capable",
"phi3:medium": "Phi-3 Medium — efficient, good reasoning",
"gemma2:9b": "Gemma-2 9B — Google's open model",
"qwen2:7b": "Qwen-2 7B — strong multilingual",
"codellama:13b": "Code LLaMA 13B — optimized for code",
"meditron:7b": "Meditron 7B — fine-tuned for medicine",
}Model Selection Guide
def select_model(requirements: dict) -> str:
"""Heuristic model selection based on requirements."""
use_case = requirements.get("use_case", "general")
vram_gb = requirements.get("vram_gb", 16)
privacy_critical = requirements.get("privacy_critical", False)
latency_sensitive = requirements.get("latency_sensitive", False)
quality_critical = requirements.get("quality_critical", False)
# Can't use API? Need local model
if privacy_critical:
if quality_critical and vram_gb >= 80:
return "meta-llama/Meta-Llama-3-70B-Instruct"
elif vram_gb >= 16:
return "meta-llama/Meta-Llama-3-8B-Instruct"
elif vram_gb >= 8:
return "microsoft/Phi-3-mini-128k-instruct"
else:
return "google/gemma-2-2b-it" # Runs on CPU or low VRAM
# Domain-specific
if use_case == "medical":
return "epfl-llm/meditron-7b" # Medical fine-tune of LLaMA
elif use_case == "code":
return "codellama/CodeLlama-34b-Instruct-hf"
elif use_case == "multilingual":
return "Qwen/Qwen2-7B-Instruct"
# General quality/cost tradeoffs
if quality_critical:
return "gpt-4o" # API for highest quality
elif latency_sensitive:
return "gpt-4o-mini" # API for fast responses
else:
return "meta-llama/Meta-Llama-3-8B-Instruct" # Local for costThe HuggingFace Hub
from huggingface_hub import HfApi, snapshot_download
import os
api = HfApi()
def search_clinical_models(query: str = "clinical") -> list:
"""Search HuggingFace Hub for clinical/medical models."""
models = api.list_models(
search=query,
filter="text-generation",
sort="downloads",
direction=-1,
limit=20,
)
return [
{
"model_id": m.modelId,
"downloads": m.downloads,
"likes": m.likes,
}
for m in models
]
def download_model_locally(model_id: str, local_dir: str) -> str:
"""Download a model from HuggingFace to local storage."""
os.makedirs(local_dir, exist_ok=True)
path = snapshot_download(
repo_id=model_id,
local_dir=local_dir,
ignore_patterns=["*.bin", "*.pth"], # Prefer safetensors
# Use local_files_only=True to prevent downloads in production
)
print(f"Downloaded {model_id} to {path}")
return pathFine-Tuning Open Source Models
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
def fine_tune_on_domain_data(
base_model_id: str,
training_examples: list[dict],
output_dir: str,
) -> None:
"""
Fine-tune an open source model on domain-specific instruction data.
training_examples: list of {"instruction": str, "response": str}
"""
model = AutoModelForCausalLM.from_pretrained(
base_model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# Add LoRA adapters (parameter-efficient fine-tuning)
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Format training data as chat
def format_example(example):
messages = [
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["response"]},
]
return tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=False
)
dataset = Dataset.from_list([
{"text": format_example(ex)} for ex in training_examples
])
config = SFTConfig(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
bf16=True,
logging_steps=10,
save_steps=100,
)
trainer = SFTTrainer(
model=model,
args=config,
train_dataset=dataset,
tokenizer=tokenizer,
)
trainer.train()
trainer.save_model(output_dir)
# Merge LoRA weights back into base model for deployment
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.bfloat16)
merged_model = PeftModel.from_pretrained(base_model, output_dir)
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained(f"{output_dir}-merged")
tokenizer.save_pretrained(f"{output_dir}-merged")
print(f"Merged model saved to {output_dir}-merged")Open vs Closed: Decision Framework
| Factor | Open Source | Closed (API) | |---|---|---| | Data privacy | Full control | Data sent to provider | | Customization | Fine-tune freely | Prompt engineering only | | Cost (low volume) | Higher (infra) | Lower (pay-per-use) | | Cost (high volume) | Lower | Higher | | Maintenance | Your responsibility | Provider's responsibility | | Frontier quality | Usually behind | Leading edge | | Deployment speed | Days to weeks | Hours | | Compliance | Depends on infra | Depends on provider |
Rule of thumb for healthcare/clinical AI:
- PHI (Protected Health Information) involved → open source required (or BAA with provider)
- High volume production workloads → open source or negotiated API contracts
- Rapid prototyping → API first, migrate to open source if volume/privacy warrants
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.