Open Source LLMs: LLaMA, Mistral, and the Ecosystem

The Open Source LLM Landscape

Since Meta released LLaMA in 2023, the open source LLM ecosystem has grown rapidly. Open source models offer:

Full control: Run on your own infrastructure, no API dependency
Privacy: Data stays on your servers (critical for healthcare, legal, finance)
Cost: No per-token API costs at scale
Customization: Fine-tune on domain-specific data without API restrictions

The tradeoff: inference infrastructure, model management, and update cycles require engineering investment.

Key Model Families

Python

OPEN_SOURCE_MODELS = {
    "LLaMA-3": {
        "publisher": "Meta",
        "sizes": ["8B", "70B", "405B"],
        "license": "LLaMA-3 Community License (restricted commercial use)",
        "strengths": ["General purpose", "Strong reasoning", "Good instruction following"],
        "context": "8192 tokens (base), 128K (LLaMA-3.1)",
        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
    },
    "Mistral": {
        "publisher": "Mistral AI",
        "sizes": ["7B", "8x7B (MoE)", "8x22B (MoE)"],
        "license": "Apache 2.0 (7B), various (larger)",
        "strengths": ["Efficient", "Sliding window attention", "Strong coding"],
        "context": "32K tokens",
        "hf_id": "mistralai/Mistral-7B-Instruct-v0.3",
    },
    "Phi-3": {
        "publisher": "Microsoft",
        "sizes": ["3.8B", "7B", "14B"],
        "license": "MIT",
        "strengths": ["Small size, strong performance", "Textbook-quality training data"],
        "context": "128K tokens",
        "hf_id": "microsoft/Phi-3-medium-128k-instruct",
    },
    "Gemma-2": {
        "publisher": "Google",
        "sizes": ["2B", "9B", "27B"],
        "license": "Gemma Terms of Use",
        "strengths": ["Small and efficient", "Strong at instruction following"],
        "context": "8192 tokens",
        "hf_id": "google/gemma-2-9b-it",
    },
    "Falcon": {
        "publisher": "Technology Innovation Institute",
        "sizes": ["7B", "40B", "180B"],
        "license": "Apache 2.0",
        "strengths": ["True open source (Apache 2.0)", "Multilingual"],
        "context": "2048 tokens",
        "hf_id": "tiiuae/falcon-7b-instruct",
    },
    "Qwen-2": {
        "publisher": "Alibaba",
        "sizes": ["0.5B", "1.5B", "7B", "72B"],
        "license": "Apache 2.0 (small models)",
        "strengths": ["Multilingual (Chinese/English)", "Strong math/coding"],
        "context": "128K tokens",
        "hf_id": "Qwen/Qwen2-7B-Instruct",
    },
}

Running with HuggingFace Transformers

Python

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

def load_open_source_model(
    model_id: str,
    quantization: str = "bfloat16",  # "bfloat16", "int8", "int4"
    device: str = "auto",
) -> tuple:
    """Load an open-source model for inference."""

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Model loading with appropriate dtype
    if quantization == "bfloat16":
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map=device,
        )
    elif quantization == "int8":
        from transformers import BitsAndBytesConfig
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
            device_map=device,
        )
    elif quantization == "int4":
        from transformers import BitsAndBytesConfig
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
            ),
            device_map=device,
        )

    return model, tokenizer


def generate_response(
    model,
    tokenizer,
    prompt: str,
    max_new_tokens: int = 512,
    temperature: float = 0.0,
    system_message: str = None,
) -> str:
    """Generate a response using the loaded model."""

    # Apply chat template
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})
    messages.append({"role": "user", "content": prompt})

    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature if temperature > 0 else None,
            do_sample=temperature > 0,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode only the newly generated tokens
    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True)


# Usage
model, tokenizer = load_open_source_model(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    quantization="bfloat16",
)

response = generate_response(
    model, tokenizer,
    prompt="What is the mechanism of action of warfarin?",
    system_message="You are a clinical pharmacist.",
    max_new_tokens=256,
)
print(response)

Ollama: Easy Local Deployment

Ollama provides a simple way to run open-source models locally:

Bash

# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh

# Download and run LLaMA-3-8B
ollama pull llama3.1:8b

# Interactive chat
ollama run llama3.1:8b "What is the pharmacokinetics of warfarin?"

# Run as API server (OpenAI-compatible)
ollama serve

Python

from openai import OpenAI

# Ollama exposes an OpenAI-compatible API
ollama_client = OpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama",  # Doesn't matter, just needs a value
)

def query_local_model(
    prompt: str,
    model: str = "llama3.1:8b",
    system: str = None,
) -> str:
    """Query a locally running Ollama model."""
    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": prompt})

    response = ollama_client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content


# Available models via Ollama
POPULAR_OLLAMA_MODELS = {
    "llama3.1:8b": "LLaMA 3.1 8B — good general purpose, 8GB VRAM",
    "llama3.1:70b": "LLaMA 3.1 70B — high quality, 48GB VRAM",
    "mistral:7b": "Mistral 7B — fast and capable",
    "phi3:medium": "Phi-3 Medium — efficient, good reasoning",
    "gemma2:9b": "Gemma-2 9B — Google's open model",
    "qwen2:7b": "Qwen-2 7B — strong multilingual",
    "codellama:13b": "Code LLaMA 13B — optimized for code",
    "meditron:7b": "Meditron 7B — fine-tuned for medicine",
}

Model Selection Guide

Python

def select_model(requirements: dict) -> str:
    """Heuristic model selection based on requirements."""

    use_case = requirements.get("use_case", "general")
    vram_gb = requirements.get("vram_gb", 16)
    privacy_critical = requirements.get("privacy_critical", False)
    latency_sensitive = requirements.get("latency_sensitive", False)
    quality_critical = requirements.get("quality_critical", False)

    # Can't use API? Need local model
    if privacy_critical:
        if quality_critical and vram_gb >= 80:
            return "meta-llama/Meta-Llama-3-70B-Instruct"
        elif vram_gb >= 16:
            return "meta-llama/Meta-Llama-3-8B-Instruct"
        elif vram_gb >= 8:
            return "microsoft/Phi-3-mini-128k-instruct"
        else:
            return "google/gemma-2-2b-it"  # Runs on CPU or low VRAM

    # Domain-specific
    if use_case == "medical":
        return "epfl-llm/meditron-7b"  # Medical fine-tune of LLaMA
    elif use_case == "code":
        return "codellama/CodeLlama-34b-Instruct-hf"
    elif use_case == "multilingual":
        return "Qwen/Qwen2-7B-Instruct"

    # General quality/cost tradeoffs
    if quality_critical:
        return "gpt-4o"  # API for highest quality
    elif latency_sensitive:
        return "gpt-4o-mini"  # API for fast responses
    else:
        return "meta-llama/Meta-Llama-3-8B-Instruct"  # Local for cost

The HuggingFace Hub

Python

from huggingface_hub import HfApi, snapshot_download
import os

api = HfApi()

def search_clinical_models(query: str = "clinical") -> list:
    """Search HuggingFace Hub for clinical/medical models."""
    models = api.list_models(
        search=query,
        filter="text-generation",
        sort="downloads",
        direction=-1,
        limit=20,
    )
    return [
        {
            "model_id": m.modelId,
            "downloads": m.downloads,
            "likes": m.likes,
        }
        for m in models
    ]


def download_model_locally(model_id: str, local_dir: str) -> str:
    """Download a model from HuggingFace to local storage."""
    os.makedirs(local_dir, exist_ok=True)

    path = snapshot_download(
        repo_id=model_id,
        local_dir=local_dir,
        ignore_patterns=["*.bin", "*.pth"],  # Prefer safetensors
        # Use local_files_only=True to prevent downloads in production
    )
    print(f"Downloaded {model_id} to {path}")
    return path

Fine-Tuning Open Source Models

Python

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import Dataset

def fine_tune_on_domain_data(
    base_model_id: str,
    training_examples: list[dict],
    output_dir: str,
) -> None:
    """
    Fine-tune an open source model on domain-specific instruction data.
    training_examples: list of {"instruction": str, "response": str}
    """
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)

    # Add LoRA adapters (parameter-efficient fine-tuning)
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # Format training data as chat
    def format_example(example):
        messages = [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["response"]},
        ]
        return tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False
        )

    dataset = Dataset.from_list([
        {"text": format_example(ex)} for ex in training_examples
    ])

    config = SFTConfig(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        bf16=True,
        logging_steps=10,
        save_steps=100,
    )

    trainer = SFTTrainer(
        model=model,
        args=config,
        train_dataset=dataset,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.save_model(output_dir)

    # Merge LoRA weights back into base model for deployment
    from peft import PeftModel
    base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.bfloat16)
    merged_model = PeftModel.from_pretrained(base_model, output_dir)
    merged_model = merged_model.merge_and_unload()
    merged_model.save_pretrained(f"{output_dir}-merged")
    tokenizer.save_pretrained(f"{output_dir}-merged")
    print(f"Merged model saved to {output_dir}-merged")

Open vs Closed: Decision Framework

| Factor | Open Source | Closed (API) | |---|---|---| | Data privacy | Full control | Data sent to provider | | Customization | Fine-tune freely | Prompt engineering only | | Cost (low volume) | Higher (infra) | Lower (pay-per-use) | | Cost (high volume) | Lower | Higher | | Maintenance | Your responsibility | Provider's responsibility | | Frontier quality | Usually behind | Leading edge | | Deployment speed | Days to weeks | Hours | | Compliance | Depends on infra | Depends on provider |

Rule of thumb for healthcare/clinical AI:

PHI (Protected Health Information) involved → open source required (or BAA with provider)
High volume production workloads → open source or negotiated API contracts
Rapid prototyping → API first, migrate to open source if volume/privacy warrants

Open Source LLMs: LLaMA, Mistral, and the Ecosystem

The Open Source LLM Landscape

Key Model Families

Running with HuggingFace Transformers

Ollama: Easy Local Deployment

Model Selection Guide

The HuggingFace Hub

Fine-Tuning Open Source Models

Open vs Closed: Decision Framework

Enjoyed this article?

Leave a comment