Output Classifiers: Detecting Harmful Content — AI Safety & Guardrails | Learnixo

What Is an Output Safety Classifier?

An output safety classifier is a model or function that takes an LLM-generated response and returns a safety verdict (safe/unsafe) before the response reaches the user.

It is the last programmatic line of defense before the response leaves your system.

Approach 1: Rule-Based Classifier (Fastest)

Rules run in microseconds with zero cost. Use them as the first filter:

Python

# classifiers/rule_based.py
import re
from dataclasses import dataclass

@dataclass
class ClassifierResult:
    is_safe: bool
    category: str  # "safe" | "harmful_medical" | "pii" | "legal_advice" | ...
    reason: str
    confidence: float

# Medical safety rules
HARMFUL_MEDICAL_PATTERNS = [
    (r"(safe|okay|fine|ok) to (take|use|combine|mix).*together", "unsafe_combination_approval"),
    (r"(double|triple|increase) (the |your )?(dose|dosage)", "dangerous_dosage"),
    (r"no (need|reason) to (see|consult|visit) (a |your )?(doctor|pharmacist|physician)", "discouraging_professional_care"),
    (r"self[- ]medicate", "self_medication_advice"),
]

# PII patterns
PII_PATTERNS = [
    (r"\b\d{3}[-.]?\d{2}[-.]?\d{4}\b", "ssn"),
    (r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", "email", re.IGNORECASE),
    (r"\b\d{16}\b", "credit_card"),
]

def classify_rule_based(text: str) -> ClassifierResult:
    text_lower = text.lower()

    for pattern, category in HARMFUL_MEDICAL_PATTERNS:
        if re.search(pattern, text_lower):
            return ClassifierResult(
                is_safe=False,
                category=f"harmful_medical:{category}",
                reason=f"Matched pattern: {pattern}",
                confidence=0.95,
            )

    for item in PII_PATTERNS:
        pattern = item[0]
        category = item[1]
        flags = item[2] if len(item) > 2 else 0
        if re.search(pattern, text, flags):
            return ClassifierResult(
                is_safe=False,
                category=f"pii:{category}",
                reason=f"PII pattern detected: {category}",
                confidence=0.99,
            )

    return ClassifierResult(
        is_safe=True,
        category="safe",
        reason="No harmful patterns detected",
        confidence=0.7,  # Rule-based has limited recall
    )

Approach 2: LLM-as-Judge Classifier

More nuanced than rules, but costs tokens:

Python

# classifiers/llm_judge.py
from openai import AsyncAzureOpenAI
from pydantic import BaseModel

class SafetyJudgment(BaseModel):
    is_safe: bool
    categories: list[str]
    confidence: float
    reason: str

JUDGE_SYSTEM_PROMPT = """You are a pharmaceutical content safety classifier.

Evaluate the following chatbot response for safety issues. Check for:
1. harmful_medical: Unsafe medical advice, dangerous drug combinations, incorrect dosage
2. discouraging_care: Discourages consulting healthcare professionals
3. misinformation: Factually incorrect medical claims
4. pii: Contains personal health information that shouldn't be shared
5. jailbreak_success: The model appears to have been jailbroken

Respond with JSON: {
    "is_safe": true/false,
    "categories": ["list", "of", "issues"],
    "confidence": 0.0-1.0,
    "reason": "one sentence explanation"
}

If the response is safe, categories should be an empty list."""

async def classify_with_llm(
    response_text: str,
    original_query: str,
    client: AsyncAzureOpenAI,
) -> SafetyJudgment:
    result = await client.chat.completions.create(
        model="gpt-4o-mini",  # Cheaper model for classification
        messages=[
            {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": f"Query: {original_query}\n\nChatbot response:\n{response_text}",
            },
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )
    return SafetyJudgment.model_validate_json(result.choices[0].message.content)

Approach 3: Fine-Tuned Classifier

For high-volume production, train a dedicated classifier. Much faster and cheaper than LLM-as-judge:

Python

# classifiers/finetuned.py
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np

class PharmacySafetyClassifier:
    """
    Fine-tuned sentence transformer with safety head.
    Runs on CPU, under 50ms per classification.
    """

    def __init__(self, model_path: str, threshold: float = 0.5):
        self.encoder = SentenceTransformer(model_path)
        self.threshold = threshold
        # In production: load trained classifier head from model_path
        self.classifier_head = self._load_head(model_path)

    def _load_head(self, path):
        # Placeholder — in practice load a trained sklearn or torch classifier
        import joblib
        return joblib.load(f"{path}/classifier_head.pkl")

    def classify(self, text: str) -> ClassifierResult:
        embedding = self.encoder.encode(text, convert_to_tensor=True)
        score = self.classifier_head.predict_proba([embedding.numpy()])[0]
        # score[0] = P(safe), score[1] = P(unsafe)

        is_safe = score[0] >= self.threshold
        return ClassifierResult(
            is_safe=is_safe,
            category="safe" if is_safe else "unsafe",
            reason=f"Classifier score: {score[0]:.3f}",
            confidence=max(score),
        )

    def classify_batch(self, texts: list[str]) -> list[ClassifierResult]:
        """Batch classification — much faster for multiple texts."""
        embeddings = self.encoder.encode(texts, convert_to_tensor=True)
        scores = self.classifier_head.predict_proba(embeddings.numpy())
        return [
            ClassifierResult(
                is_safe=s[0] >= self.threshold,
                category="safe" if s[0] >= self.threshold else "unsafe",
                reason=f"Score: {s[0]:.3f}",
                confidence=max(s),
            )
            for s in scores
        ]

Threshold Tuning: Precision vs Recall

The threshold determines how aggressively the classifier blocks responses:

| Threshold | Precision | Recall | Effect | |---|---|---|---| | 0.9 (high) | High | Low | Few false positives, many unsafe responses slip through | | 0.5 (medium) | Balanced | Balanced | Good starting point | | 0.3 (low) | Low | High | Blocks most unsafe responses, many false positives |

For medical applications: start with threshold = 0.5. Measure:

False positive rate (safe responses blocked): run 200 known-safe responses through the classifier
False negative rate (unsafe responses pass): run 200 known-unsafe responses through the classifier

Tune the threshold to achieve under 5% false negatives for critical safety categories.

Combining All Three Approaches

Python

# classifiers/combined.py
from typing import Optional

class CombinedSafetyClassifier:
    def __init__(
        self,
        rule_classifier,
        llm_client,
        finetuned_classifier: Optional[PharmacySafetyClassifier] = None,
    ):
        self.rules = rule_classifier
        self.llm = llm_client
        self.finetuned = finetuned_classifier

    async def classify(
        self,
        response_text: str,
        query: str,
    ) -> ClassifierResult:
        # Layer 1: Rules (0ms, no cost)
        rule_result = classify_rule_based(response_text)
        if not rule_result.is_safe:
            return rule_result

        # Layer 2: Fine-tuned classifier if available (under 50ms, no API cost)
        if self.finetuned:
            ft_result = self.finetuned.classify(response_text)
            if not ft_result.is_safe and ft_result.confidence > 0.8:
                return ft_result

        # Layer 3: LLM judge for uncertain cases (200-500ms, token cost)
        llm_result = await classify_with_llm(response_text, query, self.llm)
        return ClassifierResult(
            is_safe=llm_result.is_safe,
            category=",".join(llm_result.categories) or "safe",
            reason=llm_result.reason,
            confidence=llm_result.confidence,
        )

Performance Comparison

| Classifier | Latency | Cost/1k | Precision | Recall | |---|---|---|---|---| | Rules | under 1ms | $0 | High | Low | | Fine-tuned | under 50ms | $0 | Good | Good | | LLM judge | 200-500ms | ~$0.002 | Best | Best | | Combined | under 50ms (mostly) | ~$0.0002 avg | Best | Best |

The combined classifier routes ~80% of traffic through rules + fine-tuned (fast, free), and only sends ~20% (ambiguous cases) to the LLM judge. This gives best-of-both performance and cost.

Integrating in the API

Python

@router.post("/api/chat")
async def chat(request: ChatRequest, classifier=Depends(get_classifier)):
    response_text = await generate_response(request)

    result = await classifier.classify(response_text, request.message)

    if not result.is_safe:
        log.warning("response_blocked", category=result.category, confidence=result.confidence)
        return {"answer": SAFE_FALLBACK, "blocked": True}

    return {"answer": response_text, "blocked": False}

SAFE_FALLBACK = (
    "I'm not able to provide that information directly. "
    "Please consult your pharmacist or physician for guidance on this question."
)