Learnixo

LangChain Mastery · Lesson 30 of 33

LangSmith: Tracing and Debugging Chains

What is LangSmith?

LangSmith is LangChain's observability platform. It captures every LLM call, chain step, and tool invocation — with inputs, outputs, token counts, and latency — so you can debug failures, compare prompt versions, and run automated evaluations.

Without LangSmith: You see the final output. When it's wrong, you don't know which step failed.

With LangSmith: You see every intermediate step, the exact prompt sent to the LLM, and which retrieval results came back.


Setup

Python
import os

# Set these environment variables before importing LangChain
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "your-langsmith-api-key"
os.environ["LANGCHAIN_PROJECT"] = "clinical-pharmacist-bot"   # Groups runs in the UI

# That's it — LangChain now automatically traces all runs to LangSmith
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

model = ChatOpenAI(model="gpt-4o", temperature=0)
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a clinical pharmacist."),
    ("human", "{question}"),
])

chain = prompt | model | StrOutputParser()

# This run is automatically traced — visible in LangSmith UI
answer = chain.invoke({"question": "What is warfarin?"})

Every invocation now appears in https://smith.langchain.com under your project, showing:

  • Full input and output at each step
  • Token counts and cost estimates
  • Latency per step
  • Any errors with full stack traces

Adding Custom Metadata

Tag runs to filter and analyze them later:

Python
from langchain_core.runnables import RunnableConfig

# Attach metadata to a specific invocation
result = chain.invoke(
    {"question": "What is the warfarin dose for AFib?"},
    config=RunnableConfig(
        run_name="warfarin-afib-query",     # Descriptive name in LangSmith
        tags=["clinical", "dosing", "afib"],
        metadata={
            "user_id": "pharmacist_123",
            "session_id": "sess_abc",
            "query_type": "dosing",
            "environment": "production",
        },
    ),
)

# Filter runs in LangSmith by tag or metadata field
# e.g., show all runs where metadata.query_type == "dosing"

Manual Tracing with @traceable

Trace arbitrary Python functions — not just LangChain chains:

Python
from langsmith import traceable

@traceable(name="drug_database_lookup", run_type="tool")
def lookup_drug(drug_name: str) -> dict:
    """Query the internal drug database."""
    db = {
        "warfarin": {"class": "anticoagulant", "dose": "2-10mg/day"},
        "metformin": {"class": "antidiabetic", "dose": "500-2550mg/day"},
    }
    result = db.get(drug_name.lower())
    if not result:
        raise ValueError(f"Drug '{drug_name}' not found")
    return result


@traceable(name="clinical_pharmacist_answer", run_type="chain")
def answer_clinical_question(question: str, user_id: str) -> str:
    """Full pipeline: lookup + LLM answer."""
    
    # Extract drug name (simplified)
    drug = question.lower().split("warfarin" if "warfarin" in question.lower() else " ")[0]
    drug_info = lookup_drug("warfarin")   # This nested call is also traced as a child span
    
    response = chain.invoke({"question": f"{question}\n\nDrug info: {drug_info}"})
    return response


result = answer_clinical_question(
    "What is the mechanism of warfarin?",
    user_id="pharmacist_123",
)

LangSmith shows answer_clinical_question as a parent span with drug_database_lookup and the LLM call nested under it.


Creating a Dataset

Datasets in LangSmith let you run the same questions repeatedly against different prompt versions:

Python
from langsmith import Client

client = Client()

# Create a test dataset
dataset = client.create_dataset(
    dataset_name="clinical-pharmacist-qa",
    description="Q&A pairs for testing the clinical pharmacist chatbot",
)

# Add examples: input + expected output
examples = [
    {
        "input": {"question": "What does warfarin inhibit?"},
        "output": {"answer": "Warfarin inhibits vitamin K epoxide reductase (VKORC1)"},
    },
    {
        "input": {"question": "What is the normal INR range for warfarin therapy?"},
        "output": {"answer": "2.0 to 3.0 for most indications"},
    },
    {
        "input": {"question": "What is the mechanism of metformin?"},
        "output": {"answer": "Metformin activates AMPK and reduces hepatic glucose output"},
    },
]

client.create_examples(
    inputs=[e["input"] for e in examples],
    outputs=[e["output"] for e in examples],
    dataset_id=dataset.id,
)

Running Evaluations

Evaluate your chain against a dataset automatically:

Python
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Target function: what we're evaluating
def run_chain(inputs: dict) -> dict:
    answer = chain.invoke(inputs)
    return {"answer": answer}


# Evaluators: grade each output
qa_evaluator = LangChainStringEvaluator(
    "qa",   # Checks if answer matches expected output
    config={"llm": ChatOpenAI(model="gpt-4o-mini")},
)

# LLM-as-judge for correctness
correctness_evaluator = LangChainStringEvaluator(
    "criteria",
    config={
        "criteria": "correctness",
        "llm": ChatOpenAI(model="gpt-4o-mini"),
    },
)

# Run evaluation against the dataset
results = evaluate(
    run_chain,
    data="clinical-pharmacist-qa",
    evaluators=[qa_evaluator, correctness_evaluator],
    experiment_prefix="gpt-4o-baseline",   # Name this experiment in LangSmith
    metadata={"model": "gpt-4o", "temperature": 0},
)

print(f"Results: {results}")
# LangSmith shows per-example scores and aggregate metrics

Comparing Prompt Versions

Python
# Experiment 1: Current prompt
results_v1 = evaluate(
    run_chain,
    data="clinical-pharmacist-qa",
    evaluators=[correctness_evaluator],
    experiment_prefix="prompt-v1-basic",
)

# Update the prompt (add few-shot examples)
prompt_v2 = ChatPromptTemplate.from_messages([
    ("system",
     "You are a clinical pharmacist. Answer precisely and cite the mechanism when relevant.\n\n"
     "Example:\nQ: What does warfarin inhibit?\nA: Warfarin inhibits vitamin K epoxide reductase (VKORC1), "
     "blocking the recycling of vitamin K and reducing synthesis of clotting factors II, VII, IX, and X."),
    ("human", "{question}"),
])

chain_v2 = prompt_v2 | model | StrOutputParser()

def run_chain_v2(inputs: dict) -> dict:
    return {"answer": chain_v2.invoke(inputs)}

# Experiment 2: New prompt
results_v2 = evaluate(
    run_chain_v2,
    data="clinical-pharmacist-qa",
    evaluators=[correctness_evaluator],
    experiment_prefix="prompt-v2-few-shot",
)

# LangSmith UI: compare experiments side by side  see which prompt scores higher

Debugging a Specific Run

Python
from langsmith import Client

client = Client()

# List recent runs from your project
runs = list(client.list_runs(
    project_name="clinical-pharmacist-bot",
    execution_order=1,     # Only top-level runs (not child steps)
    error=True,            # Only failed runs
    limit=10,
))

for run in runs:
    print(f"Run: {run.name}")
    print(f"  Error: {run.error}")
    print(f"  Input: {run.inputs}")
    print(f"  Latency: {run.end_time - run.start_time if run.end_time else 'N/A'}")


# Get all child steps of a specific run (the full trace)
parent_run_id = "run-id-from-langsmith-ui"
child_runs = list(client.list_runs(
    project_name="clinical-pharmacist-bot",
    parent_run_id=parent_run_id,
))

for child in child_runs:
    print(f"  Step: {child.name} | Latency: {child.end_time - child.start_time}")


# Add human feedback to a run
client.create_feedback(
    run_id=runs[0].id,
    key="correctness",
    score=0,               # 0 = wrong, 1 = correct
    comment="Answer missed the CYP2C9 interaction detail",
)

LangSmith Feature Summary

| Feature | What it gives you | |---|---| | Auto-tracing | Every chain step logged with inputs, outputs, tokens, latency | | @traceable | Trace non-LangChain Python functions as child spans | | RunnableConfig metadata | Filter and segment runs in the UI | | Datasets | Reproducible test sets for regression testing | | Evaluations | Automated scoring with LLM-as-judge or custom evaluators | | Experiment comparison | Side-by-side prompt A/B testing with score diffs | | Feedback API | Collect and store human ratings on specific runs | | Playground | Re-run any traced prompt interactively and edit it |