LangChain Mastery · Lesson 30 of 33
LangSmith: Tracing and Debugging Chains
What is LangSmith?
LangSmith is LangChain's observability platform. It captures every LLM call, chain step, and tool invocation — with inputs, outputs, token counts, and latency — so you can debug failures, compare prompt versions, and run automated evaluations.
Without LangSmith: You see the final output. When it's wrong, you don't know which step failed.
With LangSmith: You see every intermediate step, the exact prompt sent to the LLM, and which retrieval results came back.
Setup
import os
# Set these environment variables before importing LangChain
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "your-langsmith-api-key"
os.environ["LANGCHAIN_PROJECT"] = "clinical-pharmacist-bot" # Groups runs in the UI
# That's it — LangChain now automatically traces all runs to LangSmith
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
model = ChatOpenAI(model="gpt-4o", temperature=0)
prompt = ChatPromptTemplate.from_messages([
("system", "You are a clinical pharmacist."),
("human", "{question}"),
])
chain = prompt | model | StrOutputParser()
# This run is automatically traced — visible in LangSmith UI
answer = chain.invoke({"question": "What is warfarin?"})Every invocation now appears in https://smith.langchain.com under your project, showing:
- Full input and output at each step
- Token counts and cost estimates
- Latency per step
- Any errors with full stack traces
Adding Custom Metadata
Tag runs to filter and analyze them later:
from langchain_core.runnables import RunnableConfig
# Attach metadata to a specific invocation
result = chain.invoke(
{"question": "What is the warfarin dose for AFib?"},
config=RunnableConfig(
run_name="warfarin-afib-query", # Descriptive name in LangSmith
tags=["clinical", "dosing", "afib"],
metadata={
"user_id": "pharmacist_123",
"session_id": "sess_abc",
"query_type": "dosing",
"environment": "production",
},
),
)
# Filter runs in LangSmith by tag or metadata field
# e.g., show all runs where metadata.query_type == "dosing"Manual Tracing with @traceable
Trace arbitrary Python functions — not just LangChain chains:
from langsmith import traceable
@traceable(name="drug_database_lookup", run_type="tool")
def lookup_drug(drug_name: str) -> dict:
"""Query the internal drug database."""
db = {
"warfarin": {"class": "anticoagulant", "dose": "2-10mg/day"},
"metformin": {"class": "antidiabetic", "dose": "500-2550mg/day"},
}
result = db.get(drug_name.lower())
if not result:
raise ValueError(f"Drug '{drug_name}' not found")
return result
@traceable(name="clinical_pharmacist_answer", run_type="chain")
def answer_clinical_question(question: str, user_id: str) -> str:
"""Full pipeline: lookup + LLM answer."""
# Extract drug name (simplified)
drug = question.lower().split("warfarin" if "warfarin" in question.lower() else " ")[0]
drug_info = lookup_drug("warfarin") # This nested call is also traced as a child span
response = chain.invoke({"question": f"{question}\n\nDrug info: {drug_info}"})
return response
result = answer_clinical_question(
"What is the mechanism of warfarin?",
user_id="pharmacist_123",
)LangSmith shows answer_clinical_question as a parent span with drug_database_lookup and the LLM call nested under it.
Creating a Dataset
Datasets in LangSmith let you run the same questions repeatedly against different prompt versions:
from langsmith import Client
client = Client()
# Create a test dataset
dataset = client.create_dataset(
dataset_name="clinical-pharmacist-qa",
description="Q&A pairs for testing the clinical pharmacist chatbot",
)
# Add examples: input + expected output
examples = [
{
"input": {"question": "What does warfarin inhibit?"},
"output": {"answer": "Warfarin inhibits vitamin K epoxide reductase (VKORC1)"},
},
{
"input": {"question": "What is the normal INR range for warfarin therapy?"},
"output": {"answer": "2.0 to 3.0 for most indications"},
},
{
"input": {"question": "What is the mechanism of metformin?"},
"output": {"answer": "Metformin activates AMPK and reduces hepatic glucose output"},
},
]
client.create_examples(
inputs=[e["input"] for e in examples],
outputs=[e["output"] for e in examples],
dataset_id=dataset.id,
)Running Evaluations
Evaluate your chain against a dataset automatically:
from langsmith.evaluation import evaluate, LangChainStringEvaluator
# Target function: what we're evaluating
def run_chain(inputs: dict) -> dict:
answer = chain.invoke(inputs)
return {"answer": answer}
# Evaluators: grade each output
qa_evaluator = LangChainStringEvaluator(
"qa", # Checks if answer matches expected output
config={"llm": ChatOpenAI(model="gpt-4o-mini")},
)
# LLM-as-judge for correctness
correctness_evaluator = LangChainStringEvaluator(
"criteria",
config={
"criteria": "correctness",
"llm": ChatOpenAI(model="gpt-4o-mini"),
},
)
# Run evaluation against the dataset
results = evaluate(
run_chain,
data="clinical-pharmacist-qa",
evaluators=[qa_evaluator, correctness_evaluator],
experiment_prefix="gpt-4o-baseline", # Name this experiment in LangSmith
metadata={"model": "gpt-4o", "temperature": 0},
)
print(f"Results: {results}")
# LangSmith shows per-example scores and aggregate metricsComparing Prompt Versions
# Experiment 1: Current prompt
results_v1 = evaluate(
run_chain,
data="clinical-pharmacist-qa",
evaluators=[correctness_evaluator],
experiment_prefix="prompt-v1-basic",
)
# Update the prompt (add few-shot examples)
prompt_v2 = ChatPromptTemplate.from_messages([
("system",
"You are a clinical pharmacist. Answer precisely and cite the mechanism when relevant.\n\n"
"Example:\nQ: What does warfarin inhibit?\nA: Warfarin inhibits vitamin K epoxide reductase (VKORC1), "
"blocking the recycling of vitamin K and reducing synthesis of clotting factors II, VII, IX, and X."),
("human", "{question}"),
])
chain_v2 = prompt_v2 | model | StrOutputParser()
def run_chain_v2(inputs: dict) -> dict:
return {"answer": chain_v2.invoke(inputs)}
# Experiment 2: New prompt
results_v2 = evaluate(
run_chain_v2,
data="clinical-pharmacist-qa",
evaluators=[correctness_evaluator],
experiment_prefix="prompt-v2-few-shot",
)
# LangSmith UI: compare experiments side by side — see which prompt scores higherDebugging a Specific Run
from langsmith import Client
client = Client()
# List recent runs from your project
runs = list(client.list_runs(
project_name="clinical-pharmacist-bot",
execution_order=1, # Only top-level runs (not child steps)
error=True, # Only failed runs
limit=10,
))
for run in runs:
print(f"Run: {run.name}")
print(f" Error: {run.error}")
print(f" Input: {run.inputs}")
print(f" Latency: {run.end_time - run.start_time if run.end_time else 'N/A'}")
# Get all child steps of a specific run (the full trace)
parent_run_id = "run-id-from-langsmith-ui"
child_runs = list(client.list_runs(
project_name="clinical-pharmacist-bot",
parent_run_id=parent_run_id,
))
for child in child_runs:
print(f" Step: {child.name} | Latency: {child.end_time - child.start_time}")
# Add human feedback to a run
client.create_feedback(
run_id=runs[0].id,
key="correctness",
score=0, # 0 = wrong, 1 = correct
comment="Answer missed the CYP2C9 interaction detail",
)LangSmith Feature Summary
| Feature | What it gives you |
|---|---|
| Auto-tracing | Every chain step logged with inputs, outputs, tokens, latency |
| @traceable | Trace non-LangChain Python functions as child spans |
| RunnableConfig metadata | Filter and segment runs in the UI |
| Datasets | Reproducible test sets for regression testing |
| Evaluations | Automated scoring with LLM-as-judge or custom evaluators |
| Experiment comparison | Side-by-side prompt A/B testing with score diffs |
| Feedback API | Collect and store human ratings on specific runs |
| Playground | Re-run any traced prompt interactively and edit it |