ConversationSummaryMemory: Compressed History

The Problem with Buffer Memory

ConversationBufferMemory stores everything verbatim. After 20+ turns, the history can consume thousands of tokens, leaving little room for the actual response. Worse, most LLMs pay less attention to information in the middle of very long contexts.

ConversationSummaryMemory solves this by using an LLM to compress older conversation turns into a summary.

ConversationSummaryMemory

Python

from langchain.memory import ConversationSummaryMemory
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage

# Use a cheap, fast model for summarization
summarizer_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

summary_memory = ConversationSummaryMemory(
    llm=summarizer_model,
    memory_key="chat_history",
    return_messages=True,    # Return as Message objects
)

# Add conversation turns
summary_memory.save_context(
    {"input": "What is warfarin?"},
    {"output": "Warfarin is an anticoagulant that inhibits VKORC1, preventing vitamin K recycling and thus reducing synthesis of clotting factors II, VII, IX, and X."},
)
summary_memory.save_context(
    {"input": "Why is it used in AFib?"},
    {"output": "In atrial fibrillation, disorganized atrial contractions allow blood to pool and clot in the left atrial appendage. These clots can embolize causing stroke. Warfarin anticoagulation reduces stroke risk by 60-70%."},
)
summary_memory.save_context(
    {"input": "What is the typical dose?"},
    {"output": "Warfarin dosing is highly individualized: typically started at 2-5mg daily, adjusted based on INR (target 2.0-3.0 for most indications). Genetic factors (CYP2C9, VKORC1) affect dose requirements significantly."},
)

# The memory now stores a SUMMARY, not verbatim messages
print(summary_memory.moving_summary_buffer)
# "The user asked about warfarin — its mechanism (VKORC1 inhibition), 
#  use in AFib (stroke prevention), and dosing (2-5mg, INR-guided)."

# History is a SystemMessage with the summary
loaded = summary_memory.load_memory_variables({})
print(loaded["chat_history"])
# [SystemMessage(content="The user asked about warfarin...")]

# Next question gets context from the summary
next_question_memory = loaded["chat_history"]
# The model sees the compressed summary, not all 3 full exchanges

How Summarization Works Internally

Python

# LangChain's default summarization prompt (simplified):
SUMMARY_PROMPT = """Progressively summarize the conversation.
Current summary: {summary}
New lines of conversation: {new_lines}
New summary:"""

# Each time save_context() is called with summary memory:
# 1. The new exchange is added
# 2. LLM is called with: current_summary + new_exchange → new_summary
# 3. Only the new summary is stored (old exchanges discarded)

# Custom summarization prompt
from langchain_core.prompts import PromptTemplate

clinical_summary_prompt = PromptTemplate(
    input_variables=["summary", "new_lines"],
    template="""Summarize this clinical pharmacology conversation.
Focus on: drugs mentioned, indications discussed, key facts established, patient characteristics.

Current summary: {summary}

New conversation:
{new_lines}

Updated clinical summary:""",
)

clinical_summary_memory = ConversationSummaryMemory(
    llm=summarizer_model,
    prompt=clinical_summary_prompt,
    memory_key="chat_history",
    return_messages=True,
)

ConversationSummaryBufferMemory (Recommended)

Hybrid approach: keep recent turns verbatim, summarize older turns when they exceed a token limit.

Python

from langchain.memory import ConversationSummaryBufferMemory

hybrid_memory = ConversationSummaryBufferMemory(
    llm=summarizer_model,
    max_token_limit=1500,    # When buffer exceeds 1500 tokens, summarize oldest turns
    memory_key="chat_history",
    return_messages=True,
)

# Add many turns — watch what happens
for i, (q, a) in enumerate([
    ("What is warfarin?", "Warfarin is an anticoagulant..."),
    ("Why is it used for AFib?", "AFib causes blood pooling leading to clot formation..."),
    ("What is the dose?", "Typically 2-5mg daily, INR-guided..."),
    ("What are the interactions?", "Major: aspirin (bleeding risk), amiodarone (increased warfarin effect)..."),
    ("What monitoring is needed?", "INR monitoring every 1-4 weeks, CBC annually..."),
    ("Is it safe in pregnancy?", "Warfarin is contraindicated in pregnancy (Category X)..."),
]):
    hybrid_memory.save_context({"input": q}, {"output": a})

# Memory structure after many turns:
# [SystemMessage("Summary: User learned warfarin mechanism, AFib indication, dosing..."),
#  HumanMessage("Is it safe in pregnancy?"),
#  AIMessage("Warfarin is contraindicated...")]
# Recent turn is verbatim; early turns are summarized

loaded = hybrid_memory.load_memory_variables({})
for msg in loaded["chat_history"]:
    print(f"[{msg.type}] {msg.content[:100]}")

Integration with LCEL Chains

Python

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

model = ChatOpenAI(model="gpt-4o", temperature=0)

# Prompt that accepts summarized history
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a clinical pharmacist. "
     "The conversation history below may contain a summary of earlier exchanges."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}"),
])

chain = prompt | model | StrOutputParser()

class SummaryConversationBot:
    """Clinical bot using summary buffer memory."""

    def __init__(self):
        self.memory = ConversationSummaryBufferMemory(
            llm=ChatOpenAI(model="gpt-4o-mini"),
            max_token_limit=2000,
            memory_key="chat_history",
            return_messages=True,
        )
        self.chain = chain

    def chat(self, question: str) -> str:
        # Get history (may include summary + recent turns)
        history = self.memory.load_memory_variables({})["chat_history"]

        # Generate response
        response = self.chain.invoke({
            "question": question,
            "chat_history": history,
        })

        # Save this turn to memory (may trigger summarization of older turns)
        self.memory.save_context(
            {"input": question},
            {"output": response},
        )

        return response

    def get_memory_state(self) -> dict:
        loaded = self.memory.load_memory_variables({})
        history = loaded["chat_history"]
        return {
            "n_messages": len(history),
            "has_summary": any(m.type == "system" for m in history),
            "current_summary": next(
                (m.content for m in history if m.type == "system"),
                None,
            ),
        }


bot = SummaryConversationBot()

# Simulate a long clinical consultation
questions = [
    "What is warfarin?",
    "Why is it used for AFib?",
    "What's the typical starting dose?",
    "What are the major drug interactions?",
    "What monitoring is required?",
    "Is it safe in elderly patients?",
    "Can it be used in pregnancy?",
    "What happens in a warfarin overdose?",
    "How do you reverse anticoagulation?",
    "Is there a better anticoagulant for my patient?",
]

for q in questions:
    response = bot.chat(q)
    state = bot.get_memory_state()
    print(f"Q: {q[:50]}")
    print(f"Memory: {state['n_messages']} messages, summary: {state['has_summary']}")
    print()

Token Counting and Memory Management

Python

import tiktoken

def estimate_memory_tokens(memory: ConversationSummaryBufferMemory, model: str = "gpt-4o") -> int:
    """Estimate tokens used by current memory state."""
    encoding = tiktoken.encoding_for_model(model)
    loaded = memory.load_memory_variables({})
    history = loaded.get("chat_history", [])

    total = 0
    for msg in history:
        total += len(encoding.encode(msg.content))
        total += 4   # Message overhead per OpenAI spec

    return total


# Monitor memory growth
def chat_with_monitoring(bot: SummaryConversationBot, question: str) -> dict:
    tokens_before = estimate_memory_tokens(bot.memory)
    response = bot.chat(question)
    tokens_after = estimate_memory_tokens(bot.memory)

    return {
        "response": response,
        "tokens_before": tokens_before,
        "tokens_after": tokens_after,
        "token_delta": tokens_after - tokens_before,
    }

When Summary Memory Isn't Enough

Summary memory degrades when:

Precise numbers matter: "The patient's INR was 3.4" may be approximated in summary to "INR was elevated." Use verbatim buffer or external structured storage for precise values.
Temporal ordering matters: Summaries may lose sequence information.
Very high summarization cost: Each save triggers an LLM call. At 1000 turns, that's 1000 extra API calls.

Mitigation for precise clinical data:

Python

# Store clinical facts externally, use summary memory for conversational context
clinical_facts = {}  # {"current_inr": "3.4", "current_dose": "5mg", ...}

def extract_and_store_clinical_facts(exchange: dict, storage: dict) -> None:
    """Extract precise clinical values and store separately from memory."""
    from langchain_core.output_parsers import JsonOutputParser
    fact_extractor = (
        ChatPromptTemplate.from_template(
            "Extract clinical facts (drugs, doses, lab values, dates) from: {text}. "
            "Return JSON: {{facts: {{key: value}}}}"
        )
        | ChatOpenAI(model="gpt-4o-mini")
        | JsonOutputParser()
    )
    facts = fact_extractor.invoke({"text": str(exchange)})
    storage.update(facts.get("facts", {}))

ConversationSummaryMemory: Compressed History

The Problem with Buffer Memory