LangChain Mastery · Lesson 18 of 33

VectorStoreRetrieverMemory: Semantic History

Why Vector Memory?

Buffer and summary memory retrieve history by recency — the last N turns. But what if the user asks about something they mentioned 50 turns ago? Vector memory retrieves by relevance, not recency:

Buffer memory: "Tell me the last 10 messages"
Vector memory: "Find the 3 past exchanges most similar to this current question"

This is ideal for long clinical consultations where a patient might return to a drug they mentioned early in the session.

VectorStoreRetrieverMemory Setup

Python

from langchain.memory import VectorStoreRetrieverMemory
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser

# Set up the vector store for memory
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma(
    collection_name="clinical_conversation_memory",
    embedding_function=embedding,
)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},   # Retrieve 3 most relevant past exchanges
)

# Create vector memory
vector_memory = VectorStoreRetrieverMemory(
    retriever=retriever,
    memory_key="chat_history",
    input_key="input",
    return_messages=False,   # Return as formatted string
)

# Populate with conversation history
exchanges = [
    ("What is warfarin?", "Warfarin is an anticoagulant that inhibits VKORC1..."),
    ("What is the dose for AFib?", "For AFib, warfarin is typically started at 2-5mg..."),
    ("What are the interactions with aspirin?", "Warfarin + aspirin: major interaction, increased bleeding risk..."),
    ("What is metformin?", "Metformin is a biguanide antidiabetic that activates AMPK..."),
    ("How does metformin work?", "Metformin reduces hepatic glucose output and improves insulin sensitivity..."),
    ("What is lisinopril?", "Lisinopril is an ACE inhibitor used for hypertension and heart failure..."),
    ("Is lisinopril safe in renal failure?", "Lisinopril should be used with caution in renal impairment; reduce dose and monitor potassium..."),
]

for q, a in exchanges:
    vector_memory.save_context(
        inputs={"input": q},
        outputs={"output": a},
    )

# Now query the vector memory semantically
# Query about warfarin bleeding → retrieves warfarin-aspirin exchange, not metformin
loaded = vector_memory.load_memory_variables({"input": "Is there a bleeding risk with warfarin?"})
print(loaded["chat_history"])
# Shows: warfarin-aspirin interaction exchange (most relevant)
# NOT metformin exchange (irrelevant)

Building a Long-Term Memory Chatbot

Python

from langchain.chains import ConversationChain

model = ChatOpenAI(model="gpt-4o", temperature=0)

# Prompt that accepts retrieved relevant history (as string)
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a clinical pharmacist. "
     "Relevant past conversation context is provided below — "
     "reference it naturally if pertinent to the current question."),
    ("human",
     "Relevant past exchanges:\n{chat_history}\n\n---\n\nCurrent question: {input}"),
])

class SemanticMemoryChatbot:
    """Clinical chatbot with semantic long-term memory."""

    def __init__(self):
        self.embedding = OpenAIEmbeddings(model="text-embedding-3-small")
        self.vectorstore = Chroma(
            collection_name="clinical_memory",
            embedding_function=self.embedding,
        )
        self.memory = VectorStoreRetrieverMemory(
            retriever=self.vectorstore.as_retriever(search_kwargs={"k": 3}),
            memory_key="chat_history",
            input_key="input",
        )
        self.model = ChatOpenAI(model="gpt-4o", temperature=0)
        self.parser = StrOutputParser()

    def chat(self, question: str) -> dict:
        # Retrieve relevant past context
        relevant_history = self.memory.load_memory_variables({"input": question})

        # Build prompt
        prompt_messages = [
            ("system", "You are a clinical pharmacist with perfect recall of past consultations."),
            ("human",
             f"Relevant past discussion:\n{relevant_history['chat_history']}\n\n"
             f"Current question: {question}"),
        ]
        messages = ChatPromptTemplate.from_messages(prompt_messages).format_messages()

        # Generate response
        response = self.model.invoke(messages)
        answer = response.content

        # Store this turn in vector memory
        self.memory.save_context(
            inputs={"input": question},
            outputs={"output": answer},
        )

        return {
            "answer": answer,
            "relevant_history_used": relevant_history["chat_history"][:200],
        }

    def clear_memory(self) -> None:
        """Clear all conversation memory."""
        self.vectorstore.delete_collection()


bot = SemanticMemoryChatbot()

# Simulate a long session
r1 = bot.chat("What is warfarin and why is it used?")
r2 = bot.chat("What is the dose for AFib?")
r3 = bot.chat("What about metformin for diabetes?")
r4 = bot.chat("My patient is on both drugs — any concerns?")   # Retrieves warfarin + metformin info
r5 = bot.chat("Going back to the anticoagulant — what monitoring is needed?")  # Retrieves warfarin info

print(r4["relevant_history_used"])  # Shows both warfarin and metformin past exchanges

Hybrid: Vector + Recency Memory

Combine vector retrieval (relevance) with buffer (recency) for best coverage:

Python

class HybridMemoryBot:
    """Uses both semantic retrieval and recent buffer."""

    def __init__(self):
        self.embedding = OpenAIEmbeddings(model="text-embedding-3-small")
        self.vectorstore = Chroma(
            collection_name="semantic_history",
            embedding_function=self.embedding,
        )
        self.vector_memory = VectorStoreRetrieverMemory(
            retriever=self.vectorstore.as_retriever(search_kwargs={"k": 2}),
            memory_key="semantic_context",
        )
        self.recent_history = []   # Last 4 turns verbatim
        self.max_recent_turns = 4

    def chat(self, question: str) -> str:
        # Get semantic context (relevant past, anywhere in history)
        semantic = self.vector_memory.load_memory_variables({"input": question})

        # Combine: recent history + semantically relevant history
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are a clinical pharmacist."),
            ("human",
             f"Semantically relevant past context:\n{semantic['semantic_context']}\n\n"
             f"Recent conversation:\n{self._format_recent()}\n\n"
             f"Question: {question}"),
        ])

        model = ChatOpenAI(model="gpt-4o", temperature=0)
        response = (prompt | model | StrOutputParser()).invoke({})

        # Update both memory systems
        self.vector_memory.save_context(
            {"input": question}, {"output": response}
        )
        self._add_to_recent(question, response)

        return response

    def _format_recent(self) -> str:
        lines = []
        for q, a in self.recent_history:
            lines.append(f"User: {q}\nBot: {a}")
        return "\n\n".join(lines) if lines else "(No recent history)"

    def _add_to_recent(self, q: str, a: str) -> None:
        self.recent_history.append((q, a))
        if len(self.recent_history) > self.max_recent_turns:
            self.recent_history.pop(0)

Persistent Vector Memory with Chroma

Python

import os

# Use a persistent directory so memory survives restarts
MEMORY_DIR = "./conversation_memory_db"

persistent_vectorstore = Chroma(
    collection_name="clinical_memory",
    embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
    persist_directory=MEMORY_DIR,
)

# Memory loads from disk on next startup
persistent_memory = VectorStoreRetrieverMemory(
    retriever=persistent_vectorstore.as_retriever(search_kwargs={"k": 3}),
    memory_key="chat_history",
)

# Across multiple sessions, past conversations are remembered
# Session 1: user asks about warfarin
# Session 2 (next day): "What was that anticoagulant we discussed?" → retrieves warfarin info

Vector Memory vs Other Types

| Situation | Best Memory Type | |---|---| | Short conversation (under 20 turns) | ConversationBufferMemory | | Long conversation, recent context most important | ConversationSummaryBufferMemory | | Very long session, topic may revisit old content | VectorStoreRetrieverMemory | | Domain with named entities (drugs, patients) | ConversationEntityMemory | | Multi-session recall (memory across days) | Persistent VectorStoreRetrieverMemory | | Need both recency and relevance | Hybrid (vector + buffer) |

ConversationSummaryMemory: Compressed History

Next Lesson

Interview: Choose the Right Memory for a Use Case