Agent Memory Types
Understand the four memory types available to agents — in-context, episodic, semantic, and procedural — and learn when to use each one.
Agent Memory Types
Human intelligence depends on multiple kinds of memory: short-term working memory for immediate tasks, episodic memory for recalling specific events, semantic memory for general knowledge, and procedural memory for learned skills. Agentic AI systems need analogous memory structures — without them, an agent is stateless, amnesiac, and incapable of improving over time.
This lesson breaks down the four main memory types for AI agents, explains what each is good for, and gives you concrete Python implementations.
The Four Memory Types
| Memory Type | What It Stores | Where It Lives | Retrieval | |---|---|---|---| | In-Context | Current conversation | LLM message history | Always available | | Episodic | Past conversations/events | External DB | Similarity search | | Semantic | Facts and knowledge | Vector store | Similarity search | | Procedural | Learned behaviors | Model weights | Implicit in generation |
Each type operates at a different timescale and serves a different purpose. Production agents typically combine two or more.
In-Context Memory
In-context memory is the simplest and most immediate form of agent memory. It is just the messages list passed to the LLM on each API call. Everything in this list is "remembered" for the duration of the conversation.
Advantages:
- Zero latency — already in the prompt
- Perfect recall — nothing is lost or approximated
- No infrastructure required
Limitations:
- Limited by the context window size (e.g., 128K tokens for GPT-4o)
- Grows with every turn — eventually you must truncate or summarize
- Lost when the conversation ends — not persistent across sessions
class InContextMemory:
"""
Simple in-context memory backed by a message list.
Manages token limits by truncating oldest non-system messages.
"""
def __init__(self, max_messages: int = 50, system_prompt: str = ""):
self.messages = []
self.max_messages = max_messages
if system_prompt:
self.messages.append({"role": "system", "content": system_prompt})
def add(self, role: str, content: str) -> None:
"""Add a message and evict oldest if over limit."""
self.messages.append({"role": role, "content": content})
self._evict_if_needed()
def _evict_if_needed(self) -> None:
"""Remove oldest non-system messages when over limit."""
while len(self.messages) > self.max_messages:
# Find and remove the first non-system message
for i, msg in enumerate(self.messages):
if msg["role"] != "system":
self.messages.pop(i)
break
else:
break # All messages are system — cannot evict
def get_messages(self) -> list:
return self.messages.copy()
def clear(self, keep_system: bool = True) -> None:
"""Clear conversation history."""
if keep_system:
self.messages = [m for m in self.messages if m["role"] == "system"]
else:
self.messages = []
@property
def turn_count(self) -> int:
return sum(1 for m in self.messages if m["role"] in ("user", "assistant"))Episodic Memory
Episodic memory stores records of past conversations or events and retrieves them by semantic similarity when relevant. When a user starts a new session, the agent searches episodic memory to find past interactions with the same user or on the same topic, and loads that context into the current prompt.
Use case: A customer support agent that remembers that a specific customer called three days ago about a billing issue and already received a credit.
import json
import time
import uuid
import openai
from dataclasses import dataclass, field, asdict
from typing import List, Optional
client = openai.OpenAI()
@dataclass
class Episode:
"""A stored memory of a past conversation or event."""
episode_id: str
user_id: str
timestamp: float
summary: str
full_content: str
embedding: List[float] = field(default_factory=list)
metadata: dict = field(default_factory=dict)
class EpisodicMemory:
"""
Episodic memory store backed by in-memory vector search.
In production, replace with Pinecone, Weaviate, or pgvector.
"""
def __init__(self):
self.episodes: List[Episode] = []
def embed(self, text: str) -> List[float]:
"""Get an embedding vector for a piece of text."""
response = client.embeddings.create(
model="text-embedding-3-small",
input=text,
)
return response.data[0].embedding
def store(
self,
user_id: str,
conversation: List[dict],
metadata: Optional[dict] = None,
) -> str:
"""
Store a conversation as an episode.
Summarizes it and generates an embedding for retrieval.
"""
# Create a text representation of the conversation
full_content = "\n".join(
f"{m['role'].upper()}: {m['content']}"
for m in conversation
if m["role"] != "system"
)
# Summarize for retrieval
summary_response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": (
f"Summarize this conversation in 2-3 sentences, "
f"focusing on what was discussed and what was resolved:\n\n{full_content}"
),
}
],
max_tokens=150,
)
summary = summary_response.choices[0].message.content
# Generate embedding for the summary
embedding = self.embed(summary)
episode = Episode(
episode_id=str(uuid.uuid4()),
user_id=user_id,
timestamp=time.time(),
summary=summary,
full_content=full_content,
embedding=embedding,
metadata=metadata or {},
)
self.episodes.append(episode)
return episode.episode_id
def retrieve(
self,
query: str,
user_id: Optional[str] = None,
top_k: int = 3,
) -> List[Episode]:
"""
Retrieve the most relevant past episodes for a given query.
Filters by user_id if provided.
"""
if not self.episodes:
return []
query_embedding = self.embed(query)
candidates = self.episodes
if user_id:
candidates = [e for e in candidates if e.user_id == user_id]
# Cosine similarity
def cosine_sim(a: List[float], b: List[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x ** 2 for x in a) ** 0.5
norm_b = sum(x ** 2 for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
scored = [
(cosine_sim(query_embedding, e.embedding), e)
for e in candidates
]
scored.sort(key=lambda x: x[0], reverse=True)
return [e for _, e in scored[:top_k]]
def format_for_context(self, episodes: List[Episode]) -> str:
"""Format retrieved episodes for inclusion in the system prompt."""
if not episodes:
return ""
parts = ["=== Relevant past interactions ==="]
for ep in episodes:
ts = time.strftime("%Y-%m-%d", time.localtime(ep.timestamp))
parts.append(f"[{ts}] {ep.summary}")
return "\n".join(parts)Semantic Memory
Semantic memory stores general facts and knowledge about the world — not tied to specific conversations. Think of it as the agent's knowledge base: product documentation, company policies, medical guidelines, or any domain-specific information that the agent needs to answer questions.
Unlike episodic memory (which stores "what happened"), semantic memory stores "what is true."
@dataclass
class Fact:
"""A stored fact in semantic memory."""
fact_id: str
content: str
source: str
category: str
embedding: List[float] = field(default_factory=list)
created_at: float = field(default_factory=time.time)
class SemanticMemory:
"""
Semantic memory store for factual knowledge.
Facts are retrieved by semantic similarity to a query.
"""
def __init__(self):
self.facts: List[Fact] = []
self._client = openai.OpenAI()
def _embed(self, text: str) -> List[float]:
response = self._client.embeddings.create(
model="text-embedding-3-small",
input=text,
)
return response.data[0].embedding
def store_fact(self, content: str, source: str, category: str) -> str:
"""Store a fact with its embedding."""
embedding = self._embed(content)
fact = Fact(
fact_id=str(uuid.uuid4()),
content=content,
source=source,
category=category,
embedding=embedding,
)
self.facts.append(fact)
return fact.fact_id
def store_document(self, text: str, source: str, category: str, chunk_size: int = 500) -> List[str]:
"""
Break a document into chunks and store each as a fact.
Returns list of fact IDs.
"""
# Simple chunking by character count — use a proper chunker in production
chunks = []
words = text.split()
current_chunk = []
current_length = 0
for word in words:
current_chunk.append(word)
current_length += len(word) + 1
if current_length >= chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
return [self.store_fact(chunk, source, category) for chunk in chunks]
def query(self, query: str, top_k: int = 5, category: Optional[str] = None) -> List[Fact]:
"""Retrieve the most relevant facts for a query."""
if not self.facts:
return []
query_emb = self._embed(query)
candidates = self.facts if not category else [
f for f in self.facts if f.category == category
]
def cosine_sim(a, b):
dot = sum(x * y for x, y in zip(a, b))
na = sum(x**2 for x in a) ** 0.5
nb = sum(x**2 for x in b) ** 0.5
return dot / (na * nb) if na and nb else 0.0
scored = [(cosine_sim(query_emb, f.embedding), f) for f in candidates]
scored.sort(reverse=True, key=lambda x: x[0])
return [f for _, f in scored[:top_k]]
def format_for_context(self, facts: List[Fact]) -> str:
"""Format retrieved facts for inclusion in the prompt."""
if not facts:
return ""
parts = ["=== Relevant knowledge ==="]
for fact in facts:
parts.append(f"[{fact.source}] {fact.content}")
return "\n".join(parts)Procedural Memory
Procedural memory is the most advanced and least directly controllable memory type. It refers to behaviors encoded in the model's weights through fine-tuning, RLHF, or constitutional AI training. You cannot easily "write" to procedural memory at runtime.
In practice, you can influence procedural behavior through:
Few-shot prompting — Include examples of the behavior you want in the system prompt. This is "temporary procedural memory" stored in the context window.
Fine-tuning — Train the model on examples of correct behavior. Changes persist across all invocations. High cost, but powerful for domain adaptation.
Prompt templates — Store recurring prompt patterns (instructions, formats, constraints) and load them dynamically. This externalizes "how to behave" from the model itself.
class ProceduralMemory:
"""
Procedural memory implemented as a library of prompt templates.
These encode 'how to behave' for specific task types.
"""
def __init__(self):
self.procedures: dict = {}
def register_procedure(self, name: str, template: str, description: str) -> None:
"""Register a reusable prompt template."""
self.procedures[name] = {
"template": template,
"description": description,
}
def get_procedure(self, name: str, **kwargs) -> Optional[str]:
"""
Retrieve and format a procedure template.
kwargs are substituted into the template.
"""
proc = self.procedures.get(name)
if not proc:
return None
try:
return proc["template"].format(**kwargs)
except KeyError as e:
return f"Template error: missing variable {e}"
def list_procedures(self) -> List[dict]:
return [
{"name": k, "description": v["description"]}
for k, v in self.procedures.items()
]
# Example: register common agent procedures
pm = ProceduralMemory()
pm.register_procedure(
name="structured_analysis",
template=(
"Analyze the following {topic}. Structure your response as:\n"
"1. Key findings (bullet points)\n"
"2. Main risks or concerns\n"
"3. Recommended actions\n"
"4. Confidence level (High/Medium/Low) and why\n\n"
"Topic: {content}"
),
description="Structured analysis with findings, risks, actions, and confidence",
)
pm.register_procedure(
name="sql_generation",
template=(
"Generate a SQL query for a {database_type} database.\n"
"Schema: {schema}\n"
"Requirements: {requirements}\n"
"Rules: Use parameterized queries. Never use SELECT *. "
"Include appropriate WHERE clauses to limit result sets.\n"
"Return only the SQL query, no explanation."
),
description="Generate safe, parameterized SQL queries",
)Combining Memory Types in an Agent
A production agent typically uses all four:
class MemoryAwareAgent:
"""Agent that combines all four memory types."""
def __init__(self, user_id: str, system_prompt: str):
self.user_id = user_id
self.in_context = InContextMemory(max_messages=40, system_prompt=system_prompt)
self.episodic = EpisodicMemory()
self.semantic = SemanticMemory()
self.procedural = ProceduralMemory()
self._client = openai.OpenAI()
def start_session(self, user_query: str) -> None:
"""
Before responding, load relevant memories into context.
Called at the start of each new session.
"""
# Load relevant episodic memories
past_episodes = self.episodic.retrieve(user_query, user_id=self.user_id, top_k=2)
if past_episodes:
episode_context = self.episodic.format_for_context(past_episodes)
self.in_context.add("system", episode_context)
# Load relevant semantic knowledge
relevant_facts = self.semantic.query(user_query, top_k=3)
if relevant_facts:
fact_context = self.semantic.format_for_context(relevant_facts)
self.in_context.add("system", fact_context)
def chat(self, user_message: str) -> str:
"""Process a single user message and return the agent's response."""
self.in_context.add("user", user_message)
response = self._client.chat.completions.create(
model="gpt-4o-mini",
messages=self.in_context.get_messages(),
)
reply = response.choices[0].message.content
self.in_context.add("assistant", reply)
return reply
def end_session(self) -> None:
"""
At session end, store the conversation in episodic memory.
"""
conversation = [
m for m in self.in_context.get_messages()
if m["role"] in ("user", "assistant")
]
if conversation:
self.episodic.store(
user_id=self.user_id,
conversation=conversation,
metadata={"session_end": time.time()},
)
self.in_context.clear(keep_system=True)Summary
- In-context memory is immediate and perfect but limited by context window size
- Episodic memory stores past conversations retrieved by semantic similarity — enables personalization and continuity across sessions
- Semantic memory stores domain knowledge as searchable facts — the agent's knowledge base
- Procedural memory encodes behaviors in model weights or prompt templates — the "how to act" layer
- Most production agents combine in-context + at least one external memory type
- Episodic and semantic memory both use embedding-based retrieval — vector databases are the infrastructure layer
Next: Managing Context Window in Agents — how to keep agents effective as conversations grow.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.