How to Implement Memory in a Python AI App
Before You Start
You need Python 3.10 or later, a PostgreSQL instance with the pgvector extension, and an OpenAI API key. The code uses OpenAI for both the LLM and embeddings, but you can substitute Anthropic, Voyage, or any other provider by changing the API calls. The memory layer itself is provider-agnostic.
Step-by-Step Implementation
Set up a virtual environment and install the required packages. You need the OpenAI SDK for embeddings and chat, psycopg2 for PostgreSQL access, and pgvector's Python bindings for the vector type.
python -m venv memory-env
source memory-env/bin/activate
pip install openai psycopg2-binary pgvectorMake sure your PostgreSQL instance has the pgvector extension installed. Run CREATE EXTENSION IF NOT EXISTS vector; in your database if you have not already.
Build a MemoryManager class that encapsulates all memory operations: connecting to the database, creating the schema, embedding text, storing memories, and searching by similarity.
import os
import json
import time
from datetime import datetime
from openai import OpenAI
import psycopg2
from pgvector.psycopg2 import register_vector
class MemoryManager:
def __init__(self, db_url, user_id):
self.client = OpenAI()
self.user_id = user_id
self.conn = psycopg2.connect(db_url)
register_vector(self.conn)
self._ensure_schema()
def _ensure_schema(self):
with self.conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS memories (
id SERIAL PRIMARY KEY,
user_id VARCHAR(64) NOT NULL,
content TEXT NOT NULL,
embedding vector(1536),
memory_type VARCHAR(32) DEFAULT 'observation',
confidence FLOAT DEFAULT 1.0,
access_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT NOW(),
last_accessed TIMESTAMP DEFAULT NOW()
)
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_memories_user
ON memories (user_id)
""")
self.conn.commit()
def _embed(self, text):
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def store(self, content, memory_type="observation"):
embedding = self._embed(content)
with self.conn.cursor() as cur:
# Check for near-duplicates
cur.execute("""
SELECT id FROM memories
WHERE user_id = %s
AND embedding <=> %s::vector < 0.05
LIMIT 1
""", (self.user_id, embedding))
existing = cur.fetchone()
if existing:
cur.execute("""
UPDATE memories
SET access_count = access_count + 1,
last_accessed = NOW()
WHERE id = %s
""", (existing[0],))
else:
cur.execute("""
INSERT INTO memories
(user_id, content, embedding, memory_type)
VALUES (%s, %s, %s::vector, %s)
""", (self.user_id, content, embedding,
memory_type))
self.conn.commit()
def search(self, query, limit=5, min_confidence=0.0):
query_embedding = self._embed(query)
with self.conn.cursor() as cur:
cur.execute("""
SELECT id, content, memory_type, confidence,
access_count, created_at,
1 - (embedding <=> %s::vector) AS similarity
FROM memories
WHERE user_id = %s AND confidence >= %s
ORDER BY embedding <=> %s::vector
LIMIT %s
""", (query_embedding, self.user_id,
min_confidence, query_embedding, limit))
results = []
ids_accessed = []
for row in cur.fetchall():
ids_accessed.append(row[0])
results.append({
"content": row[1],
"type": row[2],
"confidence": row[3],
"access_count": row[4],
"created_at": row[5].isoformat(),
"similarity": round(row[6], 4)
})
# Update access tracking
if ids_accessed:
cur.execute("""
UPDATE memories
SET access_count = access_count + 1,
last_accessed = NOW()
WHERE id = ANY(%s)
""", (ids_accessed,))
self.conn.commit()
return resultsImplement a method that uses the LLM to identify memories worth storing from a conversation. The extraction prompt should focus on lasting information that would be useful in future sessions.
def extract_and_store(self, conversation_text):
extraction_prompt = """Analyze this conversation and extract
information worth remembering for future interactions.
For each item, output it as: TYPE: content
Where TYPE is one of: fact, preference, decision, outcome
Rules:
- Only extract information with lasting value
- Skip greetings, clarifications, and session-specific context
- Be specific and self-contained in each extracted item
- Output NONE if nothing is worth storing"""
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": extraction_prompt},
{"role": "user", "content": conversation_text}
],
temperature=0.1
)
text = response.choices[0].message.content.strip()
if text == "NONE":
return 0
count = 0
for line in text.split("\n"):
line = line.strip()
if not line:
continue
if ":" in line:
parts = line.split(":", 1)
mtype = parts[0].strip().lower()
content = parts[1].strip()
if mtype in ("fact", "preference",
"decision", "outcome"):
self.store(content, memory_type=mtype)
count += 1
else:
self.store(line, memory_type="observation")
count += 1
return countCreate a function that retrieves relevant memories before each model call and injects them into the system message. The model reads the memory context and incorporates it naturally into its responses.
class MemoryChat:
def __init__(self, memory_manager, base_prompt):
self.memory = memory_manager
self.base_prompt = base_prompt
self.client = OpenAI()
self.history = []
def _build_system_message(self, user_message):
memories = self.memory.search(user_message, limit=5)
if not memories:
return self.base_prompt
context = "\n\nContext from previous interactions:\n"
for m in memories:
created = m["created_at"][:10]
context += f"- [{m['type']}] ({created}) "
context += f"{m['content']}\n"
return self.base_prompt + context
def chat(self, user_message):
system_msg = self._build_system_message(user_message)
self.history.append({
"role": "user",
"content": user_message
})
messages = [{"role": "system", "content": system_msg}]
messages.extend(self.history[-20:]) # Last 20 turns
response = self.client.chat.completions.create(
model="gpt-4o",
messages=messages
)
assistant_msg = response.choices[0].message.content
self.history.append({
"role": "assistant",
"content": assistant_msg
})
return assistant_msgWire everything together into a main loop. The loop handles user input, generates responses with memory context, and extracts new memories at the end of the session.
def main():
db_url = os.environ.get(
"DATABASE_URL",
"postgresql://localhost/myapp"
)
user_id = "user_001"
memory = MemoryManager(db_url, user_id)
chat = MemoryChat(
memory,
"You are a helpful assistant. Use context from "
"previous interactions when relevant."
)
print("Chat with memory (type 'quit' to exit)")
conversation_log = []
while True:
user_input = input("\nYou: ").strip()
if user_input.lower() == "quit":
break
response = chat.chat(user_input)
print(f"\nAssistant: {response}")
conversation_log.append(f"User: {user_input}")
conversation_log.append(f"Assistant: {response}")
# Extract and store memories from this session
if conversation_log:
full_text = "\n".join(conversation_log)
count = memory.extract_and_store(full_text)
print(f"\nExtracted {count} memories from this session.")
if __name__ == "__main__":
main()Run the application, have a conversation with some memorable facts, exit, then restart. In the new session, ask about the information from the previous session. The model should reference stored memories in its response.
# Session 1:
# You: I'm building a SaaS app with Next.js and Supabase
# You: We chose Stripe for payments and Resend for email
# You: quit
# -> Extracted 3 memories from this session.
# Session 2:
# You: What payment provider am I using?
# Assistant: Based on our previous conversations, you are
# using Stripe for payments in your SaaS application...Production Improvements
The implementation above is a complete working system, but production deployments benefit from several enhancements. Connection pooling (using psycopg2.pool or asyncpg) prevents database connection exhaustion under load. Async embedding calls (using the async OpenAI client) reduce latency by overlapping embedding generation with other work. Batch extraction (processing the entire session at once rather than per-message) reduces LLM API costs significantly.
For retrieval quality, consider adding cognitive scoring on top of vector similarity. Adaptive Recall implements ACT-R scoring that factors in recency, access frequency, entity graph connections, and confidence weighting. This produces rankings that are measurably better than cosine similarity alone, especially as the memory store grows beyond a few hundred entries and simple similarity starts returning too many loosely related results.
Get production-ready memory for your Python app without building the infrastructure. Adaptive Recall provides the full memory stack through a simple API.
Get Started Free