What you’ll build: A document Q&A system that ingests PDFs, processes them, and answers questions using RAG.Time: 15 minutes | Difficulty: Beginner
Overview
Upload documents, ask questions, get accurate answers with sources. Perfect for:- 📄 Legal document analysis
- 📚 Research paper queries
- 📋 Policy/handbook searches
- 🔍 Technical documentation
Quick Start
- Node.js
- Python
- CLI
Copy
import { use } from '@memvid/sdk';
import * as fs from 'fs';
import * as path from 'path';
// Create document store
const mem = await use('basic', 'documents.mv2', { mode: 'create' });
// Ingest documents
const docsDir = './docs';
for (const filename of fs.readdirSync(docsDir)) {
if (filename.endsWith('.pdf')) {
await mem.put({
title: filename,
label: 'document',
file: path.join(docsDir, filename),
});
}
}
// Ask questions
const answer = await mem.ask('What is the refund policy?', { returnSources: true });
console.log(`Answer: ${answer.answer}`);
console.log(`Source: ${answer.sources?.[0]?.title ?? 'n/a'}`);
Copy
from memvid_sdk import use
import os
# Create document store
mem = use('basic', 'documents.mv2', mode='create')
# Ingest a folder of documents
for filename in os.listdir('./docs'):
if filename.endswith('.pdf'):
mem.put({
"title": filename,
"label": "document",
"file": f"./docs/{filename}"
})
# Ask questions
answer = mem.ask("What is the refund policy?")
print(f"Answer: {answer.get('answer')}")
print(f"Source: {(answer.get('sources') or [{}])[0].get('title', 'n/a')}")
Copy
# Create and ingest
memvid create documents.mv2
memvid put documents.mv2 --input ./docs/
# Ask questions
memvid ask documents.mv2 --question "What is the refund policy?"
Full Implementation
Step 1: Document Processor Class
Copy
from memvid_sdk import use
from pathlib import Path
from typing import List, Optional
import hashlib
class DocumentQA:
"""Document Q&A system with Memvid."""
SUPPORTED_FORMATS = {'.pdf', '.docx', '.txt', '.md', '.html'}
def __init__(self, memory_path: str = "documents.mv2"):
self.mem = use('basic', memory_path, mode='auto')
self.stats = {"ingested": 0, "failed": 0}
def ingest_file(self, filepath: str, metadata: Optional[dict] = None) -> bool:
"""Ingest a single file."""
path = Path(filepath)
if path.suffix.lower() not in self.SUPPORTED_FORMATS:
print(f"⚠️ Unsupported format: {path.suffix}")
return False
try:
# Generate unique ID based on content hash
content_hash = hashlib.md5(path.read_bytes()).hexdigest()[:8]
self.mem.put({
"title": path.name,
"label": "document",
"file": str(path.absolute()),
"metadata": {
"path": str(path),
"size": path.stat().st_size,
"hash": content_hash,
**(metadata or {})
}
})
self.stats["ingested"] += 1
print(f"✅ Ingested: {path.name}")
return True
except Exception as e:
self.stats["failed"] += 1
print(f"❌ Failed: {path.name} - {e}")
return False
def ingest_folder(self, folder_path: str, recursive: bool = True) -> dict:
"""Ingest all documents from a folder."""
folder = Path(folder_path)
pattern = "**/*" if recursive else "*"
files = [f for f in folder.glob(pattern)
if f.is_file() and f.suffix.lower() in self.SUPPORTED_FORMATS]
print(f"📂 Found {len(files)} documents to ingest...")
for filepath in files:
self.ingest_file(str(filepath))
return self.stats
def ask(self, question: str, k: int = 5) -> dict:
"""Ask a question about the documents."""
result = self.mem.ask(question, k=k)
return {
"answer": result.text,
"sources": [
{
"title": s.title,
"snippet": s.snippet,
"score": s.score
}
for s in result.sources
],
"confidence": result.confidence if hasattr(result, 'confidence') else None
}
def search(self, query: str, k: int = 10) -> List[dict]:
"""Search documents without generating an answer."""
results = self.mem.find(query, k=k)
return [
{
"title": hit.title,
"snippet": hit.snippet,
"score": hit.score,
"metadata": hit.metadata
}
for hit in results.hits
]
def get_stats(self) -> dict:
"""Get document store statistics."""
stats = self.mem.stats()
return {
"total_documents": stats.get("frame_count", 0),
"size_bytes": stats.get("size_bytes", 0),
"size_mb": round(stats.get("size_bytes", 0) / 1024 / 1024, 2)
}
Step 2: Interactive CLI
Copy
def main():
import argparse
parser = argparse.ArgumentParser(description="Document Q&A System")
parser.add_argument("--memory", default="documents.mv2", help="Memory file path")
subparsers = parser.add_subparsers(dest="command")
# Ingest command
ingest_parser = subparsers.add_parser("ingest", help="Ingest documents")
ingest_parser.add_argument("path", help="File or folder path")
ingest_parser.add_argument("--recursive", "-r", action="store_true")
# Ask command
ask_parser = subparsers.add_parser("ask", help="Ask a question")
ask_parser.add_argument("question", help="Question to ask")
# Search command
search_parser = subparsers.add_parser("search", help="Search documents")
search_parser.add_argument("query", help="Search query")
# Stats command
subparsers.add_parser("stats", help="Show statistics")
args = parser.parse_args()
qa = DocumentQA(args.memory)
if args.command == "ingest":
path = Path(args.path)
if path.is_file():
qa.ingest_file(str(path))
else:
qa.ingest_folder(str(path), recursive=args.recursive)
print(f"\n📊 Ingested: {qa.stats['ingested']}, Failed: {qa.stats['failed']}")
elif args.command == "ask":
result = qa.ask(args.question)
print(f"\n💡 Answer: {result['answer']}")
print(f"\n📚 Sources:")
for s in result['sources'][:3]:
print(f" - {s['title']} (score: {s['score']:.2f})")
elif args.command == "search":
results = qa.search(args.query)
print(f"\n🔍 Found {len(results)} results:")
for r in results[:5]:
print(f"\n 📄 {r['title']} (score: {r['score']:.2f})")
print(f" {r['snippet'][:150]}...")
elif args.command == "stats":
stats = qa.get_stats()
print(f"\n📊 Document Store Statistics:")
print(f" Documents: {stats['total_documents']}")
print(f" Size: {stats['size_mb']} MB")
else:
parser.print_help()
if __name__ == "__main__":
main()
Web API
Deploy as a REST API:Copy
from fastapi import FastAPI, UploadFile, File, HTTPException
from pydantic import BaseModel
import tempfile
import shutil
app = FastAPI(title="Document Q&A API")
qa = DocumentQA("documents.mv2")
class QuestionRequest(BaseModel):
question: str
k: int = 5
class SearchRequest(BaseModel):
query: str
k: int = 10
@app.post("/upload")
async def upload_document(file: UploadFile = File(...)):
"""Upload and ingest a document."""
# Save to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=file.filename) as tmp:
shutil.copyfileobj(file.file, tmp)
tmp_path = tmp.name
# Ingest
success = qa.ingest_file(tmp_path, metadata={"original_name": file.filename})
if not success:
raise HTTPException(400, "Failed to ingest document")
return {"status": "success", "filename": file.filename}
@app.post("/ask")
async def ask_question(request: QuestionRequest):
"""Ask a question about the documents."""
return qa.ask(request.question, k=request.k)
@app.post("/search")
async def search_documents(request: SearchRequest):
"""Search documents."""
return qa.search(request.query, k=request.k)
@app.get("/stats")
async def get_stats():
"""Get document store statistics."""
return qa.get_stats()
Usage Examples
Legal Document Analysis
Copy
qa = DocumentQA("legal-docs.mv2")
# Ingest contracts
qa.ingest_folder("./contracts/")
# Ask specific questions
result = qa.ask("What are the termination clauses in the vendor contracts?")
print(result["answer"])
# Search for specific terms
matches = qa.search("indemnification liability")
for m in matches:
print(f"{m['title']}: {m['snippet']}")
Research Paper Analysis
Copy
qa = DocumentQA("research-papers.mv2")
qa.ingest_folder("./papers/", recursive=True)
# Synthesize across papers
result = qa.ask(
"What are the main approaches to transformer optimization "
"mentioned across these papers?"
)
print("Summary:", result["answer"])
print("\nKey papers:")
for source in result["sources"]:
print(f" - {source['title']}")