Skip to main content
What you’ll build: A document Q&A system that ingests PDFs, processes them, and answers questions using RAG.Time: 15 minutes | Difficulty: Beginner

Overview

Upload documents, ask questions, get accurate answers with sources. Perfect for:
  • 📄 Legal document analysis
  • 📚 Research paper queries
  • 📋 Policy/handbook searches
  • 🔍 Technical documentation

Quick Start

import { use } from '@memvid/sdk';
import * as fs from 'fs';
import * as path from 'path';

// Create document store
const mem = await use('basic', 'documents.mv2', { mode: 'create' });

// Ingest documents
const docsDir = './docs';
for (const filename of fs.readdirSync(docsDir)) {
  if (filename.endsWith('.pdf')) {
    await mem.put({
      title: filename,
      label: 'document',
      file: path.join(docsDir, filename),
    });
  }
}

// Ask questions
const answer = await mem.ask('What is the refund policy?', { returnSources: true });
console.log(`Answer: ${answer.answer}`);
console.log(`Source: ${answer.sources?.[0]?.title ?? 'n/a'}`);

Full Implementation

Step 1: Document Processor Class

from memvid_sdk import use
from pathlib import Path
from typing import List, Optional
import hashlib

class DocumentQA:
    """Document Q&A system with Memvid."""

    SUPPORTED_FORMATS = {'.pdf', '.docx', '.txt', '.md', '.html'}

    def __init__(self, memory_path: str = "documents.mv2"):
        self.mem = use('basic', memory_path, mode='auto')
        self.stats = {"ingested": 0, "failed": 0}

    def ingest_file(self, filepath: str, metadata: Optional[dict] = None) -> bool:
        """Ingest a single file."""
        path = Path(filepath)

        if path.suffix.lower() not in self.SUPPORTED_FORMATS:
            print(f"⚠️ Unsupported format: {path.suffix}")
            return False

        try:
            # Generate unique ID based on content hash
            content_hash = hashlib.md5(path.read_bytes()).hexdigest()[:8]

            self.mem.put({
                "title": path.name,
                "label": "document",
                "file": str(path.absolute()),
                "metadata": {
                    "path": str(path),
                    "size": path.stat().st_size,
                    "hash": content_hash,
                    **(metadata or {})
                }
            })

            self.stats["ingested"] += 1
            print(f"✅ Ingested: {path.name}")
            return True

        except Exception as e:
            self.stats["failed"] += 1
            print(f"❌ Failed: {path.name} - {e}")
            return False

    def ingest_folder(self, folder_path: str, recursive: bool = True) -> dict:
        """Ingest all documents from a folder."""
        folder = Path(folder_path)

        pattern = "**/*" if recursive else "*"
        files = [f for f in folder.glob(pattern)
                 if f.is_file() and f.suffix.lower() in self.SUPPORTED_FORMATS]

        print(f"📂 Found {len(files)} documents to ingest...")

        for filepath in files:
            self.ingest_file(str(filepath))

        return self.stats

    def ask(self, question: str, k: int = 5) -> dict:
        """Ask a question about the documents."""
        result = self.mem.ask(question, k=k)

        return {
            "answer": result.text,
            "sources": [
                {
                    "title": s.title,
                    "snippet": s.snippet,
                    "score": s.score
                }
                for s in result.sources
            ],
            "confidence": result.confidence if hasattr(result, 'confidence') else None
        }

    def search(self, query: str, k: int = 10) -> List[dict]:
        """Search documents without generating an answer."""
        results = self.mem.find(query, k=k)

        return [
            {
                "title": hit.title,
                "snippet": hit.snippet,
                "score": hit.score,
                "metadata": hit.metadata
            }
            for hit in results.hits
        ]

    def get_stats(self) -> dict:
        """Get document store statistics."""
        stats = self.mem.stats()
        return {
            "total_documents": stats.get("frame_count", 0),
            "size_bytes": stats.get("size_bytes", 0),
            "size_mb": round(stats.get("size_bytes", 0) / 1024 / 1024, 2)
        }

Step 2: Interactive CLI

def main():
    import argparse

    parser = argparse.ArgumentParser(description="Document Q&A System")
    parser.add_argument("--memory", default="documents.mv2", help="Memory file path")

    subparsers = parser.add_subparsers(dest="command")

    # Ingest command
    ingest_parser = subparsers.add_parser("ingest", help="Ingest documents")
    ingest_parser.add_argument("path", help="File or folder path")
    ingest_parser.add_argument("--recursive", "-r", action="store_true")

    # Ask command
    ask_parser = subparsers.add_parser("ask", help="Ask a question")
    ask_parser.add_argument("question", help="Question to ask")

    # Search command
    search_parser = subparsers.add_parser("search", help="Search documents")
    search_parser.add_argument("query", help="Search query")

    # Stats command
    subparsers.add_parser("stats", help="Show statistics")

    args = parser.parse_args()
    qa = DocumentQA(args.memory)

    if args.command == "ingest":
        path = Path(args.path)
        if path.is_file():
            qa.ingest_file(str(path))
        else:
            qa.ingest_folder(str(path), recursive=args.recursive)
        print(f"\n📊 Ingested: {qa.stats['ingested']}, Failed: {qa.stats['failed']}")

    elif args.command == "ask":
        result = qa.ask(args.question)
        print(f"\n💡 Answer: {result['answer']}")
        print(f"\n📚 Sources:")
        for s in result['sources'][:3]:
            print(f"  - {s['title']} (score: {s['score']:.2f})")

    elif args.command == "search":
        results = qa.search(args.query)
        print(f"\n🔍 Found {len(results)} results:")
        for r in results[:5]:
            print(f"\n  📄 {r['title']} (score: {r['score']:.2f})")
            print(f"     {r['snippet'][:150]}...")

    elif args.command == "stats":
        stats = qa.get_stats()
        print(f"\n📊 Document Store Statistics:")
        print(f"  Documents: {stats['total_documents']}")
        print(f"  Size: {stats['size_mb']} MB")

    else:
        parser.print_help()


if __name__ == "__main__":
    main()

Web API

Deploy as a REST API:
from fastapi import FastAPI, UploadFile, File, HTTPException
from pydantic import BaseModel
import tempfile
import shutil

app = FastAPI(title="Document Q&A API")
qa = DocumentQA("documents.mv2")


class QuestionRequest(BaseModel):
    question: str
    k: int = 5


class SearchRequest(BaseModel):
    query: str
    k: int = 10


@app.post("/upload")
async def upload_document(file: UploadFile = File(...)):
    """Upload and ingest a document."""
    # Save to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix=file.filename) as tmp:
        shutil.copyfileobj(file.file, tmp)
        tmp_path = tmp.name

    # Ingest
    success = qa.ingest_file(tmp_path, metadata={"original_name": file.filename})

    if not success:
        raise HTTPException(400, "Failed to ingest document")

    return {"status": "success", "filename": file.filename}


@app.post("/ask")
async def ask_question(request: QuestionRequest):
    """Ask a question about the documents."""
    return qa.ask(request.question, k=request.k)


@app.post("/search")
async def search_documents(request: SearchRequest):
    """Search documents."""
    return qa.search(request.query, k=request.k)


@app.get("/stats")
async def get_stats():
    """Get document store statistics."""
    return qa.get_stats()

Usage Examples

qa = DocumentQA("legal-docs.mv2")

# Ingest contracts
qa.ingest_folder("./contracts/")

# Ask specific questions
result = qa.ask("What are the termination clauses in the vendor contracts?")
print(result["answer"])

# Search for specific terms
matches = qa.search("indemnification liability")
for m in matches:
    print(f"{m['title']}: {m['snippet']}")

Research Paper Analysis

qa = DocumentQA("research-papers.mv2")
qa.ingest_folder("./papers/", recursive=True)

# Synthesize across papers
result = qa.ask(
    "What are the main approaches to transformer optimization "
    "mentioned across these papers?"
)

print("Summary:", result["answer"])
print("\nKey papers:")
for source in result["sources"]:
    print(f"  - {source['title']}")

Next Steps