Skip to main content
What you’ll build: An AI-powered research assistant that ingests papers, identifies themes, and answers complex questions across your research corpus.Time: 30 minutes | Difficulty: Intermediate

Overview

Build a research assistant that:
  • 📄 Ingests PDFs from arXiv, local files, and URLs
  • 🔬 Extracts key findings and methodology
  • 🔗 Identifies connections between papers
  • ❓ Answers questions citing specific papers
  • 📊 Generates literature reviews

Quick Start

from memvid_sdk import use
import requests
import tempfile
from pathlib import Path

class ResearchAssistant:
    def __init__(self, memory_path: str = "research.mv2"):
        self.mem = use('llamaindex', memory_path, mode='auto')

    def add_paper(self, title: str, pdf_path: str, metadata: dict = None):
        """Add a research paper to the corpus."""
        self.mem.put({
            "title": title,
            "label": "paper",
            "file": pdf_path,
            "metadata": metadata or {}
        })
        print(f"✅ Added: {title}")

    def add_arxiv(self, arxiv_id: str):
        """Add a paper from arXiv."""
        # Download PDF
        url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
        response = requests.get(url)

        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
            f.write(response.content)
            pdf_path = f.name

        # Get metadata
        meta_url = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
        meta_response = requests.get(meta_url)
        # Parse XML for title, authors, abstract...

        self.add_paper(
            title=f"arXiv:{arxiv_id}",
            pdf_path=pdf_path,
            metadata={"arxiv_id": arxiv_id, "source": "arxiv"}
        )

    def search(self, query: str, k: int = 10):
        """Search across all papers."""
        return self.mem.find(query, k=k)

    def ask(self, question: str):
        """Ask a question about the research corpus."""
        return self.mem.ask(question, k=5)

    def find_related(self, paper_title: str, k: int = 5):
        """Find papers related to a specific paper."""
        return self.mem.find(f"related to {paper_title}", k=k)


# Usage
assistant = ResearchAssistant()

# Add papers
assistant.add_arxiv("2301.07041")  # LLaMA paper
assistant.add_arxiv("2302.13971")  # LLaMA 2 paper
assistant.add_arxiv("2303.08774")  # GPT-4 Technical Report

# Ask questions
answer = assistant.ask(
    "What are the key differences between LLaMA and GPT-4 architectures?"
)
print(answer["answer"])

# Find related work
related = assistant.find_related("LLaMA", k=5)
for hit in related.get("hits", []):
    print(f"- {hit['title']} (score: {hit['score']:.2f})")

Advanced Features

Literature Review Generator

def generate_literature_review(self, topic: str, max_papers: int = 20) -> str:
    """Generate a literature review on a topic."""

    # Find relevant papers
    papers = self.search(topic, k=max_papers)

    # Build context from papers
    context = "\n\n".join([
        f"Paper: {p.title}\nKey points: {p.snippet}"
        for p in papers.hits
    ])

    # Generate review using LLM
    prompt = f"""Based on these research papers, write a comprehensive literature review on "{topic}":

{context}

Structure the review with:
1. Introduction and background
2. Key themes and findings
3. Methodological approaches
4. Gaps and future directions
5. Conclusion

Literature Review:"""

    # Use the ask() method with context
    review = self.mem.ask(prompt, k=max_papers)
    return review.text

Theme Extraction

def extract_themes(self, k: int = 100) -> list:
    """Extract main research themes from the corpus."""

    # Get sample of papers
    timeline = self.mem.timeline(limit=k)

    # Cluster by topic (simplified)
    themes = {}
    for entry in timeline.entries:
        # Extract keywords from title/text
        keywords = extract_keywords(entry.text)
        for kw in keywords:
            themes[kw] = themes.get(kw, 0) + 1

    # Sort by frequency
    sorted_themes = sorted(themes.items(), key=lambda x: -x[1])
    return sorted_themes[:20]

Citation Network

def build_citation_network(self) -> dict:
    """Build a citation network from the corpus."""
    network = {"nodes": [], "edges": []}

    timeline = self.mem.timeline(limit=1000)

    for entry in timeline.entries:
        # Add node
        network["nodes"].append({
            "id": entry.frame_id,
            "title": entry.title
        })

        # Find papers it might cite (based on similarity)
        related = self.find_related(entry.title, k=5)
        for r in related.hits:
            if r.frame_id != entry.frame_id:
                network["edges"].append({
                    "source": entry.frame_id,
                    "target": r.frame_id,
                    "weight": r.score
                })

    return network

Web Interface

import streamlit as st

st.title("🔬 Research Assistant")

assistant = ResearchAssistant("research.mv2")

# Sidebar for adding papers
with st.sidebar:
    st.header("Add Papers")

    arxiv_id = st.text_input("arXiv ID")
    if st.button("Add from arXiv"):
        assistant.add_arxiv(arxiv_id)
        st.success(f"Added {arxiv_id}")

    uploaded = st.file_uploader("Upload PDF", type="pdf")
    if uploaded:
        # Save and add
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
            f.write(uploaded.read())
            assistant.add_paper(uploaded.name, f.name)
        st.success(f"Added {uploaded.name}")

# Main area
tab1, tab2, tab3 = st.tabs(["Search", "Ask", "Literature Review"])

	with tab1:
	    query = st.text_input("Search papers")
	    if query:
	        results = assistant.search(query)
	        for hit in results.get("hits", []):
	            st.markdown(f"**{hit['title']}** (score: {hit['score']:.2f})")
	            st.write(hit.get("snippet", ""))
	            st.divider()

with tab2:
    question = st.text_area("Ask a research question")
	    if st.button("Ask"):
	        answer = assistant.ask(question)
	        st.markdown("### Answer")
	        st.write(answer.get("answer", ""))
	        st.markdown("### Sources")
	        for s in answer.get("sources", []):
	            st.write(f"- {s.get('title')}")

with tab3:
    topic = st.text_input("Topic for literature review")
    if st.button("Generate Review"):
        with st.spinner("Generating..."):
            review = assistant.generate_literature_review(topic)
        st.markdown(review)

Batch Import from arXiv

import arxiv

def batch_import_arxiv(query: str, max_results: int = 50):
    """Import papers from arXiv search."""
    assistant = ResearchAssistant()

    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )

    for paper in search.results():
        print(f"Downloading: {paper.title}")

        # Download PDF
        pdf_path = paper.download_pdf()

        # Add to corpus
        assistant.add_paper(
            title=paper.title,
            pdf_path=pdf_path,
            metadata={
                "arxiv_id": paper.entry_id,
                "authors": [a.name for a in paper.authors],
                "abstract": paper.summary,
                "published": paper.published.isoformat(),
                "categories": paper.categories
            }
        )

    print(f"✅ Imported {max_results} papers")


# Import transformer papers
batch_import_arxiv("transformer attention mechanism", max_results=100)

Next Steps