What you’ll build: An AI-powered research assistant that ingests papers, identifies themes, and answers complex questions across your research corpus.Time: 30 minutes | Difficulty: Intermediate
Overview
Build a research assistant that:- 📄 Ingests PDFs from arXiv, local files, and URLs
- 🔬 Extracts key findings and methodology
- 🔗 Identifies connections between papers
- ❓ Answers questions citing specific papers
- 📊 Generates literature reviews
Quick Start
Copy
from memvid_sdk import use
import requests
import tempfile
from pathlib import Path
class ResearchAssistant:
def __init__(self, memory_path: str = "research.mv2"):
self.mem = use('llamaindex', memory_path, mode='auto')
def add_paper(self, title: str, pdf_path: str, metadata: dict = None):
"""Add a research paper to the corpus."""
self.mem.put({
"title": title,
"label": "paper",
"file": pdf_path,
"metadata": metadata or {}
})
print(f"✅ Added: {title}")
def add_arxiv(self, arxiv_id: str):
"""Add a paper from arXiv."""
# Download PDF
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
response = requests.get(url)
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
f.write(response.content)
pdf_path = f.name
# Get metadata
meta_url = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
meta_response = requests.get(meta_url)
# Parse XML for title, authors, abstract...
self.add_paper(
title=f"arXiv:{arxiv_id}",
pdf_path=pdf_path,
metadata={"arxiv_id": arxiv_id, "source": "arxiv"}
)
def search(self, query: str, k: int = 10):
"""Search across all papers."""
return self.mem.find(query, k=k)
def ask(self, question: str):
"""Ask a question about the research corpus."""
return self.mem.ask(question, k=5)
def find_related(self, paper_title: str, k: int = 5):
"""Find papers related to a specific paper."""
return self.mem.find(f"related to {paper_title}", k=k)
# Usage
assistant = ResearchAssistant()
# Add papers
assistant.add_arxiv("2301.07041") # LLaMA paper
assistant.add_arxiv("2302.13971") # LLaMA 2 paper
assistant.add_arxiv("2303.08774") # GPT-4 Technical Report
# Ask questions
answer = assistant.ask(
"What are the key differences between LLaMA and GPT-4 architectures?"
)
print(answer["answer"])
# Find related work
related = assistant.find_related("LLaMA", k=5)
for hit in related.get("hits", []):
print(f"- {hit['title']} (score: {hit['score']:.2f})")
Advanced Features
Literature Review Generator
Copy
def generate_literature_review(self, topic: str, max_papers: int = 20) -> str:
"""Generate a literature review on a topic."""
# Find relevant papers
papers = self.search(topic, k=max_papers)
# Build context from papers
context = "\n\n".join([
f"Paper: {p.title}\nKey points: {p.snippet}"
for p in papers.hits
])
# Generate review using LLM
prompt = f"""Based on these research papers, write a comprehensive literature review on "{topic}":
{context}
Structure the review with:
1. Introduction and background
2. Key themes and findings
3. Methodological approaches
4. Gaps and future directions
5. Conclusion
Literature Review:"""
# Use the ask() method with context
review = self.mem.ask(prompt, k=max_papers)
return review.text
Theme Extraction
Copy
def extract_themes(self, k: int = 100) -> list:
"""Extract main research themes from the corpus."""
# Get sample of papers
timeline = self.mem.timeline(limit=k)
# Cluster by topic (simplified)
themes = {}
for entry in timeline.entries:
# Extract keywords from title/text
keywords = extract_keywords(entry.text)
for kw in keywords:
themes[kw] = themes.get(kw, 0) + 1
# Sort by frequency
sorted_themes = sorted(themes.items(), key=lambda x: -x[1])
return sorted_themes[:20]
Citation Network
Copy
def build_citation_network(self) -> dict:
"""Build a citation network from the corpus."""
network = {"nodes": [], "edges": []}
timeline = self.mem.timeline(limit=1000)
for entry in timeline.entries:
# Add node
network["nodes"].append({
"id": entry.frame_id,
"title": entry.title
})
# Find papers it might cite (based on similarity)
related = self.find_related(entry.title, k=5)
for r in related.hits:
if r.frame_id != entry.frame_id:
network["edges"].append({
"source": entry.frame_id,
"target": r.frame_id,
"weight": r.score
})
return network
Web Interface
Copy
import streamlit as st
st.title("🔬 Research Assistant")
assistant = ResearchAssistant("research.mv2")
# Sidebar for adding papers
with st.sidebar:
st.header("Add Papers")
arxiv_id = st.text_input("arXiv ID")
if st.button("Add from arXiv"):
assistant.add_arxiv(arxiv_id)
st.success(f"Added {arxiv_id}")
uploaded = st.file_uploader("Upload PDF", type="pdf")
if uploaded:
# Save and add
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
f.write(uploaded.read())
assistant.add_paper(uploaded.name, f.name)
st.success(f"Added {uploaded.name}")
# Main area
tab1, tab2, tab3 = st.tabs(["Search", "Ask", "Literature Review"])
with tab1:
query = st.text_input("Search papers")
if query:
results = assistant.search(query)
for hit in results.get("hits", []):
st.markdown(f"**{hit['title']}** (score: {hit['score']:.2f})")
st.write(hit.get("snippet", ""))
st.divider()
with tab2:
question = st.text_area("Ask a research question")
if st.button("Ask"):
answer = assistant.ask(question)
st.markdown("### Answer")
st.write(answer.get("answer", ""))
st.markdown("### Sources")
for s in answer.get("sources", []):
st.write(f"- {s.get('title')}")
with tab3:
topic = st.text_input("Topic for literature review")
if st.button("Generate Review"):
with st.spinner("Generating..."):
review = assistant.generate_literature_review(topic)
st.markdown(review)
Batch Import from arXiv
Copy
import arxiv
def batch_import_arxiv(query: str, max_results: int = 50):
"""Import papers from arXiv search."""
assistant = ResearchAssistant()
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.Relevance
)
for paper in search.results():
print(f"Downloading: {paper.title}")
# Download PDF
pdf_path = paper.download_pdf()
# Add to corpus
assistant.add_paper(
title=paper.title,
pdf_path=pdf_path,
metadata={
"arxiv_id": paper.entry_id,
"authors": [a.name for a in paper.authors],
"abstract": paper.summary,
"published": paper.published.isoformat(),
"categories": paper.categories
}
)
print(f"✅ Imported {max_results} papers")
# Import transformer papers
batch_import_arxiv("transformer attention mechanism", max_results=100)