How to Design a Swiss Army Knife Research Agent with Tools-Using AI, Web Search, PDF Analysis, Vision, and Automated Reporting

by
0 comments

In this tutorial, we build a “Swiss Army Knife” research agent that goes far beyond simple chat interactions and proactively solves multi-step research problems from end to end. We combine a tool-use agent architecture with live web search, local PDF ingestion, vision-based chart analysis, and automated report generation to demonstrate how modern agents can reason, verify, and produce structured output. By combining small agents, OpenAI models, and practical data-extraction utilities together, we show how a single agent can trace sources, investigate claims, and synthesize findings into professional-grade Markdown and DOCX reports.

%pip -q install -U smolagents openai trafilatura duckduckgo-search pypdf pymupdf python-docx pillow tqdm


import os, re, json, getpass
from typing import List, Dict, Any
import requests
import trafilatura
from duckduckgo_search import DDGS
from pypdf import PdfReader
import fitz
from docx import Document
from docx.shared import Pt
from datetime import datetime


from openai import OpenAI
from smolagents import CodeAgent, OpenAIModel, tool


if not os.environ.get("OPENAI_API_KEY"):
   os.environ("OPENAI_API_KEY") = getpass.getpass("Paste your OpenAI API key (hidden): ").strip()
print("OPENAI_API_KEY set:", "YES" if os.environ.get("OPENAI_API_KEY") else "NO")


if not os.environ.get("SERPER_API_KEY"):
   serper = getpass.getpass("Optional: Paste SERPER_API_KEY for Google results (press Enter to skip): ").strip()
   if serper:
       os.environ("SERPER_API_KEY") = serper
print("SERPER_API_KEY set:", "YES" if os.environ.get("SERPER_API_KEY") else "NO")


client = OpenAI()


def _now():
   return datetime.utcnow().strftime("%Y-%m-%d %H:%M:%SZ")


def _safe_filename(s: str) -> str:
   s = re.sub(r"(^a-zA-Z0-9._-)+", "_", s).strip("_")
   return s(:180) if s else "file"

We set up the full execution environment and securely load all required credentials without hardcoding secrets. We import all the dependencies needed for web search, document parsing, vision analysis, and agent orchestration. We also introduced shared utilities to standardize timestamps and file naming across the entire workflow.

try:
   from google.colab import files
   os.makedirs("/content/pdfs", exist_ok=True)
   uploaded = files.upload()
   for name, data in uploaded.items():
       if name.lower().endswith(".pdf"):
           with open(f"/content/pdfs/{name}", "wb") as f:
               f.write(data)
   print("PDFs in /content/pdfs:", os.listdir("/content/pdfs"))
except Exception as e:
   print("Upload skipped:", str(e))


def web_search(query: str, k: int = 6) -> List(Dict(str, str)):
   serper_key = os.environ.get("SERPER_API_KEY", "").strip()
   if serper_key:
       resp = requests.post(
           "https://google.serper.dev/search",
           headers={"X-API-KEY": serper_key, "Content-Type": "application/json"},
           json={"q": query, "num": k},
           timeout=30,
       )
       resp.raise_for_status()
       data = resp.json()
       out = ()
       for item in (data.get("organic") or ())(:k):
           out.append({
               "title": item.get("title",""),
               "url": item.get("link",""),
               "snippet": item.get("snippet",""),
           })
       return out


   out = ()
   with DDGS() as ddgs:
       for r in ddgs.text(query, max_results=k):
           out.append({
               "title": r.get("title",""),
               "url": r.get("href",""),
               "snippet": r.get("body",""),
           })
   return out


def fetch_url_text(url: str) -> Dict(str, Any):
   try:
       downloaded = trafilatura.fetch_url(url, timeout=30)
       if not downloaded:
           return {"url": url, "ok": False, "error": "fetch_failed", "text": ""}
       text = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
       if not text:
           return {"url": url, "ok": False, "error": "extract_failed", "text": ""}
       title_guess = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")(:120)
       return {"url": url, "ok": True, "title_guess": title_guess, "text": text}
   except Exception as e:
       return {"url": url, "ok": False, "error": str(e), "text": ""}

We enable local PDF ingestion and set up a flexible web search pipeline that works with or without paid search APIs. We show how we elegantly handle alternative inputs while maintaining reliable research flow. We also implement robust URL fetching and text extraction to prepare clean source content for downstream logic.

def read_pdf_text(pdf_path: str, max_pages: int = 30) -> Dict(str, Any):
   reader = PdfReader(pdf_path)
   pages = min(len(reader.pages), max_pages)
   chunks = ()
   for i in range(pages):
       try:
           chunks.append(reader.pages(i).extract_text() or "")
       except Exception:
           chunks.append("")
   return {"pdf_path": pdf_path, "pages_read": pages, "text": "\n\n".join(chunks).strip()}


def extract_pdf_images(pdf_path: str, out_dir: str = "/content/extracted_images", max_pages: int = 10) -> List(str):
   os.makedirs(out_dir, exist_ok=True)
   doc = fitz.open(pdf_path)
   saved = ()
   pages = min(len(doc), max_pages)
   base = _safe_filename(os.path.basename(pdf_path).rsplit(".", 1)(0))


   for p in range(pages):
       page = doc(p)
       img_list = page.get_images(full=True)
       for img_i, img in enumerate(img_list):
           xref = img(0)
           pix = fitz.Pixmap(doc, xref)
           if pix.n - pix.alpha >= 4:
               pix = fitz.Pixmap(fitz.csRGB, pix)
           img_path = os.path.join(out_dir, f"{base}_p{p+1}_img{img_i+1}.png")
           pix.save(img_path)
           saved.append(img_path)


   doc.close()
   return saved


def vision_analyze_image(image_path: str, question: str, model: str = "gpt-4.1-mini") -> Dict(str, Any):
   with open(image_path, "rb") as f:
       img_bytes = f.read()


   resp = client.responses.create(
       model=model,
       input=({
           "role": "user",
           "content": (
               {"type": "input_text", "text": f"Answer concisely and accurately.\n\nQuestion: {question}"},
               {"type": "input_image", "image_data": img_bytes},
           ),
       }),
   )
   return {"image_path": image_path, "answer": resp.output_text}

We focus on deep understanding of the document by removing structured text and visual artifacts from PDF. We integrate a vision-enabled model to interpret charts and figures instead of treating them as opaque images. We ensure that numerical trends and visual insights can be transformed into clear, text-based evidence.

def write_markdown(path: str, content: str) -> str:
   os.makedirs(os.path.dirname(path), exist_ok=True)
   with open(path, "w", encoding="utf-8") as f:
       f.write(content)
   return path


def write_docx_from_markdown(docx_path: str, md: str, title: str = "Research Report") -> str:
   os.makedirs(os.path.dirname(docx_path), exist_ok=True)
   doc = Document()
   t = doc.add_paragraph()
   run = t.add_run(title)
   run.bold = True
   run.font.size = Pt(18)
   meta = doc.add_paragraph()
   meta.add_run(f"Generated: {_now()}").italic = True
   doc.add_paragraph("")
   for line in md.splitlines():
       line = line.rstrip()
       if not line:
           doc.add_paragraph("")
           continue
       if line.startswith("# "):
           doc.add_heading(line(2:).strip(), level=1)
       elif line.startswith("## "):
           doc.add_heading(line(3:).strip(), level=2)
       elif line.startswith("### "):
           doc.add_heading(line(4:).strip(), level=3)
       elif re.match(r"^\s*(-*)\s+", line):
           p = doc.add_paragraph(style="List Bullet")
           p.add_run(re.sub(r"^\s*(-*)\s+", "", line).strip())
       else:
           doc.add_paragraph(line)
   doc.save(docx_path)
   return docx_path


@tool
def t_web_search(query: str, k: int = 6) -> str:
   return json.dumps(web_search(query, k), ensure_ascii=False)


@tool
def t_fetch_url_text(url: str) -> str:
   return json.dumps(fetch_url_text(url), ensure_ascii=False)


@tool
def t_list_pdfs() -> str:
   pdf_dir = "/content/pdfs"
   if not os.path.isdir(pdf_dir):
       return json.dumps(())
   paths = (os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf"))
   return json.dumps(sorted(paths), ensure_ascii=False)


@tool
def t_read_pdf_text(pdf_path: str, max_pages: int = 30) -> str:
   return json.dumps(read_pdf_text(pdf_path, max_pages=max_pages), ensure_ascii=False)


@tool
def t_extract_pdf_images(pdf_path: str, max_pages: int = 10) -> str:
   imgs = extract_pdf_images(pdf_path, max_pages=max_pages)
   return json.dumps(imgs, ensure_ascii=False)


@tool
def t_vision_analyze_image(image_path: str, question: str) -> str:
   return json.dumps(vision_analyze_image(image_path, question), ensure_ascii=False)


@tool
def t_write_markdown(path: str, content: str) -> str:
   return write_markdown(path, content)


@tool
def t_write_docx_from_markdown(docx_path: str, md_path: str, title: str = "Research Report") -> str:
   with open(md_path, "r", encoding="utf-8") as f:
       md = f.read()
   return write_docx_from_markdown(docx_path, md, title=title)

We implement a full output layer by generating Markdown reports and converting them into polished DOCX documents. We expose all core capabilities as explicit tools that the agent can reason about and apply step by step. We ensure that every change from raw data to final report remains deterministic and observable.

model = OpenAIModel(model_id="gpt-5")


agent = CodeAgent(
   tools=(
       t_web_search,
       t_fetch_url_text,
       t_list_pdfs,
       t_read_pdf_text,
       t_extract_pdf_images,
       t_vision_analyze_image,
       t_write_markdown,
       t_write_docx_from_markdown,
   ),
   model=model,
   add_base_tools=False,
   additional_authorized_imports=("json","re","os","math","datetime","time","textwrap"),
)


SYSTEM_INSTRUCTIONS = """
You are a Swiss Army Knife Research Agent.
"""


def run_research(topic: str):
   os.makedirs("/content/report", exist_ok=True)
   prompt = f"""{SYSTEM_INSTRUCTIONS.strip()}


Research question:
{topic}


Steps:
1) List available PDFs (if any) and decide which are relevant.
2) Do web search for the topic.
3) Fetch and extract the text of the best sources.
4) If PDFs exist, extract text and images.
5) Visually analyze figures.
6) Write a Markdown report and convert to DOCX.
"""
   return agent.run(prompt)


topic = "Build a research brief on the most reliable design patterns for tool-using agents (2024-2026), focusing on evaluation, citations, and failure modes."
out = run_research(topic)
print(out(:1500) if isinstance(out, str) else out)


try:
   from google.colab import files
   files.download("/content/report/report.md")
   files.download("/content/report/report.docx")
except Exception as e:
   print("Download skipped:", str(e))

We assemble the entire research agent and define a structured execution plan for multi-step reasoning. We guide the agent to search, analyze, synthesize, and write using a coherent prompt. We demonstrate how the agent produces a finished research artifact that can be instantly reviewed, shared, and reused.

In conclusion, we demonstrated how a well-designed tool-using agent can function as a reliable research assistant rather than a conversational toy. We showed how clear tools, disciplined prompts, and step-by-step execution allow the agent to search the web, analyze documents and visuals, and produce traceable, citation-aware reports. This approach provides a practical blueprint for building trustworthy research agents that emphasizes evaluation, evidence, and failure awareness, which are increasingly essential capabilities for real-world AI systems.


check it out full code here. Also, feel free to follow us Twitter And don’t forget to join us 100k+ ml subreddit and subscribe our newsletter. wait! Are you on Telegram? Now you can also connect with us on Telegram.


Related Articles

Leave a Comment