FunguyBot/plugins/arxiv.py

"""
arXiv Paper Search Plugin for Funguy Bot

Searches academic papers in physics, mathematics, computer science, and more.
Uses arXiv API - completely free, no API key required.

Commands:
  !arxiv <query>              - Search for papers (shows abstract)
  !arxiv list <query>         - List papers without abstracts
  !arxiv category <category>  - Browse recent papers by category
  !arxiv recent [category]    - Recent papers (last 7 days)
  !arxiv random               - Random paper
  !arxiv <id>                 - Get paper by arXiv ID
"""

import logging
import aiohttp
import xml.etree.ElementTree as ET
import random
from typing import Optional, Dict, List
from datetime import datetime, timedelta

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

DEFAULT_RESULTS = 3
MAX_RESULTS = 10

CATEGORIES = {
    "ai": "cs.AI",
    "ml": "cs.LG",
    "security": "cs.CR",
    "crypto": "cs.CR",
    "cv": "cs.CV",
    "nlp": "cs.CL",
    "math": "math",
    "physics": "physics",
    "quantum": "quant-ph",
    "bio": "q-bio",
    "economics": "econ",
    "software": "cs.SE"
}


# ---------------------------------------------------------------------------
# Helper Functions
# ---------------------------------------------------------------------------

def _format_collapsible(title: str, content: str, expanded: bool = False) -> str:
    """Format content in a collapsible details/summary block."""
    open_attr = ' open' if expanded else ''
    return f"<details{open_attr}>\n<summary>📚 {title}</summary>\n\n{content}\n\n</details>"


def _oxford_comma(items):
    if not items:
        return ""
    if len(items) == 1:
        return items[0]
    if len(items) == 2:
        return f"{items[0]} and {items[1]}"
    return f"{', '.join(items[:-1])}, and {items[-1]}"


def _format_paper(paper: Dict, index: int, include_abstract: bool = True) -> str:
    """Format a paper as an HTML list item."""
    result = f"<li>\n<strong>{index}. {paper['title']}</strong><br/>\n"

    # Authors
    result += f"👥 <strong>Authors:</strong> {_oxford_comma(paper['authors'][:3])}"
    if len(paper['authors']) > 3:
        result += f" and {len(paper['authors']) - 3} others"
    result += "<br/>\n"

    # Metadata
    result += f"📅 <strong>Published:</strong> {paper['published']}<br/>\n"
    result += f"🏷️ <strong>Categories:</strong> {', '.join(paper['categories'][:3])}"
    if len(paper['categories']) > 3:
        result += f" +{len(paper['categories']) - 3}"
    result += "<br/>\n"

    # Links
    result += f"🔗 <strong>arXiv ID:</strong> {paper['id']}<br/>\n"
    result += f"📄 <strong>PDF:</strong> <a href='{paper['pdf_url']}'>{paper['pdf_url']}</a><br/>\n"

    # Abstract
    if include_abstract and paper['summary'] != "No abstract":
        abstract = paper['summary']
        if len(abstract) > 500:
            abstract = abstract[:497] + "..."
        result += f"📝 <strong>Abstract:</strong><br/>{abstract}\n"

    result += "</li>"
    return result


async def _search_arxiv(query: str, max_results: int = DEFAULT_RESULTS, id_list: List[str] = None) -> Optional[List[Dict]]:
    base_url = "http://export.arxiv.org/api/query"

    if id_list:
        id_query = "+OR+".join([f"id:{pid}" for pid in id_list])
        params = {"search_query": id_query, "max_results": max_results}
    else:
        params = {
            "search_query": query,
            "max_results": max_results,
            "sortBy": "relevance",
            "sortOrder": "descending"
        }

    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(base_url, params=params) as response:
                if response.status == 200:
                    text = await response.text()
                    return _parse_arxiv_response(text)
                return None
    except Exception as e:
        logging.error(f"Error searching arXiv: {e}")
        return None


async def _get_category_papers(category: str, limit: int = DEFAULT_RESULTS) -> Optional[List[Dict]]:
    return await _search_arxiv(f"cat:{category}", limit)


async def _get_recent_papers(category: str = None, days: int = 7) -> Optional[List[Dict]]:
    date = (datetime.now() - timedelta(days=days)).strftime("%Y%m%d")
    if category:
        query = f"cat:{category} AND submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
    else:
        query = f"submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
    return await _search_arxiv(query, DEFAULT_RESULTS)


async def _get_random_paper() -> Optional[Dict]:
    terms = ["machine learning", "quantum", "neural network", "optimization", "algorithm", "security"]
    query = random.choice(terms)
    results = await _search_arxiv(query, max_results=MAX_RESULTS)
    return random.choice(results) if results else None


def _parse_arxiv_response(xml_text: str) -> List[Dict]:
    namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
    root = ET.fromstring(xml_text)
    entries = root.findall('atom:entry', namespaces)

    papers = []
    for entry in entries:
        title = entry.find('atom:title', namespaces)
        title_text = ' '.join(title.text.strip().split()) if title is not None else "No title"

        summary = entry.find('atom:summary', namespaces)
        summary_text = ' '.join(summary.text.strip().split()) if summary is not None else "No abstract"

        authors = []
        for author in entry.findall('atom:author', namespaces):
            name = author.find('atom:name', namespaces)
            if name is not None and name.text:
                authors.append(name.text)

        id_elem = entry.find('atom:id', namespaces)
        paper_id = id_elem.text.split('/')[-1] if id_elem is not None else "Unknown"

        pdf_link = None
        for link in entry.findall('atom:link', namespaces):
            if link.get('title') == 'pdf':
                pdf_link = link.get('href')
                break

        categories = []
        for category in entry.findall('atom:category', namespaces):
            term = category.get('term')
            if term:
                categories.append(term)

        published = entry.find('atom:published', namespaces)
        pub_date = published.text.split('T')[0] if published is not None else "Unknown"

        papers.append({
            'id': paper_id,
            'title': title_text,
            'summary': summary_text,
            'authors': authors,
            'pdf_url': pdf_link or f"http://arxiv.org/pdf/{paper_id}.pdf",
            'arxiv_url': f"http://arxiv.org/abs/{paper_id}",
            'categories': categories,
            'published': pub_date
        })

    return papers


# ---------------------------------------------------------------------------
# Command Handler
# ---------------------------------------------------------------------------

async def handle_command(room, message, bot, prefix, config):
    import simplematrixbotlib as botlib

    match = botlib.MessageMatch(room, message, bot, prefix)

    if not (match.is_not_from_this_bot() and match.prefix() and match.command("arxiv")):
        return

    args = match.args()

    if not args:
        help_content = (
            "<strong>Commands:</strong><br/><br/>"
            "• <code>!arxiv &lt;query&gt;</code> - Search papers<br/>"
            "• <code>!arxiv list &lt;query&gt;</code> - List without abstracts<br/>"
            "• <code>!arxiv category &lt;cat&gt;</code> - Browse category<br/>"
            "• <code>!arxiv recent [cat]</code> - Recent papers<br/>"
            "• <code>!arxiv random</code> - Random paper<br/>"
            "• <code>!arxiv &lt;id&gt;</code> - Get by ID<br/><br/>"
            "<strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software"
        )
        response = _format_collapsible("arXiv Help", help_content, expanded=True)
        await bot.api.send_markdown_message(room.room_id, response)
        return

    cmd = args[0].lower()
    limit = DEFAULT_RESULTS
    include_abstract = True

    if args and args[0].isdigit():
        limit = min(int(args[0]), MAX_RESULTS)
        args = args[1:]
        cmd = args[0].lower() if args else None
    elif args and args[-1].isdigit():
        limit = min(int(args[-1]), MAX_RESULTS)
        args = args[:-1]
        cmd = args[0].lower() if args else None

    if cmd == "list":
        include_abstract = False
        if len(args) >= 2:
            query = " ".join(args[1:])
        else:
            await bot.api.send_text_message(room.room_id, "Usage: !arxiv list <query>")
            return

    elif cmd == "category" and len(args) >= 2:
        cat_key = args[1].lower()
        if cat_key in CATEGORIES:
            category = CATEGORIES[cat_key]
            await bot.api.send_text_message(room.room_id, f"📚 Fetching {cat_key.upper()} papers...")
            papers = await _get_category_papers(category, limit)
            title = f"Recent Papers in {cat_key.upper()}"
        else:
            await bot.api.send_text_message(room.room_id, f"Unknown category. Available: {', '.join(CATEGORIES.keys())}")
            return

    elif cmd == "recent":
        category = None
        if len(args) >= 2 and args[1].lower() in CATEGORIES:
            category = CATEGORIES[args[1].lower()]
            await bot.api.send_text_message(room.room_id, f"📚 Fetching recent {args[1].upper()} papers...")
            title = f"Recent Papers in {args[1].upper()} (7 Days)"
        else:
            await bot.api.send_text_message(room.room_id, "📚 Fetching recent papers...")
            title = "Recent Papers (Last 7 Days)"
        papers = await _get_recent_papers(category, limit)

    elif cmd == "random":
        await bot.api.send_text_message(room.room_id, "🎲 Fetching random paper...")
        paper = await _get_random_paper()
        if paper:
            content = f"<ul>\n{_format_paper(paper, 1, True)}\n</ul>"
            response = _format_collapsible("Random Paper", content, True)
            await bot.api.send_markdown_message(room.room_id, response)
        else:
            await bot.api.send_text_message(room.room_id, "❌ Failed to fetch random paper.")
        return

    elif cmd and (cmd[0].isdigit() or ('.' in cmd and len(cmd.split('.')) == 2)):
        paper_ids = [cmd] + [arg for arg in args[1:] if arg[0].isdigit() or ('.' in arg and len(arg.split('.')) == 2)]
        if paper_ids:
            await bot.api.send_text_message(room.room_id, f"📚 Fetching paper(s)...")
            papers = await _search_arxiv("", max_results=len(paper_ids), id_list=paper_ids)
            title = "Paper Details"
        else:
            await bot.api.send_text_message(room.room_id, "❌ Invalid arXiv ID.")
            return

    else:
        query = " ".join(args)
        await bot.api.send_text_message(room.room_id, f"🔍 Searching: *{query[:50]}*...")
        papers = await _search_arxiv(query, limit)
        title = f"Search: '{query[:50]}'"

    if not papers:
        await bot.api.send_text_message(room.room_id, "❌ No papers found.")
        return

    content = "<ul>\n"
    for i, paper in enumerate(papers, 1):
        content += _format_paper(paper, i, include_abstract) + "\n"
    content += f"</ul>\n\n<em>Found {len(papers)} papers</em>"

    response = _format_collapsible(title, content, False)
    await bot.api.send_markdown_message(room.room_id, response)
    logging.info(f"Sent arXiv search results")


# ---------------------------------------------------------------------------
# Plugin Setup
# ---------------------------------------------------------------------------

def setup(bot):
    logging.info("arXiv plugin loaded")


# ---------------------------------------------------------------------------
# Plugin Metadata
# ---------------------------------------------------------------------------

__version__ = "1.0.0"
__author__ = "Funguy Bot"
__description__ = "arXiv academic paper search"
__help__ = """
<details>
<summary><strong>!arxiv</strong> – Search academic papers on arXiv</summary>
<ul>
<li><code>!arxiv &lt;query&gt;</code> – Search papers (shows abstracts)</li>
<li><code>!arxiv list &lt;query&gt;</code> – List without abstracts</li>
<li><code>!arxiv category &lt;category&gt;</code> – Browse recent papers by category</li>
<li><code>!arxiv recent [category]</code> – Most recent papers (7 days)</li>
<li><code>!arxiv random</code> – Random paper</li>
<li><code>!arxiv &lt;id&gt;</code> – Get paper by arXiv ID (e.g., 2101.00101)</li>
</ul>
<p><strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software</p>
</details>
"""