FunguyBot/plugins/arxiv.py

"""
arXiv Paper Search Plugin for Funguy Bot

Searches academic papers in physics, mathematics, computer science, and more.
Uses arXiv API - completely free, no API key required.

Commands:
  !arxiv <query>              - Search for papers (shows abstract)
  !arxiv list <query>         - List papers without abstracts
  !arxiv category <category>  - Browse recent papers by category
  !arxiv recent [category]    - Recent papers (last 7 days)
  !arxiv random               - Random paper
  !arxiv <id>                 - Get paper by arXiv ID
"""

import asyncio
import logging
import time
import aiohttp
import xml.etree.ElementTree as ET
import random
from typing import Optional, Dict, List, Tuple
from datetime import datetime, timedelta

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

DEFAULT_RESULTS = 3
MAX_RESULTS = 10

# REQUIRED by arXiv API terms – identify your bot.
# Use a descriptive string with contact info. A Firefox User-Agent is
# also accepted, but the bot-specific one is recommended.
# Example Firefox UA: "Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0"
USER_AGENT = "FunguyBot/1.0 (mailto:your-email@example.com)"

# Minimum delay between successive API calls (arXiv asks for ≥3 seconds)
MIN_REQUEST_INTERVAL = 5

CATEGORIES = {
    "ai": "cs.AI",
    "ml": "cs.LG",
    "security": "cs.CR",
    "crypto": "cs.CR",
    "cv": "cs.CV",
    "nlp": "cs.CL",
    "math": "math",
    "physics": "physics",
    "quantum": "quant-ph",
    "bio": "q-bio",
    "economics": "econ",
    "software": "cs.SE"
}

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _format_collapsible(title: str, content: str, expanded: bool = False) -> str:
    open_attr = ' open' if expanded else ''
    return f"<details{open_attr}>\n<summary>📚 {title}</summary>\n\n{content}\n\n</details>"


def _oxford_comma(items):
    if not items:
        return ""
    if len(items) == 1:
        return items[0]
    if len(items) == 2:
        return f"{items[0]} and {items[1]}"
    return f"{', '.join(items[:-1])}, and {items[-1]}"


def _format_paper(paper: Dict, index: int, include_abstract: bool = True) -> str:
    result = f"<li>\n<strong>{index}. {paper['title']}</strong><br/>\n"
    result += f"👥 <strong>Authors:</strong> {_oxford_comma(paper['authors'][:3])}"
    if len(paper['authors']) > 3:
        result += f" and {len(paper['authors']) - 3} others"
    result += "<br/>\n"
    result += f"📅 <strong>Published:</strong> {paper['published']}<br/>\n"
    result += f"🏷️ <strong>Categories:</strong> {', '.join(paper['categories'][:3])}"
    if len(paper['categories']) > 3:
        result += f" +{len(paper['categories']) - 3}"
    result += "<br/>\n"
    result += f"🔗 <strong>arXiv ID:</strong> {paper['id']}<br/>\n"
    result += f"📄 <strong>PDF:</strong> <a href='{paper['pdf_url']}'>{paper['pdf_url']}</a><br/>\n"

    if include_abstract and paper['summary'] != "No abstract":
        abstract = paper['summary']
        if len(abstract) > 500:
            abstract = abstract[:497] + "..."
        result += f"📝 <strong>Abstract:</strong><br/>{abstract}\n"

    result += "</li>"
    return result


# ---------------------------------------------------------------------------
# Persist last request timestamp for rate limiting
# ---------------------------------------------------------------------------
_last_request_time = 0.0


async def _search_arxiv(query: str, max_results: int = DEFAULT_RESULTS,
                        id_list: List[str] = None) -> Tuple[Optional[List[Dict]], Optional[str]]:
    """
    Search arXiv API. Returns (papers, error_message).
    - papers: list of paper dicts, or None on failure.
    - error_message: None on success, otherwise a user-friendly error string.
    """
    global _last_request_time

    # ----- Throttle -----
    now = time.monotonic()
    wait = _last_request_time + MIN_REQUEST_INTERVAL - now
    if wait > 0:
        logging.debug(f"arXiv throttling: waiting {wait:.1f}s")
        await asyncio.sleep(wait)
    _last_request_time = time.monotonic()

    base_url = "http://export.arxiv.org/api/query"
    headers = {"User-Agent": USER_AGENT}

    if id_list:
        id_query = "+OR+".join(f"id:{pid}" for pid in id_list)
        params = {"search_query": id_query, "max_results": max_results}
    else:
        params = {
            "search_query": query,
            "max_results": max_results,
            "sortBy": "relevance",
            "sortOrder": "descending"
        }

    try:
        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(base_url, params=params) as response:
                if response.status == 200:
                    text = await response.text()
                    papers = _parse_arxiv_response(text)
                    logging.info(f"arXiv returned {len(papers)} papers for query: {query[:60]}")
                    return papers, None
                elif response.status == 429:
                    retry_after = response.headers.get("Retry-After", "unknown")
                    logging.error(f"arXiv rate limited (429). Retry-After: {retry_after}")
                    return None, "⚠️ arXiv rate limit exceeded. Please wait a moment and try again."
                else:
                    text = await response.text()
                    logging.error(f"arXiv API error {response.status}: {text[:300]}")
                    return None, f"❌ arXiv API error (HTTP {response.status})."
    except Exception as e:
        logging.error(f"Error searching arXiv: {e}")
        return None, "❌ Network or internal error while contacting arXiv."


async def _get_category_papers(category: str, limit: int = DEFAULT_RESULTS) -> Optional[List[Dict]]:
    papers, _ = await _search_arxiv(f"cat:{category}", limit)
    return papers


async def _get_recent_papers(category: str = None, days: int = 7) -> Optional[List[Dict]]:
    date = (datetime.now() - timedelta(days=days)).strftime("%Y%m%d")
    if category:
        query = f"cat:{category} AND submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
    else:
        query = f"submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
    papers, _ = await _search_arxiv(query, DEFAULT_RESULTS)
    return papers


async def _get_random_paper() -> Optional[Dict]:
    """Returns a single random paper or None."""
    terms = ["machine learning", "quantum", "neural network", "optimization", "algorithm", "security"]
    query = random.choice(terms)
    papers, _ = await _search_arxiv(query, max_results=MAX_RESULTS)
    return random.choice(papers) if papers else None


def _parse_arxiv_response(xml_text: str) -> List[Dict]:
    namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
    root = ET.fromstring(xml_text)
    entries = root.findall('atom:entry', namespaces)

    papers = []
    for entry in entries:
        title = entry.find('atom:title', namespaces)
        title_text = ' '.join(title.text.strip().split()) if title is not None else "No title"

        summary = entry.find('atom:summary', namespaces)
        summary_text = ' '.join(summary.text.strip().split()) if summary is not None else "No abstract"

        authors = []
        for author in entry.findall('atom:author', namespaces):
            name = author.find('atom:name', namespaces)
            if name is not None and name.text:
                authors.append(name.text)

        id_elem = entry.find('atom:id', namespaces)
        paper_id = id_elem.text.split('/')[-1] if id_elem is not None else "Unknown"

        pdf_link = None
        for link in entry.findall('atom:link', namespaces):
            if link.get('title') == 'pdf':
                pdf_link = link.get('href')
                break

        categories = []
        for category in entry.findall('atom:category', namespaces):
            term = category.get('term')
            if term:
                categories.append(term)

        published = entry.find('atom:published', namespaces)
        pub_date = published.text.split('T')[0] if published is not None else "Unknown"

        papers.append({
            'id': paper_id,
            'title': title_text,
            'summary': summary_text,
            'authors': authors,
            'pdf_url': pdf_link or f"http://arxiv.org/pdf/{paper_id}.pdf",
            'arxiv_url': f"http://arxiv.org/abs/{paper_id}",
            'categories': categories,
            'published': pub_date
        })
    return papers


# ---------------------------------------------------------------------------
# Command Handler
# ---------------------------------------------------------------------------

async def handle_command(room, message, bot, prefix, config):
    import simplematrixbotlib as botlib

    match = botlib.MessageMatch(room, message, bot, prefix)
    if not (match.is_not_from_this_bot() and match.prefix() and match.command("arxiv")):
        return

    args = match.args()

    # No arguments → show help
    if not args:
        help_content = (
            "<strong>Commands:</strong><br/><br/>"
            "• <code>!arxiv &lt;query&gt;</code> - Search papers<br/>"
            "• <code>!arxiv list &lt;query&gt;</code> - List without abstracts<br/>"
            "• <code>!arxiv category &lt;cat&gt;</code> - Browse category<br/>"
            "• <code>!arxiv recent [cat]</code> - Recent papers<br/>"
            "• <code>!arxiv random</code> - Random paper<br/>"
            "• <code>!arxiv &lt;id&gt;</code> - Get by ID<br/><br/>"
            "<strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software"
        )
        response = _format_collapsible("arXiv Help", help_content, expanded=True)
        await bot.api.send_markdown_message(room.room_id, response)
        return

    cmd = args[0].lower()
    limit = DEFAULT_RESULTS
    include_abstract = True

    # Extract optional numeric limit (first or last argument)
    if args and args[0].isdigit():
        limit = min(int(args[0]), MAX_RESULTS)
        args = args[1:]
        cmd = args[0].lower() if args else None
    elif args and args[-1].isdigit():
        limit = min(int(args[-1]), MAX_RESULTS)
        args = args[:-1]
        cmd = args[0].lower() if args else None

    # ---- LIST ----
    if cmd == "list":
        include_abstract = False
        if len(args) >= 2:
            query = " ".join(args[1:])
        else:
            await bot.api.send_text_message(room.room_id, "Usage: !arxiv list <query>")
            return
        await bot.api.send_text_message(room.room_id, f"🔍 Listing: *{query[:50]}*...")
        papers, error_msg = await _search_arxiv(query, limit)
        if error_msg:
            await bot.api.send_text_message(room.room_id, error_msg)
            return
        title = f"Search: '{query[:50]}'"

    # ---- CATEGORY ----
    elif cmd == "category" and len(args) >= 2:
        cat_key = args[1].lower()
        if cat_key not in CATEGORIES:
            await bot.api.send_text_message(room.room_id,
                                            f"Unknown category. Available: {', '.join(CATEGORIES.keys())}")
            return
        category = CATEGORIES[cat_key]
        await bot.api.send_text_message(room.room_id, f"📚 Fetching {cat_key.upper()} papers...")
        papers, error_msg = await _search_arxiv(f"cat:{category}", limit)
        if error_msg:
            await bot.api.send_text_message(room.room_id, error_msg)
            return
        title = f"Recent Papers in {cat_key.upper()}"

    # ---- RECENT ----
    elif cmd == "recent":
        category = None
        if len(args) >= 2 and args[1].lower() in CATEGORIES:
            category = CATEGORIES[args[1].lower()]
            await bot.api.send_text_message(room.room_id, f"📚 Fetching recent {args[1].upper()} papers...")
            title = f"Recent Papers in {args[1].upper()} (7 Days)"
        else:
            await bot.api.send_text_message(room.room_id, "📚 Fetching recent papers...")
            title = "Recent Papers (Last 7 Days)"
        papers, error_msg = await _search_arxiv(
            f"cat:{category} AND submittedDate:[{(datetime.now() - timedelta(days=7)).strftime('%Y%m%d')}000000 TO {datetime.now().strftime('%Y%m%d')}235959]" if category
            else f"submittedDate:[{(datetime.now() - timedelta(days=7)).strftime('%Y%m%d')}000000 TO {datetime.now().strftime('%Y%m%d')}235959]",
            limit
        )
        if error_msg:
            await bot.api.send_text_message(room.room_id, error_msg)
            return

    # ---- RANDOM ----
    elif cmd == "random":
        await bot.api.send_text_message(room.room_id, "🎲 Fetching random paper...")
        paper = await _get_random_paper()
        if paper:
            content = f"<ul>\n{_format_paper(paper, 1, True)}\n</ul>"
            response = _format_collapsible("Random Paper", content, True)
            await bot.api.send_markdown_message(room.room_id, response)
        else:
            await bot.api.send_text_message(room.room_id, "❌ Failed to fetch random paper (rate limit or API error).")
        return   # early return – we already sent the result

    # ---- ID LOOKUP ----
    elif cmd and (cmd[0].isdigit() or ('.' in cmd and len(cmd.split('.')) == 2)):
        paper_ids = [cmd] + [arg for arg in args[1:] if arg[0].isdigit() or ('.' in arg and len(arg.split('.')) == 2)]
        if not paper_ids:
            await bot.api.send_text_message(room.room_id, "❌ Invalid arXiv ID.")
            return
        await bot.api.send_text_message(room.room_id, "📚 Fetching paper(s)...")
        papers, error_msg = await _search_arxiv("", max_results=len(paper_ids), id_list=paper_ids)
        if error_msg:
            await bot.api.send_text_message(room.room_id, error_msg)
            return
        title = "Paper Details"

    # ---- DEFAULT SEARCH ----
    else:
        query = " ".join(args)
        await bot.api.send_text_message(room.room_id, f"🔍 Searching: *{query[:50]}*...")
        papers, error_msg = await _search_arxiv(query, limit)
        if error_msg:
            await bot.api.send_text_message(room.room_id, error_msg)
            return
        title = f"Search: '{query[:50]}'"

    # If we get here, papers is a list (possibly empty)
    if not papers:
        await bot.api.send_text_message(room.room_id, "❌ No papers found.")
        return

    content = "<ul>\n"
    for i, paper in enumerate(papers, 1):
        content += _format_paper(paper, i, include_abstract) + "\n"
    content += f"</ul>\n\n<em>Found {len(papers)} papers</em>"

    response = _format_collapsible(title, content, False)
    await bot.api.send_markdown_message(room.room_id, response)
    logging.info("Sent arXiv search results")


# ---------------------------------------------------------------------------
# Plugin Setup
# ---------------------------------------------------------------------------

def setup(bot):
    logging.info("arXiv plugin loaded")


# ---------------------------------------------------------------------------
# Plugin Metadata
# ---------------------------------------------------------------------------

__version__ = "1.0.2"
__author__ = "Funguy Bot"
__description__ = "arXiv academic paper search (with rate limiting and error reporting)"
__help__ = """
<details>
<summary><strong>!arxiv</strong> – Search academic papers on arXiv</summary>
<ul>
<li><code>!arxiv &lt;query&gt;</code> – Search papers (shows abstracts)</li>
<li><code>!arxiv list &lt;query&gt;</code> – List without abstracts</li>
<li><code>!arxiv category &lt;category&gt;</code> – Browse recent papers by category</li>
<li><code>!arxiv recent [category]</code> – Most recent papers (7 days)</li>
<li><code>!arxiv random</code> – Random paper</li>
<li><code>!arxiv &lt;id&gt;</code> – Get paper by arXiv ID (e.g., 2101.00101)</li>
</ul>
<p><strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software</p>
</details>
"""