""" arXiv Paper Search Plugin for Funguy Bot Searches academic papers in physics, mathematics, computer science, and more. Uses arXiv API - completely free, no API key required. Commands: !arxiv - Search for papers (shows abstract) !arxiv list - List papers without abstracts !arxiv category - Browse recent papers by category !arxiv recent [category] - Recent papers (last 7 days) !arxiv random - Random paper !arxiv - Get paper by arXiv ID """ import asyncio import logging import time import aiohttp import xml.etree.ElementTree as ET import random from typing import Optional, Dict, List, Tuple from datetime import datetime, timedelta # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- DEFAULT_RESULTS = 3 MAX_RESULTS = 10 # REQUIRED by arXiv API terms – identify your bot. # Use a descriptive string with contact info. A Firefox User-Agent is # also accepted, but the bot-specific one is recommended. # Example Firefox UA: "Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0" USER_AGENT = "FunguyBot/1.0 (mailto:your-email@example.com)" # Minimum delay between successive API calls (arXiv asks for ≥3 seconds) MIN_REQUEST_INTERVAL = 5 CATEGORIES = { "ai": "cs.AI", "ml": "cs.LG", "security": "cs.CR", "crypto": "cs.CR", "cv": "cs.CV", "nlp": "cs.CL", "math": "math", "physics": "physics", "quantum": "quant-ph", "bio": "q-bio", "economics": "econ", "software": "cs.SE" } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _format_collapsible(title: str, content: str, expanded: bool = False) -> str: open_attr = ' open' if expanded else '' return f"\n📚 {title}\n\n{content}\n\n" def _oxford_comma(items): if not items: return "" if len(items) == 1: return items[0] if len(items) == 2: return f"{items[0]} and {items[1]}" return f"{', '.join(items[:-1])}, and {items[-1]}" def _format_paper(paper: Dict, index: int, include_abstract: bool = True) -> str: result = f"
  • \n{index}. {paper['title']}
    \n" result += f"👥 Authors: {_oxford_comma(paper['authors'][:3])}" if len(paper['authors']) > 3: result += f" and {len(paper['authors']) - 3} others" result += "
    \n" result += f"📅 Published: {paper['published']}
    \n" result += f"🏷️ Categories: {', '.join(paper['categories'][:3])}" if len(paper['categories']) > 3: result += f" +{len(paper['categories']) - 3}" result += "
    \n" result += f"🔗 arXiv ID: {paper['id']}
    \n" result += f"📄 PDF: {paper['pdf_url']}
    \n" if include_abstract and paper['summary'] != "No abstract": abstract = paper['summary'] if len(abstract) > 500: abstract = abstract[:497] + "..." result += f"📝 Abstract:
    {abstract}\n" result += "
  • " return result # --------------------------------------------------------------------------- # Persist last request timestamp for rate limiting # --------------------------------------------------------------------------- _last_request_time = 0.0 async def _search_arxiv(query: str, max_results: int = DEFAULT_RESULTS, id_list: List[str] = None) -> Tuple[Optional[List[Dict]], Optional[str]]: """ Search arXiv API. Returns (papers, error_message). - papers: list of paper dicts, or None on failure. - error_message: None on success, otherwise a user-friendly error string. """ global _last_request_time # ----- Throttle ----- now = time.monotonic() wait = _last_request_time + MIN_REQUEST_INTERVAL - now if wait > 0: logging.debug(f"arXiv throttling: waiting {wait:.1f}s") await asyncio.sleep(wait) _last_request_time = time.monotonic() base_url = "http://export.arxiv.org/api/query" headers = {"User-Agent": USER_AGENT} if id_list: id_query = "+OR+".join(f"id:{pid}" for pid in id_list) params = {"search_query": id_query, "max_results": max_results} else: params = { "search_query": query, "max_results": max_results, "sortBy": "relevance", "sortOrder": "descending" } try: async with aiohttp.ClientSession(headers=headers) as session: async with session.get(base_url, params=params) as response: if response.status == 200: text = await response.text() papers = _parse_arxiv_response(text) logging.info(f"arXiv returned {len(papers)} papers for query: {query[:60]}") return papers, None elif response.status == 429: retry_after = response.headers.get("Retry-After", "unknown") logging.error(f"arXiv rate limited (429). Retry-After: {retry_after}") return None, "⚠️ arXiv rate limit exceeded. Please wait a moment and try again." else: text = await response.text() logging.error(f"arXiv API error {response.status}: {text[:300]}") return None, f"❌ arXiv API error (HTTP {response.status})." except Exception as e: logging.error(f"Error searching arXiv: {e}") return None, "❌ Network or internal error while contacting arXiv." async def _get_category_papers(category: str, limit: int = DEFAULT_RESULTS) -> Optional[List[Dict]]: papers, _ = await _search_arxiv(f"cat:{category}", limit) return papers async def _get_recent_papers(category: str = None, days: int = 7) -> Optional[List[Dict]]: date = (datetime.now() - timedelta(days=days)).strftime("%Y%m%d") if category: query = f"cat:{category} AND submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]" else: query = f"submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]" papers, _ = await _search_arxiv(query, DEFAULT_RESULTS) return papers async def _get_random_paper() -> Optional[Dict]: """Returns a single random paper or None.""" terms = ["machine learning", "quantum", "neural network", "optimization", "algorithm", "security"] query = random.choice(terms) papers, _ = await _search_arxiv(query, max_results=MAX_RESULTS) return random.choice(papers) if papers else None def _parse_arxiv_response(xml_text: str) -> List[Dict]: namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} root = ET.fromstring(xml_text) entries = root.findall('atom:entry', namespaces) papers = [] for entry in entries: title = entry.find('atom:title', namespaces) title_text = ' '.join(title.text.strip().split()) if title is not None else "No title" summary = entry.find('atom:summary', namespaces) summary_text = ' '.join(summary.text.strip().split()) if summary is not None else "No abstract" authors = [] for author in entry.findall('atom:author', namespaces): name = author.find('atom:name', namespaces) if name is not None and name.text: authors.append(name.text) id_elem = entry.find('atom:id', namespaces) paper_id = id_elem.text.split('/')[-1] if id_elem is not None else "Unknown" pdf_link = None for link in entry.findall('atom:link', namespaces): if link.get('title') == 'pdf': pdf_link = link.get('href') break categories = [] for category in entry.findall('atom:category', namespaces): term = category.get('term') if term: categories.append(term) published = entry.find('atom:published', namespaces) pub_date = published.text.split('T')[0] if published is not None else "Unknown" papers.append({ 'id': paper_id, 'title': title_text, 'summary': summary_text, 'authors': authors, 'pdf_url': pdf_link or f"http://arxiv.org/pdf/{paper_id}.pdf", 'arxiv_url': f"http://arxiv.org/abs/{paper_id}", 'categories': categories, 'published': pub_date }) return papers # --------------------------------------------------------------------------- # Command Handler # --------------------------------------------------------------------------- async def handle_command(room, message, bot, prefix, config): import simplematrixbotlib as botlib match = botlib.MessageMatch(room, message, bot, prefix) if not (match.is_not_from_this_bot() and match.prefix() and match.command("arxiv")): return args = match.args() # No arguments → show help if not args: help_content = ( "Commands:

    " "• !arxiv <query> - Search papers
    " "• !arxiv list <query> - List without abstracts
    " "• !arxiv category <cat> - Browse category
    " "• !arxiv recent [cat] - Recent papers
    " "• !arxiv random - Random paper
    " "• !arxiv <id> - Get by ID

    " "Categories: ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software" ) response = _format_collapsible("arXiv Help", help_content, expanded=True) await bot.api.send_markdown_message(room.room_id, response) return cmd = args[0].lower() limit = DEFAULT_RESULTS include_abstract = True # Extract optional numeric limit (first or last argument) if args and args[0].isdigit(): limit = min(int(args[0]), MAX_RESULTS) args = args[1:] cmd = args[0].lower() if args else None elif args and args[-1].isdigit(): limit = min(int(args[-1]), MAX_RESULTS) args = args[:-1] cmd = args[0].lower() if args else None # ---- LIST ---- if cmd == "list": include_abstract = False if len(args) >= 2: query = " ".join(args[1:]) else: await bot.api.send_text_message(room.room_id, "Usage: !arxiv list ") return await bot.api.send_text_message(room.room_id, f"🔍 Listing: *{query[:50]}*...") papers, error_msg = await _search_arxiv(query, limit) if error_msg: await bot.api.send_text_message(room.room_id, error_msg) return title = f"Search: '{query[:50]}'" # ---- CATEGORY ---- elif cmd == "category" and len(args) >= 2: cat_key = args[1].lower() if cat_key not in CATEGORIES: await bot.api.send_text_message(room.room_id, f"Unknown category. Available: {', '.join(CATEGORIES.keys())}") return category = CATEGORIES[cat_key] await bot.api.send_text_message(room.room_id, f"📚 Fetching {cat_key.upper()} papers...") papers, error_msg = await _search_arxiv(f"cat:{category}", limit) if error_msg: await bot.api.send_text_message(room.room_id, error_msg) return title = f"Recent Papers in {cat_key.upper()}" # ---- RECENT ---- elif cmd == "recent": category = None if len(args) >= 2 and args[1].lower() in CATEGORIES: category = CATEGORIES[args[1].lower()] await bot.api.send_text_message(room.room_id, f"📚 Fetching recent {args[1].upper()} papers...") title = f"Recent Papers in {args[1].upper()} (7 Days)" else: await bot.api.send_text_message(room.room_id, "📚 Fetching recent papers...") title = "Recent Papers (Last 7 Days)" papers, error_msg = await _search_arxiv( f"cat:{category} AND submittedDate:[{(datetime.now() - timedelta(days=7)).strftime('%Y%m%d')}000000 TO {datetime.now().strftime('%Y%m%d')}235959]" if category else f"submittedDate:[{(datetime.now() - timedelta(days=7)).strftime('%Y%m%d')}000000 TO {datetime.now().strftime('%Y%m%d')}235959]", limit ) if error_msg: await bot.api.send_text_message(room.room_id, error_msg) return # ---- RANDOM ---- elif cmd == "random": await bot.api.send_text_message(room.room_id, "🎲 Fetching random paper...") paper = await _get_random_paper() if paper: content = f"
      \n{_format_paper(paper, 1, True)}\n
    " response = _format_collapsible("Random Paper", content, True) await bot.api.send_markdown_message(room.room_id, response) else: await bot.api.send_text_message(room.room_id, "❌ Failed to fetch random paper (rate limit or API error).") return # early return – we already sent the result # ---- ID LOOKUP ---- elif cmd and (cmd[0].isdigit() or ('.' in cmd and len(cmd.split('.')) == 2)): paper_ids = [cmd] + [arg for arg in args[1:] if arg[0].isdigit() or ('.' in arg and len(arg.split('.')) == 2)] if not paper_ids: await bot.api.send_text_message(room.room_id, "❌ Invalid arXiv ID.") return await bot.api.send_text_message(room.room_id, "📚 Fetching paper(s)...") papers, error_msg = await _search_arxiv("", max_results=len(paper_ids), id_list=paper_ids) if error_msg: await bot.api.send_text_message(room.room_id, error_msg) return title = "Paper Details" # ---- DEFAULT SEARCH ---- else: query = " ".join(args) await bot.api.send_text_message(room.room_id, f"🔍 Searching: *{query[:50]}*...") papers, error_msg = await _search_arxiv(query, limit) if error_msg: await bot.api.send_text_message(room.room_id, error_msg) return title = f"Search: '{query[:50]}'" # If we get here, papers is a list (possibly empty) if not papers: await bot.api.send_text_message(room.room_id, "❌ No papers found.") return content = "
      \n" for i, paper in enumerate(papers, 1): content += _format_paper(paper, i, include_abstract) + "\n" content += f"
    \n\nFound {len(papers)} papers" response = _format_collapsible(title, content, False) await bot.api.send_markdown_message(room.room_id, response) logging.info("Sent arXiv search results") # --------------------------------------------------------------------------- # Plugin Setup # --------------------------------------------------------------------------- def setup(bot): logging.info("arXiv plugin loaded") # --------------------------------------------------------------------------- # Plugin Metadata # --------------------------------------------------------------------------- __version__ = "1.0.2" __author__ = "Funguy Bot" __description__ = "arXiv academic paper search (with rate limiting and error reporting)" __help__ = """
    !arxiv – Search academic papers on arXiv
    • !arxiv <query> – Search papers (shows abstracts)
    • !arxiv list <query> – List without abstracts
    • !arxiv category <category> – Browse recent papers by category
    • !arxiv recent [category] – Most recent papers (7 days)
    • !arxiv random – Random paper
    • !arxiv <id> – Get paper by arXiv ID (e.g., 2101.00101)

    Categories: ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software

    """