Files
FunguyBot/plugins/arxiv.py
T

401 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
arXiv Paper Search Plugin for Funguy Bot
Searches academic papers in physics, mathematics, computer science, and more.
Uses arXiv API - completely free, no API key required.
Commands:
!arxiv <query> - Search for papers (shows abstract)
!arxiv list <query> - List papers without abstracts
!arxiv category <category> - Browse recent papers by category
!arxiv recent [category] - Recent papers (last 7 days)
!arxiv random - Random paper
!arxiv <id> - Get paper by arXiv ID
"""
import asyncio
import logging
import time
import aiohttp
import xml.etree.ElementTree as ET
import random
from typing import Optional, Dict, List, Tuple
from datetime import datetime, timedelta
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DEFAULT_RESULTS = 3
MAX_RESULTS = 10
# REQUIRED by arXiv API terms identify your bot.
# Use a descriptive string with contact info. A Firefox User-Agent is
# also accepted, but the bot-specific one is recommended.
# Example Firefox UA: "Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0"
USER_AGENT = "FunguyBot/1.0 (mailto:your-email@example.com)"
# Minimum delay between successive API calls (arXiv asks for ≥3 seconds)
MIN_REQUEST_INTERVAL = 5
CATEGORIES = {
"ai": "cs.AI",
"ml": "cs.LG",
"security": "cs.CR",
"crypto": "cs.CR",
"cv": "cs.CV",
"nlp": "cs.CL",
"math": "math",
"physics": "physics",
"quantum": "quant-ph",
"bio": "q-bio",
"economics": "econ",
"software": "cs.SE"
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _format_collapsible(title: str, content: str, expanded: bool = False) -> str:
open_attr = ' open' if expanded else ''
return f"<details{open_attr}>\n<summary>📚 {title}</summary>\n\n{content}\n\n</details>"
def _oxford_comma(items):
if not items:
return ""
if len(items) == 1:
return items[0]
if len(items) == 2:
return f"{items[0]} and {items[1]}"
return f"{', '.join(items[:-1])}, and {items[-1]}"
def _format_paper(paper: Dict, index: int, include_abstract: bool = True) -> str:
result = f"<li>\n<strong>{index}. {paper['title']}</strong><br/>\n"
result += f"👥 <strong>Authors:</strong> {_oxford_comma(paper['authors'][:3])}"
if len(paper['authors']) > 3:
result += f" and {len(paper['authors']) - 3} others"
result += "<br/>\n"
result += f"📅 <strong>Published:</strong> {paper['published']}<br/>\n"
result += f"🏷️ <strong>Categories:</strong> {', '.join(paper['categories'][:3])}"
if len(paper['categories']) > 3:
result += f" +{len(paper['categories']) - 3}"
result += "<br/>\n"
result += f"🔗 <strong>arXiv ID:</strong> {paper['id']}<br/>\n"
result += f"📄 <strong>PDF:</strong> <a href='{paper['pdf_url']}'>{paper['pdf_url']}</a><br/>\n"
if include_abstract and paper['summary'] != "No abstract":
abstract = paper['summary']
if len(abstract) > 500:
abstract = abstract[:497] + "..."
result += f"📝 <strong>Abstract:</strong><br/>{abstract}\n"
result += "</li>"
return result
# ---------------------------------------------------------------------------
# Persist last request timestamp for rate limiting
# ---------------------------------------------------------------------------
_last_request_time = 0.0
async def _search_arxiv(query: str, max_results: int = DEFAULT_RESULTS,
id_list: List[str] = None) -> Tuple[Optional[List[Dict]], Optional[str]]:
"""
Search arXiv API. Returns (papers, error_message).
- papers: list of paper dicts, or None on failure.
- error_message: None on success, otherwise a user-friendly error string.
"""
global _last_request_time
# ----- Throttle -----
now = time.monotonic()
wait = _last_request_time + MIN_REQUEST_INTERVAL - now
if wait > 0:
logging.debug(f"arXiv throttling: waiting {wait:.1f}s")
await asyncio.sleep(wait)
_last_request_time = time.monotonic()
base_url = "http://export.arxiv.org/api/query"
headers = {"User-Agent": USER_AGENT}
if id_list:
id_query = "+OR+".join(f"id:{pid}" for pid in id_list)
params = {"search_query": id_query, "max_results": max_results}
else:
params = {
"search_query": query,
"max_results": max_results,
"sortBy": "relevance",
"sortOrder": "descending"
}
try:
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(base_url, params=params) as response:
if response.status == 200:
text = await response.text()
papers = _parse_arxiv_response(text)
logging.info(f"arXiv returned {len(papers)} papers for query: {query[:60]}")
return papers, None
elif response.status == 429:
retry_after = response.headers.get("Retry-After", "unknown")
logging.error(f"arXiv rate limited (429). Retry-After: {retry_after}")
return None, "⚠️ arXiv rate limit exceeded. Please wait a moment and try again."
else:
text = await response.text()
logging.error(f"arXiv API error {response.status}: {text[:300]}")
return None, f"❌ arXiv API error (HTTP {response.status})."
except Exception as e:
logging.error(f"Error searching arXiv: {e}")
return None, "❌ Network or internal error while contacting arXiv."
async def _get_category_papers(category: str, limit: int = DEFAULT_RESULTS) -> Optional[List[Dict]]:
papers, _ = await _search_arxiv(f"cat:{category}", limit)
return papers
async def _get_recent_papers(category: str = None, days: int = 7) -> Optional[List[Dict]]:
date = (datetime.now() - timedelta(days=days)).strftime("%Y%m%d")
if category:
query = f"cat:{category} AND submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
else:
query = f"submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
papers, _ = await _search_arxiv(query, DEFAULT_RESULTS)
return papers
async def _get_random_paper() -> Optional[Dict]:
"""Returns a single random paper or None."""
terms = ["machine learning", "quantum", "neural network", "optimization", "algorithm", "security"]
query = random.choice(terms)
papers, _ = await _search_arxiv(query, max_results=MAX_RESULTS)
return random.choice(papers) if papers else None
def _parse_arxiv_response(xml_text: str) -> List[Dict]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
root = ET.fromstring(xml_text)
entries = root.findall('atom:entry', namespaces)
papers = []
for entry in entries:
title = entry.find('atom:title', namespaces)
title_text = ' '.join(title.text.strip().split()) if title is not None else "No title"
summary = entry.find('atom:summary', namespaces)
summary_text = ' '.join(summary.text.strip().split()) if summary is not None else "No abstract"
authors = []
for author in entry.findall('atom:author', namespaces):
name = author.find('atom:name', namespaces)
if name is not None and name.text:
authors.append(name.text)
id_elem = entry.find('atom:id', namespaces)
paper_id = id_elem.text.split('/')[-1] if id_elem is not None else "Unknown"
pdf_link = None
for link in entry.findall('atom:link', namespaces):
if link.get('title') == 'pdf':
pdf_link = link.get('href')
break
categories = []
for category in entry.findall('atom:category', namespaces):
term = category.get('term')
if term:
categories.append(term)
published = entry.find('atom:published', namespaces)
pub_date = published.text.split('T')[0] if published is not None else "Unknown"
papers.append({
'id': paper_id,
'title': title_text,
'summary': summary_text,
'authors': authors,
'pdf_url': pdf_link or f"http://arxiv.org/pdf/{paper_id}.pdf",
'arxiv_url': f"http://arxiv.org/abs/{paper_id}",
'categories': categories,
'published': pub_date
})
return papers
# ---------------------------------------------------------------------------
# Command Handler
# ---------------------------------------------------------------------------
async def handle_command(room, message, bot, prefix, config):
import simplematrixbotlib as botlib
match = botlib.MessageMatch(room, message, bot, prefix)
if not (match.is_not_from_this_bot() and match.prefix() and match.command("arxiv")):
return
args = match.args()
# No arguments → show help
if not args:
help_content = (
"<strong>Commands:</strong><br/><br/>"
"• <code>!arxiv &lt;query&gt;</code> - Search papers<br/>"
"• <code>!arxiv list &lt;query&gt;</code> - List without abstracts<br/>"
"• <code>!arxiv category &lt;cat&gt;</code> - Browse category<br/>"
"• <code>!arxiv recent [cat]</code> - Recent papers<br/>"
"• <code>!arxiv random</code> - Random paper<br/>"
"• <code>!arxiv &lt;id&gt;</code> - Get by ID<br/><br/>"
"<strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software"
)
response = _format_collapsible("arXiv Help", help_content, expanded=True)
await bot.api.send_markdown_message(room.room_id, response)
return
cmd = args[0].lower()
limit = DEFAULT_RESULTS
include_abstract = True
# Extract optional numeric limit (first or last argument)
if args and args[0].isdigit():
limit = min(int(args[0]), MAX_RESULTS)
args = args[1:]
cmd = args[0].lower() if args else None
elif args and args[-1].isdigit():
limit = min(int(args[-1]), MAX_RESULTS)
args = args[:-1]
cmd = args[0].lower() if args else None
# ---- LIST ----
if cmd == "list":
include_abstract = False
if len(args) >= 2:
query = " ".join(args[1:])
else:
await bot.api.send_text_message(room.room_id, "Usage: !arxiv list <query>")
return
await bot.api.send_text_message(room.room_id, f"🔍 Listing: *{query[:50]}*...")
papers, error_msg = await _search_arxiv(query, limit)
if error_msg:
await bot.api.send_text_message(room.room_id, error_msg)
return
title = f"Search: '{query[:50]}'"
# ---- CATEGORY ----
elif cmd == "category" and len(args) >= 2:
cat_key = args[1].lower()
if cat_key not in CATEGORIES:
await bot.api.send_text_message(room.room_id,
f"Unknown category. Available: {', '.join(CATEGORIES.keys())}")
return
category = CATEGORIES[cat_key]
await bot.api.send_text_message(room.room_id, f"📚 Fetching {cat_key.upper()} papers...")
papers, error_msg = await _search_arxiv(f"cat:{category}", limit)
if error_msg:
await bot.api.send_text_message(room.room_id, error_msg)
return
title = f"Recent Papers in {cat_key.upper()}"
# ---- RECENT ----
elif cmd == "recent":
category = None
if len(args) >= 2 and args[1].lower() in CATEGORIES:
category = CATEGORIES[args[1].lower()]
await bot.api.send_text_message(room.room_id, f"📚 Fetching recent {args[1].upper()} papers...")
title = f"Recent Papers in {args[1].upper()} (7 Days)"
else:
await bot.api.send_text_message(room.room_id, "📚 Fetching recent papers...")
title = "Recent Papers (Last 7 Days)"
papers, error_msg = await _search_arxiv(
f"cat:{category} AND submittedDate:[{(datetime.now() - timedelta(days=7)).strftime('%Y%m%d')}000000 TO {datetime.now().strftime('%Y%m%d')}235959]" if category
else f"submittedDate:[{(datetime.now() - timedelta(days=7)).strftime('%Y%m%d')}000000 TO {datetime.now().strftime('%Y%m%d')}235959]",
limit
)
if error_msg:
await bot.api.send_text_message(room.room_id, error_msg)
return
# ---- RANDOM ----
elif cmd == "random":
await bot.api.send_text_message(room.room_id, "🎲 Fetching random paper...")
paper = await _get_random_paper()
if paper:
content = f"<ul>\n{_format_paper(paper, 1, True)}\n</ul>"
response = _format_collapsible("Random Paper", content, True)
await bot.api.send_markdown_message(room.room_id, response)
else:
await bot.api.send_text_message(room.room_id, "❌ Failed to fetch random paper (rate limit or API error).")
return # early return we already sent the result
# ---- ID LOOKUP ----
elif cmd and (cmd[0].isdigit() or ('.' in cmd and len(cmd.split('.')) == 2)):
paper_ids = [cmd] + [arg for arg in args[1:] if arg[0].isdigit() or ('.' in arg and len(arg.split('.')) == 2)]
if not paper_ids:
await bot.api.send_text_message(room.room_id, "❌ Invalid arXiv ID.")
return
await bot.api.send_text_message(room.room_id, "📚 Fetching paper(s)...")
papers, error_msg = await _search_arxiv("", max_results=len(paper_ids), id_list=paper_ids)
if error_msg:
await bot.api.send_text_message(room.room_id, error_msg)
return
title = "Paper Details"
# ---- DEFAULT SEARCH ----
else:
query = " ".join(args)
await bot.api.send_text_message(room.room_id, f"🔍 Searching: *{query[:50]}*...")
papers, error_msg = await _search_arxiv(query, limit)
if error_msg:
await bot.api.send_text_message(room.room_id, error_msg)
return
title = f"Search: '{query[:50]}'"
# If we get here, papers is a list (possibly empty)
if not papers:
await bot.api.send_text_message(room.room_id, "❌ No papers found.")
return
content = "<ul>\n"
for i, paper in enumerate(papers, 1):
content += _format_paper(paper, i, include_abstract) + "\n"
content += f"</ul>\n\n<em>Found {len(papers)} papers</em>"
response = _format_collapsible(title, content, False)
await bot.api.send_markdown_message(room.room_id, response)
logging.info("Sent arXiv search results")
# ---------------------------------------------------------------------------
# Plugin Setup
# ---------------------------------------------------------------------------
def setup(bot):
logging.info("arXiv plugin loaded")
# ---------------------------------------------------------------------------
# Plugin Metadata
# ---------------------------------------------------------------------------
__version__ = "1.0.2"
__author__ = "Funguy Bot"
__description__ = "arXiv academic paper search (with rate limiting and error reporting)"
__help__ = """
<details>
<summary><strong>!arxiv</strong> Search academic papers on arXiv</summary>
<ul>
<li><code>!arxiv &lt;query&gt;</code> Search papers (shows abstracts)</li>
<li><code>!arxiv list &lt;query&gt;</code> List without abstracts</li>
<li><code>!arxiv category &lt;category&gt;</code> Browse recent papers by category</li>
<li><code>!arxiv recent [category]</code> Most recent papers (7 days)</li>
<li><code>!arxiv random</code> Random paper</li>
<li><code>!arxiv &lt;id&gt;</code> Get paper by arXiv ID (e.g., 2101.00101)</li>
</ul>
<p><strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software</p>
</details>
"""