Files
FunguyBot/plugins/arxiv.py
T

337 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
arXiv Paper Search Plugin for Funguy Bot
Searches academic papers in physics, mathematics, computer science, and more.
Uses arXiv API - completely free, no API key required.
Commands:
!arxiv <query> - Search for papers (shows abstract)
!arxiv list <query> - List papers without abstracts
!arxiv category <category> - Browse recent papers by category
!arxiv recent [category] - Recent papers (last 7 days)
!arxiv random - Random paper
!arxiv <id> - Get paper by arXiv ID
"""
import logging
import aiohttp
import xml.etree.ElementTree as ET
import random
from typing import Optional, Dict, List
from datetime import datetime, timedelta
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DEFAULT_RESULTS = 3
MAX_RESULTS = 10
CATEGORIES = {
"ai": "cs.AI",
"ml": "cs.LG",
"security": "cs.CR",
"crypto": "cs.CR",
"cv": "cs.CV",
"nlp": "cs.CL",
"math": "math",
"physics": "physics",
"quantum": "quant-ph",
"bio": "q-bio",
"economics": "econ",
"software": "cs.SE"
}
# ---------------------------------------------------------------------------
# Helper Functions
# ---------------------------------------------------------------------------
def _format_collapsible(title: str, content: str, expanded: bool = False) -> str:
"""Format content in a collapsible details/summary block."""
open_attr = ' open' if expanded else ''
return f"<details{open_attr}>\n<summary>📚 {title}</summary>\n\n{content}\n\n</details>"
def _oxford_comma(items):
if not items:
return ""
if len(items) == 1:
return items[0]
if len(items) == 2:
return f"{items[0]} and {items[1]}"
return f"{', '.join(items[:-1])}, and {items[-1]}"
def _format_paper(paper: Dict, index: int, include_abstract: bool = True) -> str:
"""Format a paper as an HTML list item."""
result = f"<li>\n<strong>{index}. {paper['title']}</strong><br/>\n"
# Authors
result += f"👥 <strong>Authors:</strong> {_oxford_comma(paper['authors'][:3])}"
if len(paper['authors']) > 3:
result += f" and {len(paper['authors']) - 3} others"
result += "<br/>\n"
# Metadata
result += f"📅 <strong>Published:</strong> {paper['published']}<br/>\n"
result += f"🏷️ <strong>Categories:</strong> {', '.join(paper['categories'][:3])}"
if len(paper['categories']) > 3:
result += f" +{len(paper['categories']) - 3}"
result += "<br/>\n"
# Links
result += f"🔗 <strong>arXiv ID:</strong> {paper['id']}<br/>\n"
result += f"📄 <strong>PDF:</strong> <a href='{paper['pdf_url']}'>{paper['pdf_url']}</a><br/>\n"
# Abstract
if include_abstract and paper['summary'] != "No abstract":
abstract = paper['summary']
if len(abstract) > 500:
abstract = abstract[:497] + "..."
result += f"📝 <strong>Abstract:</strong><br/>{abstract}\n"
result += "</li>"
return result
async def _search_arxiv(query: str, max_results: int = DEFAULT_RESULTS, id_list: List[str] = None) -> Optional[List[Dict]]:
base_url = "http://export.arxiv.org/api/query"
if id_list:
id_query = "+OR+".join([f"id:{pid}" for pid in id_list])
params = {"search_query": id_query, "max_results": max_results}
else:
params = {
"search_query": query,
"max_results": max_results,
"sortBy": "relevance",
"sortOrder": "descending"
}
try:
async with aiohttp.ClientSession() as session:
async with session.get(base_url, params=params) as response:
if response.status == 200:
text = await response.text()
return _parse_arxiv_response(text)
return None
except Exception as e:
logging.error(f"Error searching arXiv: {e}")
return None
async def _get_category_papers(category: str, limit: int = DEFAULT_RESULTS) -> Optional[List[Dict]]:
return await _search_arxiv(f"cat:{category}", limit)
async def _get_recent_papers(category: str = None, days: int = 7) -> Optional[List[Dict]]:
date = (datetime.now() - timedelta(days=days)).strftime("%Y%m%d")
if category:
query = f"cat:{category} AND submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
else:
query = f"submittedDate:[{date}000000 TO {datetime.now().strftime('%Y%m%d')}235959]"
return await _search_arxiv(query, DEFAULT_RESULTS)
async def _get_random_paper() -> Optional[Dict]:
terms = ["machine learning", "quantum", "neural network", "optimization", "algorithm", "security"]
query = random.choice(terms)
results = await _search_arxiv(query, max_results=MAX_RESULTS)
return random.choice(results) if results else None
def _parse_arxiv_response(xml_text: str) -> List[Dict]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
root = ET.fromstring(xml_text)
entries = root.findall('atom:entry', namespaces)
papers = []
for entry in entries:
title = entry.find('atom:title', namespaces)
title_text = ' '.join(title.text.strip().split()) if title is not None else "No title"
summary = entry.find('atom:summary', namespaces)
summary_text = ' '.join(summary.text.strip().split()) if summary is not None else "No abstract"
authors = []
for author in entry.findall('atom:author', namespaces):
name = author.find('atom:name', namespaces)
if name is not None and name.text:
authors.append(name.text)
id_elem = entry.find('atom:id', namespaces)
paper_id = id_elem.text.split('/')[-1] if id_elem is not None else "Unknown"
pdf_link = None
for link in entry.findall('atom:link', namespaces):
if link.get('title') == 'pdf':
pdf_link = link.get('href')
break
categories = []
for category in entry.findall('atom:category', namespaces):
term = category.get('term')
if term:
categories.append(term)
published = entry.find('atom:published', namespaces)
pub_date = published.text.split('T')[0] if published is not None else "Unknown"
papers.append({
'id': paper_id,
'title': title_text,
'summary': summary_text,
'authors': authors,
'pdf_url': pdf_link or f"http://arxiv.org/pdf/{paper_id}.pdf",
'arxiv_url': f"http://arxiv.org/abs/{paper_id}",
'categories': categories,
'published': pub_date
})
return papers
# ---------------------------------------------------------------------------
# Command Handler
# ---------------------------------------------------------------------------
async def handle_command(room, message, bot, prefix, config):
import simplematrixbotlib as botlib
match = botlib.MessageMatch(room, message, bot, prefix)
if not (match.is_not_from_this_bot() and match.prefix() and match.command("arxiv")):
return
args = match.args()
if not args:
help_content = (
"<strong>Commands:</strong><br/><br/>"
"• <code>!arxiv &lt;query&gt;</code> - Search papers<br/>"
"• <code>!arxiv list &lt;query&gt;</code> - List without abstracts<br/>"
"• <code>!arxiv category &lt;cat&gt;</code> - Browse category<br/>"
"• <code>!arxiv recent [cat]</code> - Recent papers<br/>"
"• <code>!arxiv random</code> - Random paper<br/>"
"• <code>!arxiv &lt;id&gt;</code> - Get by ID<br/><br/>"
"<strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software"
)
response = _format_collapsible("arXiv Help", help_content, expanded=True)
await bot.api.send_markdown_message(room.room_id, response)
return
cmd = args[0].lower()
limit = DEFAULT_RESULTS
include_abstract = True
if args and args[0].isdigit():
limit = min(int(args[0]), MAX_RESULTS)
args = args[1:]
cmd = args[0].lower() if args else None
elif args and args[-1].isdigit():
limit = min(int(args[-1]), MAX_RESULTS)
args = args[:-1]
cmd = args[0].lower() if args else None
if cmd == "list":
include_abstract = False
if len(args) >= 2:
query = " ".join(args[1:])
else:
await bot.api.send_text_message(room.room_id, "Usage: !arxiv list <query>")
return
elif cmd == "category" and len(args) >= 2:
cat_key = args[1].lower()
if cat_key in CATEGORIES:
category = CATEGORIES[cat_key]
await bot.api.send_text_message(room.room_id, f"📚 Fetching {cat_key.upper()} papers...")
papers = await _get_category_papers(category, limit)
title = f"Recent Papers in {cat_key.upper()}"
else:
await bot.api.send_text_message(room.room_id, f"Unknown category. Available: {', '.join(CATEGORIES.keys())}")
return
elif cmd == "recent":
category = None
if len(args) >= 2 and args[1].lower() in CATEGORIES:
category = CATEGORIES[args[1].lower()]
await bot.api.send_text_message(room.room_id, f"📚 Fetching recent {args[1].upper()} papers...")
title = f"Recent Papers in {args[1].upper()} (7 Days)"
else:
await bot.api.send_text_message(room.room_id, "📚 Fetching recent papers...")
title = "Recent Papers (Last 7 Days)"
papers = await _get_recent_papers(category, limit)
elif cmd == "random":
await bot.api.send_text_message(room.room_id, "🎲 Fetching random paper...")
paper = await _get_random_paper()
if paper:
content = f"<ul>\n{_format_paper(paper, 1, True)}\n</ul>"
response = _format_collapsible("Random Paper", content, True)
await bot.api.send_markdown_message(room.room_id, response)
else:
await bot.api.send_text_message(room.room_id, "❌ Failed to fetch random paper.")
return
elif cmd and (cmd[0].isdigit() or ('.' in cmd and len(cmd.split('.')) == 2)):
paper_ids = [cmd] + [arg for arg in args[1:] if arg[0].isdigit() or ('.' in arg and len(arg.split('.')) == 2)]
if paper_ids:
await bot.api.send_text_message(room.room_id, f"📚 Fetching paper(s)...")
papers = await _search_arxiv("", max_results=len(paper_ids), id_list=paper_ids)
title = "Paper Details"
else:
await bot.api.send_text_message(room.room_id, "❌ Invalid arXiv ID.")
return
else:
query = " ".join(args)
await bot.api.send_text_message(room.room_id, f"🔍 Searching: *{query[:50]}*...")
papers = await _search_arxiv(query, limit)
title = f"Search: '{query[:50]}'"
if not papers:
await bot.api.send_text_message(room.room_id, "❌ No papers found.")
return
content = "<ul>\n"
for i, paper in enumerate(papers, 1):
content += _format_paper(paper, i, include_abstract) + "\n"
content += f"</ul>\n\n<em>Found {len(papers)} papers</em>"
response = _format_collapsible(title, content, False)
await bot.api.send_markdown_message(room.room_id, response)
logging.info(f"Sent arXiv search results")
# ---------------------------------------------------------------------------
# Plugin Setup
# ---------------------------------------------------------------------------
def setup(bot):
logging.info("arXiv plugin loaded")
# ---------------------------------------------------------------------------
# Plugin Metadata
# ---------------------------------------------------------------------------
__version__ = "1.0.0"
__author__ = "Funguy Bot"
__description__ = "arXiv academic paper search"
__help__ = """
<details>
<summary><strong>!arxiv</strong> Search academic papers on arXiv</summary>
<ul>
<li><code>!arxiv &lt;query&gt;</code> Search papers (shows abstracts)</li>
<li><code>!arxiv list &lt;query&gt;</code> List without abstracts</li>
<li><code>!arxiv category &lt;category&gt;</code> Browse recent papers by category</li>
<li><code>!arxiv recent [category]</code> Most recent papers (7 days)</li>
<li><code>!arxiv random</code> Random paper</li>
<li><code>!arxiv &lt;id&gt;</code> Get paper by arXiv ID (e.g., 2101.00101)</li>
</ul>
<p><strong>Categories:</strong> ai, ml, security, crypto, cv, nlp, math, physics, quantum, bio, software</p>
</details>
"""