Add Wikipedia plugin with MediaWiki API integration

- Added new !wp command for fetching Wikipedia summaries and images
- Uses aiohttp for async HTTP requests to Wikipedia's REST API
- Implements multiple search strategies for better accuracy:
  1. OpenSearch API for fast title resolution
  2. Full-text search as fallback
  3. Direct hyphenated title matching for compounds
- Returns clean article extracts with main images
- No BeautifulSoup or HTML scraping - pure API approach
- Added bs4 dependency to requirements.txt for any other plugins that might need it

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-30 22:48:28 -05:00
parent c24893e141
commit 76b0c87e0c
2 changed files with 283 additions and 1 deletions
+282
View File
@@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Wikipedia plugin for FunguyBot uses MediaWiki APIs exclusively.
No HTML scraping, no BeautifulSoup, no regex on article HTML.
Commands:
!wp <search term> - Fetch summary + main image
!wp help - Show usage
Title resolution (strategies in order):
1. Action API `opensearch` fast, returns the most likely title directly.
2. Action API `list=search` fulltext search, good fallback.
3. Direct hyphenated title (spaces → hyphens) works for many drugs/chemicals.
All errors are logged. No API key required.
"""
import logging
import re
import aiohttp
import tempfile
import os
import simplematrixbotlib as botlib
from urllib.parse import quote
# ----------------------------------------------------------------------
# Plugin lifecycle
# ----------------------------------------------------------------------
def setup(bot):
logging.info("Wikipedia plugin (APIonly) loaded. !wp command ready.")
# ----------------------------------------------------------------------
# Constants
# ----------------------------------------------------------------------
HEADERS = {'User-Agent': 'FunguyBot/2.0 (Matrix bot; educational; contact: your-email@example.com)'}
BASE = 'https://en.wikipedia.org'
# ----------------------------------------------------------------------
# REST API: summary (the cleanest way to get extract + image)
# ----------------------------------------------------------------------
async def _fetch_summary(session: aiohttp.ClientSession, title: str, lang: str = 'en') -> dict | None:
"""
GET /api/rest_v1/page/summary/<title>
Wikipedia follows redirects automatically (e.g. 4-aco-dmt -> 4-AcO-DMT).
Returns JSON dict or None.
"""
encoded = quote(title.replace(' ', '_'))
url = f'https://{lang}.wikipedia.org/api/rest_v1/page/summary/{encoded}'
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=12)) as resp:
logging.info(f'Summary [{resp.status}] for {title!r}')
if resp.status == 200:
return await resp.json(content_type=None)
return None
except Exception as e:
logging.warning(f'Summary fetch error for {title!r}: {e}')
return None
# ----------------------------------------------------------------------
# Strategy 1: OpenSearch Wikipedia's "suggest" API, returns exact title
# ----------------------------------------------------------------------
async def _strategy_opensearch(session: aiohttp.ClientSession, query: str, lang: str) -> str | None:
"""
GET /w/api.php?action=opensearch&search=<query>&namespace=0&limit=1
Returns a list: [query, [title1,...], [desc1,...], [url1,...]]
The first title is usually the best match.
"""
url = f'https://{lang}.wikipedia.org/w/api.php'
params = {
'action': 'opensearch',
'search': query,
'namespace': 0,
'limit': 1,
'format': 'json'
}
try:
async with session.get(url, params=params, timeout=10) as resp:
logging.info(f'OpenSearch [{resp.status}] for {query!r}')
if resp.status == 200:
data = await resp.json()
# data = [query, [title], [description], [url]]
if len(data) >= 2 and data[1]:
title = data[1][0]
logging.info(f'OpenSearch found: {title!r}')
return title
except Exception as e:
logging.warning(f'OpenSearch error: {e}')
return None
# ----------------------------------------------------------------------
# Strategy 2: Action API fulltext search (list=search)
# ----------------------------------------------------------------------
async def _strategy_action_search(session: aiohttp.ClientSession, query: str, lang: str) -> str | None:
url = f'https://{lang}.wikipedia.org/w/api.php'
params = {
'action': 'query',
'list': 'search',
'srsearch': query,
'srlimit': 1,
'srnamespace': 0,
'format': 'json',
}
try:
async with session.get(url, params=params, timeout=10) as resp:
logging.info(f'Action search [{resp.status}] for {query!r}')
if resp.status == 200:
data = await resp.json(content_type=None)
results = data.get('query', {}).get('search', [])
if results:
title = results[0].get('title')
logging.info(f'Action search found: {title!r}')
return title
except Exception as e:
logging.warning(f'Action search error: {e}')
return None
# ----------------------------------------------------------------------
# Strategy 3: Direct hyphenated title (spaces → hyphens)
# ----------------------------------------------------------------------
async def _strategy_direct_hyphenated(session: aiohttp.ClientSession, query: str, lang: str) -> dict | None:
"""
Replace spaces with hyphens and call summary directly.
Wikipedia redirects mean "4-aco-dmt" -> "4-AcO-DMT" transparently.
Returns the full summary dict if successful.
"""
if ' ' not in query:
return None
hyphenated = query.replace(' ', '-')
data = await _fetch_summary(session, hyphenated, lang)
if data and data.get('type') == 'standard' and data.get('extract'):
logging.info(f'Direct hyphenated hit: {hyphenated!r} -> {data.get("title")!r}')
return data
return None
# ----------------------------------------------------------------------
# Main fetch (orchestration)
# ----------------------------------------------------------------------
async def fetch_wikipedia(query: str, lang: str = 'en') -> tuple:
"""
Returns (title, extract, article_url, image_path | None, error_msg | None)
"""
async with aiohttp.ClientSession(headers=HEADERS) as session:
summary_data = None
title = None
# Strategy 1: OpenSearch (fastest, most accurate)
title = await _strategy_opensearch(session, query, lang)
if title:
summary_data = await _fetch_summary(session, title, lang)
# Strategy 2: Fulltext search
if not summary_data:
title = await _strategy_action_search(session, query, lang)
if title:
summary_data = await _fetch_summary(session, title, lang)
# Strategy 3: Hyphenated fallback (for compounds like "4-aco-dmt")
if not summary_data:
summary_data = await _strategy_direct_hyphenated(session, query, lang)
if not summary_data:
logging.error(f'All strategies exhausted for {query!r}')
return None, None, None, None, f'No Wikipedia article found for "{query}".'
page_type = summary_data.get('type', 'standard')
if page_type == 'disambiguation':
url = summary_data.get('content_urls', {}).get('desktop', {}).get('page', '')
return (
None, None, None, None,
f'"{query}" is a disambiguation page try a more specific term.'
+ (f' See: {url}' if url else '')
)
if page_type == 'no-extract' or not summary_data.get('extract'):
return None, None, None, None, f'No summary available for "{query}".'
title = summary_data.get('title', query)
extract = summary_data.get('extract', '').strip()
url = summary_data.get('content_urls', {}).get('desktop', {}).get('page', '')
# Trim long extracts (Matrix messages have ~4KB limit, but keep reasonable)
if len(extract) > 1200:
extract = extract[:1200].rsplit(' ', 1)[0] + ''
# Image: prefer originalimage, fallback to thumbnail
image_url = (
summary_data.get('originalimage', {}).get('source')
or summary_data.get('thumbnail', {}).get('source')
)
image_path = await _download_image(session, image_url) if image_url else None
return title, extract, url, image_path, None
# ----------------------------------------------------------------------
# Image download helper
# ----------------------------------------------------------------------
async def _download_image(session: aiohttp.ClientSession, image_url: str) -> str | None:
if not image_url:
return None
if image_url.startswith('//'):
image_url = 'https:' + image_url
try:
async with session.get(image_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status != 200:
return None
ct = resp.headers.get('Content-Type', '')
suffix = '.png' if 'png' in ct else '.gif' if 'gif' in ct else '.jpg'
data = await resp.read()
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(data)
return tmp.name
except Exception as e:
logging.warning(f'Image download failed: {e}')
return None
# ----------------------------------------------------------------------
# Formatting (HTML for Matrix)
# ----------------------------------------------------------------------
def format_response(title: str, extract: str, url: str) -> str:
body = extract.replace('\n', '<br>')
return (
f'<details>'
f'<summary><strong>📖 Wikipedia: {title}</strong></summary>'
f'<p>{body}</p>'
f'<p><a href="{url}">🔗 Read full article</a></p>'
f'</details>'
)
# ----------------------------------------------------------------------
# Command handler (called by FunguyBot)
# ----------------------------------------------------------------------
async def handle_command(room, message, bot, prefix, config):
match = botlib.MessageMatch(room, message, bot, prefix)
if not (match.is_not_from_this_bot() and match.prefix() and match.command('wp')):
return
args = match.args()
if not args or args[0].lower() == 'help':
help_text = """
<details>
<summary><strong>📖 Wikipedia Help</strong></summary>
<p>
<strong>!wp &lt;search term&gt;</strong> Get the lead section and main image from Wikipedia.<br>
<strong>!wp help</strong> Show this help.<br><br>
<strong>Examples:</strong><br>
<code>!wp 4 aco dmt</code><br>
<code>!wp psilocybin</code><br>
<code>!wp Albert Einstein</code><br>
<code>!wp Python programming language</code>
</p>
</details>
"""
await bot.api.send_markdown_message(room.room_id, help_text)
return
query = ' '.join(args).strip()
await bot.api.send_text_message(room.room_id, f'🔍 Searching Wikipedia for: {query}')
try:
title, extract, url, image_path, error = await fetch_wikipedia(query)
except Exception as e:
logging.exception('Unexpected error in wikipedia plugin')
await bot.api.send_text_message(room.room_id, f'❌ Unexpected error: {e}')
return
if error:
await bot.api.send_text_message(room.room_id, f'{error}')
return
await bot.api.send_markdown_message(room.room_id, format_response(title, extract, url))
if image_path and os.path.exists(image_path):
try:
await bot.api.send_image_message(room_id=room.room_id, image_filepath=image_path)
except Exception as e:
logging.warning(f'Image send failed: {e}')
finally:
try:
os.unlink(image_path)
except OSError:
pass