Files
FunguyBot/plugins/wikipedia.py
T

299 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Wikipedia plugin for FunguyBot uses MediaWiki APIs exclusively.
No HTML scraping, no BeautifulSoup, no regex on article HTML.
Commands:
!wp <search term> - Fetch summary + main image
!wp help - Show usage
Title resolution (strategies in order):
1. Action API `opensearch` fast, returns the most likely title directly.
2. Action API `list=search` fulltext search, good fallback.
3. Direct hyphenated title (spaces → hyphens) works for many drugs/chemicals.
All errors are logged. No API key required.
"""
import logging
import re
import aiohttp
import tempfile
import os
import simplematrixbotlib as botlib
from urllib.parse import quote
# ----------------------------------------------------------------------
# Plugin lifecycle
# ----------------------------------------------------------------------
def setup(bot):
logging.info("Wikipedia plugin (APIonly) loaded. !wp command ready.")
# ----------------------------------------------------------------------
# Constants
# ----------------------------------------------------------------------
HEADERS = {'User-Agent': 'FunguyBot/2.0 (Matrix bot; educational; contact: your-email@example.com)'}
BASE = 'https://en.wikipedia.org'
# ----------------------------------------------------------------------
# REST API: summary (the cleanest way to get extract + image)
# ----------------------------------------------------------------------
async def _fetch_summary(session: aiohttp.ClientSession, title: str, lang: str = 'en') -> dict | None:
"""
GET /api/rest_v1/page/summary/<title>
Wikipedia follows redirects automatically (e.g. 4-aco-dmt -> 4-AcO-DMT).
Returns JSON dict or None.
"""
encoded = quote(title.replace(' ', '_'))
url = f'https://{lang}.wikipedia.org/api/rest_v1/page/summary/{encoded}'
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=12)) as resp:
logging.info(f'Summary [{resp.status}] for {title!r}')
if resp.status == 200:
return await resp.json(content_type=None)
return None
except Exception as e:
logging.warning(f'Summary fetch error for {title!r}: {e}')
return None
# ----------------------------------------------------------------------
# Strategy 1: OpenSearch Wikipedia's "suggest" API, returns exact title
# ----------------------------------------------------------------------
async def _strategy_opensearch(session: aiohttp.ClientSession, query: str, lang: str) -> str | None:
"""
GET /w/api.php?action=opensearch&search=<query>&namespace=0&limit=1
Returns a list: [query, [title1,...], [desc1,...], [url1,...]]
The first title is usually the best match.
"""
url = f'https://{lang}.wikipedia.org/w/api.php'
params = {
'action': 'opensearch',
'search': query,
'namespace': 0,
'limit': 1,
'format': 'json'
}
try:
async with session.get(url, params=params, timeout=10) as resp:
logging.info(f'OpenSearch [{resp.status}] for {query!r}')
if resp.status == 200:
data = await resp.json()
# data = [query, [title], [description], [url]]
if len(data) >= 2 and data[1]:
title = data[1][0]
logging.info(f'OpenSearch found: {title!r}')
return title
except Exception as e:
logging.warning(f'OpenSearch error: {e}')
return None
# ----------------------------------------------------------------------
# Strategy 2: Action API fulltext search (list=search)
# ----------------------------------------------------------------------
async def _strategy_action_search(session: aiohttp.ClientSession, query: str, lang: str) -> str | None:
url = f'https://{lang}.wikipedia.org/w/api.php'
params = {
'action': 'query',
'list': 'search',
'srsearch': query,
'srlimit': 1,
'srnamespace': 0,
'format': 'json',
}
try:
async with session.get(url, params=params, timeout=10) as resp:
logging.info(f'Action search [{resp.status}] for {query!r}')
if resp.status == 200:
data = await resp.json(content_type=None)
results = data.get('query', {}).get('search', [])
if results:
title = results[0].get('title')
logging.info(f'Action search found: {title!r}')
return title
except Exception as e:
logging.warning(f'Action search error: {e}')
return None
# ----------------------------------------------------------------------
# Strategy 3: Direct hyphenated title (spaces → hyphens)
# ----------------------------------------------------------------------
async def _strategy_direct_hyphenated(session: aiohttp.ClientSession, query: str, lang: str) -> dict | None:
"""
Replace spaces with hyphens and call summary directly.
Wikipedia redirects mean "4-aco-dmt" -> "4-AcO-DMT" transparently.
Returns the full summary dict if successful.
"""
if ' ' not in query:
return None
hyphenated = query.replace(' ', '-')
data = await _fetch_summary(session, hyphenated, lang)
if data and data.get('type') == 'standard' and data.get('extract'):
logging.info(f'Direct hyphenated hit: {hyphenated!r} -> {data.get("title")!r}')
return data
return None
# ----------------------------------------------------------------------
# Main fetch (orchestration)
# ----------------------------------------------------------------------
async def fetch_wikipedia(query: str, lang: str = 'en') -> tuple:
"""
Returns (title, extract, article_url, image_path | None, error_msg | None)
"""
async with aiohttp.ClientSession(headers=HEADERS) as session:
summary_data = None
title = None
# Strategy 1: OpenSearch (fastest, most accurate)
title = await _strategy_opensearch(session, query, lang)
if title:
summary_data = await _fetch_summary(session, title, lang)
# Strategy 2: Fulltext search
if not summary_data:
title = await _strategy_action_search(session, query, lang)
if title:
summary_data = await _fetch_summary(session, title, lang)
# Strategy 3: Hyphenated fallback (for compounds like "4-aco-dmt")
if not summary_data:
summary_data = await _strategy_direct_hyphenated(session, query, lang)
if not summary_data:
logging.error(f'All strategies exhausted for {query!r}')
return None, None, None, None, f'No Wikipedia article found for "{query}".'
page_type = summary_data.get('type', 'standard')
if page_type == 'disambiguation':
url = summary_data.get('content_urls', {}).get('desktop', {}).get('page', '')
return (
None, None, None, None,
f'"{query}" is a disambiguation page try a more specific term.'
+ (f' See: {url}' if url else '')
)
if page_type == 'no-extract' or not summary_data.get('extract'):
return None, None, None, None, f'No summary available for "{query}".'
title = summary_data.get('title', query)
extract = summary_data.get('extract', '').strip()
url = summary_data.get('content_urls', {}).get('desktop', {}).get('page', '')
# Trim long extracts (Matrix messages have ~4KB limit, but keep reasonable)
if len(extract) > 1200:
extract = extract[:1200].rsplit(' ', 1)[0] + ''
# Image: prefer originalimage, fallback to thumbnail
image_url = (
summary_data.get('originalimage', {}).get('source')
or summary_data.get('thumbnail', {}).get('source')
)
image_path = await _download_image(session, image_url) if image_url else None
return title, extract, url, image_path, None
# ----------------------------------------------------------------------
# Image download helper
# ----------------------------------------------------------------------
async def _download_image(session: aiohttp.ClientSession, image_url: str) -> str | None:
if not image_url:
return None
if image_url.startswith('//'):
image_url = 'https:' + image_url
try:
async with session.get(image_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status != 200:
return None
ct = resp.headers.get('Content-Type', '')
suffix = '.png' if 'png' in ct else '.gif' if 'gif' in ct else '.jpg'
data = await resp.read()
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(data)
return tmp.name
except Exception as e:
logging.warning(f'Image download failed: {e}')
return None
# ----------------------------------------------------------------------
# Formatting (HTML for Matrix)
# ----------------------------------------------------------------------
def format_response(title: str, extract: str, url: str) -> str:
body = extract.replace('\n', '<br>')
return (
f'<details>'
f'<summary><strong>📖 Wikipedia: {title}</strong></summary>'
f'<p>{body}</p>'
f'<p><a href="{url}">🔗 Read full article</a></p>'
f'</details>'
)
# ----------------------------------------------------------------------
# Command handler (called by FunguyBot)
# ----------------------------------------------------------------------
async def handle_command(room, message, bot, prefix, config):
match = botlib.MessageMatch(room, message, bot, prefix)
if not (match.is_not_from_this_bot() and match.prefix() and match.command('wp')):
return
args = match.args()
if not args or args[0].lower() == 'help':
help_text = """
<details>
<summary><strong>📖 Wikipedia Help</strong></summary>
<p>
<strong>!wp &lt;search term&gt;</strong> Get the lead section and main image from Wikipedia.<br>
<strong>!wp help</strong> Show this help.<br><br>
<strong>Examples:</strong><br>
<code>!wp 4 aco dmt</code><br>
<code>!wp psilocybin</code><br>
<code>!wp Albert Einstein</code><br>
<code>!wp Python programming language</code>
</p>
</details>
"""
await bot.api.send_markdown_message(room.room_id, help_text)
return
query = ' '.join(args).strip()
await bot.api.send_text_message(room.room_id, f'🔍 Searching Wikipedia for: {query}')
try:
title, extract, url, image_path, error = await fetch_wikipedia(query)
except Exception as e:
logging.exception('Unexpected error in wikipedia plugin')
await bot.api.send_text_message(room.room_id, f'❌ Unexpected error: {e}')
return
if error:
await bot.api.send_text_message(room.room_id, f'{error}')
return
await bot.api.send_markdown_message(room.room_id, format_response(title, extract, url))
if image_path and os.path.exists(image_path):
try:
await bot.api.send_image_message(room_id=room.room_id, image_filepath=image_path)
except Exception as e:
logging.warning(f'Image send failed: {e}')
finally:
try:
os.unlink(image_path)
except OSError:
pass
# ---------------------------------------------------------------------------
# Plugin Metadata
# ---------------------------------------------------------------------------
__version__ = "1.0.0"
__author__ = "Funguy Bot"
__description__ = "Wikipedia article summary"
__help__ = """
<details>
<summary><strong>!wp</strong> Wikipedia summary</summary>
<p><code>!wp &lt;search term&gt;</code> Returns the lead section and main image from Wikipedia.<br>
Uses MediaWiki APIs, no scraping.</p>
</details>
"""