Add Wikipedia plugin with MediaWiki API integration

- Added new !wp command for fetching Wikipedia summaries and images - Uses aiohttp for async HTTP requests to Wikipedia's REST API - Implements multiple search strategies for better accuracy: 1. OpenSearch API for fast title resolution 2. Full-text search as fallback 3. Direct hyphenated title matching for compounds - Returns clean article extracts with main images - No BeautifulSoup or HTML scraping - pure API approach - Added bs4 dependency to requirements.txt for any other plugins that might need it Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 22:48:28 -05:00
parent c24893e141
commit 76b0c87e0c
2 changed files with 283 additions and 1 deletions
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+"""
+Wikipedia plugin for FunguyBot – uses MediaWiki APIs exclusively.
+No HTML scraping, no BeautifulSoup, no regex on article HTML.
+
+Commands:
+  !wp <search term>  - Fetch summary + main image
+  !wp help           - Show usage
+
+Title resolution (strategies in order):
+  1. Action API `opensearch` – fast, returns the most likely title directly.
+  2. Action API `list=search` – full‑text search, good fallback.
+  3. Direct hyphenated title (spaces → hyphens) – works for many drugs/chemicals.
+All errors are logged. No API key required.
+"""
+
+import logging
+import re
+import aiohttp
+import tempfile
+import os
+import simplematrixbotlib as botlib
+from urllib.parse import quote
+
+# ----------------------------------------------------------------------
+# Plugin lifecycle
+# ----------------------------------------------------------------------
+def setup(bot):
+    logging.info("Wikipedia plugin (API‑only) loaded. !wp command ready.")
+
+# ----------------------------------------------------------------------
+# Constants
+# ----------------------------------------------------------------------
+HEADERS = {'User-Agent': 'FunguyBot/2.0 (Matrix bot; educational; contact: your-email@example.com)'}
+BASE = 'https://en.wikipedia.org'
+
+# ----------------------------------------------------------------------
+# REST API: summary (the cleanest way to get extract + image)
+# ----------------------------------------------------------------------
+async def _fetch_summary(session: aiohttp.ClientSession, title: str, lang: str = 'en') -> dict | None:
+    """
+    GET /api/rest_v1/page/summary/<title>
+    Wikipedia follows redirects automatically (e.g. 4-aco-dmt -> 4-AcO-DMT).
+    Returns JSON dict or None.
+    """
+    encoded = quote(title.replace(' ', '_'))
+    url = f'https://{lang}.wikipedia.org/api/rest_v1/page/summary/{encoded}'
+    try:
+        async with session.get(url, timeout=aiohttp.ClientTimeout(total=12)) as resp:
+            logging.info(f'Summary [{resp.status}] for {title!r}')
+            if resp.status == 200:
+                return await resp.json(content_type=None)
+            return None
+    except Exception as e:
+        logging.warning(f'Summary fetch error for {title!r}: {e}')
+        return None
+
+# ----------------------------------------------------------------------
+# Strategy 1: OpenSearch – Wikipedia's "suggest" API, returns exact title
+# ----------------------------------------------------------------------
+async def _strategy_opensearch(session: aiohttp.ClientSession, query: str, lang: str) -> str | None:
+    """
+    GET /w/api.php?action=opensearch&search=<query>&namespace=0&limit=1
+    Returns a list: [query, [title1,...], [desc1,...], [url1,...]]
+    The first title is usually the best match.
+    """
+    url = f'https://{lang}.wikipedia.org/w/api.php'
+    params = {
+        'action': 'opensearch',
+        'search': query,
+        'namespace': 0,
+        'limit': 1,
+        'format': 'json'
+    }
+    try:
+        async with session.get(url, params=params, timeout=10) as resp:
+            logging.info(f'OpenSearch [{resp.status}] for {query!r}')
+            if resp.status == 200:
+                data = await resp.json()
+                # data = [query, [title], [description], [url]]
+                if len(data) >= 2 and data[1]:
+                    title = data[1][0]
+                    logging.info(f'OpenSearch found: {title!r}')
+                    return title
+    except Exception as e:
+        logging.warning(f'OpenSearch error: {e}')
+    return None
+
+# ----------------------------------------------------------------------
+# Strategy 2: Action API full‑text search (list=search)
+# ----------------------------------------------------------------------
+async def _strategy_action_search(session: aiohttp.ClientSession, query: str, lang: str) -> str | None:
+    url = f'https://{lang}.wikipedia.org/w/api.php'
+    params = {
+        'action': 'query',
+        'list': 'search',
+        'srsearch': query,
+        'srlimit': 1,
+        'srnamespace': 0,
+        'format': 'json',
+    }
+    try:
+        async with session.get(url, params=params, timeout=10) as resp:
+            logging.info(f'Action search [{resp.status}] for {query!r}')
+            if resp.status == 200:
+                data = await resp.json(content_type=None)
+                results = data.get('query', {}).get('search', [])
+                if results:
+                    title = results[0].get('title')
+                    logging.info(f'Action search found: {title!r}')
+                    return title
+    except Exception as e:
+        logging.warning(f'Action search error: {e}')
+    return None
+
+# ----------------------------------------------------------------------
+# Strategy 3: Direct hyphenated title (spaces → hyphens)
+# ----------------------------------------------------------------------
+async def _strategy_direct_hyphenated(session: aiohttp.ClientSession, query: str, lang: str) -> dict | None:
+    """
+    Replace spaces with hyphens and call summary directly.
+    Wikipedia redirects mean "4-aco-dmt" -> "4-AcO-DMT" transparently.
+    Returns the full summary dict if successful.
+    """
+    if ' ' not in query:
+        return None
+    hyphenated = query.replace(' ', '-')
+    data = await _fetch_summary(session, hyphenated, lang)
+    if data and data.get('type') == 'standard' and data.get('extract'):
+        logging.info(f'Direct hyphenated hit: {hyphenated!r} -> {data.get("title")!r}')
+        return data
+    return None
+
+# ----------------------------------------------------------------------
+# Main fetch (orchestration)
+# ----------------------------------------------------------------------
+async def fetch_wikipedia(query: str, lang: str = 'en') -> tuple:
+    """
+    Returns (title, extract, article_url, image_path | None, error_msg | None)
+    """
+    async with aiohttp.ClientSession(headers=HEADERS) as session:
+
+        summary_data = None
+        title = None
+
+        # Strategy 1: OpenSearch (fastest, most accurate)
+        title = await _strategy_opensearch(session, query, lang)
+        if title:
+            summary_data = await _fetch_summary(session, title, lang)
+
+        # Strategy 2: Full‑text search
+        if not summary_data:
+            title = await _strategy_action_search(session, query, lang)
+            if title:
+                summary_data = await _fetch_summary(session, title, lang)
+
+        # Strategy 3: Hyphenated fallback (for compounds like "4-aco-dmt")
+        if not summary_data:
+            summary_data = await _strategy_direct_hyphenated(session, query, lang)
+
+        if not summary_data:
+            logging.error(f'All strategies exhausted for {query!r}')
+            return None, None, None, None, f'No Wikipedia article found for "{query}".'
+
+        page_type = summary_data.get('type', 'standard')
+
+        if page_type == 'disambiguation':
+            url = summary_data.get('content_urls', {}).get('desktop', {}).get('page', '')
+            return (
+                None, None, None, None,
+                f'"{query}" is a disambiguation page – try a more specific term.'
+                + (f' See: {url}' if url else '')
+            )
+
+        if page_type == 'no-extract' or not summary_data.get('extract'):
+            return None, None, None, None, f'No summary available for "{query}".'
+
+        title   = summary_data.get('title', query)
+        extract = summary_data.get('extract', '').strip()
+        url     = summary_data.get('content_urls', {}).get('desktop', {}).get('page', '')
+
+        # Trim long extracts (Matrix messages have ~4KB limit, but keep reasonable)
+        if len(extract) > 1200:
+            extract = extract[:1200].rsplit(' ', 1)[0] + '…'
+
+        # Image: prefer originalimage, fallback to thumbnail
+        image_url = (
+            summary_data.get('originalimage', {}).get('source')
+            or summary_data.get('thumbnail', {}).get('source')
+        )
+        image_path = await _download_image(session, image_url) if image_url else None
+
+        return title, extract, url, image_path, None
+
+# ----------------------------------------------------------------------
+# Image download helper
+# ----------------------------------------------------------------------
+async def _download_image(session: aiohttp.ClientSession, image_url: str) -> str | None:
+    if not image_url:
+        return None
+    if image_url.startswith('//'):
+        image_url = 'https:' + image_url
+    try:
+        async with session.get(image_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
+            if resp.status != 200:
+                return None
+            ct = resp.headers.get('Content-Type', '')
+            suffix = '.png' if 'png' in ct else '.gif' if 'gif' in ct else '.jpg'
+            data = await resp.read()
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+            tmp.write(data)
+            return tmp.name
+    except Exception as e:
+        logging.warning(f'Image download failed: {e}')
+        return None
+
+# ----------------------------------------------------------------------
+# Formatting (HTML for Matrix)
+# ----------------------------------------------------------------------
+def format_response(title: str, extract: str, url: str) -> str:
+    body = extract.replace('\n', '<br>')
+    return (
+        f'<details>'
+        f'<summary><strong>📖 Wikipedia: {title}</strong></summary>'
+        f'<p>{body}</p>'
+        f'<p><a href="{url}">🔗 Read full article</a></p>'
+        f'</details>'
+    )
+
+# ----------------------------------------------------------------------
+# Command handler (called by FunguyBot)
+# ----------------------------------------------------------------------
+async def handle_command(room, message, bot, prefix, config):
+    match = botlib.MessageMatch(room, message, bot, prefix)
+    if not (match.is_not_from_this_bot() and match.prefix() and match.command('wp')):
+        return
+
+    args = match.args()
+    if not args or args[0].lower() == 'help':
+        help_text = """
+<details>
+<summary><strong>📖 Wikipedia Help</strong></summary>
+<p>
+<strong>!wp &lt;search term&gt;</strong> – Get the lead section and main image from Wikipedia.<br>
+<strong>!wp help</strong> – Show this help.<br><br>
+<strong>Examples:</strong><br>
+<code>!wp 4 aco dmt</code><br>
+<code>!wp psilocybin</code><br>
+<code>!wp Albert Einstein</code><br>
+<code>!wp Python programming language</code>
+</p>
+</details>
+"""
+        await bot.api.send_markdown_message(room.room_id, help_text)
+        return
+
+    query = ' '.join(args).strip()
+    await bot.api.send_text_message(room.room_id, f'🔍 Searching Wikipedia for: {query}…')
+
+    try:
+        title, extract, url, image_path, error = await fetch_wikipedia(query)
+    except Exception as e:
+        logging.exception('Unexpected error in wikipedia plugin')
+        await bot.api.send_text_message(room.room_id, f'❌ Unexpected error: {e}')
+        return
+
+    if error:
+        await bot.api.send_text_message(room.room_id, f'❌ {error}')
+        return
+
+    await bot.api.send_markdown_message(room.room_id, format_response(title, extract, url))
+
+    if image_path and os.path.exists(image_path):
+        try:
+            await bot.api.send_image_message(room_id=room.room_id, image_filepath=image_path)
+        except Exception as e:
+            logging.warning(f'Image send failed: {e}')
+        finally:
+            try:
+                os.unlink(image_path)
+            except OSError:
+                pass