Files
FunguyBot/plugins/quote.py
T

200 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Goodreads Quote Scraper Playwright (headless Chromium)
No external APIs, no keys; scrapes directly from goodreads.com
"""
import logging
import random
import re
import asyncio
import simplematrixbotlib as botlib
from bs4 import BeautifulSoup
from urllib.parse import urlencode
logger = logging.getLogger("quote")
GR_POPULAR = "https://www.goodreads.com/quotes"
GR_SEARCH = "https://www.goodreads.com/quotes/search"
QUOTES_PER_PAGE = 30
MAX_SEARCH_PAGES = 3
# ---------------------------------------------------------------------------
# Playwright browser (shared, launched once)
# ---------------------------------------------------------------------------
_browser = None
_playwright = None
async def _get_browser():
global _browser, _playwright
if _browser is None:
from playwright.async_api import async_playwright
_playwright = await async_playwright().start()
_browser = await _playwright.chromium.launch(headless=True)
logger.info("Playwright browser started")
return _browser
async def _close_browser():
global _browser, _playwright
if _browser:
await _browser.close()
_browser = None
if _playwright:
await _playwright.stop()
_playwright = None
# ---------------------------------------------------------------------------
# HTML parsing (Goodreads specific)
# ---------------------------------------------------------------------------
def _extract_quotes(html: str) -> list[dict]:
"""Parse Goodreads HTML and return a list of {content, author} dicts."""
soup = BeautifulSoup(html, "lxml")
quotes = []
for div in soup.find_all("div", class_="quoteText"):
full_text = div.get_text(" ", strip=True)
# Try curly quotes
m = re.search(r"“(.+?)”", full_text)
if not m:
m = re.search(r"(.+?)\s*―", full_text)
if not m:
continue
content = m.group(1).strip()
author_span = div.find("span", class_="authorOrTitle")
author = author_span.get_text(strip=True).rstrip(",") if author_span else "Unknown"
quotes.append({"content": content, "author": author})
# Alternative layout (if first method yielded nothing)
for div in soup.find_all("div", class_="quoteDetails"):
text_elem = div.find("div", class_="quoteText")
author_elem = div.find("span", class_="authorOrTitle")
if text_elem:
content = text_elem.get_text(strip=True).strip("“”")
else:
continue
author = author_elem.get_text(strip=True).rstrip(",") if author_elem else "Unknown"
quotes.append({"content": content, "author": author})
return quotes
# ---------------------------------------------------------------------------
# Page fetching
# ---------------------------------------------------------------------------
async def _scrape(url: str, params: dict = None) -> str:
browser = await _get_browser()
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
)
page = await context.new_page()
try:
if params:
full_url = f"{url}?{urlencode(params)}"
else:
full_url = url
await page.goto(full_url, wait_until="networkidle", timeout=15000)
html = await page.content()
return html
except Exception as e:
logger.error(f"Failed to load {full_url}: {e}")
return ""
finally:
await page.close()
await context.close()
async def get_random_popular() -> list[dict]:
html = await _scrape(GR_POPULAR)
return _extract_quotes(html)
async def get_author_quotes(author: str) -> list[dict]:
all_quotes = []
for page in range(1, MAX_SEARCH_PAGES + 1):
html = await _scrape(GR_SEARCH, {"q": author, "commit": "Search", "page": page})
page_quotes = _extract_quotes(html)
all_quotes.extend(page_quotes)
if len(page_quotes) < QUOTES_PER_PAGE:
break
return all_quotes
# ---------------------------------------------------------------------------
# Formatting
# ---------------------------------------------------------------------------
def format_quote(q: dict) -> str:
return f'"{q["content"]}"\n\n{q["author"]}'
# ---------------------------------------------------------------------------
# Command handler
# ---------------------------------------------------------------------------
async def handle_command(room, message, bot, prefix, config):
match = botlib.MessageMatch(room, message, bot, prefix)
if not (match.is_not_from_this_bot() and match.prefix() and match.command("quote")):
return
args = match.args()
# Help
if args and args[0].lower() in ("help", "-h", "--help"):
help_html = (
"<details><summary><strong>📖 !quote help</strong></summary>"
"<ul>"
"<li><code>!quote</code> random popular quote from Goodreads</li>"
"<li><code>!quote &lt;author&gt;</code> random quote by that author</li>"
"<li><code>!quote help</code> this</li>"
"</ul>"
"<p><b>Examples:</b><br><code>!quote</code><br>"
"<code>!quote Terence McKenna</code><br>"
"<code>!quote Oscar Wilde</code></p>"
"<p>Scraped with Playwright (headless browser).</p>"
"</details>"
)
await bot.api.send_markdown_message(room.room_id, help_html)
return
try:
if args:
author = " ".join(args).strip()
await bot.api.send_text_message(
room.room_id, f"🔍 Searching Goodreads for quotes by **{author}**…"
)
quotes = await get_author_quotes(author)
if not quotes:
await bot.api.send_text_message(
room.room_id,
f"❌ No quotes found for '**{author}**'. Try a different spelling."
)
return
chosen = random.choice(quotes)
else:
await bot.api.send_text_message(room.room_id, "✨ Fetching a random popular quote…")
quotes = await get_random_popular()
if not quotes:
await bot.api.send_text_message(room.room_id, "❌ Could not fetch any quotes.")
return
chosen = random.choice(quotes)
await bot.api.send_markdown_message(room.room_id, format_quote(chosen))
logger.info(f"Quote sent: {chosen['author']}")
except Exception as e:
logger.exception("Unexpected error in quote plugin")
await bot.api.send_text_message(
room.room_id, f"❌ Scraping error: {e}"
)
# ---------------------------------------------------------------------------
# Plugin metadata
# ---------------------------------------------------------------------------
__version__ = "1.0.1"
__author__ = "Funguy Bot"
__description__ = "Goodreads quotes via Playwright (headless browser)"
__help__ = """
<details>
<summary><strong>!quote</strong> Quotes from Goodreads (scraped with Playwright)</summary>
<ul>
<li><code>!quote</code> random popular quote</li>
<li><code>!quote &lt;author&gt;</code> random quote by that author</li>
<li><code>!quote help</code></li>
</ul>
<p>No API keys, no JSON files just a real browser fetching from Goodreads.</p>
</details>
"""