From 3c3eae0c399ec888658d6db2dc1d58a52e9c67f5 Mon Sep 17 00:00:00 2001 From: Aciid <703382+Aciid@users.noreply.github.com> Date: Wed, 8 Mar 2023 14:57:26 +0200 Subject: [PATCH] wikipedia.py: add smart trimming for extract, use redirects, use canonical URL in content, convert spaces to underscore in urls. --- modules/wikipedia.py | 45 +++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/modules/wikipedia.py b/modules/wikipedia.py index e59be5a..bc1d08b 100644 --- a/modules/wikipedia.py +++ b/modules/wikipedia.py @@ -4,6 +4,7 @@ import requests from modules.common.module import BotModule + # This module searches wikipedia for query, returns page summary and link. class MatrixModule(BotModule): def __init__(self, name): @@ -14,32 +15,36 @@ class MatrixModule(BotModule): args = event.body.split() if len(args) > 1: - query = event.body[len(args[0])+1:] + query = event.body[len(args[0]) + 1:] try: response = requests.get(self.api_url, params={ 'action': 'query', - 'prop': 'extracts', + 'format': 'json', 'exintro': True, 'explaintext': True, + 'prop': 'extracts', + 'redirects': 1, 'titles': query, - 'format': 'json', - 'formatversion': 2 }) response.raise_for_status() data = response.json() - if 'query' not in data or 'pages' not in data['query'] or len(data['query']['pages']) == 0: + + # Get the page id + page_id = list(data['query']['pages'].keys())[0] + + if page_id == '-1': await bot.send_text(room, 'No results found') return - page = data['query']['pages'][0] + # Get the page title + title = data['query']['pages'][page_id]['title'] - if 'extract' not in page: - await bot.send_text(room, 'No results found') - return + # Get the page summary + summary = data['query']['pages'][page_id]['extract'] # Remove all html tags - extract = re.sub('<[^<]+?>', '', page['extract']) + extract = re.sub('<[^<]+?>', '', summary) # Remove any multiple spaces extract = re.sub(' +', ' ', extract) # Remove any new lines @@ -47,13 +52,23 @@ class MatrixModule(BotModule): # Remove any tabs extract = re.sub('\t', '', extract) - # Truncate to 500 chars - extract = extract[:500] + # Truncate the extract, Element URL preview contains nonsense Wikipedia meta content + if len(extract) <= 256: + pass + else: + extract = ' '.join(extract[:256 + 1].split(' ')[0:-1]) + '...' - # Add a link to the page - extract = extract + '\nhttps://en.wikipedia.org/?curid=' + str(page['pageid']) + # Get the page url + url = f'https://en.wikipedia.org/wiki/{title}' - await bot.send_text(room, extract) + # Convert all spaces to underscores in url + url = re.sub(r'\s', '_', url) + + # Format the response + response = f'{title}: {extract} \n{url}' + + # Send the response + await bot.send_text(room, response) return except Exception as exc: await bot.send_text(room, str(exc))