Merge pull request #235 from Aciid/wikipedia
wikipedia.py: fixes smart trim, redirects, canonical urls
This commit is contained in:
commit
bf3e4a7c7c
|
@ -4,6 +4,7 @@ import requests
|
||||||
|
|
||||||
from modules.common.module import BotModule
|
from modules.common.module import BotModule
|
||||||
|
|
||||||
|
|
||||||
# This module searches wikipedia for query, returns page summary and link.
|
# This module searches wikipedia for query, returns page summary and link.
|
||||||
class MatrixModule(BotModule):
|
class MatrixModule(BotModule):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
|
@ -14,32 +15,36 @@ class MatrixModule(BotModule):
|
||||||
args = event.body.split()
|
args = event.body.split()
|
||||||
|
|
||||||
if len(args) > 1:
|
if len(args) > 1:
|
||||||
query = event.body[len(args[0])+1:]
|
query = event.body[len(args[0]) + 1:]
|
||||||
try:
|
try:
|
||||||
response = requests.get(self.api_url, params={
|
response = requests.get(self.api_url, params={
|
||||||
'action': 'query',
|
'action': 'query',
|
||||||
'prop': 'extracts',
|
'format': 'json',
|
||||||
'exintro': True,
|
'exintro': True,
|
||||||
'explaintext': True,
|
'explaintext': True,
|
||||||
|
'prop': 'extracts',
|
||||||
|
'redirects': 1,
|
||||||
'titles': query,
|
'titles': query,
|
||||||
'format': 'json',
|
|
||||||
'formatversion': 2
|
|
||||||
})
|
})
|
||||||
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
data = response.json()
|
data = response.json()
|
||||||
if 'query' not in data or 'pages' not in data['query'] or len(data['query']['pages']) == 0:
|
|
||||||
|
# Get the page id
|
||||||
|
page_id = list(data['query']['pages'].keys())[0]
|
||||||
|
|
||||||
|
if page_id == '-1':
|
||||||
await bot.send_text(room, 'No results found')
|
await bot.send_text(room, 'No results found')
|
||||||
return
|
return
|
||||||
|
|
||||||
page = data['query']['pages'][0]
|
# Get the page title
|
||||||
|
title = data['query']['pages'][page_id]['title']
|
||||||
|
|
||||||
if 'extract' not in page:
|
# Get the page summary
|
||||||
await bot.send_text(room, 'No results found')
|
summary = data['query']['pages'][page_id]['extract']
|
||||||
return
|
|
||||||
|
|
||||||
# Remove all html tags
|
# Remove all html tags
|
||||||
extract = re.sub('<[^<]+?>', '', page['extract'])
|
extract = re.sub('<[^<]+?>', '', summary)
|
||||||
# Remove any multiple spaces
|
# Remove any multiple spaces
|
||||||
extract = re.sub(' +', ' ', extract)
|
extract = re.sub(' +', ' ', extract)
|
||||||
# Remove any new lines
|
# Remove any new lines
|
||||||
|
@ -47,13 +52,23 @@ class MatrixModule(BotModule):
|
||||||
# Remove any tabs
|
# Remove any tabs
|
||||||
extract = re.sub('\t', '', extract)
|
extract = re.sub('\t', '', extract)
|
||||||
|
|
||||||
# Truncate to 500 chars
|
# Truncate the extract, Element URL preview contains nonsense Wikipedia meta content
|
||||||
extract = extract[:500]
|
if len(extract) <= 256:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
extract = ' '.join(extract[:256 + 1].split(' ')[0:-1]) + '...'
|
||||||
|
|
||||||
# Add a link to the page
|
# Get the page url
|
||||||
extract = extract + '\nhttps://en.wikipedia.org/?curid=' + str(page['pageid'])
|
url = f'https://en.wikipedia.org/wiki/{title}'
|
||||||
|
|
||||||
await bot.send_text(room, extract)
|
# Convert all spaces to underscores in url
|
||||||
|
url = re.sub(r'\s', '_', url)
|
||||||
|
|
||||||
|
# Format the response
|
||||||
|
response = f'{title}: {extract} \n{url}'
|
||||||
|
|
||||||
|
# Send the response
|
||||||
|
await bot.send_text(room, response)
|
||||||
return
|
return
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await bot.send_text(room, str(exc))
|
await bot.send_text(room, str(exc))
|
||||||
|
|
Loading…
Reference in New Issue