Url: support blacklisting certain urls (to work around youtube suckiness)

This commit is contained in:
Ville Ranki 2021-04-08 20:47:49 +03:00
parent 2bc62c7a37
commit 35593de3b9
2 changed files with 74 additions and 46 deletions

View File

@ -260,20 +260,24 @@ Defaults to off and needs to be activated on every room you want this.
You can choose to send titles as notices (as in Matrix spec) or normal You can choose to send titles as notices (as in Matrix spec) or normal
messages (IRC users might prefer this). This is a global setting currently. messages (IRC users might prefer this). This is a global setting currently.
You can set a blacklist to ignore URLs containing words from the blacklist.
Commands: Commands:
* !url status - show current status * !url status - show current status
* !url title - spam titles to room * !url title - spam titles to room
* !url description - spam descriptions * !url description - spam descriptions
* !url both - spam both title and description * !url both - spam both title and description
* !url off - stop spamming * !url off - stop spamming
* !url text - send titles as normal text (must be owner) * !url text - send titles as normal text (must be owner)
* !url notice - sends titles as notices (must be owner) * !url notice - sends titles as notices (must be owner)
* !url blacklist list - blacklist comma separated list of url substrings
* !url blacklist clear - clear blacklist
Example: Example:
* !url status * !url status
* !url blacklist www.youtube.com,www.somethingelse.com
NOTE: Disabled by default, i.e. you also need to enable it before activating it NOTE: Disabled by default, i.e. you also need to enable it before activating it

View File

@ -33,7 +33,7 @@ class MatrixModule(BotModule):
"DESCRIPTION": "Spamming this channel with descriptions", "DESCRIPTION": "Spamming this channel with descriptions",
"BOTH": "Spamming this channel with both title and description", "BOTH": "Spamming this channel with both title and description",
} }
self.blacklist = [ ]
self.enabled = False self.enabled = False
def matrix_start(self, bot): def matrix_start(self, bot):
@ -45,9 +45,6 @@ class MatrixModule(BotModule):
bot.client.add_event_callback(self.text_cb, RoomMessageText) bot.client.add_event_callback(self.text_cb, RoomMessageText)
# extend the useragent string to contain version and bot name # extend the useragent string to contain version and bot name
self.useragent = f"Mozilla/5.0 (compatible; Hemppa/{self.bot.version}; {self.bot.client.user}; +https://github.com/vranki/hemppa/)" self.useragent = f"Mozilla/5.0 (compatible; Hemppa/{self.bot.version}; {self.bot.client.user}; +https://github.com/vranki/hemppa/)"
# Actually no - for example Youtube doesn't server titles for proper Hemppa user agent!
# Lie and say we are generic Firefox. Blame Youtube..
self.useragent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
self.logger.debug(f"useragent: {self.useragent}") self.logger.debug(f"useragent: {self.useragent}")
@ -83,47 +80,59 @@ class MatrixModule(BotModule):
if status == "OFF": if status == "OFF":
return return
# extract possible urls from message try:
urls = re.findall(r"(https?://\S+)", event.body) # extract possible urls from message
urls = re.findall(r"(https?://\S+)", event.body)
# no urls, nothing to do # no urls, nothing to do
if len(urls) == 0: if len(urls) == 0:
return return
# fetch the urls and if we can see a title spit it out # fetch the urls and if we can see a title spit it out
for url in urls: for url in urls:
# fix for #98 a bit ugly, but skip all matrix.to urls # fix for #98 a bit ugly, but skip all matrix.to urls
# those are 99.99% pills and should not # those are 99.99% pills and should not
# spam the channel with matrix.to titles # spam the channel with matrix.to titles
if url.startswith("https://matrix.to/#/"): if url.startswith("https://matrix.to/#/"):
self.logger.debug(f"Skipping matrix.to url (#98): {url}") self.logger.debug(f"Skipping matrix.to url (#98): {url}")
continue continue
try: url_blacklisted = False
title, description = self.get_content_from_url(url) for blacklisted in self.blacklist:
except Exception as e: if blacklisted in url:
self.logger.warning(f"could not fetch url: {e}") url_blacklisted = True
traceback.print_exc(file=sys.stderr) if url_blacklisted:
# failed fetching, give up self.logger.debug(f"Skipping blacklisted url {url}")
continue continue
msg = None try:
title, description = self.get_content_from_url(url)
except Exception as e:
self.logger.warning(f"could not fetch url: {e}")
traceback.print_exc(file=sys.stderr)
# failed fetching, give up
continue
if status == "TITLE" and title is not None: msg = None
msg = f"Title: {title}"
elif status == "DESCRIPTION" and description is not None:
msg = f"Description: {description}"
elif status == "BOTH" and title is not None and description is not None: if status == "TITLE" and title is not None:
msg = f"Title: {title}\nDescription: {description}" msg = f"Title: {title}"
elif status == "DESCRIPTION" and description is not None:
msg = f"Description: {description}"
elif status == "BOTH" and title is not None: elif status == "BOTH" and title is not None and description is not None:
msg = f"Title: {title}" msg = f"Title: {title}\nDescription: {description}"
elif status == "BOTH" and description is not None:
msg = f"Description: {description}"
if msg is not None: elif status == "BOTH" and title is not None:
await self.bot.send_text(room, msg, msgtype=self.type, bot_ignore=True) msg = f"Title: {title}"
elif status == "BOTH" and description is not None:
msg = f"Description: {description}"
if msg is not None:
await self.bot.send_text(room, msg, msgtype=self.type, bot_ignore=True)
except Exception as e:
self.logger.warning(f"Unexpected error in url module text_cb: {e}")
traceback.print_exc(file=sys.stderr)
@lru_cache(maxsize=128) @lru_cache(maxsize=128)
def get_content_from_url(self, url): def get_content_from_url(self, url):
@ -178,7 +187,7 @@ class MatrixModule(BotModule):
title_tag = soup.find("meta", attrs={"name": "title"}) title_tag = soup.find("meta", attrs={"name": "title"})
ogtitle = soup.find("meta", property="og:title") ogtitle = soup.find("meta", property="og:title")
if title_tag: if title_tag:
title = descr_tag.get("content", None) title = title_tag.get("content", None)
elif ogtitle: elif ogtitle:
title = ogtitle["content"] title = ogtitle["content"]
elif soup.head and soup.head.title: elif soup.head and soup.head.title:
@ -217,8 +226,9 @@ class MatrixModule(BotModule):
# show status # show status
elif len(args) == 1 and args[0] == "status": elif len(args) == 1 and args[0] == "status":
status = self.STATUSES.get(self.status.get(room.room_id, "OFF")) + f', URL blacklist: {self.blacklist}'
await bot.send_text( await bot.send_text(
room, self.STATUSES.get(self.status.get(room.room_id, "OFF")) room, status
) )
return return
@ -238,6 +248,17 @@ class MatrixModule(BotModule):
await bot.send_text(room, "Sending titles as text from now on.") await bot.send_text(room, "Sending titles as text from now on.")
return return
# set blacklist
elif len(args) == 2 and args[0] == "blacklist":
bot.must_be_owner(event)
if args[1] == 'clear':
self.blacklist = []
else:
self.blacklist = args[1].split(',')
bot.save_settings()
await bot.send_text(room, f"Blacklisted URLs set to {self.blacklist}")
return
# invalid command # invalid command
await bot.send_text( await bot.send_text(
room, room,
@ -250,6 +271,7 @@ class MatrixModule(BotModule):
data = super().get_settings() data = super().get_settings()
data["status"] = self.status data["status"] = self.status
data["type"] = self.type data["type"] = self.type
data["blacklist"] = self.blacklist
return data return data
def set_settings(self, data): def set_settings(self, data):
@ -258,6 +280,8 @@ class MatrixModule(BotModule):
self.status = data["status"] self.status = data["status"]
if data.get("type"): if data.get("type"):
self.type = data["type"] self.type = data["type"]
if data.get("blacklist"):
self.blacklist = data["blacklist"]
def help(self): def help(self):
return "If I see a url in a message I will try to get the title from the page and spit it out" return "If I see a url in a message I will try to get the title from the page and spit it out"