Url: support blacklisting certain urls (to work around youtube suckiness)

This commit is contained in:
Ville Ranki 2021-04-08 20:47:49 +03:00
parent 2bc62c7a37
commit 35593de3b9
2 changed files with 74 additions and 46 deletions

View File

@ -260,6 +260,7 @@ Defaults to off and needs to be activated on every room you want this.
You can choose to send titles as notices (as in Matrix spec) or normal You can choose to send titles as notices (as in Matrix spec) or normal
messages (IRC users might prefer this). This is a global setting currently. messages (IRC users might prefer this). This is a global setting currently.
You can set a blacklist to ignore URLs containing words from the blacklist.
Commands: Commands:
@ -270,10 +271,13 @@ Commands:
* !url off - stop spamming * !url off - stop spamming
* !url text - send titles as normal text (must be owner) * !url text - send titles as normal text (must be owner)
* !url notice - sends titles as notices (must be owner) * !url notice - sends titles as notices (must be owner)
* !url blacklist list - blacklist comma separated list of url substrings
* !url blacklist clear - clear blacklist
Example: Example:
* !url status * !url status
* !url blacklist www.youtube.com,www.somethingelse.com
NOTE: Disabled by default, i.e. you also need to enable it before activating it NOTE: Disabled by default, i.e. you also need to enable it before activating it

View File

@ -33,7 +33,7 @@ class MatrixModule(BotModule):
"DESCRIPTION": "Spamming this channel with descriptions", "DESCRIPTION": "Spamming this channel with descriptions",
"BOTH": "Spamming this channel with both title and description", "BOTH": "Spamming this channel with both title and description",
} }
self.blacklist = [ ]
self.enabled = False self.enabled = False
def matrix_start(self, bot): def matrix_start(self, bot):
@ -45,9 +45,6 @@ class MatrixModule(BotModule):
bot.client.add_event_callback(self.text_cb, RoomMessageText) bot.client.add_event_callback(self.text_cb, RoomMessageText)
# extend the useragent string to contain version and bot name # extend the useragent string to contain version and bot name
self.useragent = f"Mozilla/5.0 (compatible; Hemppa/{self.bot.version}; {self.bot.client.user}; +https://github.com/vranki/hemppa/)" self.useragent = f"Mozilla/5.0 (compatible; Hemppa/{self.bot.version}; {self.bot.client.user}; +https://github.com/vranki/hemppa/)"
# Actually no - for example Youtube doesn't server titles for proper Hemppa user agent!
# Lie and say we are generic Firefox. Blame Youtube..
self.useragent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
self.logger.debug(f"useragent: {self.useragent}") self.logger.debug(f"useragent: {self.useragent}")
@ -83,6 +80,7 @@ class MatrixModule(BotModule):
if status == "OFF": if status == "OFF":
return return
try:
# extract possible urls from message # extract possible urls from message
urls = re.findall(r"(https?://\S+)", event.body) urls = re.findall(r"(https?://\S+)", event.body)
@ -99,6 +97,14 @@ class MatrixModule(BotModule):
self.logger.debug(f"Skipping matrix.to url (#98): {url}") self.logger.debug(f"Skipping matrix.to url (#98): {url}")
continue continue
url_blacklisted = False
for blacklisted in self.blacklist:
if blacklisted in url:
url_blacklisted = True
if url_blacklisted:
self.logger.debug(f"Skipping blacklisted url {url}")
continue
try: try:
title, description = self.get_content_from_url(url) title, description = self.get_content_from_url(url)
except Exception as e: except Exception as e:
@ -124,6 +130,9 @@ class MatrixModule(BotModule):
if msg is not None: if msg is not None:
await self.bot.send_text(room, msg, msgtype=self.type, bot_ignore=True) await self.bot.send_text(room, msg, msgtype=self.type, bot_ignore=True)
except Exception as e:
self.logger.warning(f"Unexpected error in url module text_cb: {e}")
traceback.print_exc(file=sys.stderr)
@lru_cache(maxsize=128) @lru_cache(maxsize=128)
def get_content_from_url(self, url): def get_content_from_url(self, url):
@ -178,7 +187,7 @@ class MatrixModule(BotModule):
title_tag = soup.find("meta", attrs={"name": "title"}) title_tag = soup.find("meta", attrs={"name": "title"})
ogtitle = soup.find("meta", property="og:title") ogtitle = soup.find("meta", property="og:title")
if title_tag: if title_tag:
title = descr_tag.get("content", None) title = title_tag.get("content", None)
elif ogtitle: elif ogtitle:
title = ogtitle["content"] title = ogtitle["content"]
elif soup.head and soup.head.title: elif soup.head and soup.head.title:
@ -217,8 +226,9 @@ class MatrixModule(BotModule):
# show status # show status
elif len(args) == 1 and args[0] == "status": elif len(args) == 1 and args[0] == "status":
status = self.STATUSES.get(self.status.get(room.room_id, "OFF")) + f', URL blacklist: {self.blacklist}'
await bot.send_text( await bot.send_text(
room, self.STATUSES.get(self.status.get(room.room_id, "OFF")) room, status
) )
return return
@ -238,6 +248,17 @@ class MatrixModule(BotModule):
await bot.send_text(room, "Sending titles as text from now on.") await bot.send_text(room, "Sending titles as text from now on.")
return return
# set blacklist
elif len(args) == 2 and args[0] == "blacklist":
bot.must_be_owner(event)
if args[1] == 'clear':
self.blacklist = []
else:
self.blacklist = args[1].split(',')
bot.save_settings()
await bot.send_text(room, f"Blacklisted URLs set to {self.blacklist}")
return
# invalid command # invalid command
await bot.send_text( await bot.send_text(
room, room,
@ -250,6 +271,7 @@ class MatrixModule(BotModule):
data = super().get_settings() data = super().get_settings()
data["status"] = self.status data["status"] = self.status
data["type"] = self.type data["type"] = self.type
data["blacklist"] = self.blacklist
return data return data
def set_settings(self, data): def set_settings(self, data):
@ -258,6 +280,8 @@ class MatrixModule(BotModule):
self.status = data["status"] self.status = data["status"]
if data.get("type"): if data.get("type"):
self.type = data["type"] self.type = data["type"]
if data.get("blacklist"):
self.blacklist = data["blacklist"]
def help(self): def help(self):
return "If I see a url in a message I will try to get the title from the page and spit it out" return "If I see a url in a message I will try to get the title from the page and spit it out"