Url: support blacklisting certain urls (to work around youtube suckiness)

This commit is contained in:
Ville Ranki 2021-04-08 20:47:49 +03:00
parent 2bc62c7a37
commit 35593de3b9
2 changed files with 74 additions and 46 deletions

View File

@ -260,6 +260,7 @@ Defaults to off and needs to be activated on every room you want this.
You can choose to send titles as notices (as in Matrix spec) or normal
messages (IRC users might prefer this). This is a global setting currently.
You can set a blacklist to ignore URLs containing words from the blacklist.
Commands:
@ -270,10 +271,13 @@ Commands:
* !url off - stop spamming
* !url text - send titles as normal text (must be owner)
* !url notice - sends titles as notices (must be owner)
* !url blacklist list - blacklist comma separated list of url substrings
* !url blacklist clear - clear blacklist
Example:
* !url status
* !url blacklist www.youtube.com,www.somethingelse.com
NOTE: Disabled by default, i.e. you also need to enable it before activating it

View File

@ -33,7 +33,7 @@ class MatrixModule(BotModule):
"DESCRIPTION": "Spamming this channel with descriptions",
"BOTH": "Spamming this channel with both title and description",
}
self.blacklist = [ ]
self.enabled = False
def matrix_start(self, bot):
@ -45,9 +45,6 @@ class MatrixModule(BotModule):
bot.client.add_event_callback(self.text_cb, RoomMessageText)
# extend the useragent string to contain version and bot name
self.useragent = f"Mozilla/5.0 (compatible; Hemppa/{self.bot.version}; {self.bot.client.user}; +https://github.com/vranki/hemppa/)"
# Actually no - for example Youtube doesn't server titles for proper Hemppa user agent!
# Lie and say we are generic Firefox. Blame Youtube..
self.useragent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
self.logger.debug(f"useragent: {self.useragent}")
@ -83,6 +80,7 @@ class MatrixModule(BotModule):
if status == "OFF":
return
try:
# extract possible urls from message
urls = re.findall(r"(https?://\S+)", event.body)
@ -99,6 +97,14 @@ class MatrixModule(BotModule):
self.logger.debug(f"Skipping matrix.to url (#98): {url}")
continue
url_blacklisted = False
for blacklisted in self.blacklist:
if blacklisted in url:
url_blacklisted = True
if url_blacklisted:
self.logger.debug(f"Skipping blacklisted url {url}")
continue
try:
title, description = self.get_content_from_url(url)
except Exception as e:
@ -124,6 +130,9 @@ class MatrixModule(BotModule):
if msg is not None:
await self.bot.send_text(room, msg, msgtype=self.type, bot_ignore=True)
except Exception as e:
self.logger.warning(f"Unexpected error in url module text_cb: {e}")
traceback.print_exc(file=sys.stderr)
@lru_cache(maxsize=128)
def get_content_from_url(self, url):
@ -178,7 +187,7 @@ class MatrixModule(BotModule):
title_tag = soup.find("meta", attrs={"name": "title"})
ogtitle = soup.find("meta", property="og:title")
if title_tag:
title = descr_tag.get("content", None)
title = title_tag.get("content", None)
elif ogtitle:
title = ogtitle["content"]
elif soup.head and soup.head.title:
@ -217,8 +226,9 @@ class MatrixModule(BotModule):
# show status
elif len(args) == 1 and args[0] == "status":
status = self.STATUSES.get(self.status.get(room.room_id, "OFF")) + f', URL blacklist: {self.blacklist}'
await bot.send_text(
room, self.STATUSES.get(self.status.get(room.room_id, "OFF"))
room, status
)
return
@ -238,6 +248,17 @@ class MatrixModule(BotModule):
await bot.send_text(room, "Sending titles as text from now on.")
return
# set blacklist
elif len(args) == 2 and args[0] == "blacklist":
bot.must_be_owner(event)
if args[1] == 'clear':
self.blacklist = []
else:
self.blacklist = args[1].split(',')
bot.save_settings()
await bot.send_text(room, f"Blacklisted URLs set to {self.blacklist}")
return
# invalid command
await bot.send_text(
room,
@ -250,6 +271,7 @@ class MatrixModule(BotModule):
data = super().get_settings()
data["status"] = self.status
data["type"] = self.type
data["blacklist"] = self.blacklist
return data
def set_settings(self, data):
@ -258,6 +280,8 @@ class MatrixModule(BotModule):
self.status = data["status"]
if data.get("type"):
self.type = data["type"]
if data.get("blacklist"):
self.blacklist = data["blacklist"]
def help(self):
return "If I see a url in a message I will try to get the title from the page and spit it out"