Url: support blacklisting certain urls (to work around youtube suckiness)
This commit is contained in:
parent
2bc62c7a37
commit
35593de3b9
18
README.md
18
README.md
|
@ -260,20 +260,24 @@ Defaults to off and needs to be activated on every room you want this.
|
||||||
|
|
||||||
You can choose to send titles as notices (as in Matrix spec) or normal
|
You can choose to send titles as notices (as in Matrix spec) or normal
|
||||||
messages (IRC users might prefer this). This is a global setting currently.
|
messages (IRC users might prefer this). This is a global setting currently.
|
||||||
|
You can set a blacklist to ignore URLs containing words from the blacklist.
|
||||||
|
|
||||||
Commands:
|
Commands:
|
||||||
|
|
||||||
* !url status - show current status
|
* !url status - show current status
|
||||||
* !url title - spam titles to room
|
* !url title - spam titles to room
|
||||||
* !url description - spam descriptions
|
* !url description - spam descriptions
|
||||||
* !url both - spam both title and description
|
* !url both - spam both title and description
|
||||||
* !url off - stop spamming
|
* !url off - stop spamming
|
||||||
* !url text - send titles as normal text (must be owner)
|
* !url text - send titles as normal text (must be owner)
|
||||||
* !url notice - sends titles as notices (must be owner)
|
* !url notice - sends titles as notices (must be owner)
|
||||||
|
* !url blacklist list - blacklist comma separated list of url substrings
|
||||||
|
* !url blacklist clear - clear blacklist
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
* !url status
|
* !url status
|
||||||
|
* !url blacklist www.youtube.com,www.somethingelse.com
|
||||||
|
|
||||||
NOTE: Disabled by default, i.e. you also need to enable it before activating it
|
NOTE: Disabled by default, i.e. you also need to enable it before activating it
|
||||||
|
|
||||||
|
|
102
modules/url.py
102
modules/url.py
|
@ -33,7 +33,7 @@ class MatrixModule(BotModule):
|
||||||
"DESCRIPTION": "Spamming this channel with descriptions",
|
"DESCRIPTION": "Spamming this channel with descriptions",
|
||||||
"BOTH": "Spamming this channel with both title and description",
|
"BOTH": "Spamming this channel with both title and description",
|
||||||
}
|
}
|
||||||
|
self.blacklist = [ ]
|
||||||
self.enabled = False
|
self.enabled = False
|
||||||
|
|
||||||
def matrix_start(self, bot):
|
def matrix_start(self, bot):
|
||||||
|
@ -45,9 +45,6 @@ class MatrixModule(BotModule):
|
||||||
bot.client.add_event_callback(self.text_cb, RoomMessageText)
|
bot.client.add_event_callback(self.text_cb, RoomMessageText)
|
||||||
# extend the useragent string to contain version and bot name
|
# extend the useragent string to contain version and bot name
|
||||||
self.useragent = f"Mozilla/5.0 (compatible; Hemppa/{self.bot.version}; {self.bot.client.user}; +https://github.com/vranki/hemppa/)"
|
self.useragent = f"Mozilla/5.0 (compatible; Hemppa/{self.bot.version}; {self.bot.client.user}; +https://github.com/vranki/hemppa/)"
|
||||||
# Actually no - for example Youtube doesn't server titles for proper Hemppa user agent!
|
|
||||||
# Lie and say we are generic Firefox. Blame Youtube..
|
|
||||||
self.useragent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
|
|
||||||
self.logger.debug(f"useragent: {self.useragent}")
|
self.logger.debug(f"useragent: {self.useragent}")
|
||||||
|
|
||||||
|
|
||||||
|
@ -83,47 +80,59 @@ class MatrixModule(BotModule):
|
||||||
if status == "OFF":
|
if status == "OFF":
|
||||||
return
|
return
|
||||||
|
|
||||||
# extract possible urls from message
|
try:
|
||||||
urls = re.findall(r"(https?://\S+)", event.body)
|
# extract possible urls from message
|
||||||
|
urls = re.findall(r"(https?://\S+)", event.body)
|
||||||
|
|
||||||
# no urls, nothing to do
|
# no urls, nothing to do
|
||||||
if len(urls) == 0:
|
if len(urls) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
# fetch the urls and if we can see a title spit it out
|
# fetch the urls and if we can see a title spit it out
|
||||||
for url in urls:
|
for url in urls:
|
||||||
# fix for #98 a bit ugly, but skip all matrix.to urls
|
# fix for #98 a bit ugly, but skip all matrix.to urls
|
||||||
# those are 99.99% pills and should not
|
# those are 99.99% pills and should not
|
||||||
# spam the channel with matrix.to titles
|
# spam the channel with matrix.to titles
|
||||||
if url.startswith("https://matrix.to/#/"):
|
if url.startswith("https://matrix.to/#/"):
|
||||||
self.logger.debug(f"Skipping matrix.to url (#98): {url}")
|
self.logger.debug(f"Skipping matrix.to url (#98): {url}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
url_blacklisted = False
|
||||||
title, description = self.get_content_from_url(url)
|
for blacklisted in self.blacklist:
|
||||||
except Exception as e:
|
if blacklisted in url:
|
||||||
self.logger.warning(f"could not fetch url: {e}")
|
url_blacklisted = True
|
||||||
traceback.print_exc(file=sys.stderr)
|
if url_blacklisted:
|
||||||
# failed fetching, give up
|
self.logger.debug(f"Skipping blacklisted url {url}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
msg = None
|
try:
|
||||||
|
title, description = self.get_content_from_url(url)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"could not fetch url: {e}")
|
||||||
|
traceback.print_exc(file=sys.stderr)
|
||||||
|
# failed fetching, give up
|
||||||
|
continue
|
||||||
|
|
||||||
if status == "TITLE" and title is not None:
|
msg = None
|
||||||
msg = f"Title: {title}"
|
|
||||||
elif status == "DESCRIPTION" and description is not None:
|
|
||||||
msg = f"Description: {description}"
|
|
||||||
|
|
||||||
elif status == "BOTH" and title is not None and description is not None:
|
if status == "TITLE" and title is not None:
|
||||||
msg = f"Title: {title}\nDescription: {description}"
|
msg = f"Title: {title}"
|
||||||
|
elif status == "DESCRIPTION" and description is not None:
|
||||||
|
msg = f"Description: {description}"
|
||||||
|
|
||||||
elif status == "BOTH" and title is not None:
|
elif status == "BOTH" and title is not None and description is not None:
|
||||||
msg = f"Title: {title}"
|
msg = f"Title: {title}\nDescription: {description}"
|
||||||
elif status == "BOTH" and description is not None:
|
|
||||||
msg = f"Description: {description}"
|
|
||||||
|
|
||||||
if msg is not None:
|
elif status == "BOTH" and title is not None:
|
||||||
await self.bot.send_text(room, msg, msgtype=self.type, bot_ignore=True)
|
msg = f"Title: {title}"
|
||||||
|
elif status == "BOTH" and description is not None:
|
||||||
|
msg = f"Description: {description}"
|
||||||
|
|
||||||
|
if msg is not None:
|
||||||
|
await self.bot.send_text(room, msg, msgtype=self.type, bot_ignore=True)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Unexpected error in url module text_cb: {e}")
|
||||||
|
traceback.print_exc(file=sys.stderr)
|
||||||
|
|
||||||
@lru_cache(maxsize=128)
|
@lru_cache(maxsize=128)
|
||||||
def get_content_from_url(self, url):
|
def get_content_from_url(self, url):
|
||||||
|
@ -178,7 +187,7 @@ class MatrixModule(BotModule):
|
||||||
title_tag = soup.find("meta", attrs={"name": "title"})
|
title_tag = soup.find("meta", attrs={"name": "title"})
|
||||||
ogtitle = soup.find("meta", property="og:title")
|
ogtitle = soup.find("meta", property="og:title")
|
||||||
if title_tag:
|
if title_tag:
|
||||||
title = descr_tag.get("content", None)
|
title = title_tag.get("content", None)
|
||||||
elif ogtitle:
|
elif ogtitle:
|
||||||
title = ogtitle["content"]
|
title = ogtitle["content"]
|
||||||
elif soup.head and soup.head.title:
|
elif soup.head and soup.head.title:
|
||||||
|
@ -217,8 +226,9 @@ class MatrixModule(BotModule):
|
||||||
|
|
||||||
# show status
|
# show status
|
||||||
elif len(args) == 1 and args[0] == "status":
|
elif len(args) == 1 and args[0] == "status":
|
||||||
|
status = self.STATUSES.get(self.status.get(room.room_id, "OFF")) + f', URL blacklist: {self.blacklist}'
|
||||||
await bot.send_text(
|
await bot.send_text(
|
||||||
room, self.STATUSES.get(self.status.get(room.room_id, "OFF"))
|
room, status
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -238,6 +248,17 @@ class MatrixModule(BotModule):
|
||||||
await bot.send_text(room, "Sending titles as text from now on.")
|
await bot.send_text(room, "Sending titles as text from now on.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# set blacklist
|
||||||
|
elif len(args) == 2 and args[0] == "blacklist":
|
||||||
|
bot.must_be_owner(event)
|
||||||
|
if args[1] == 'clear':
|
||||||
|
self.blacklist = []
|
||||||
|
else:
|
||||||
|
self.blacklist = args[1].split(',')
|
||||||
|
bot.save_settings()
|
||||||
|
await bot.send_text(room, f"Blacklisted URLs set to {self.blacklist}")
|
||||||
|
return
|
||||||
|
|
||||||
# invalid command
|
# invalid command
|
||||||
await bot.send_text(
|
await bot.send_text(
|
||||||
room,
|
room,
|
||||||
|
@ -250,6 +271,7 @@ class MatrixModule(BotModule):
|
||||||
data = super().get_settings()
|
data = super().get_settings()
|
||||||
data["status"] = self.status
|
data["status"] = self.status
|
||||||
data["type"] = self.type
|
data["type"] = self.type
|
||||||
|
data["blacklist"] = self.blacklist
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def set_settings(self, data):
|
def set_settings(self, data):
|
||||||
|
@ -258,6 +280,8 @@ class MatrixModule(BotModule):
|
||||||
self.status = data["status"]
|
self.status = data["status"]
|
||||||
if data.get("type"):
|
if data.get("type"):
|
||||||
self.type = data["type"]
|
self.type = data["type"]
|
||||||
|
if data.get("blacklist"):
|
||||||
|
self.blacklist = data["blacklist"]
|
||||||
|
|
||||||
def help(self):
|
def help(self):
|
||||||
return "If I see a url in a message I will try to get the title from the page and spit it out"
|
return "If I see a url in a message I will try to get the title from the page and spit it out"
|
||||||
|
|
Loading…
Reference in New Issue