From 3df9c52512aca95fefcac39a77bc16b4f612fb22 Mon Sep 17 00:00:00 2001 From: Tatu Wikman Date: Wed, 29 Jan 2020 11:43:23 +0200 Subject: [PATCH] Use Beautiful Soup for content parsing This will help with pages that have broken html.. closes #29 --- modules/url.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/modules/url.py b/modules/url.py index 65e2b82..6b8f5a1 100644 --- a/modules/url.py +++ b/modules/url.py @@ -3,7 +3,7 @@ import shlex from functools import lru_cache import httpx -from lxml.html.soupparser import fromstring +from bs4 import BeautifulSoup from nio import RoomMessageText @@ -90,32 +90,24 @@ class MatrixModule: try: r = httpx.get(url) except Exception as e: - # if it failed then it failed, no point in trying anything fancy - # this is just a title spitting bot :) + print(f"Failed fetching url {url}. Error: {e}") return (title, description) if r.status_code != 200: + print(f"Failed fetching url {url}. Status code: {r.status_code}") return (title, description) # try parse and get the title try: - titleelem = fromstring(r.text).find(".//head/title") - descriptionelem = fromstring(r.text).find( - './/head/meta[@name="description"]' - ) - except Exception: - # again, no point in trying anything else + soup = BeautifulSoup(r.text, "html.parser") + title = soup.title.string + descr_tag = soup.find("meta", attrs={"name": "description"}) + if descr_tag: + description = descr_tag.get("content", None) + except Exception as e: + print(f"Failed parsing response from url {url}. Error: {e}") return (title, description) - try: - if titleelem is not None: - title = titleelem.text - if descriptionelem is not None: - description = descriptionelem.attrib.get("content") - except Exception: - # if it fails it fails - pass - return (title, description) async def matrix_message(self, bot, room, event):