Merge pull request #30 from tswfi/url_use_BeautifulSoup_for_parsing

Use Beautiful Soup for content parsing
This commit is contained in:
Ville Ranki 2020-01-29 11:49:10 +02:00 committed by GitHub
commit 760100bf51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 10 additions and 18 deletions

View File

@ -3,7 +3,7 @@ import shlex
from functools import lru_cache
import httpx
from lxml.html.soupparser import fromstring
from bs4 import BeautifulSoup
from nio import RoomMessageText
@ -90,32 +90,24 @@ class MatrixModule:
try:
r = httpx.get(url)
except Exception as e:
# if it failed then it failed, no point in trying anything fancy
# this is just a title spitting bot :)
print(f"Failed fetching url {url}. Error: {e}")
return (title, description)
if r.status_code != 200:
print(f"Failed fetching url {url}. Status code: {r.status_code}")
return (title, description)
# try parse and get the title
try:
titleelem = fromstring(r.text).find(".//head/title")
descriptionelem = fromstring(r.text).find(
'.//head/meta[@name="description"]'
)
except Exception:
# again, no point in trying anything else
soup = BeautifulSoup(r.text, "html.parser")
title = soup.title.string
descr_tag = soup.find("meta", attrs={"name": "description"})
if descr_tag:
description = descr_tag.get("content", None)
except Exception as e:
print(f"Failed parsing response from url {url}. Error: {e}")
return (title, description)
try:
if titleelem is not None:
title = titleelem.text
if descriptionelem is not None:
description = descriptionelem.attrib.get("content")
except Exception:
# if it fails it fails
pass
return (title, description)
async def matrix_message(self, bot, room, event):