Merge pull request #30 from tswfi/url_use_BeautifulSoup_for_parsing
Use Beautiful Soup for content parsing
This commit is contained in:
commit
760100bf51
|
@ -3,7 +3,7 @@ import shlex
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from lxml.html.soupparser import fromstring
|
from bs4 import BeautifulSoup
|
||||||
from nio import RoomMessageText
|
from nio import RoomMessageText
|
||||||
|
|
||||||
|
|
||||||
|
@ -90,32 +90,24 @@ class MatrixModule:
|
||||||
try:
|
try:
|
||||||
r = httpx.get(url)
|
r = httpx.get(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# if it failed then it failed, no point in trying anything fancy
|
print(f"Failed fetching url {url}. Error: {e}")
|
||||||
# this is just a title spitting bot :)
|
|
||||||
return (title, description)
|
return (title, description)
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
|
print(f"Failed fetching url {url}. Status code: {r.status_code}")
|
||||||
return (title, description)
|
return (title, description)
|
||||||
|
|
||||||
# try parse and get the title
|
# try parse and get the title
|
||||||
try:
|
try:
|
||||||
titleelem = fromstring(r.text).find(".//head/title")
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
descriptionelem = fromstring(r.text).find(
|
title = soup.title.string
|
||||||
'.//head/meta[@name="description"]'
|
descr_tag = soup.find("meta", attrs={"name": "description"})
|
||||||
)
|
if descr_tag:
|
||||||
except Exception:
|
description = descr_tag.get("content", None)
|
||||||
# again, no point in trying anything else
|
except Exception as e:
|
||||||
|
print(f"Failed parsing response from url {url}. Error: {e}")
|
||||||
return (title, description)
|
return (title, description)
|
||||||
|
|
||||||
try:
|
|
||||||
if titleelem is not None:
|
|
||||||
title = titleelem.text
|
|
||||||
if descriptionelem is not None:
|
|
||||||
description = descriptionelem.attrib.get("content")
|
|
||||||
except Exception:
|
|
||||||
# if it fails it fails
|
|
||||||
pass
|
|
||||||
|
|
||||||
return (title, description)
|
return (title, description)
|
||||||
|
|
||||||
async def matrix_message(self, bot, room, event):
|
async def matrix_message(self, bot, room, event):
|
||||||
|
|
Loading…
Reference in New Issue