Merge pull request #30 from tswfi/url_use_BeautifulSoup_for_parsing

Use Beautiful Soup for content parsing
This commit is contained in:
Ville Ranki 2020-01-29 11:49:10 +02:00 committed by GitHub
commit 760100bf51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 10 additions and 18 deletions

View File

@ -3,7 +3,7 @@ import shlex
from functools import lru_cache from functools import lru_cache
import httpx import httpx
from lxml.html.soupparser import fromstring from bs4 import BeautifulSoup
from nio import RoomMessageText from nio import RoomMessageText
@ -90,32 +90,24 @@ class MatrixModule:
try: try:
r = httpx.get(url) r = httpx.get(url)
except Exception as e: except Exception as e:
# if it failed then it failed, no point in trying anything fancy print(f"Failed fetching url {url}. Error: {e}")
# this is just a title spitting bot :)
return (title, description) return (title, description)
if r.status_code != 200: if r.status_code != 200:
print(f"Failed fetching url {url}. Status code: {r.status_code}")
return (title, description) return (title, description)
# try parse and get the title # try parse and get the title
try: try:
titleelem = fromstring(r.text).find(".//head/title") soup = BeautifulSoup(r.text, "html.parser")
descriptionelem = fromstring(r.text).find( title = soup.title.string
'.//head/meta[@name="description"]' descr_tag = soup.find("meta", attrs={"name": "description"})
) if descr_tag:
except Exception: description = descr_tag.get("content", None)
# again, no point in trying anything else except Exception as e:
print(f"Failed parsing response from url {url}. Error: {e}")
return (title, description) return (title, description)
try:
if titleelem is not None:
title = titleelem.text
if descriptionelem is not None:
description = descriptionelem.attrib.get("content")
except Exception:
# if it fails it fails
pass
return (title, description) return (title, description)
async def matrix_message(self, bot, room, event): async def matrix_message(self, bot, room, event):