From b818353885a98897c872be4dd4f2600239862974 Mon Sep 17 00:00:00 2001
From: Tatu Wikman <tatu.wikman@gmail.com>
Date: Sat, 10 Oct 2020 21:42:53 +0300
Subject: [PATCH] url: fix reading streaming responses forever, fixes #79

---
 modules/url.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/modules/url.py b/modules/url.py
index d69fced..980d64b 100644
--- a/modules/url.py
+++ b/modules/url.py
@@ -4,7 +4,7 @@ from functools import lru_cache
 
 import httpx
 from bs4 import BeautifulSoup
-from nio import RoomMessageText, AsyncClient
+from nio import RoomMessageText
 
 from modules.common.module import BotModule
 
@@ -56,7 +56,7 @@ class MatrixModule(BotModule):
         # skip edited content to prevent spamming the same thing multiple times
         if "content" in event.source:
             if "m.new_content" in event.source["content"]:
-                self.logger.debug(f"Skipping edited event to prevent spam")
+                self.logger.debug("Skipping edited event to prevent spam")
                 return
 
         # are we on in this room?
@@ -114,9 +114,29 @@ class MatrixModule(BotModule):
         """
         title = None
         description = None
+        # timeout will still handle network timeouts
         timeout = httpx.Timeout(10.0, connect=2.0, read=5.0)
+        responsetext = ""  # read our response here
         try:
-            r = httpx.get(url, timeout=timeout)
+            self.logger.debug(f"start streaming {url}")
+            # stream the response so that we can set a upper limit on how much we want to fetch.
+            # as we are using stream the r.text wont be available, save our read data ourself
+
+            # maximum size to read of the response in characters (this prevents us from reading stream forever)
+            maxsize = 100000
+            with httpx.stream("GET", url, timeout=timeout) as r:
+                for part in r.iter_text():
+                    self.logger.debug(
+                        f"reading response stream, limiting in {maxsize} bytes"
+                    )
+
+                    responsetext += part
+                    maxsize -= len(part)
+
+                    if maxsize < 0:
+                        break
+
+            self.logger.debug(f"end streaming {url}")
         except Exception as e:
             self.logger.warning(f"Failed fetching url {url}. Error: {e}")
             return (title, description)
@@ -129,7 +149,7 @@ class MatrixModule(BotModule):
 
         # try parse and get the title
         try:
-            soup = BeautifulSoup(r.text, "html.parser")
+            soup = BeautifulSoup(responsetext, "html.parser")
             # Prefer og:title first (for example Youtube uses this)
             ogtitle = soup.find("meta", property="og:title")
             if ogtitle: