v0.75055 - html sanitizer fixes; beautifulsoup4

FlyingFathead · Oct 21, 2024 · 8c5d8d8 · 8c5d8d8
1 parent c3c6379
commit 8c5d8d8
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -236,6 +236,7 @@ If you run into any issues, consult the logs or reach out on the repository's [I
 ---
 
 # Changelog
+- v0.75055 - fixes to the html sanitizer (for Telegram's API; better handling of malformed html), using BeautifulSoup4 for parsing now
 - v0.75054 - small fixes and more error catching in `calc_module.py`
 - v0.75053 - only include eligible territories in U.S. NWS queries
   - list of queried / eligible territories can be set in `config.ini` under the `NWS` section

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+beautifulsoup4>=4.12.3
 configparser>=6.0.0
 elastic-transport>=8.15.0
 elasticsearch>=8.15.1

diff --git a/src/main.py b/src/main.py
@@ -8,7 +8,7 @@
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # version of this program
-version_number = "0.75054"
+version_number = "0.75055"
 
 # Add the project root directory to Python's path
 import sys

diff --git a/src/text_message_handler.py b/src/text_message_handler.py
@@ -19,6 +19,7 @@
 from utils import holiday_replacements
 import holidays
 import pytz
+from bs4 import BeautifulSoup
 
 from telegram import Update
 from telegram.ext import CallbackContext
@@ -607,13 +608,14 @@ async def handle_message(bot, update: Update, context: CallbackContext, logger)
                         # Ensure the bot has a substantive response to send
                         if bot_reply:
                             # Function to clean unsupported tags
-                            def sanitize_html(content):
-                                # Remove unsupported HTML tags
-                                for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
-                                    content = content.replace(tag, '')
-                                # Optionally: Replace line breaks with "\n" to preserve formatting
-                                content = content.replace('<br>', '\n').replace('<br/>', '\n')
-                                return content
+                            # # // old method
+                            # def sanitize_html(content):
+                            #     # Remove unsupported HTML tags
+                            #     for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
+                            #         content = content.replace(tag, '')
+                            #     # Optionally: Replace line breaks with "\n" to preserve formatting
+                            #     content = content.replace('<br>', '\n').replace('<br/>', '\n')
+                            #     return content
 
                             # Convert markdown to HTML
                             escaped_reply = markdown_to_html(bot_reply)
@@ -679,13 +681,13 @@ def sanitize_html(content):
                         # Ensure the bot has a substantive response to send
                         if bot_reply:
                             # Function to clean unsupported tags
-                            def sanitize_html(content):
-                                # Remove unsupported HTML tags
-                                for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
-                                    content = content.replace(tag, '')
-                                # Optionally: Replace line breaks with "\n" to preserve formatting
-                                content = content.replace('<br>', '\n').replace('<br/>', '\n')
-                                return content
+                            # def sanitize_html(content):
+                            #     # Remove unsupported HTML tags
+                            #     for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
+                            #         content = content.replace(tag, '')
+                            #     # Optionally: Replace line breaks with "\n" to preserve formatting
+                            #     content = content.replace('<br>', '\n').replace('<br/>', '\n')
+                            #     return content
 
                             # Convert markdown to HTML
                             escaped_reply = markdown_to_html(bot_reply)
@@ -1019,6 +1021,8 @@ def sanitize_html(content):
                 #     parse_mode=ParseMode.HTML
                 # )
 
+                escaped_reply = sanitize_html(escaped_reply)
+
                 message_parts = split_message(escaped_reply)
 
                 for part in message_parts:
@@ -1280,6 +1284,19 @@ def split_message(message, max_length=4000):
 
     return message_parts
 
+# sanitize html
+def sanitize_html(content):
+    soup = BeautifulSoup(content, 'html.parser')
+
+    # Remove unsupported tags
+    for tag in soup.find_all():
+        if tag.name not in ['b', 'i', 'u', 's', 'a', 'code', 'pre']:
+            tag.unwrap()
+
+    # Fix improperly nested tags
+    content = str(soup)
+    return content
+
 # # // (old request type)
 # async def make_api_request(bot, chat_history, timeout=30):
 #     # Prepare the payload for the API request with updated chat history