Skip to content

Commit

Permalink
v0.75055 - html sanitizer fixes; beautifulsoup4
Browse files Browse the repository at this point in the history
  • Loading branch information
FlyingFathead committed Oct 21, 2024
1 parent c3c6379 commit 8c5d8d8
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 15 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ If you run into any issues, consult the logs or reach out on the repository's [I
---

# Changelog
- v0.75055 - fixes to the html sanitizer (for Telegram's API; better handling of malformed html), using BeautifulSoup4 for parsing now
- v0.75054 - small fixes and more error catching in `calc_module.py`
- v0.75053 - only include eligible territories in U.S. NWS queries
- list of queried / eligible territories can be set in `config.ini` under the `NWS` section
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
beautifulsoup4>=4.12.3
configparser>=6.0.0
elastic-transport>=8.15.0
elasticsearch>=8.15.1
Expand Down
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# version of this program
version_number = "0.75054"
version_number = "0.75055"

# Add the project root directory to Python's path
import sys
Expand Down
45 changes: 31 additions & 14 deletions src/text_message_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from utils import holiday_replacements
import holidays
import pytz
from bs4 import BeautifulSoup

from telegram import Update
from telegram.ext import CallbackContext
Expand Down Expand Up @@ -607,13 +608,14 @@ async def handle_message(bot, update: Update, context: CallbackContext, logger)
# Ensure the bot has a substantive response to send
if bot_reply:
# Function to clean unsupported tags
def sanitize_html(content):
# Remove unsupported HTML tags
for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
content = content.replace(tag, '')
# Optionally: Replace line breaks with "\n" to preserve formatting
content = content.replace('<br>', '\n').replace('<br/>', '\n')
return content
# # // old method
# def sanitize_html(content):
# # Remove unsupported HTML tags
# for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
# content = content.replace(tag, '')
# # Optionally: Replace line breaks with "\n" to preserve formatting
# content = content.replace('<br>', '\n').replace('<br/>', '\n')
# return content

# Convert markdown to HTML
escaped_reply = markdown_to_html(bot_reply)
Expand Down Expand Up @@ -679,13 +681,13 @@ def sanitize_html(content):
# Ensure the bot has a substantive response to send
if bot_reply:
# Function to clean unsupported tags
def sanitize_html(content):
# Remove unsupported HTML tags
for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
content = content.replace(tag, '')
# Optionally: Replace line breaks with "\n" to preserve formatting
content = content.replace('<br>', '\n').replace('<br/>', '\n')
return content
# def sanitize_html(content):
# # Remove unsupported HTML tags
# for tag in ['<pre>', '</pre>', '<br>', '<br/>', '</br>', '<div>', '</div>', '<span>', '</span>', '<p>', '</p>']:
# content = content.replace(tag, '')
# # Optionally: Replace line breaks with "\n" to preserve formatting
# content = content.replace('<br>', '\n').replace('<br/>', '\n')
# return content

# Convert markdown to HTML
escaped_reply = markdown_to_html(bot_reply)
Expand Down Expand Up @@ -1019,6 +1021,8 @@ def sanitize_html(content):
# parse_mode=ParseMode.HTML
# )

escaped_reply = sanitize_html(escaped_reply)

message_parts = split_message(escaped_reply)

for part in message_parts:
Expand Down Expand Up @@ -1280,6 +1284,19 @@ def split_message(message, max_length=4000):

return message_parts

# sanitize html
def sanitize_html(content):
soup = BeautifulSoup(content, 'html.parser')

# Remove unsupported tags
for tag in soup.find_all():
if tag.name not in ['b', 'i', 'u', 's', 'a', 'code', 'pre']:
tag.unwrap()

# Fix improperly nested tags
content = str(soup)
return content

# # // (old request type)
# async def make_api_request(bot, chat_history, timeout=30):
# # Prepare the payload for the API request with updated chat history
Expand Down

0 comments on commit 8c5d8d8

Please sign in to comment.