From 682841b7d3814b27ae211bb1dc3fed30e19ab910 Mon Sep 17 00:00:00 2001 From: Hinrich Mahler <22366557+Bibo-Joshi@users.noreply.github.com> Date: Sat, 20 Apr 2024 10:29:47 +0200 Subject: [PATCH] Automatically Delete Long Code Snippets and Ask for Pastebin --- components/callbacks.py | 48 +++++++++++++++++++++++++++++++++++++++++ rules_bot.py | 9 +++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/components/callbacks.py b/components/callbacks.py index f411cc0..9a6e13e 100644 --- a/components/callbacks.py +++ b/components/callbacks.py @@ -1,6 +1,7 @@ import asyncio import logging import random +import re import time from collections import deque from copy import deepcopy @@ -522,3 +523,50 @@ async def compat_warning(update: Update, _: ContextTypes.DEFAULT_TYPE) -> None: hint.html_markup(), reply_markup=hint.inline_keyboard, ) + + +async def long_code_handling(update: Update, _: ContextTypes.DEFAULT_TYPE) -> None: + """When someone posts a long code snippet: + Reply with the /pastebin taghint. + Because we do the regexing in here rather than in the filter, the corresponding handler + will have to be in a lower group. + """ + message = cast(Message, update.effective_message) + text = cast(str, message.text) + has_long_code = False + + # We make some educated guesses about the message's content. This is nothing more than + # a few simple heuristics, but it should catch the most common cases. + # If we have a code block longer than 15 lines, we assume it's a long code snippet + parsed_entities = message.parse_entities(types=[MessageEntity.CODE, MessageEntity.PRE]) + if any(len(text.split("\n")) >= 15 for text in parsed_entities.values()): + has_long_code = True + + # if the text contains more than 5 import lines, we assume it's a long code snippet + # regex from https://stackoverflow.com/a/44988666/10606962 + pattern = re.compile(r"(?m)^(?:from +(\S+) +)?import +(\S+)(?: +as +\S+)? *$") + if not has_long_code and len(pattern.findall(text)) >= 5: + has_long_code = True + + # if the text contains more than 3 class or function definitions, ... + pattern = re.compile(r"(class|def) [a-zA-Z]+[a-zA-Z0-9_]*\(") + if not has_long_code and len(pattern.findall(text)) >= 3: + has_long_code = True + + if not has_long_code: + return + + # Get the long_code hint + hint = TAG_HINTS["pastebin"] + + # the leading ". " is important here since html_markup() splits on whitespaces! + mention = f". {update.effective_user.mention_html()}" if update.effective_user else None + + await message.reply_text( + hint.html_markup(mention), + reply_markup=hint.inline_keyboard, + ) + await try_to_delete(message) + + # We don't want this message to be processed any further + raise ApplicationHandlerStop diff --git a/rules_bot.py b/rules_bot.py index 7cd8c43..1e24478 100644 --- a/rules_bot.py +++ b/rules_bot.py @@ -31,6 +31,7 @@ compat_warning, delete_message, leave_chat, + long_code_handling, off_on_topic, raise_app_handler_stop, regex_token_warning, @@ -137,7 +138,13 @@ def main() -> None: group=-2, ) - application.add_handler(MessageHandler(~filters.COMMAND, rate_limit_tracker), group=-1) + application.add_handler(MessageHandler(~filters.COMMAND, rate_limit_tracker), group=-2) + + # We need several different patterns, so filters.REGEX doesn't do the trick + # therefore we catch everything and do regex ourselves. In case the message contains a + # long code block, we'll raise AppHandlerStop to prevent further processing. + application.add_handler(MessageHandler(filters.TEXT, long_code_handling), group=-1) + application.add_handler( MessageHandler( filters.SenderChat.CHANNEL