From 682841b7d3814b27ae211bb1dc3fed30e19ab910 Mon Sep 17 00:00:00 2001
From: Hinrich Mahler <22366557+Bibo-Joshi@users.noreply.github.com>
Date: Sat, 20 Apr 2024 10:29:47 +0200
Subject: [PATCH] Automatically Delete Long Code Snippets and Ask for Pastebin

---
 components/callbacks.py | 48 +++++++++++++++++++++++++++++++++++++++++
 rules_bot.py            |  9 +++++++-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/components/callbacks.py b/components/callbacks.py
index f411cc0..9a6e13e 100644
--- a/components/callbacks.py
+++ b/components/callbacks.py
@@ -1,6 +1,7 @@
 import asyncio
 import logging
 import random
+import re
 import time
 from collections import deque
 from copy import deepcopy
@@ -522,3 +523,50 @@ async def compat_warning(update: Update, _: ContextTypes.DEFAULT_TYPE) -> None:
         hint.html_markup(),
         reply_markup=hint.inline_keyboard,
     )
+
+
+async def long_code_handling(update: Update, _: ContextTypes.DEFAULT_TYPE) -> None:
+    """When someone posts a long code snippet:
+    Reply with the /pastebin taghint.
+    Because we do the regexing in here rather than in the filter, the corresponding handler
+    will have to be in a lower group.
+    """
+    message = cast(Message, update.effective_message)
+    text = cast(str, message.text)
+    has_long_code = False
+
+    # We make some educated guesses about the message's content. This is nothing more than
+    # a few simple heuristics, but it should catch the most common cases.
+    # If we have a code block longer than 15 lines, we assume it's a long code snippet
+    parsed_entities = message.parse_entities(types=[MessageEntity.CODE, MessageEntity.PRE])
+    if any(len(text.split("\n")) >= 15 for text in parsed_entities.values()):
+        has_long_code = True
+
+    # if the text contains more than 5 import lines, we assume it's a long code snippet
+    # regex from https://stackoverflow.com/a/44988666/10606962
+    pattern = re.compile(r"(?m)^(?:from +(\S+) +)?import +(\S+)(?: +as +\S+)? *$")
+    if not has_long_code and len(pattern.findall(text)) >= 5:
+        has_long_code = True
+
+    # if the text contains more than 3 class or function definitions, ...
+    pattern = re.compile(r"(class|def) [a-zA-Z]+[a-zA-Z0-9_]*\(")
+    if not has_long_code and len(pattern.findall(text)) >= 3:
+        has_long_code = True
+
+    if not has_long_code:
+        return
+
+    # Get the long_code hint
+    hint = TAG_HINTS["pastebin"]
+
+    # the leading ". " is important here since html_markup() splits on whitespaces!
+    mention = f". {update.effective_user.mention_html()}" if update.effective_user else None
+
+    await message.reply_text(
+        hint.html_markup(mention),
+        reply_markup=hint.inline_keyboard,
+    )
+    await try_to_delete(message)
+
+    # We don't want this message to be processed any further
+    raise ApplicationHandlerStop
diff --git a/rules_bot.py b/rules_bot.py
index 7cd8c43..1e24478 100644
--- a/rules_bot.py
+++ b/rules_bot.py
@@ -31,6 +31,7 @@
     compat_warning,
     delete_message,
     leave_chat,
+    long_code_handling,
     off_on_topic,
     raise_app_handler_stop,
     regex_token_warning,
@@ -137,7 +138,13 @@ def main() -> None:
         group=-2,
     )
 
-    application.add_handler(MessageHandler(~filters.COMMAND, rate_limit_tracker), group=-1)
+    application.add_handler(MessageHandler(~filters.COMMAND, rate_limit_tracker), group=-2)
+
+    # We need several different patterns, so filters.REGEX doesn't do the trick
+    # therefore we catch everything and do regex ourselves. In case the message contains a
+    # long code block, we'll raise AppHandlerStop to prevent further processing.
+    application.add_handler(MessageHandler(filters.TEXT, long_code_handling), group=-1)
+
     application.add_handler(
         MessageHandler(
             filters.SenderChat.CHANNEL