getsentry · aliu39 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/migrations_lockfile.txt b/migrations_lockfile.txt
@@ -10,7 +10,7 @@ hybridcloud: 0016_add_control_cacheversion
 nodestore: 0002_nodestore_no_dictfield
 remote_subscriptions: 0003_drop_remote_subscription
 replays: 0004_index_together
-sentry: 0777_add_related_name_to_dashboard_permissions
+sentry: 0778_userreport_comments_max_length
 social_auth: 0002_default_auto_field
 uptime: 0017_unique_on_timeout
 workflow_engine: 0009_detector_type
@@ -11,7 +11,7 @@
 from sentry import features, options
 from sentry.constants import DataCategory
 from sentry.eventstore.models import Event, GroupEvent
-from sentry.feedback.usecases.spam_detection import is_spam
+from sentry.feedback.usecases.spam_detection import is_spam, spam_detection_enabled
 from sentry.issues.grouptype import FeedbackGroup
 from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
 from sentry.issues.json_schemas import EVENT_PAYLOAD_SCHEMA, LEGACY_EVENT_PAYLOAD_SCHEMA
@@ -141,7 +141,6 @@ def fix_for_issue_platform(event_data):
     # If no user email was provided specify the contact-email as the user-email.
     feedback_obj = event_data.get("contexts", {}).get("feedback", {})
     contact_email = feedback_obj.get("contact_email")
-
     if not ret_event["user"].get("email", ""):
         ret_event["user"]["email"] = contact_email
 
@@ -168,6 +167,21 @@ def fix_for_issue_platform(event_data):
     return ret_event
 
 
+def validate_issue_platform_event_schema(event_data):
+    """
+    The issue platform schema validation does not run in dev atm so we have to do the validation
+    ourselves, or else our tests are not representative of what happens in prod.
+    """
+    try:
+        jsonschema.validate(event_data, EVENT_PAYLOAD_SCHEMA)
+    except jsonschema.exceptions.ValidationError:
+        try:
+            jsonschema.validate(event_data, LEGACY_EVENT_PAYLOAD_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            metrics.incr("feedback.create_feedback_issue.invalid_schema")
+            raise
+
+
 def should_filter_feedback(event, project_id, source: FeedbackCreationSource):
     # Right now all unreal error events without a feedback
     # actually get a sent a feedback with this message
@@ -222,26 +236,43 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     if should_filter_feedback(event, project_id, source):
         return
 
+    feedback_message = event["contexts"]["feedback"]["message"]
+    max_msg_size = options.get("feedback.message.max-size")  # Note options are cached.
     project = Project.objects.get_from_cache(id=project_id)
 
+    # Spam detection.
     is_message_spam = None
-    if features.has(
-        "organizations:user-feedback-spam-filter-ingest", project.organization
-    ) and project.get_option("sentry:feedback_ai_spam_detection"):
-        try:
-            is_message_spam = is_spam(event["contexts"]["feedback"]["message"])
-        except Exception:
-            # until we have LLM error types ironed out, just catch all exceptions
-            logger.exception("Error checking if message is spam", extra={"project_id": project_id})
-        metrics.incr(
-            "feedback.create_feedback_issue.spam_detection",
+    if spam_detection_enabled(project):
+        if len(feedback_message) <= max_msg_size:
+            try:
+                is_message_spam = is_spam(feedback_message)
+            except Exception:
+                # until we have LLM error types ironed out, just catch all exceptions
+                logger.exception(
+                    "Error checking if message is spam", extra={"project_id": project_id}
+                )
+            metrics.incr(
+                "feedback.create_feedback_issue.spam_detection",
+                tags={
+                    "is_spam": is_message_spam,
+                    "referrer": source.value,
+                    "client_source": event["contexts"]["feedback"].get("source"),
+                },
+                sample_rate=1.0,
+            )
+        else:
+            is_message_spam = True
+
+    if len(feedback_message) > max_msg_size:
+        metrics.distribution(
+            "feedback.large_message",
+            len(feedback_message),
             tags={
-                "is_spam": is_message_spam,
+                "entrypoint": "create_feedback_issue",
                 "referrer": source.value,
-                "client_source": event["contexts"]["feedback"].get("source"),
             },
-            sample_rate=1.0,
         )
+        feedback_message = feedback_message[:max_msg_size]
 
     # Note that some of the fields below like title and subtitle
     # are not used by the feedback UI, but are required.
@@ -257,7 +288,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
         project_id=project_id,
         fingerprint=issue_fingerprint,  # random UUID for fingerprint so feedbacks are grouped individually
         issue_title="User Feedback",
-        subtitle=event["contexts"]["feedback"]["message"],
+        subtitle=feedback_message,
         resource_id=None,
         evidence_data=evidence_data,
         evidence_display=evidence_display,
@@ -279,6 +310,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     # make sure event data is valid for issue platform
     validate_issue_platform_event_schema(event_fixed)
 
+    # Analytics
     if not project.flags.has_feedbacks:
         first_feedback_received.send_robust(project=project, sender=Project)
 
@@ -291,9 +323,11 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     ):
         first_new_feedback_received.send_robust(project=project, sender=Project)
 
+    # Send to issue platform for processing.
     produce_occurrence_to_kafka(
         payload_type=PayloadType.OCCURRENCE, occurrence=occurrence, event_data=event_fixed
     )
+    # Mark as spam. We need this since IP doesn't currently support an initial status of IGNORED.
     if is_message_spam:
         auto_ignore_spam_feedbacks(project, issue_fingerprint)
     metrics.incr(
@@ -304,6 +338,7 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
         },
         sample_rate=1.0,
     )
+
     track_outcome(
         org_id=project.organization_id,
         project_id=project_id,
@@ -317,19 +352,27 @@ def create_feedback_issue(event, project_id: int, source: FeedbackCreationSource
     )
 
 
-def validate_issue_platform_event_schema(event_data):
+def auto_ignore_spam_feedbacks(project, issue_fingerprint):
     """
-    The issue platform schema validation does not run in dev atm so we have to do the validation
-    ourselves, or else our tests are not representative of what happens in prod.
+    Marks an issue as spam with a STATUS_CHANGE kafka message. The IGNORED status allows the occurrence to skip alerts
+    and be picked up by frontend spam queries.
     """
-    try:
-        jsonschema.validate(event_data, EVENT_PAYLOAD_SCHEMA)
-    except jsonschema.exceptions.ValidationError:
-        try:
-            jsonschema.validate(event_data, LEGACY_EVENT_PAYLOAD_SCHEMA)
-        except jsonschema.exceptions.ValidationError:
-            metrics.incr("feedback.create_feedback_issue.invalid_schema")
-            raise
+    if features.has("organizations:user-feedback-spam-filter-actions", project.organization):
+        metrics.incr("feedback.spam-detection-actions.set-ignored")
+        produce_occurrence_to_kafka(
+            payload_type=PayloadType.STATUS_CHANGE,
+            status_change=StatusChangeMessage(
+                fingerprint=issue_fingerprint,
+                project_id=project.id,
+                new_status=GroupStatus.IGNORED,  # we use ignored in the UI for the spam tab
+                new_substatus=GroupSubStatus.FOREVER,
+            ),
+        )
+
+
+###########
+# Shim code
+###########
 
 
 class UserReportShimDict(TypedDict):
@@ -390,19 +433,5 @@ def shim_to_feedback(
         metrics.incr("feedback.shim_to_feedback.failed", tags={"referrer": source.value})
 
 
-def auto_ignore_spam_feedbacks(project, issue_fingerprint):
-    if features.has("organizations:user-feedback-spam-filter-actions", project.organization):
-        metrics.incr("feedback.spam-detection-actions.set-ignored")
-        produce_occurrence_to_kafka(
-            payload_type=PayloadType.STATUS_CHANGE,
-            status_change=StatusChangeMessage(
-                fingerprint=issue_fingerprint,
-                project_id=project.id,
-                new_status=GroupStatus.IGNORED,  # we use ignored in the UI for the spam tab
-                new_substatus=GroupSubStatus.FOREVER,
-            ),
-        )
-
-
 def is_in_feedback_denylist(organization):
     return organization.slug in options.get("feedback.organizations.slug-denylist")
@@ -1,6 +1,8 @@
 import logging
 
+from sentry import features
 from sentry.llm.usecases import LLMUseCase, complete_prompt
+from sentry.models.project import Project
 from sentry.utils import metrics
 
 logger = logging.getLogger(__name__)
@@ -68,3 +70,9 @@ def trim_response(text):
         return True, trimmed_text
     else:
         return False, trimmed_text
+
+
+def spam_detection_enabled(project: Project) -> bool:
+    return features.has(
+        "organizations:user-feedback-spam-filter-ingest", project.organization
+    ) and project.get_option("sentry:feedback_ai_spam_detection")
@@ -39,6 +39,18 @@ def save_userreport(
         if should_filter_user_report(report["comments"]):
             return
 
+        max_comment_length = UserReport._meta.get_field("comments").max_length
+        if max_comment_length and len(report["comments"]) > max_comment_length:
+            metrics.distribution(
+                "feedback.large_message",
+                len(report["comments"]),
+                tags={
+                    "entrypoint": "save_userreport",
+                    "referrer": source.value,
+                },
+            )
+            report["comments"] = report["comments"][:max_comment_length]
+
         if start_time is None:
             start_time = timezone.now()
 

@@ -0,0 +1,33 @@
+# Generated by Django 5.1.1 on 2024-10-17 23:33
+
+from django.db import migrations, models
+
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production.
+    # This should only be used for operations where it's safe to run the migration after your
+    # code has deployed. So this should not be used for most operations that alter the schema
+    # of a table.
+    # Here are some things that make sense to mark as post deployment:
+    # - Large data migrations. Typically we want these to be run manually so that they can be
+    #   monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   run this outside deployments so that we don't block them. Note that while adding an index
+    #   is a schema change, it's completely safe to run the operation after the code has deployed.
+    # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment
+
+    is_post_deployment = False
+
+    dependencies = [
+        ("sentry", "0777_add_related_name_to_dashboard_permissions"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="userreport",
+            name="comments",
+            field=models.TextField(max_length=4096),
+        ),
+    ]
diff --git a/src/sentry/models/userreport.py b/src/sentry/models/userreport.py
@@ -15,7 +15,9 @@ class UserReport(Model):
     environment_id = BoundedBigIntegerField(null=True, db_index=True)
     name = models.CharField(max_length=128)
     email = models.EmailField(max_length=75)
-    comments = models.TextField()
+    comments = models.TextField(
+        max_length=4096
+    )  # Keep max_length <= "feedback.message.max-size" sentry option.
     date_added = models.DateTimeField(default=timezone.now, db_index=True)
 
     class Meta:

diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py
@@ -474,6 +474,12 @@
     default=[],
     flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
 )
+register(
+    "feedback.message.max-size",
+    type=Int,
+    default=4096,
+    flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
+)
 
 # Dev Toolbar Options
 register(

diff --git a/src/sentry/web/frontend/error_page_embed.py b/src/sentry/web/frontend/error_page_embed.py
@@ -58,14 +58,16 @@
 
 class UserReportForm(forms.ModelForm):
     name = forms.CharField(
-        max_length=128, widget=forms.TextInput(attrs={"placeholder": _("Jane Bloggs")})
+        max_length=UserReport._meta.get_field("name").max_length,
+        widget=forms.TextInput(attrs={"placeholder": _("Jane Bloggs")}),
     )
     email = forms.EmailField(
-        max_length=75,
+        max_length=UserReport._meta.get_field("email").max_length,
         widget=forms.TextInput(attrs={"placeholder": _("jane@example.com"), "type": "email"}),
     )
     comments = forms.CharField(
-        widget=forms.Textarea(attrs={"placeholder": _("I clicked on 'X' and then hit 'Confirm'")})
+        max_length=UserReport._meta.get_field("comments").max_length,
+        widget=forms.Textarea(attrs={"placeholder": _("I clicked on 'X' and then hit 'Confirm'")}),
     )
 
     class Meta:

@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import time
-from datetime import datetime
+from datetime import UTC, datetime
 from typing import Any
 from unittest.mock import Mock
 
@@ -773,8 +773,55 @@ def test_create_feedback_spam_detection_set_status_ignored(
         assert group.substatus == GroupSubStatus.FOREVER
 
 
-# Unit tests for shim_to_feedback error cases. The typical behavior of this function is tested in
-# test_project_user_reports, test_post_process, and test_update_user_reports.
+@django_db_all
+def test_create_feedback_large_message_truncated(
+    default_project, mock_produce_occurrence_to_kafka, set_sentry_option
+):
+    """Large messages are truncated before producing to kafka."""
+    with set_sentry_option("feedback.message.max-size", 4096):
+        event = mock_feedback_event(default_project.id, datetime.now(UTC))
+        event["contexts"]["feedback"]["message"] = "a" * 7007
+        create_feedback_issue(
+            event, default_project.id, FeedbackCreationSource.NEW_FEEDBACK_ENVELOPE
+        )
+
+    kwargs = mock_produce_occurrence_to_kafka.call_args[1]
+    assert len(kwargs["occurrence"].subtitle) == 4096
+
+
+@django_db_all
+def test_create_feedback_large_message_skips_spam_detection(
+    default_project, set_sentry_option, monkeypatch
+):
+    """If spam is enabled, large messages are marked as spam without making an LLM request."""
+    with Feature(
+        {
+            "organizations:user-feedback-spam-filter-actions": True,
+            "organizations:user-feedback-spam-filter-ingest": True,
+            "organizations:feedback-ingest": True,
+        }
+    ), set_sentry_option("feedback.message.max-size", 4096):
+
+        event = mock_feedback_event(default_project.id, datetime.now(UTC))
+        event["contexts"]["feedback"]["message"] = "a" * 7007
+
+        mock_complete_prompt = Mock()
+        monkeypatch.setattr("sentry.llm.usecases.complete_prompt", mock_complete_prompt)
+
+        create_feedback_issue(
+            event, default_project.id, FeedbackCreationSource.NEW_FEEDBACK_ENVELOPE
+        )
+        assert mock_complete_prompt.call_count == 0
+
+        group = Group.objects.get()
+        assert group.status == GroupStatus.IGNORED
+        assert group.substatus == GroupSubStatus.FOREVER
+
+
+"""
+Unit tests for shim_to_feedback error cases. The typical behavior of this function is tested in
+test_project_user_reports, test_post_process, and test_update_user_reports.
+"""
 
 
 @django_db_all