From 0ff3c6130e67287cc778b168d178b641eaded1ab Mon Sep 17 00:00:00 2001 From: Steven Eubank Date: Mon, 2 Mar 2026 17:40:25 +0100 Subject: [PATCH] feat(o11y)add-sentry Adds a fast Sentry MVP to Realtime by wiring backend error reporting with request context and lightweight operational captures, while keeping tracing on the existing OpenTelemetry path. Configuration is env-driven (SENTRY_DSN, SENTRY_ENVIRONMENT, SENTRY_RELEASE) with initial sampling and basic sensitive-data redaction to make early rollout safer. ###Test Plan - Trigger a controller error and confirm event appears in Sentry with request context. - Send a malformed websocket payload and confirm grouped channel error capture. - Enable OTEL exporter and verify trace generation for /healthcheck. --- config/config.exs | 9 ++++ config/runtime.exs | 29 +++++++++- lib/realtime/sentry_event_filter.ex | 54 +++++++++++++++++++ .../channels/realtime_channel/logging.ex | 28 ++++++++++ .../controllers/fallback_controller.ex | 38 +++++++++++-- lib/realtime_web/endpoint.ex | 2 + mix.exs | 4 +- 7 files changed, 159 insertions(+), 5 deletions(-) create mode 100644 lib/realtime/sentry_event_filter.ex diff --git a/config/config.exs b/config/config.exs index 8550e0d72..b59f6c884 100644 --- a/config/config.exs +++ b/config/config.exs @@ -56,6 +56,15 @@ config :logger, :console, format: "$time $metadata[$level] $message\n", metadata: [:request_id, :project, :external_id, :application_name, :error_code, :sub, :iss, :exp] +config :sentry, + dsn: nil, + environment_name: Mix.env(), + release: nil, + sample_rate: 1.0, + enable_source_code_context: true, + root_source_code_paths: [File.cwd!()], + before_send: &Realtime.SentryEventFilter.before_send/1 + # Use Jason for JSON parsing in Phoenix config :phoenix, :json_library, Jason diff --git a/config/runtime.exs b/config/runtime.exs index c6040da51..81e19d061 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -15,6 +15,19 @@ defmodule Env do value = System.get_env(env) if value, do: value |> String.downcase() |> String.to_existing_atom(), else: default end + + def get_float(env, default) do + case System.get_env(env) do + nil -> + default + + value -> + case Float.parse(value) do + {parsed, ""} -> parsed + _ -> default + end + end + end end app_name = System.get_env("APP_NAME", "") @@ -93,6 +106,12 @@ metrics_pusher_auth = System.get_env("METRICS_PUSHER_AUTH") metrics_pusher_interval_ms = Env.get_integer("METRICS_PUSHER_INTERVAL_MS", :timer.seconds(30)) metrics_pusher_timeout_ms = Env.get_integer("METRICS_PUSHER_TIMEOUT_MS", :timer.seconds(15)) metrics_pusher_compress = Env.get_boolean("METRICS_PUSHER_COMPRESS", true) +sentry_dsn = System.get_env("SENTRY_DSN") +sentry_environment = System.get_env("SENTRY_ENVIRONMENT", Atom.to_string(config_env())) +sentry_release = System.get_env("SENTRY_RELEASE") +sentry_error_sample_rate = Env.get_float("SENTRY_ERROR_SAMPLE_RATE", 1.0) +sentry_channel_error_sample_rate = Env.get_float("SENTRY_CHANNEL_ERROR_SAMPLE_RATE", 0.1) +sentry_controller_error_sample_rate = Env.get_float("SENTRY_CONTROLLER_ERROR_SAMPLE_RATE", 1.0) if !(db_version in [nil, "ipv6", "ipv4"]), do: raise("Invalid IP version, please set either ipv6 or ipv4") @@ -316,6 +335,12 @@ if config_env() != :test do config :logger, level: System.get_env("LOG_LEVEL", "info") |> String.to_existing_atom() + config :sentry, + dsn: sentry_dsn, + environment_name: sentry_environment, + release: sentry_release, + sample_rate: sentry_error_sample_rate + config :realtime, request_id_baggage_key: System.get_env("REQUEST_ID_BAGGAGE_KEY", "request-id"), jwt_claim_validators: System.get_env("JWT_CLAIM_VALIDATORS", "{}"), @@ -327,7 +352,9 @@ if config_env() != :test do region: region, prom_poll_rate: Env.get_integer("PROM_POLL_RATE", 5000), slot_name_suffix: slot_name_suffix, - max_gen_rpc_clients: max_gen_rpc_clients + max_gen_rpc_clients: max_gen_rpc_clients, + sentry_channel_error_sample_rate: sentry_channel_error_sample_rate, + sentry_controller_error_sample_rate: sentry_controller_error_sample_rate end # Setup Production diff --git a/lib/realtime/sentry_event_filter.ex b/lib/realtime/sentry_event_filter.ex new file mode 100644 index 000000000..d4443648e --- /dev/null +++ b/lib/realtime/sentry_event_filter.ex @@ -0,0 +1,54 @@ +defmodule Realtime.SentryEventFilter do + @moduledoc false + + @redacted "[REDACTED]" + @sensitive_headers ~w(authorization cookie x-api-key) + @sensitive_keys ~w(access_token token api_key apikey jwt secret password) + + def before_send(event) when is_map(event) do + event + |> sanitize_request_headers() + |> sanitize_extra() + end + + def before_send(event), do: event + + defp sanitize_request_headers(event) do + case Map.get(event, :request) do + request when is_map(request) -> + headers = + case Map.get(request, :headers) do + list when is_list(list) -> Enum.map(list, &sanitize_header/1) + headers when is_map(headers) -> Map.new(headers, fn {key, value} -> {key, maybe_redact_header(key, value)} end) + other -> other + end + + Map.put(event, :request, Map.put(request, :headers, headers)) + + _ -> + event + end + end + + defp sanitize_extra(event) do + case Map.get(event, :extra) do + extra when is_map(extra) -> + Map.put(event, :extra, Map.new(extra, fn {key, value} -> {key, maybe_redact_key(key, value)} end)) + + _ -> + event + end + end + + defp sanitize_header({key, value}), do: {key, maybe_redact_header(key, value)} + defp sanitize_header(other), do: other + + defp maybe_redact_header(key, value) do + if String.downcase(to_string(key)) in @sensitive_headers, do: @redacted, else: value + end + + defp maybe_redact_key(key, value) do + key = String.downcase(to_string(key)) + if Enum.any?(@sensitive_keys, &String.contains?(key, &1)), do: @redacted, else: value + end +end diff --git a/lib/realtime_web/channels/realtime_channel/logging.ex b/lib/realtime_web/channels/realtime_channel/logging.ex index 2f6c91fdb..0d5b8d647 100644 --- a/lib/realtime_web/channels/realtime_channel/logging.ex +++ b/lib/realtime_web/channels/realtime_channel/logging.ex @@ -22,6 +22,7 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do msg = build_msg(code, msg) emit_system_error(:error, code) log(socket, :error, code, msg) + maybe_capture_sentry_error(socket, code, msg) {:error, %{reason: msg}} end @@ -69,6 +70,7 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do msg = build_msg(code, msg) emit_system_error(level, code) if Logger.compare_levels(log_level, level) != :gt, do: log(socket, level, code, msg) + if level == :error, do: maybe_capture_sentry_error(socket, code, msg) if level in [:error, :warning], do: {:error, %{reason: msg}}, else: :ok end @@ -103,4 +105,30 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do nil end end + + @sentry_capture_codes MapSet.new([ + "MalformedWebSocketMessage", + "UnknownErrorOnChannel", + "InitializingProjectConnection", + "DatabaseConnectionIssue", + "UnableToSetPolicies" + ]) + + defp maybe_capture_sentry_error(socket, code, msg) do + if MapSet.member?(@sentry_capture_codes, code) and + sampled?(Application.get_env(:realtime, :sentry_channel_error_sample_rate, 0.1)) do + tenant = get_in(socket, [:assigns, :tenant]) + topic = Map.get(socket, :topic) + + Sentry.capture_message(msg, + level: :error, + tags: %{error_code: code, source: "channel"}, + extra: %{tenant: tenant, topic: topic} + ) + end + end + + defp sampled?(rate) when is_float(rate), do: rate >= 1.0 or :rand.uniform() <= rate + defp sampled?(rate) when is_integer(rate), do: sampled?(rate / 1) + defp sampled?(_), do: false end diff --git a/lib/realtime_web/controllers/fallback_controller.ex b/lib/realtime_web/controllers/fallback_controller.ex index 75f31b85d..e5d64de92 100644 --- a/lib/realtime_web/controllers/fallback_controller.ex +++ b/lib/realtime_web/controllers/fallback_controller.ex @@ -9,9 +9,11 @@ defmodule RealtimeWeb.FallbackController do use Realtime.Logs import RealtimeWeb.ErrorHelpers + require Logger def call(conn, {:error, :not_found}) do log_error("TenantNotFound", "Tenant not found") + maybe_capture_sentry_error(conn, "TenantNotFound", "Tenant not found") conn |> put_status(:not_found) @@ -20,10 +22,13 @@ defmodule RealtimeWeb.FallbackController do end def call(conn, {:error, %Ecto.Changeset{} = changeset}) do + details = Ecto.Changeset.traverse_errors(changeset, &translate_error/1) + log_error( "UnprocessableEntity", - Ecto.Changeset.traverse_errors(changeset, &translate_error/1) + details ) + maybe_capture_sentry_error(conn, "UnprocessableEntity", details) conn |> put_status(:unprocessable_entity) @@ -33,6 +38,7 @@ defmodule RealtimeWeb.FallbackController do def call(conn, {:error, status, message}) when is_atom(status) and is_binary(message) do log_error("UnprocessableEntity", message) + maybe_capture_sentry_error(conn, "UnprocessableEntity", message) conn |> put_status(status) @@ -41,10 +47,13 @@ defmodule RealtimeWeb.FallbackController do end def call(conn, {:error, %Ecto.Changeset{valid?: false} = changeset}) do + details = Ecto.Changeset.traverse_errors(changeset, &translate_error/1) + log_error( "UnprocessableEntity", - Ecto.Changeset.traverse_errors(changeset, &translate_error/1) + details ) + maybe_capture_sentry_error(conn, "UnprocessableEntity", details) conn |> put_status(:unprocessable_entity) @@ -60,10 +69,13 @@ defmodule RealtimeWeb.FallbackController do end def call(conn, %Ecto.Changeset{valid?: false} = changeset) do + details = Ecto.Changeset.traverse_errors(changeset, &translate_error/1) + log_error( "UnprocessableEntity", - Ecto.Changeset.traverse_errors(changeset, &translate_error/1) + details ) + maybe_capture_sentry_error(conn, "UnprocessableEntity", details) conn |> put_status(:unprocessable_entity) @@ -73,10 +85,30 @@ defmodule RealtimeWeb.FallbackController do def call(conn, response) do log_error("UnknownErrorOnController", response) + maybe_capture_sentry_error(conn, "UnknownErrorOnController", response) conn |> put_status(:unprocessable_entity) |> put_view(RealtimeWeb.ErrorView) |> render("error.json", message: "Unknown error") end + + defp maybe_capture_sentry_error(conn, code, details) do + if sampled?(Application.get_env(:realtime, :sentry_controller_error_sample_rate, 1.0)) do + Sentry.capture_message("#{code}: controller error", + level: :error, + tags: %{error_code: code, source: "controller"}, + extra: %{ + method: conn.method, + path: conn.request_path, + request_id: Logger.metadata()[:request_id], + details: details + } + ) + end + end + + defp sampled?(rate) when is_float(rate), do: rate >= 1.0 or :rand.uniform() <= rate + defp sampled?(rate) when is_integer(rate), do: sampled?(rate / 1) + defp sampled?(_), do: false end diff --git a/lib/realtime_web/endpoint.ex b/lib/realtime_web/endpoint.ex index 7552b9873..4668c32e7 100644 --- a/lib/realtime_web/endpoint.ex +++ b/lib/realtime_web/endpoint.ex @@ -1,4 +1,5 @@ defmodule RealtimeWeb.Endpoint do + use Sentry.PlugCapture use Phoenix.Endpoint, otp_app: :realtime alias RealtimeWeb.Plugs.BaggageRequestId @@ -88,6 +89,7 @@ defmodule RealtimeWeb.Endpoint do pass: ["*/*"], json_decoder: Phoenix.json_library() + plug Sentry.PlugContext plug Plug.MethodOverride plug Plug.Head plug Plug.Session, @session_options diff --git a/mix.exs b/mix.exs index 549603957..da4e3aa25 100644 --- a/mix.exs +++ b/mix.exs @@ -40,7 +40,7 @@ defmodule Realtime.MixProject do def application do [ mod: {Realtime.Application, []}, - extra_applications: [:logger, :runtime_tools, :prom_ex, :mix, :os_mon] + extra_applications: [:logger, :runtime_tools, :prom_ex, :mix, :os_mon, :sentry] ] end @@ -69,6 +69,8 @@ defmodule Realtime.MixProject do {:telemetry_poller, "~> 1.0"}, {:gettext, "~> 0.19"}, {:jason, "~> 1.3"}, + {:sentry, "~> 12.0.2"}, + {:hackney, "~> 1.25"}, {:plug_cowboy, "~> 2.6"}, {:libcluster, "~> 3.3"}, {:libcluster_postgres, "~> 0.2"},