Do not trim span descriptions. (#1983)

antonpirker · web-flow · commit c4d03846cb3f · 2023-04-03T12:54:48.000+02:00
- Made sure that span descriptions are never trimmed. (for all op values, not just db spans.)
- Removed the experimental smart_transaction_trimming option
- Also removed some dead code that was never executed because the experimental option defaults to False.
diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py
@@ -320,12 +320,7 @@ def _prepare_event(
         # Postprocess the event here so that annotated types do
         # generally not surface in before_send
         if event is not None:
-            event = serialize(
-                event,
-                smart_transaction_trimming=self.options["_experiments"].get(
-                    "smart_transaction_trimming"
-                ),
-            )
+            event = serialize(event)
 
         before_send = self.options["before_send"]
         if before_send is not None and event.get("type") != "transaction":
diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py
@@ -33,8 +33,7 @@
         {
             "max_spans": Optional[int],
             "record_sql_params": Optional[bool],
-            "smart_transaction_trimming": Optional[bool],
-            # TODO: Remvoe these 2 profiling related experiments
+            # TODO: Remove these 2 profiling related experiments
             "profiles_sample_rate": Optional[float],
             "profiler_mode": Optional[ProfilerMode],
         },
diff --git a/sentry_sdk/serializer.py b/sentry_sdk/serializer.py
@@ -8,13 +8,9 @@
     capture_internal_exception,
     disable_capture_event,
     format_timestamp,
-    json_dumps,
     safe_repr,
     strip_string,
 )
-
-import sentry_sdk.utils
-
 from sentry_sdk._compat import (
     text_type,
     PY2,
@@ -23,12 +19,9 @@
     iteritems,
     binary_sequence_types,
 )
-
 from sentry_sdk._types import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from datetime import timedelta
-
     from types import TracebackType
 
     from typing import Any
@@ -37,7 +30,6 @@
     from typing import Dict
     from typing import List
     from typing import Optional
-    from typing import Tuple
     from typing import Type
     from typing import Union
 
@@ -120,12 +112,11 @@ def __exit__(
         self._ids.pop(id(self._objs.pop()), None)
 
 
-def serialize(event, smart_transaction_trimming=False, **kwargs):
-    # type: (Event, bool, **Any) -> Event
+def serialize(event, **kwargs):
+    # type: (Event, **Any) -> Event
     memo = Memo()
     path = []  # type: List[Segment]
     meta_stack = []  # type: List[Dict[str, Any]]
-    span_description_bytes = []  # type: List[int]
 
     def _annotate(**meta):
         # type: (**Any) -> None
@@ -365,113 +356,23 @@ def _serialize_node_impl(
             if not isinstance(obj, string_types):
                 obj = safe_repr(obj)
 
-        # Allow span descriptions to be longer than other strings.
-        #
-        # For database auto-instrumented spans, the description contains
-        # potentially long SQL queries that are most useful when not truncated.
-        # Because arbitrarily large events may be discarded by the server as a
-        # protection mechanism, we dynamically limit the description length
-        # later in _truncate_span_descriptions.
-        if (
-            smart_transaction_trimming
-            and len(path) == 3
-            and path[0] == "spans"
-            and path[-1] == "description"
-        ):
-            span_description_bytes.append(len(obj))
+        is_span_description = (
+            len(path) == 3 and path[0] == "spans" and path[-1] == "description"
+        )
+        if is_span_description:
             return obj
-        return _flatten_annotated(strip_string(obj))
 
-    def _truncate_span_descriptions(serialized_event, event, excess_bytes):
-        # type: (Event, Event, int) -> None
-        """
-        Modifies serialized_event in-place trying to remove excess_bytes from
-        span descriptions. The original event is used read-only to access the
-        span timestamps (represented as RFC3399-formatted strings in
-        serialized_event).
-
-        It uses heuristics to prioritize preserving the description of spans
-        that might be the most interesting ones in terms of understanding and
-        optimizing performance.
-        """
-        # When truncating a description, preserve a small prefix.
-        min_length = 10
-
-        def shortest_duration_longest_description_first(args):
-            # type: (Tuple[int, Span]) -> Tuple[timedelta, int]
-            i, serialized_span = args
-            span = event["spans"][i]
-            now = datetime.utcnow()
-            start = span.get("start_timestamp") or now
-            end = span.get("timestamp") or now
-            duration = end - start
-            description = serialized_span.get("description") or ""
-            return (duration, -len(description))
-
-        # Note: for simplicity we sort spans by exact duration and description
-        # length. If ever needed, we could have a more involved heuristic, e.g.
-        # replacing exact durations with "buckets" and/or looking at other span
-        # properties.
-        path.append("spans")
-        for i, span in sorted(
-            enumerate(serialized_event.get("spans") or []),
-            key=shortest_duration_longest_description_first,
-        ):
-            description = span.get("description") or ""
-            if len(description) <= min_length:
-                continue
-            excess_bytes -= len(description) - min_length
-            path.extend([i, "description"])
-            # Note: the last time we call strip_string we could preserve a few
-            # more bytes up to a total length of MAX_EVENT_BYTES. Since that's
-            # not strictly required, we leave it out for now for simplicity.
-            span["description"] = _flatten_annotated(
-                strip_string(description, max_length=min_length)
-            )
-            del path[-2:]
-            del meta_stack[len(path) + 1 :]
-
-            if excess_bytes <= 0:
-                break
-        path.pop()
-        del meta_stack[len(path) + 1 :]
+        return _flatten_annotated(strip_string(obj))
 
+    #
+    # Start of serialize() function
+    #
     disable_capture_event.set(True)
     try:
-        rv = _serialize_node(event, **kwargs)
-        if meta_stack and isinstance(rv, dict):
-            rv["_meta"] = meta_stack[0]
-
-        sum_span_description_bytes = sum(span_description_bytes)
-        if smart_transaction_trimming and sum_span_description_bytes > 0:
-            span_count = len(event.get("spans") or [])
-            # This is an upper bound of how many bytes all descriptions would
-            # consume if the usual string truncation in _serialize_node_impl
-            # would have taken place, not accounting for the metadata attached
-            # as event["_meta"].
-            descriptions_budget_bytes = span_count * sentry_sdk.utils.MAX_STRING_LENGTH
-
-            # If by not truncating descriptions we ended up with more bytes than
-            # per the usual string truncation, check if the event is too large
-            # and we need to truncate some descriptions.
-            #
-            # This is guarded with an if statement to avoid JSON-encoding the
-            # event unnecessarily.
-            if sum_span_description_bytes > descriptions_budget_bytes:
-                original_bytes = len(json_dumps(rv))
-                excess_bytes = original_bytes - MAX_EVENT_BYTES
-                if excess_bytes > 0:
-                    # Event is too large, will likely be discarded by the
-                    # server. Trim it down before sending.
-                    _truncate_span_descriptions(rv, event, excess_bytes)
-
-                    # Span descriptions truncated, set or reset _meta.
-                    #
-                    # We run the same code earlier because we want to account
-                    # for _meta when calculating original_bytes, the number of
-                    # bytes in the JSON-encoded event.
-                    if meta_stack and isinstance(rv, dict):
-                        rv["_meta"] = meta_stack[0]
-        return rv
+        serialized_event = _serialize_node(event, **kwargs)
+        if meta_stack and isinstance(serialized_event, dict):
+            serialized_event["_meta"] = meta_stack[0]
+
+        return serialized_event
     finally:
         disable_capture_event.set(False)
diff --git a/tests/integrations/sqlalchemy/test_sqlalchemy.py b/tests/integrations/sqlalchemy/test_sqlalchemy.py
@@ -8,8 +8,8 @@
 
 from sentry_sdk import capture_message, start_transaction, configure_scope
 from sentry_sdk.integrations.sqlalchemy import SqlalchemyIntegration
-from sentry_sdk.utils import json_dumps, MAX_STRING_LENGTH
 from sentry_sdk.serializer import MAX_EVENT_BYTES
+from sentry_sdk.utils import json_dumps, MAX_STRING_LENGTH
 
 
 def test_orm_queries(sentry_init, capture_events):
@@ -143,7 +143,6 @@ def test_long_sql_query_preserved(sentry_init, capture_events):
     sentry_init(
         traces_sample_rate=1,
         integrations=[SqlalchemyIntegration()],
-        _experiments={"smart_transaction_trimming": True},
     )
     events = capture_events()
 
@@ -158,11 +157,10 @@ def test_long_sql_query_preserved(sentry_init, capture_events):
     assert description.endswith("SELECT 98 UNION SELECT 99")
 
 
-def test_too_large_event_truncated(sentry_init, capture_events):
+def test_large_event_not_truncated(sentry_init, capture_events):
     sentry_init(
         traces_sample_rate=1,
         integrations=[SqlalchemyIntegration()],
-        _experiments={"smart_transaction_trimming": True},
     )
     events = capture_events()
 
@@ -178,36 +176,26 @@ def processor(event, hint):
     engine = create_engine("sqlite:///:memory:")
     with start_transaction(name="test"):
         with engine.connect() as con:
-            for _ in range(2000):
+            for _ in range(1500):
                 con.execute(" UNION ".join("SELECT {}".format(i) for i in range(100)))
 
     (event,) = events
 
-    # Because of attached metadata in the "_meta" key, we may send out a little
-    # bit more than MAX_EVENT_BYTES.
-    max_bytes = 1.2 * MAX_EVENT_BYTES
-    assert len(json_dumps(event)) < max_bytes
+    assert len(json_dumps(event)) > MAX_EVENT_BYTES
 
     # Some spans are discarded.
     assert len(event["spans"]) == 1000
 
-    for i, span in enumerate(event["spans"]):
-        description = span["description"]
-
-        assert description.startswith("SELECT ")
-        if str(i) in event["_meta"]["spans"]:
-            # Description must have been truncated
-            assert len(description) == 10
-            assert description.endswith("...")
-        else:
-            # Description was not truncated, check for original length
-            assert len(description) == 1583
-            assert description.endswith("SELECT 98 UNION SELECT 99")
-
-    # Smoke check the meta info for one of the spans.
-    assert next(iter(event["_meta"]["spans"].values())) == {
-        "description": {"": {"len": 1583, "rem": [["!limit", "x", 7, 10]]}}
-    }
+    # Span descriptions are not truncated.
+    description = event["spans"][0]["description"]
+    assert len(description) == 1583
+    assert description.startswith("SELECT 0")
+    assert description.endswith("SELECT 98 UNION SELECT 99")
+
+    description = event["spans"][999]["description"]
+    assert len(description) == 1583
+    assert description.startswith("SELECT 0")
+    assert description.endswith("SELECT 98 UNION SELECT 99")
 
     # Smoke check that truncation of other fields has not changed.
     assert len(event["message"]) == MAX_STRING_LENGTH