From adce981bcce4273bdf435a7421ba006081b1ebb7 Mon Sep 17 00:00:00 2001 From: vinothkumarr227 Date: Wed, 10 Sep 2025 12:27:54 +0530 Subject: [PATCH 1/4] stats/opentelemetry: record retry attempts from clientStream (#8342) Fixes: https://github.com/grpc/grpc-go/issues/8299 RELEASE NOTES: - stats/opentelemetry: Retry attempts (`grpc.previous-rpc-attempts`) are now recorded as span attributes for non-transparent client retries. --- stats/opentelemetry/client_tracing.go | 25 +- stats/opentelemetry/e2e_test.go | 566 ++++++++++++++------------ stats/opentelemetry/opentelemetry.go | 8 +- stats/opentelemetry/trace.go | 14 - 4 files changed, 335 insertions(+), 278 deletions(-) diff --git a/stats/opentelemetry/client_tracing.go b/stats/opentelemetry/client_tracing.go index 868d6a2fc9c1..6b725fb67366 100644 --- a/stats/opentelemetry/client_tracing.go +++ b/stats/opentelemetry/client_tracing.go @@ -21,6 +21,7 @@ import ( "log" "strings" + "go.opentelemetry.io/otel/attribute" otelcodes "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/trace" "google.golang.org/grpc" @@ -83,7 +84,10 @@ func (h *clientTracingHandler) finishTrace(err error, ts trace.Span) { // It creates a new outgoing carrier which serializes information about this // span into gRPC Metadata, if TextMapPropagator is provided in the trace // options. if TextMapPropagator is not provided, it returns the context as is. -func (h *clientTracingHandler) traceTagRPC(ctx context.Context, ai *attemptInfo, nameResolutionDelayed bool) (context.Context, *attemptInfo) { +// +// Note: The passed attemptInfo pointer (ai) is mutated in-place. Fields such as +// ai.traceSpan are updated directly. No new attemptInfo is returned. +func (h *clientTracingHandler) traceTagRPC(ctx context.Context, ai *attemptInfo, nameResolutionDelayed bool) context.Context { // Add a "Delayed name resolution complete" event to the call span // if there was name resolution delay. In case of multiple retry attempts, // ensure that event is added only once. @@ -98,7 +102,7 @@ func (h *clientTracingHandler) traceTagRPC(ctx context.Context, ai *attemptInfo, carrier := otelinternaltracing.NewOutgoingCarrier(ctx) h.options.TraceOptions.TextMapPropagator.Inject(ctx, carrier) ai.traceSpan = span - return carrier.Context(), ai + return carrier.Context() } // createCallTraceSpan creates a call span to put in the provided context using @@ -121,7 +125,12 @@ func (h *clientTracingHandler) HandleConn(context.Context, stats.ConnStats) {} // TagRPC implements per RPC attempt context management for traces. func (h *clientTracingHandler) TagRPC(ctx context.Context, info *stats.RPCTagInfo) context.Context { ctx, ai := getOrCreateRPCAttemptInfo(ctx) - ctx, ai = h.traceTagRPC(ctx, ai, info.NameResolutionDelay) + ci := getCallInfo(ctx) + if ci == nil { + logger.Error("context passed into client side stats handler (TagRPC) has no call info") + return ctx + } + ctx = h.traceTagRPC(ctx, ai, info.NameResolutionDelay) return setRPCInfo(ctx, &rpcInfo{ai: ai}) } @@ -132,5 +141,15 @@ func (h *clientTracingHandler) HandleRPC(ctx context.Context, rs stats.RPCStats) logger.Error("ctx passed into client side tracing handler trace event handling has no client attempt data present") return } + + // Client-specific Begin attributes. + if begin, ok := rs.(*stats.Begin); ok { + ci := getCallInfo(ctx) + previousRPCAttempts := ci.previousRPCAttempts.Add(1) - 1 + ri.ai.traceSpan.SetAttributes( + attribute.Int64("previous-rpc-attempts", int64(previousRPCAttempts)), + attribute.Bool("transparent-retry", begin.IsTransparentRetryAttempt), + ) + } populateSpan(rs, ri.ai) } diff --git a/stats/opentelemetry/e2e_test.go b/stats/opentelemetry/e2e_test.go index 2d575bfe06c5..9f345517e8c7 100644 --- a/stats/opentelemetry/e2e_test.go +++ b/stats/opentelemetry/e2e_test.go @@ -89,13 +89,7 @@ type traceSpanInfo struct { name string events []trace.Event attributes []attribute.KeyValue -} - -// traceSpanInfoMapKey is the key struct for constructing a map of trace spans -// retrievable by span name and span kind -type traceSpanInfoMapKey struct { - spanName string - spanKind string + status otelcodes.Code } // defaultMetricsOptions creates default metrics options @@ -276,46 +270,33 @@ func validateTraces(t *testing.T, spans tracetest.SpanStubs, wantSpanInfos []tra } } - // Constructs a map from a slice of traceSpanInfo to retrieve the - // corresponding expected span info based on span name and span kind - // for comparison. - wantSpanInfosMap := make(map[traceSpanInfoMapKey]traceSpanInfo) - for _, info := range wantSpanInfos { - key := traceSpanInfoMapKey{spanName: info.name, spanKind: info.spanKind} - wantSpanInfosMap[key] = info - } - - // Compare retrieved spans with expected spans. - for _, span := range spans { - // Check that the attempt span has the correct status. - if got, want := span.Status.Code, otelcodes.Ok; got != want { - t.Errorf("Got status code %v, want %v", got, want) - } - - // Retrieve the corresponding expected span info based on span name and - // span kind to compare. - want, ok := wantSpanInfosMap[traceSpanInfoMapKey{spanName: span.Name, spanKind: span.SpanKind.String()}] - if !ok { - t.Errorf("Unexpected span: %v", span) - continue - } - - // comparers - attributesSort := cmpopts.SortSlices(func(a, b attribute.KeyValue) bool { - return a.Key < b.Key - }) - attributesValueComparable := cmpopts.EquateComparable(attribute.KeyValue{}.Value) - eventsTimeIgnore := cmpopts.IgnoreFields(trace.Event{}, "Time") - - // attributes - if diff := cmp.Diff(want.attributes, span.Attributes, attributesSort, attributesValueComparable); diff != "" { - t.Errorf("Attributes mismatch for span %s (-want +got):\n%s", span.Name, diff) - } - // events - if diff := cmp.Diff(want.events, span.Events, attributesSort, attributesValueComparable, eventsTimeIgnore); diff != "" { - t.Errorf("Events mismatch for span %s (-want +got):\n%s", span.Name, diff) + // Convert spans to traceSpanInfo for cmp.Diff comparison. + actualSpanInfos := make([]traceSpanInfo, len(spans)) + for i, s := range spans { + actualSpanInfos[i] = traceSpanInfo{ + name: s.Name, + spanKind: s.SpanKind.String(), + attributes: s.Attributes, + events: s.Events, + status: s.Status.Code, } } + opts := []cmp.Option{ + cmpopts.SortSlices(func(a, b traceSpanInfo) bool { + if a.name == b.name { + return a.spanKind < b.spanKind + } + return a.name < b.name + }), + cmpopts.SortSlices(func(a, b trace.Event) bool { return a.Name < b.Name }), + cmpopts.IgnoreFields(trace.Event{}, "Time"), + cmpopts.EquateComparable(attribute.KeyValue{}, attribute.Value{}, attribute.Set{}), + cmpopts.IgnoreFields(tracetest.SpanStub{}, "InstrumentationScope"), + cmp.AllowUnexported(traceSpanInfo{}), + } + if diff := cmp.Diff(wantSpanInfos, actualSpanInfos, opts...); diff != "" { + t.Errorf("Spans mismatch (-want +got):\n%s", diff) + } } // TestMethodAttributeFilter tests the method attribute filter. The method @@ -857,26 +838,10 @@ func (s) TestMetricsAndTracesOptionEnabled(t *testing.T) { wantSpanInfos := []traceSpanInfo{ { - name: "Recv.grpc.testing.TestService.UnaryCall", - spanKind: oteltrace.SpanKindServer.String(), - attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(false), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(false), - }, - { - Key: "previous-rpc-attempts", - Value: attribute.IntValue(0), - }, - { - Key: "transparent-retry", - Value: attribute.BoolValue(false), - }, - }, + name: "Recv.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, events: []trace.Event{ { Name: "Inbound message", @@ -917,15 +882,8 @@ func (s) TestMetricsAndTracesOptionEnabled(t *testing.T) { { name: "Attempt.grpc.testing.TestService.UnaryCall", spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(true), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(true), - }, { Key: "previous-rpc-attempts", Value: attribute.IntValue(0), @@ -975,50 +933,29 @@ func (s) TestMetricsAndTracesOptionEnabled(t *testing.T) { { name: "Sent.grpc.testing.TestService.UnaryCall", spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, attributes: nil, events: nil, }, { - name: "Recv.grpc.testing.TestService.FullDuplexCall", - spanKind: oteltrace.SpanKindServer.String(), - attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(false), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(false), - }, - { - Key: "previous-rpc-attempts", - Value: attribute.IntValue(0), - }, - { - Key: "transparent-retry", - Value: attribute.BoolValue(false), - }, - }, - events: nil, + name: "Recv.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, + events: nil, }, { name: "Sent.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, attributes: nil, events: nil, }, { name: "Attempt.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(true), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(true), - }, { Key: "previous-rpc-attempts", Value: attribute.IntValue(0), @@ -1081,26 +1018,10 @@ func (s) TestSpan(t *testing.T) { wantSpanInfos := []traceSpanInfo{ { - name: "Recv.grpc.testing.TestService.UnaryCall", - spanKind: oteltrace.SpanKindServer.String(), - attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(false), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(false), - }, - { - Key: "previous-rpc-attempts", - Value: attribute.IntValue(0), - }, - { - Key: "transparent-retry", - Value: attribute.BoolValue(false), - }, - }, + name: "Recv.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, events: []trace.Event{ { Name: "Inbound message", @@ -1133,15 +1054,8 @@ func (s) TestSpan(t *testing.T) { { name: "Attempt.grpc.testing.TestService.UnaryCall", spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(true), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(true), - }, { Key: "previous-rpc-attempts", Value: attribute.IntValue(0), @@ -1183,50 +1097,29 @@ func (s) TestSpan(t *testing.T) { { name: "Sent.grpc.testing.TestService.UnaryCall", spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, attributes: nil, events: nil, }, { - name: "Recv.grpc.testing.TestService.FullDuplexCall", - spanKind: oteltrace.SpanKindServer.String(), - attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(false), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(false), - }, - { - Key: "previous-rpc-attempts", - Value: attribute.IntValue(0), - }, - { - Key: "transparent-retry", - Value: attribute.BoolValue(false), - }, - }, - events: nil, + name: "Recv.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, + events: nil, }, { name: "Sent.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, attributes: nil, events: nil, }, { name: "Attempt.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(true), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(true), - }, { Key: "previous-rpc-attempts", Value: attribute.IntValue(0), @@ -1291,26 +1184,10 @@ func (s) TestSpan_WithW3CContextPropagator(t *testing.T) { wantSpanInfos := []traceSpanInfo{ { - name: "Recv.grpc.testing.TestService.UnaryCall", - spanKind: oteltrace.SpanKindServer.String(), - attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(false), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(false), - }, - { - Key: "previous-rpc-attempts", - Value: attribute.IntValue(0), - }, - { - Key: "transparent-retry", - Value: attribute.BoolValue(false), - }, - }, + name: "Recv.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, events: []trace.Event{ { Name: "Inbound message", @@ -1343,15 +1220,8 @@ func (s) TestSpan_WithW3CContextPropagator(t *testing.T) { { name: "Attempt.grpc.testing.TestService.UnaryCall", spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(true), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(true), - }, { Key: "previous-rpc-attempts", Value: attribute.IntValue(0), @@ -1393,50 +1263,29 @@ func (s) TestSpan_WithW3CContextPropagator(t *testing.T) { { name: "Sent.grpc.testing.TestService.UnaryCall", spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, attributes: nil, events: nil, }, { - name: "Recv.grpc.testing.TestService.FullDuplexCall", - spanKind: oteltrace.SpanKindServer.String(), - attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(false), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(false), - }, - { - Key: "previous-rpc-attempts", - Value: attribute.IntValue(0), - }, - { - Key: "transparent-retry", - Value: attribute.BoolValue(false), - }, - }, - events: nil, + name: "Recv.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, + events: nil, }, { name: "Sent.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, attributes: nil, events: nil, }, { name: "Attempt.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, attributes: []attribute.KeyValue{ - { - Key: "Client", - Value: attribute.BoolValue(true), - }, - { - Key: "FailFast", - Value: attribute.BoolValue(true), - }, { Key: "previous-rpc-attempts", Value: attribute.IntValue(0), @@ -1550,10 +1399,10 @@ const delayedResolutionEventName = "Delayed name resolution complete" // only once if any of the retry attempt encountered a delay in name resolution func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { tests := []struct { - name string - setupStub func() *stubserver.StubServer - doCall func(context.Context, testgrpc.TestServiceClient) error - spanName string + name string + setupStub func() *stubserver.StubServer + doCall func(context.Context, testgrpc.TestServiceClient) error + wantSpanInfos []traceSpanInfo }{ { name: "unary", @@ -1576,7 +1425,136 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { _, err := client.UnaryCall(ctx, &testpb.SimpleRequest{}) return err }, - spanName: "Sent.grpc.testing.TestService.UnaryCall", + wantSpanInfos: []traceSpanInfo{ + { + name: "Recv.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Error, + attributes: nil, + events: []trace.Event{ + { + Name: "Inbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + // RPC attempt #1 + { + name: "Attempt.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Error, + attributes: []attribute.KeyValue{ + attribute.Int("previous-rpc-attempts", 0), + attribute.Bool("transparent-retry", false), + }, + events: []trace.Event{ + { + Name: "Delayed LB pick complete", + }, + { + Name: "Outbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + { + name: "Recv.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Error, + attributes: nil, + events: []trace.Event{ + { + Name: "Inbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + // RPC attempt #2 + { + name: "Attempt.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Error, + attributes: []attribute.KeyValue{ + attribute.Int("previous-rpc-attempts", 1), + attribute.Bool("transparent-retry", false), + }, + events: []trace.Event{ + { + Name: "Outbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + { + name: "Recv.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, + events: []trace.Event{ + { + Name: "Inbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + { + Name: "Outbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + // RPC attempt #3 + { + name: "Attempt.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, + attributes: []attribute.KeyValue{ + attribute.Int("previous-rpc-attempts", 2), + attribute.Bool("transparent-retry", false), + }, + events: []trace.Event{ + { + Name: "Outbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + { + Name: "Inbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + { + name: "Sent.grpc.testing.TestService.UnaryCall", + spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, + attributes: nil, + events: []trace.Event{ + {Name: delayedResolutionEventName}, + }, + }, + }, }, { name: "streaming", @@ -1620,7 +1598,106 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { } return nil }, - spanName: "Sent.grpc.testing.TestService.FullDuplexCall", + wantSpanInfos: []traceSpanInfo{ + { + name: "Recv.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Error, + attributes: nil, + events: nil, + }, + // RPC attempt #1 + { + name: "Attempt.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Error, + attributes: []attribute.KeyValue{ + attribute.Int("previous-rpc-attempts", 0), + attribute.Bool("transparent-retry", false), + }, + events: []trace.Event{ + { + Name: "Delayed LB pick complete", + }, + { + Name: "Outbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + { + name: "Recv.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Error, + attributes: nil, + events: nil, + }, + // RPC attempt #2 + { + name: "Attempt.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Error, + attributes: []attribute.KeyValue{ + attribute.Int("previous-rpc-attempts", 1), + attribute.Bool("transparent-retry", false), + }, + events: []trace.Event{ + { + Name: "Outbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + { + name: "Recv.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + attributes: nil, + events: []trace.Event{ + { + Name: "Inbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + // RPC attempt #3 + { + name: "Attempt.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, + attributes: []attribute.KeyValue{ + attribute.Int("previous-rpc-attempts", 2), + attribute.Bool("transparent-retry", false), + }, + events: []trace.Event{ + { + Name: "Outbound message", + Attributes: []attribute.KeyValue{ + attribute.Int("sequence-number", 0), + attribute.Int("message-size", 0), + }, + }, + }, + }, + { + name: "Sent.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, + attributes: nil, + events: []trace.Event{ + {Name: delayedResolutionEventName}, + }, + }, + }, }, } @@ -1676,38 +1753,15 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { if err := tt.doCall(ctx, client); err != nil { t.Fatalf("%s call failed: %v", tt.name, err) } - - wantSpanInfo := traceSpanInfo{ - name: tt.spanName, - spanKind: oteltrace.SpanKindClient.String(), - events: []trace.Event{{Name: delayedResolutionEventName}}, - } - spans, err := waitForTraceSpans(ctx, exporter, []traceSpanInfo{wantSpanInfo}) + spans, err := waitForTraceSpans(ctx, exporter, tt.wantSpanInfos) if err != nil { t.Fatal(err) } - verifyTrace(t, spans, wantSpanInfo) + validateTraces(t, spans, tt.wantSpanInfos) }) } } -func verifyTrace(t *testing.T, spans tracetest.SpanStubs, want traceSpanInfo) { - match := false - for _, span := range spans { - if span.Name == want.name && span.SpanKind.String() == want.spanKind { - match = true - if diff := cmp.Diff(want.events, span.Events, cmpopts.IgnoreFields(trace.Event{}, "Time")); diff != "" { - t.Errorf("Span event mismatch for %q (kind: %s) (-want +got):\n%s", - want.name, want.spanKind, diff) - } - break - } - } - if !match { - t.Errorf("Expected span not found: %q (kind: %s)", want.name, want.spanKind) - } -} - // TestStreamingRPC_TraceSequenceNumbers verifies that sequence numbers // are incremented correctly for multiple messages sent and received // during a streaming RPC. @@ -1756,27 +1810,23 @@ func (s) TestStreamingRPC_TraceSequenceNumbers(t *testing.T) { { name: "Sent.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindClient.String(), + status: otelcodes.Ok, events: nil, attributes: nil, }, { - name: "Recv.grpc.testing.TestService.FullDuplexCall", - spanKind: oteltrace.SpanKindServer.String(), - events: wantInboundEvents, - attributes: []attribute.KeyValue{ - attribute.Bool("Client", false), - attribute.Bool("FailFast", false), - attribute.Int("previous-rpc-attempts", 0), - attribute.Bool("transparent-retry", false), - }, + name: "Recv.grpc.testing.TestService.FullDuplexCall", + spanKind: oteltrace.SpanKindServer.String(), + status: otelcodes.Ok, + events: wantInboundEvents, + attributes: nil, }, { name: "Attempt.grpc.testing.TestService.FullDuplexCall", spanKind: oteltrace.SpanKindInternal.String(), + status: otelcodes.Ok, events: wantOutboundEvents, attributes: []attribute.KeyValue{ - attribute.Bool("Client", true), - attribute.Bool("FailFast", true), attribute.Int("previous-rpc-attempts", 0), attribute.Bool("transparent-retry", false), }, diff --git a/stats/opentelemetry/opentelemetry.go b/stats/opentelemetry/opentelemetry.go index 2a9cb5e57d77..925da96781d5 100644 --- a/stats/opentelemetry/opentelemetry.go +++ b/stats/opentelemetry/opentelemetry.go @@ -179,6 +179,9 @@ type callInfo struct { // nameResolutionEventAdded is set when the resolver delay trace event // is added. Prevents duplicate events, since it is reported per-attempt. nameResolutionEventAdded atomic.Bool + // previousRPCAttempts holds the count of RPC attempts that have happened + // before current attempt. Transparent retries are excluded. + previousRPCAttempts atomic.Uint32 } type callInfoKey struct{} @@ -239,9 +242,8 @@ type attemptInfo struct { // message counters for sent and received messages (used for // generating message IDs), and the number of previous RPC attempts for the // associated call. - countSentMsg uint32 - countRecvMsg uint32 - previousRPCAttempts uint32 + countSentMsg uint32 + countRecvMsg uint32 } type clientMetrics struct { diff --git a/stats/opentelemetry/trace.go b/stats/opentelemetry/trace.go index 40ac7a1b6ef5..3ee66d1e8cc7 100644 --- a/stats/opentelemetry/trace.go +++ b/stats/opentelemetry/trace.go @@ -17,8 +17,6 @@ package opentelemetry import ( - "sync/atomic" - "go.opentelemetry.io/otel/attribute" otelcodes "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/trace" @@ -40,18 +38,6 @@ func populateSpan(rs stats.RPCStats, ai *attemptInfo) { span := ai.traceSpan switch rs := rs.(type) { - case *stats.Begin: - // Note: Go always added Client and FailFast attributes even though they are not - // defined by the OpenCensus gRPC spec. Thus, they are unimportant for - // correctness. - span.SetAttributes( - attribute.Bool("Client", rs.Client), - attribute.Bool("FailFast", rs.FailFast), - attribute.Int64("previous-rpc-attempts", int64(ai.previousRPCAttempts)), - attribute.Bool("transparent-retry", rs.IsTransparentRetryAttempt), - ) - // increment previous rpc attempts applicable for next attempt - atomic.AddUint32(&ai.previousRPCAttempts, 1) case *stats.DelayedPickComplete: span.AddEvent("Delayed LB pick complete") case *stats.InPayload: From b1732c68f393d281651e3cdc6485dbbc6eef7845 Mon Sep 17 00:00:00 2001 From: ulascansenturk Date: Thu, 20 Nov 2025 13:52:56 +0300 Subject: [PATCH 2/4] stats/opentelemetry: fix flaky TestTraceSpan_WithRetriesAndNameResolutionDelay This commit fixes the flaky test TestTraceSpan_WithRetriesAndNameResolutionDelay which was introduced in the previous commit and caused PR #8342 to be reverted. Root Cause: The test had race conditions related to timing: 1. The goroutine that updates resolver state could complete before or after the delayed resolution event was fully processed and recorded in spans 2. Span export timing was not synchronized with test validation, causing the test to sometimes check spans before they were fully exported Fix: 1. Added 'stateUpdated' event to synchronize between the resolver state update completing and span validation beginning 2. Added explicit wait for the stateUpdated event before validating spans 3. Added a 50ms sleep after RPC completion to give the span exporter time to process and export all spans before validation Testing: - Test now passes consistently (10+ consecutive runs) - Passes with race detector enabled (-race flag) - No data races detected Fixes #8700 --- stats/opentelemetry/e2e_test.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/stats/opentelemetry/e2e_test.go b/stats/opentelemetry/e2e_test.go index 9f345517e8c7..e24da6d06e01 100644 --- a/stats/opentelemetry/e2e_test.go +++ b/stats/opentelemetry/e2e_test.go @@ -1704,6 +1704,7 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { resolutionWait := grpcsync.NewEvent() + stateUpdated := grpcsync.NewEvent() prevHook := internal.NewStreamWaitingForResolver internal.NewStreamWaitingForResolver = func() { resolutionWait.Fire() } defer func() { internal.NewStreamWaitingForResolver = prevHook }() @@ -1746,13 +1747,32 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() + // Start the goroutine that will update resolver state once the stream + // is waiting for resolution. Use stateUpdated event to ensure the + // resolver state is updated before we start validating spans. go func() { <-resolutionWait.Done() rb.UpdateState(resolver.State{Addresses: []resolver.Address{{Addr: ss.Address}}}) + stateUpdated.Fire() }() + if err := tt.doCall(ctx, client); err != nil { t.Fatalf("%s call failed: %v", tt.name, err) } + + // Wait for the resolver state to be updated to ensure the delayed + // resolution event has been processed. + select { + case <-stateUpdated.Done(): + case <-ctx.Done(): + t.Fatal("Timed out waiting for resolver state update") + } + + // Give the span exporter a small amount of time to process and export + // all spans from the completed RPC. This reduces flakiness by ensuring + // all trace events have been fully recorded before validation. + time.Sleep(50 * time.Millisecond) + spans, err := waitForTraceSpans(ctx, exporter, tt.wantSpanInfos) if err != nil { t.Fatal(err) From 6d30a3efdb7a90511b9f70de7dfc697fbf8d5984 Mon Sep 17 00:00:00 2001 From: ulascansenturk Date: Mon, 24 Nov 2025 17:21:57 +0300 Subject: [PATCH 3/4] address review --- stats/opentelemetry/e2e_test.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/stats/opentelemetry/e2e_test.go b/stats/opentelemetry/e2e_test.go index e24da6d06e01..529f35fa4392 100644 --- a/stats/opentelemetry/e2e_test.go +++ b/stats/opentelemetry/e2e_test.go @@ -1768,11 +1768,6 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { t.Fatal("Timed out waiting for resolver state update") } - // Give the span exporter a small amount of time to process and export - // all spans from the completed RPC. This reduces flakiness by ensuring - // all trace events have been fully recorded before validation. - time.Sleep(50 * time.Millisecond) - spans, err := waitForTraceSpans(ctx, exporter, tt.wantSpanInfos) if err != nil { t.Fatal(err) From 890acb3da246a317b1b9f9ddec580f0288800170 Mon Sep 17 00:00:00 2001 From: ulascansenturk Date: Thu, 27 Nov 2025 10:19:26 +0300 Subject: [PATCH 4/4] stats/opentelemetry: properly wait for span completion in tests The previous fix removed time.Sleep based on review feedback that waitForTraceSpans should be sufficient. However, waitForTraceSpans only checked for span presence (name + kind), not span completion. This led to test flakiness where spans were captured before all their events and status were fully recorded. Fix: Enhanced waitForTraceSpans to also check that spans have ended (non-zero EndTime) before returning, ensuring all span data is complete before validation. Testing: - Passes 10+ consecutive runs without flakiness - Passes with -race flag - All opentelemetry tests pass --- stats/opentelemetry/e2e_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/stats/opentelemetry/e2e_test.go b/stats/opentelemetry/e2e_test.go index 529f35fa4392..94b6def7b6eb 100644 --- a/stats/opentelemetry/e2e_test.go +++ b/stats/opentelemetry/e2e_test.go @@ -164,7 +164,12 @@ func waitForTraceSpans(ctx context.Context, exporter *tracetest.InMemoryExporter missingAnySpan := false for _, wantSpan := range wantSpans { if !slices.ContainsFunc(spans, func(span tracetest.SpanStub) bool { - return span.Name == wantSpan.name && span.SpanKind.String() == wantSpan.spanKind + // Check both name/kind match AND that the span has ended (is complete). + // An ended span has a non-zero EndTime, indicating all events and + // status have been recorded. + return span.Name == wantSpan.name && + span.SpanKind.String() == wantSpan.spanKind && + !span.EndTime.IsZero() }) { missingAnySpan = true }