Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion balancer/endpointsharding/endpointsharding.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@
}()
children := es.children.Load()
for _, child := range children.Values() {
child.(balancer.Balancer).ResolverError(err)
child.(*balancerWrapper).resolverErrorLocked(err)
}
}

Expand Down Expand Up @@ -355,3 +355,10 @@
bw.child.Close()
bw.isClosed = true
}

func (bw *balancerWrapper) resolverErrorLocked(err error) {
if bw.isClosed {
return
}

Check warning on line 362 in balancer/endpointsharding/endpointsharding.go

View check run for this annotation

Codecov / codecov/patch

balancer/endpointsharding/endpointsharding.go#L361-L362

Added lines #L361 - L362 were not covered by tests
bw.child.ResolverError(err)
}
48 changes: 45 additions & 3 deletions balancer/endpointsharding/endpointsharding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,27 @@ package endpointsharding_test
import (
"context"
"encoding/json"
"errors"
"fmt"
"log"
"strings"
"testing"
"time"

"google.golang.org/grpc"
"google.golang.org/grpc/backoff"
"google.golang.org/grpc/balancer"
"google.golang.org/grpc/balancer/endpointsharding"
"google.golang.org/grpc/balancer/pickfirst/pickfirstleaf"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/grpclog"
"google.golang.org/grpc/internal"
"google.golang.org/grpc/internal/balancer/stub"
"google.golang.org/grpc/internal/grpctest"
"google.golang.org/grpc/internal/stubserver"
"google.golang.org/grpc/internal/testutils"
"google.golang.org/grpc/internal/testutils/roundrobin"
"google.golang.org/grpc/peer"
"google.golang.org/grpc/resolver"
Expand Down Expand Up @@ -125,7 +129,9 @@ func (fp *fakePetiole) UpdateState(state balancer.State) {
// special picker, so it should fallback to the default behavior, which is to
// round_robin amongst the endpoint children that are in the aggregated state.
// It also verifies the petiole has access to the raw child state in case it
// wants to implement a custom picker.
// wants to implement a custom picker. The test sends a resolver error to the
// endpointsharding balancer and verifies an error picker from the children
// is used while making an RPC.
func (s) TestEndpointShardingBasic(t *testing.T) {
backend1 := stubserver.StartTestService(t, nil)
defer backend1.Stop()
Expand All @@ -135,7 +141,7 @@ func (s) TestEndpointShardingBasic(t *testing.T) {
mr := manual.NewBuilderWithScheme("e2e-test")
defer mr.Close()

json := `{"loadBalancingConfig": [{"fake_petiole":{}}]}`
json := fmt.Sprintf(`{"loadBalancingConfig": [{"%s":{}}]}`, fakePetioleName)
sc := internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(json)
mr.InitialState(resolver.State{
Endpoints: []resolver.Endpoint{
Expand All @@ -145,7 +151,20 @@ func (s) TestEndpointShardingBasic(t *testing.T) {
ServiceConfig: sc,
})

cc, err := grpc.NewClient(mr.Scheme()+":///", grpc.WithResolvers(mr), grpc.WithTransportCredentials(insecure.NewCredentials()))
dOpts := []grpc.DialOption{
grpc.WithResolvers(mr), grpc.WithTransportCredentials(insecure.NewCredentials()),
// Use a large backoff dealy to avoid the error picker being updated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/dealy/delay

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed the typo.

// too quickly.
grpc.WithConnectParams(grpc.ConnectParams{
Backoff: backoff.Config{
BaseDelay: 100 * time.Second,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit/Optional: This could be 2*defaultTestTimeout instead of an arbitrarily large value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed.

Multiplier: float64(0),
Jitter: float64(0),
MaxDelay: 100 * time.Second,
},
}),
}
cc, err := grpc.NewClient(mr.Scheme()+":///", dOpts...)
if err != nil {
log.Fatalf("Failed to create new client: %v", err)
}
Expand All @@ -159,6 +178,29 @@ func (s) TestEndpointShardingBasic(t *testing.T) {
if err = roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{{Addr: backend1.Address}, {Addr: backend2.Address}}); err != nil {
t.Fatalf("error in expected round robin: %v", err)
}

// Stopping both the backends should make the channel enter
// TransientFailure.
backend1.Stop()
backend2.Stop()
testutils.AwaitState(ctx, t, cc, connectivity.TransientFailure)

// When the resolver reports an error, the picker should get updated to
// return the resolver error.
mr.ReportError(errors.New("test error"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This tests that the operation was a nop, which isn't really a good test of the functionality. I.e. the child could not even get the call and the test would still pass. I think it would be better to have a test that calls ResolverError before it's working and ensures that it results in pick errors / TF state.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The children of enpointsharding wouldn't get created if the resolver doesn't produce a good list of endpoints first.
I can use a stub child balancer which modifies pickfirst's behaviour of ignoring resolver errors when it has a working state.

Copy link
Member

@dfawley dfawley Feb 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Or you should be able to make it start failing (disconnect subchannel) and then produce a ResolverError. The errors from pick should change IIUC

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I stopped the backends, reported a resolver error and verified the RPC error message.

testutils.AwaitState(ctx, t, cc, connectivity.TransientFailure)
for ; ctx.Err() == nil; <-time.After(time.Millisecond) {
_, err := client.EmptyCall(ctx, &testpb.Empty{})
if err == nil {
t.Fatalf("EmptyCall returned unexpected error: <nil>, want %q", "test error")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: instead of saying unexpected error nil, it could say succeeded when expected to fail with "test error"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed the error message.

}
if strings.Contains(err.Error(), "test error") {
break
}
}
if ctx.Err() != nil {
t.Fatalf("Context timed out waiting for picker with resolver error.")
}
}

// Tests that endpointsharding doesn't automatically re-connect IDLE children.
Expand Down