-
Notifications
You must be signed in to change notification settings - Fork 867
feat: [shard-distributor]Send "draining" heartbeat on executer shutdown #7498
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,9 +5,11 @@ import ( | |
| "time" | ||
|
|
||
| "github.com/golang/mock/gomock" | ||
| "github.com/stretchr/testify/assert" | ||
| "github.com/uber-go/tally" | ||
| "go.uber.org/fx" | ||
| "go.uber.org/fx/fxtest" | ||
| ubergomock "go.uber.org/mock/gomock" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont think we need both "github.com/golang/mock/gomock" and "go.uber.org/mock/gomock". |
||
| "go.uber.org/yarpc" | ||
| "go.uber.org/yarpc/api/transport/transporttest" | ||
| "go.uber.org/yarpc/transport/grpc" | ||
|
|
@@ -16,7 +18,10 @@ import ( | |
|
|
||
| "github.com/uber/cadence/common/clock" | ||
| "github.com/uber/cadence/common/log" | ||
| "github.com/uber/cadence/service/sharddistributor/canary/processor" | ||
| "github.com/uber/cadence/service/sharddistributor/canary/processorephemeral" | ||
| "github.com/uber/cadence/service/sharddistributor/client/clientcommon" | ||
| "github.com/uber/cadence/service/sharddistributor/client/executorclient" | ||
| ) | ||
|
|
||
| func TestModule(t *testing.T) { | ||
|
|
@@ -50,9 +55,69 @@ func TestModule(t *testing.T) { | |
| fx.Annotate(clock.NewMockedTimeSource(), fx.As(new(clock.TimeSource))), | ||
| fx.Annotate(log.NewNoop(), fx.As(new(log.Logger))), | ||
| fx.Annotate(mockClientConfigProvider, fx.As(new(yarpc.ClientConfig))), | ||
| yarpc.Config{Name: "shard-distributor-canary-test"}, | ||
| zaptest.NewLogger(t), | ||
| config, | ||
| ), | ||
| fx.Provide(yarpc.NewDispatcher), | ||
| Module(NamespacesNames{FixedNamespace: "shard-distributor-canary", EphemeralNamespace: "shard-distributor-canary-ephemeral", ExternalAssignmentNamespace: "test-external-assignment", SharddistributorServiceName: "cadence-shard-distributor"}), | ||
| ).RequireStart().RequireStop() | ||
| } | ||
|
|
||
| type mockLifecycle struct { | ||
| hookCount int | ||
| } | ||
|
|
||
| func (m *mockLifecycle) Append(hook fx.Hook) { | ||
| m.hookCount++ | ||
| } | ||
|
|
||
| func TestRegisterExecutorLifecycle(t *testing.T) { | ||
| ctrl := ubergomock.NewController(t) | ||
| defer ctrl.Finish() | ||
|
|
||
| tests := []struct { | ||
| name string | ||
| params lifecycleParams | ||
| expectedHookCount int | ||
| }{ | ||
| { | ||
| name: "multiple executors", | ||
| params: lifecycleParams{ | ||
| Lifecycle: &mockLifecycle{}, | ||
| Dispatcher: yarpc.NewDispatcher(yarpc.Config{ | ||
| Name: "test-dispatcher", | ||
| }), | ||
| FixedExecutors: []executorclient.Executor[*processor.ShardProcessor]{ | ||
| executorclient.NewMockExecutor[*processor.ShardProcessor](ctrl), | ||
| executorclient.NewMockExecutor[*processor.ShardProcessor](ctrl), | ||
| }, | ||
| EphemeralExecutors: []executorclient.Executor[*processorephemeral.ShardProcessor]{ | ||
| executorclient.NewMockExecutor[*processorephemeral.ShardProcessor](ctrl), | ||
| }, | ||
| }, | ||
| expectedHookCount: 1, | ||
| }, | ||
| { | ||
| name: "no executors", | ||
| params: lifecycleParams{ | ||
| Lifecycle: &mockLifecycle{}, | ||
| Dispatcher: yarpc.NewDispatcher(yarpc.Config{ | ||
| Name: "test-dispatcher", | ||
| }), | ||
| FixedExecutors: []executorclient.Executor[*processor.ShardProcessor]{}, | ||
| EphemeralExecutors: []executorclient.Executor[*processorephemeral.ShardProcessor]{}, | ||
| }, | ||
| expectedHookCount: 1, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| mockLifecycle := tt.params.Lifecycle.(*mockLifecycle) | ||
|
|
||
| registerExecutorLifecycle(tt.params) | ||
| assert.Equal(t, tt.expectedHookCount, mockLifecycle.hookCount) | ||
|
Comment on lines
+117
to
+120
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would recommend checking .Start and .Stop calls of executor. |
||
| }) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,8 @@ const ( | |
| ) | ||
|
|
||
| const ( | ||
| heartbeatJitterCoeff = 0.1 // 10% jitter | ||
| heartbeatJitterCoeff = 0.1 // 10% jitter | ||
| drainingHeartbeatTimeout = 5 * time.Second | ||
| ) | ||
|
|
||
| type managedProcessor[SP ShardProcessor] struct { | ||
|
|
@@ -101,6 +102,7 @@ type executorImpl[SP ShardProcessor] struct { | |
| metrics tally.Scope | ||
| migrationMode atomic.Int32 | ||
| metadata syncExecutorMetadata | ||
| hasSuccessfulHeartbeat atomic.Bool | ||
| } | ||
|
|
||
| func (e *executorImpl[SP]) setMigrationMode(mode types.MigrationMode) { | ||
|
|
@@ -124,6 +126,17 @@ func (e *executorImpl[SP]) Stop() { | |
| e.logger.Info("stopping shard distributor executor", tag.ShardNamespace(e.namespace)) | ||
| close(e.stopC) | ||
| e.processLoopWG.Wait() | ||
|
|
||
| if !e.shouldSendFinalHeartbeat() { | ||
| return | ||
| } | ||
|
|
||
| ctx, cancel := context.WithTimeout(context.Background(), e.finalHeartbeatTimeout()) | ||
| defer cancel() | ||
|
|
||
| if err := e.sendDrainingHeartbeat(ctx); err != nil { | ||
| e.logger.Error("failed to send draining heartbeat", tag.Error(err)) | ||
| } | ||
| } | ||
|
|
||
| func (e *executorImpl[SP]) GetShardProcess(ctx context.Context, shardID string) (SP, error) { | ||
|
|
@@ -269,6 +282,10 @@ func (e *executorImpl[SP]) updateShardAssignmentMetered(ctx context.Context, sha | |
| } | ||
|
|
||
| func (e *executorImpl[SP]) heartbeat(ctx context.Context) (shardAssignments map[string]*types.ShardAssignment, migrationMode types.MigrationMode, err error) { | ||
| return e.sendHeartbeat(ctx, types.ExecutorStatusACTIVE) | ||
| } | ||
|
|
||
| func (e *executorImpl[SP]) sendHeartbeat(ctx context.Context, status types.ExecutorStatus) (map[string]*types.ShardAssignment, types.MigrationMode, error) { | ||
| // Fill in the shard status reports | ||
| shardStatusReports := make(map[string]*types.ShardStatusReport) | ||
| e.managedProcessors.Range(func(shardID string, managedProcessor *managedProcessor[SP]) bool { | ||
|
|
@@ -289,7 +306,7 @@ func (e *executorImpl[SP]) heartbeat(ctx context.Context) (shardAssignments map[ | |
| request := &types.ExecutorHeartbeatRequest{ | ||
| Namespace: e.namespace, | ||
| ExecutorID: e.executorID, | ||
| Status: types.ExecutorStatusACTIVE, | ||
| Status: status, | ||
| ShardStatusReports: shardStatusReports, | ||
| Metadata: e.metadata.Get(), | ||
| } | ||
|
|
@@ -299,6 +316,7 @@ func (e *executorImpl[SP]) heartbeat(ctx context.Context) (shardAssignments map[ | |
| if err != nil { | ||
| return nil, types.MigrationModeINVALID, fmt.Errorf("send heartbeat: %w", err) | ||
| } | ||
| e.hasSuccessfulHeartbeat.Store(true) | ||
|
|
||
| previousMode := e.getMigrationMode() | ||
| currentMode := response.MigrationMode | ||
|
|
@@ -314,6 +332,22 @@ func (e *executorImpl[SP]) heartbeat(ctx context.Context) (shardAssignments map[ | |
| return response.ShardAssignments, response.MigrationMode, nil | ||
| } | ||
|
|
||
| func (e *executorImpl[SP]) sendDrainingHeartbeat(ctx context.Context) error { | ||
| _, _, err := e.sendHeartbeat(ctx, types.ExecutorStatusDRAINING) | ||
| return err | ||
| } | ||
|
|
||
| func (e *executorImpl[SP]) shouldSendFinalHeartbeat() bool { | ||
| return e.shardDistributorClient != nil && e.hasSuccessfulHeartbeat.Load() | ||
| } | ||
|
|
||
| func (e *executorImpl[SP]) finalHeartbeatTimeout() time.Duration { | ||
| if e.heartBeatInterval > 0 && e.heartBeatInterval < drainingHeartbeatTimeout { | ||
| return e.heartBeatInterval | ||
|
Comment on lines
+345
to
+346
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Btw. is that possible e.heartBeatInterval == 0? It means no heartbeat? |
||
| } | ||
| return drainingHeartbeatTimeout | ||
| } | ||
|
|
||
| func (e *executorImpl[SP]) updateShardAssignment(ctx context.Context, shardAssignments map[string]*types.ShardAssignment) { | ||
| wg := sync.WaitGroup{} | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is quite unexpected to canary needs to explicitely .Start and .Stop executors + care about order.
Does that mean other clients (executors) should do that as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cc @jakobht