From 918949e0f83a97b1d70089f054745f1956dc52dd Mon Sep 17 00:00:00 2001 From: Manuel Alejandro de Brito Fontes Date: Thu, 21 Jul 2022 18:56:26 -0400 Subject: [PATCH] Refactor Manager StartWorkspace --- .../typescript/src/promisified-client.ts | 15 ++++++++------- components/ws-manager/pkg/manager/manager.go | 16 +++++++++++++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/components/ws-manager-api/typescript/src/promisified-client.ts b/components/ws-manager-api/typescript/src/promisified-client.ts index 2eef04b6bd82ed..27d48d5d7acaf3 100644 --- a/components/ws-manager-api/typescript/src/promisified-client.ts +++ b/components/ws-manager-api/typescript/src/promisified-client.ts @@ -117,16 +117,17 @@ export class PromisifiedWorkspaceManagerClient implements Disposable { this.client.startWorkspace( request, withTracing({ span }), - this.getDefaultUnaryOptions(), + { + // Important!!!!: client timeout must be higher than ws-manager to be able to process any error + // https://github.com/gitpod-io/gitpod/blob/main/components/ws-manager/pkg/manager/manager.go#L171 + deadline: new Date(new Date().getTime() + 60000*11), + interceptors: this.interceptor, + }, (err, resp) => { span.finish(); if (err) { - if (attempt < 3 && err.message.indexOf("already exists") !== -1) { - // lets wait a bit more - } else { - TraceContext.setError(ctx, err); - reject(err); - } + TraceContext.setError(ctx, err); + reject(err); } else { resolve(resp); } diff --git a/components/ws-manager/pkg/manager/manager.go b/components/ws-manager/pkg/manager/manager.go index 545976fb2c5857..206958460c4dbb 100644 --- a/components/ws-manager/pkg/manager/manager.go +++ b/components/ws-manager/pkg/manager/manager.go @@ -164,7 +164,13 @@ func (m *Manager) Close() { } // StartWorkspace creates a new running workspace within the manager's cluster -func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceRequest) (res *api.StartWorkspaceResponse, err error) { +func (m *Manager) StartWorkspace(_ context.Context, req *api.StartWorkspaceRequest) (res *api.StartWorkspaceResponse, err error) { + // We cannot use the passed context because we need to decouple the timeouts + // Create a context with a high timeout value to be able to wait for scale-up events in the cluster (slow operation) + // Important!!!: this timeout must be lower than https://github.com/gitpod-io/gitpod/blob/main/components/ws-manager-api/typescript/src/promisified-client.ts#L122 + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + owi := log.LogContext(req.Metadata.Owner, req.Metadata.MetaId, req.Id, req.Metadata.GetProject(), req.Metadata.GetTeam()) clog := log.WithFields(owi) span, ctx := tracing.FromContext(ctx, "StartWorkspace") @@ -314,8 +320,12 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq return nil, err } - err = wait.PollWithContext(ctx, 100*time.Millisecond, 10*time.Minute, podRunning(m.Clientset, pod.Name, pod.Namespace)) + // if we reach this point the pod is created + // in case the context is canceled or a timeout happens we should delete the pod? + + err = wait.PollImmediateWithContext(ctx, 100*time.Millisecond, 7*time.Minute, podRunning(m.Clientset, pod.Name, pod.Namespace)) if err != nil { + clog.WithError(err).WithField("req", req).WithField("pod", pod.Name).Warn("was unable to start workspace") return nil, xerrors.Errorf("workspace pod never reached Running state: %w", err) } @@ -326,7 +336,7 @@ func (m *Manager) StartWorkspace(ctx context.Context, req *api.StartWorkspaceReq return nil, xerrors.Errorf("unable to get workspace pod %s: %w", pod.Name, err) } - err = wait.PollWithContext(ctx, 100*time.Millisecond, 5*time.Minute, pvcRunning(m.Clientset, pvc.Name, pvc.Namespace)) + err = wait.PollImmediateWithContext(ctx, 100*time.Millisecond, 5*time.Minute, pvcRunning(m.Clientset, pvc.Name, pvc.Namespace)) if err != nil { if startContext.VolumeSnapshot != nil && startContext.VolumeSnapshot.VolumeSnapshotName != "" { m.eventRecorder.Eventf(pod, corev1.EventTypeWarning, "PersistentVolumeClaim", "PVC %q restore from volume snapshot %q failed %v", pvc.Name, startContext.VolumeSnapshot.VolumeSnapshotName, err)