diff --git a/components/ws-manager/pkg/manager/metrics.go b/components/ws-manager/pkg/manager/metrics.go index 6a0602b7506a74..3dbc4d6c42c9d3 100644 --- a/components/ws-manager/pkg/manager/metrics.go +++ b/components/ws-manager/pkg/manager/metrics.go @@ -45,12 +45,13 @@ type metrics struct { volumeRestoreTimeHistVec *prometheus.HistogramVec // Counter - totalStartsCounterVec *prometheus.CounterVec - totalStopsCounterVec *prometheus.CounterVec - totalBackupCounterVec *prometheus.CounterVec - totalBackupFailureCounterVec *prometheus.CounterVec - totalRestoreCounterVec *prometheus.CounterVec - totalRestoreFailureCounterVec *prometheus.CounterVec + totalStartsCounterVec *prometheus.CounterVec + totalStopsCounterVec *prometheus.CounterVec + totalBackupCounterVec *prometheus.CounterVec + totalBackupFailureCounterVec *prometheus.CounterVec + totalRestoreCounterVec *prometheus.CounterVec + totalRestoreFailureCounterVec *prometheus.CounterVec + totalUnintentionalWorkspaceStopCounterVec *prometheus.CounterVec // Gauge totalOpenPortGauge prometheus.GaugeFunc @@ -135,6 +136,12 @@ func newMetrics(m *Manager) *metrics { Name: "workspace_restores_failure_total", Help: "total number of workspace restore failures", }, []string{"type", "class"}), + totalUnintentionalWorkspaceStopCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: "workspace_unintentional_stop_total", + Help: "total number of workspaces when container stopped without being deleted prior", + }, []string{"type", "class"}), totalOpenPortGauge: prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Namespace: metricsNamespace, Subsystem: metricsWorkspaceSubsystem, @@ -197,6 +204,7 @@ func (m *metrics) Register(reg prometheus.Registerer) error { m.totalBackupFailureCounterVec, m.totalRestoreCounterVec, m.totalRestoreFailureCounterVec, + m.totalUnintentionalWorkspaceStopCounterVec, m.totalOpenPortGauge, } for _, c := range collectors { diff --git a/components/ws-manager/pkg/manager/status.go b/components/ws-manager/pkg/manager/status.go index ad4ca44a4d4031..e84437d45db945 100644 --- a/components/ws-manager/pkg/manager/status.go +++ b/components/ws-manager/pkg/manager/status.go @@ -315,7 +315,7 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac result.Spec.ExposedPorts = extractExposedPorts(pod).Ports // check failure states, i.e. determine value of result.Failed - failure, phase := extractFailure(wso) + failure, phase := extractFailure(wso, m.metrics) result.Conditions.Failed = failure if phase != nil { result.Phase = *phase @@ -350,11 +350,7 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac result.Phase = api.WorkspacePhase_STOPPING _, podFailedBeforeBeingStopped := pod.Annotations[workspaceFailedBeforeStoppingAnnotation] - if !podFailedBeforeBeingStopped { - // While the pod is being deleted we do not care or want to know about any failure state. - // If the pod got stopped because it failed we will have sent out a Stopping status with a "failure" - result.Conditions.Failed = "" - } else { + if podFailedBeforeBeingStopped { if _, ok := pod.Annotations[workspaceNeverReadyAnnotation]; ok { // The workspace is never ready, so there is no need for a stopping phase. result.Phase = api.WorkspacePhase_STOPPED @@ -529,7 +525,7 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac // extractFailure returns a pod failure reason and possibly a phase. If phase is nil then // one should extract the phase themselves. If the pod has not failed, this function returns "", nil. -func extractFailure(wso workspaceObjects) (string, *api.WorkspacePhase) { +func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.WorkspacePhase) { pod := wso.Pod // if the workspace was explicitely marked as failed that also constitutes a failure reason @@ -591,11 +587,15 @@ func extractFailure(wso workspaceObjects) (string, *api.WorkspacePhase) { return fmt.Sprintf("container %s ran with an error: exit code %d", cs.Name, terminationState.ExitCode), &phase } } else if terminationState.Reason == "Completed" { - if wso.IsWorkspaceHeadless() { - // default way for headless workspaces to be done - return "", nil + // container terminated successfully - this is not a failure + if !isPodBeingDeleted(pod) { + wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel]) + wsClass := pod.Labels[workspaceClassLabel] + if metrics != nil && !wso.IsWorkspaceHeadless() { + metrics.totalUnintentionalWorkspaceStopCounterVec.WithLabelValues(wsType, wsClass).Inc() + } } - return fmt.Sprintf("container %s completed; containers of a workspace pod are not supposed to do that. Reason: %s", cs.Name, terminationState.Message), nil + return "", nil } else if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerUnknownExitCode { // if a container is terminated and it wasn't because of either: // - regular shutdown diff --git a/components/ws-manager/pkg/manager/testdata/actOnPodEvent_wsstartup_Creating00.golden b/components/ws-manager/pkg/manager/testdata/actOnPodEvent_wsstartup_Creating00.golden index 9813a541cbda58..5981ded50b76f0 100644 --- a/components/ws-manager/pkg/manager/testdata/actOnPodEvent_wsstartup_Creating00.golden +++ b/components/ws-manager/pkg/manager/testdata/actOnPodEvent_wsstartup_Creating00.golden @@ -1,24 +1,3 @@ { - "actions": [ - { - "Func": "markWorkspace", - "Params": { - "annotations": [ - { - "Name": "gitpod/failedBeforeStopping", - "Value": "true", - "Delete": false - } - ], - "workspaceID": "foobar" - } - }, - { - "Func": "stopWorkspace", - "Params": { - "gracePeriod": 30000000000, - "workspaceID": "foobar" - } - } - ] -} \ No newline at end of file + "actions": null +} diff --git a/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPED01.golden b/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPED01.golden index aad17de163c639..c1a5fabfd50be3 100644 --- a/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPED01.golden +++ b/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPED01.golden @@ -20,7 +20,7 @@ }, "phase": 6, "conditions": { - "failed": "last backup failed: testing the backup failure mode.", + "failed": "ungraceful shutdown - teardown was unsuccessful: socket did not appear before context was canceled; last backup failed: testing the backup failure mode.", "final_backup_complete": 1, "volume_snapshot": {} }, diff --git a/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPING02.golden b/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPING02.golden index d4c83c4131df60..c587b035d330ed 100644 --- a/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPING02.golden +++ b/components/ws-manager/pkg/manager/testdata/status_disposal_STOPPING02.golden @@ -20,6 +20,7 @@ }, "phase": 6, "conditions": { + "failed": "ungraceful shutdown - teardown was unsuccessful: socket did not appear before context was canceled", "final_backup_complete": 1, "volume_snapshot": {} }, diff --git a/components/ws-manager/pkg/manager/testdata/status_wsstartup_Creating00.golden b/components/ws-manager/pkg/manager/testdata/status_wsstartup_Creating00.golden index 7f63416be2bf8a..88710f6bc9fc45 100644 --- a/components/ws-manager/pkg/manager/testdata/status_wsstartup_Creating00.golden +++ b/components/ws-manager/pkg/manager/testdata/status_wsstartup_Creating00.golden @@ -16,7 +16,6 @@ }, "phase": 3, "conditions": { - "failed": "container sync completed; containers of a workspace pod are not supposed to do that. Reason: ", "volume_snapshot": {} }, "message": "workspace initializer is running",