Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions components/ws-manager/pkg/manager/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,13 @@ type metrics struct {
volumeRestoreTimeHistVec *prometheus.HistogramVec

// Counter
totalStartsCounterVec *prometheus.CounterVec
totalStopsCounterVec *prometheus.CounterVec
totalBackupCounterVec *prometheus.CounterVec
totalBackupFailureCounterVec *prometheus.CounterVec
totalRestoreCounterVec *prometheus.CounterVec
totalRestoreFailureCounterVec *prometheus.CounterVec
totalStartsCounterVec *prometheus.CounterVec
totalStopsCounterVec *prometheus.CounterVec
totalBackupCounterVec *prometheus.CounterVec
totalBackupFailureCounterVec *prometheus.CounterVec
totalRestoreCounterVec *prometheus.CounterVec
totalRestoreFailureCounterVec *prometheus.CounterVec
totalUnintentionalWorkspaceStopCounterVec *prometheus.CounterVec

// Gauge
totalOpenPortGauge prometheus.GaugeFunc
Expand Down Expand Up @@ -135,6 +136,12 @@ func newMetrics(m *Manager) *metrics {
Name: "workspace_restores_failure_total",
Help: "total number of workspace restore failures",
}, []string{"type", "class"}),
totalUnintentionalWorkspaceStopCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: "workspace_unintentional_stop_total",
Help: "total number of workspaces when container stopped without being deleted prior",
}, []string{"type", "class"}),
totalOpenPortGauge: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Expand Down Expand Up @@ -197,6 +204,7 @@ func (m *metrics) Register(reg prometheus.Registerer) error {
m.totalBackupFailureCounterVec,
m.totalRestoreCounterVec,
m.totalRestoreFailureCounterVec,
m.totalUnintentionalWorkspaceStopCounterVec,
m.totalOpenPortGauge,
}
for _, c := range collectors {
Expand Down
22 changes: 11 additions & 11 deletions components/ws-manager/pkg/manager/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac
result.Spec.ExposedPorts = extractExposedPorts(pod).Ports

// check failure states, i.e. determine value of result.Failed
failure, phase := extractFailure(wso)
failure, phase := extractFailure(wso, m.metrics)
result.Conditions.Failed = failure
if phase != nil {
result.Phase = *phase
Expand Down Expand Up @@ -350,11 +350,7 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac
result.Phase = api.WorkspacePhase_STOPPING

_, podFailedBeforeBeingStopped := pod.Annotations[workspaceFailedBeforeStoppingAnnotation]
if !podFailedBeforeBeingStopped {
// While the pod is being deleted we do not care or want to know about any failure state.
// If the pod got stopped because it failed we will have sent out a Stopping status with a "failure"
result.Conditions.Failed = ""
} else {
if podFailedBeforeBeingStopped {
if _, ok := pod.Annotations[workspaceNeverReadyAnnotation]; ok {
// The workspace is never ready, so there is no need for a stopping phase.
result.Phase = api.WorkspacePhase_STOPPED
Expand Down Expand Up @@ -529,7 +525,7 @@ func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspac

// extractFailure returns a pod failure reason and possibly a phase. If phase is nil then
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
func extractFailure(wso workspaceObjects) (string, *api.WorkspacePhase) {
func extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.WorkspacePhase) {
pod := wso.Pod

// if the workspace was explicitely marked as failed that also constitutes a failure reason
Expand Down Expand Up @@ -591,11 +587,15 @@ func extractFailure(wso workspaceObjects) (string, *api.WorkspacePhase) {
return fmt.Sprintf("container %s ran with an error: exit code %d", cs.Name, terminationState.ExitCode), &phase
}
} else if terminationState.Reason == "Completed" {
if wso.IsWorkspaceHeadless() {
// default way for headless workspaces to be done
return "", nil
// container terminated successfully - this is not a failure
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original intent with this branch was to cover the case that a regular workspace stops without being deleted. An unintentional stop, so to speak. That said, the failure mode is actually nicer if your workspace just stops compared to get getting an inactionable error message ("containers of a workspace pod are not supposed to do that" 🤷).

The failed condition on workspace status indeed isn't stable (but should be). ws-manager-bridge should protect us against that by not resetting that condition if it ever was set. Because we have nothing like an at-least-once delivery semantic, we might just miss the "failed" message though.

if !isPodBeingDeleted(pod) {
wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel])
wsClass := pod.Labels[workspaceClassLabel]
if metrics != nil && !wso.IsWorkspaceHeadless() {
metrics.totalUnintentionalWorkspaceStopCounterVec.WithLabelValues(wsType, wsClass).Inc()
}
}
return fmt.Sprintf("container %s completed; containers of a workspace pod are not supposed to do that. Reason: %s", cs.Name, terminationState.Message), nil
return "", nil
} else if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerUnknownExitCode {
// if a container is terminated and it wasn't because of either:
// - regular shutdown
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,3 @@
{
"actions": [
{
"Func": "markWorkspace",
"Params": {
"annotations": [
{
"Name": "gitpod/failedBeforeStopping",
"Value": "true",
"Delete": false
}
],
"workspaceID": "foobar"
}
},
{
"Func": "stopWorkspace",
"Params": {
"gracePeriod": 30000000000,
"workspaceID": "foobar"
}
}
]
}
"actions": null
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
},
"phase": 6,
"conditions": {
"failed": "last backup failed: testing the backup failure mode.",
"failed": "ungraceful shutdown - teardown was unsuccessful: socket did not appear before context was canceled; last backup failed: testing the backup failure mode.",
"final_backup_complete": 1,
"volume_snapshot": {}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
},
"phase": 6,
"conditions": {
"failed": "ungraceful shutdown - teardown was unsuccessful: socket did not appear before context was canceled",
"final_backup_complete": 1,
"volume_snapshot": {}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
},
"phase": 3,
"conditions": {
"failed": "container sync completed; containers of a workspace pod are not supposed to do that. Reason: ",
"volume_snapshot": {}
},
"message": "workspace initializer is running",
Expand Down