From 04f124b47141f71c35f27dd940c3c038506c501b Mon Sep 17 00:00:00 2001 From: Chris Bandy Date: Thu, 3 Mar 2022 13:08:25 -0600 Subject: [PATCH 1/5] Change "status.instances.replicas" to include all pods We added this field while implementing rolling updates and chose to populate it with semantics similar to the appsv1.Deployment controller which ignores terminating Pods. However, our controller does *not* ignore terminating PostgreSQL Pods because they are still writing to disk and still participating in HA quorum. So, this field now reflects the number of observed Pods, regardless of their state. This is the same approach taken by the appsv1.StatefulSet controller. When this field is zero, we can be relatively certain that all PostgreSQL processes have stopped and disks are idle. Issue: [sc-13696] See: https://issue.k8s.io/107920 See: e0a885ccb842a735fa007604576a2c31a80b2f7a --- ...res-operator.crunchydata.com_postgresclusters.yaml | 5 ++--- docs/content/references/crd.md | 4 ++-- internal/controller/postgrescluster/instance.go | 11 +++++------ .../v1beta1/postgrescluster_types.go | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml b/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml index 33462b46b..1c725de9f 100644 --- a/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml +++ b/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml @@ -9151,12 +9151,11 @@ spec: format: int32 type: integer replicas: - description: Total number of non-terminated pods. + description: Total number of pods. format: int32 type: integer updatedReplicas: - description: Total number of non-terminated pods that have the - desired specification. + description: Total number of pods that have the desired specification. format: int32 type: integer required: diff --git a/docs/content/references/crd.md b/docs/content/references/crd.md index 3ec75ad58..78d004cfd 100644 --- a/docs/content/references/crd.md +++ b/docs/content/references/crd.md @@ -13658,12 +13658,12 @@ Condition contains details for one aspect of the current state of this API Resou replicas integer - Total number of non-terminated pods. + Total number of pods. false updatedReplicas integer - Total number of non-terminated pods that have the desired specification. + Total number of pods that have the desired specification. false diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index 86033b568..01bb75eef 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -353,16 +353,15 @@ func (r *Reconciler) observeInstances( cluster.Status.InstanceSets = cluster.Status.InstanceSets[:0] for _, name := range observed.setNames.List() { status := v1beta1.PostgresInstanceSetStatus{Name: name} + for _, instance := range observed.bySet[name] { + status.Replicas += int32(len(instance.Pods)) + if ready, known := instance.IsReady(); known && ready { status.ReadyReplicas++ } - if terminating, known := instance.IsTerminating(); known && !terminating { - status.Replicas++ - - if matches, known := instance.PodMatchesPodTemplate(); known && matches { - status.UpdatedReplicas++ - } + if matches, known := instance.PodMatchesPodTemplate(); known && matches { + status.UpdatedReplicas++ } } diff --git a/pkg/apis/postgres-operator.crunchydata.com/v1beta1/postgrescluster_types.go b/pkg/apis/postgres-operator.crunchydata.com/v1beta1/postgrescluster_types.go index e713e9ae2..f86e6494b 100644 --- a/pkg/apis/postgres-operator.crunchydata.com/v1beta1/postgrescluster_types.go +++ b/pkg/apis/postgres-operator.crunchydata.com/v1beta1/postgrescluster_types.go @@ -493,11 +493,11 @@ type PostgresInstanceSetStatus struct { // +optional ReadyReplicas int32 `json:"readyReplicas,omitempty"` - // Total number of non-terminated pods. + // Total number of pods. // +optional Replicas int32 `json:"replicas,omitempty"` - // Total number of non-terminated pods that have the desired specification. + // Total number of pods that have the desired specification. // +optional UpdatedReplicas int32 `json:"updatedReplicas,omitempty"` } From a8ea06cff35281119c5ca400ccb8138c43aa10d0 Mon Sep 17 00:00:00 2001 From: Chris Bandy Date: Tue, 8 Mar 2022 16:21:09 -0600 Subject: [PATCH 2/5] Use PostgresCluster status in deletion tests The tests were written before the status fields existed. This also reduces the scope of variables used only for failure messages. --- .../controller/postgrescluster/delete_test.go | 73 +++++++------------ 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/internal/controller/postgrescluster/delete_test.go b/internal/controller/postgrescluster/delete_test.go index 1bd0a1b69..5bd6e45e9 100644 --- a/internal/controller/postgrescluster/delete_test.go +++ b/internal/controller/postgrescluster/delete_test.go @@ -28,7 +28,6 @@ import ( "go.opentelemetry.io/otel" "gotest.tools/v3/assert" - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -201,29 +200,20 @@ func TestReconcilerHandleDelete(t *testing.T) { "cluster should immediately have a finalizer") // Continue until instances are healthy. - var instances []appsv1.StatefulSet - var ready int32 - assert.NilError(t, wait.Poll(time.Second, Scale(time.Minute), func() (bool, error) { - mustReconcile(t, cluster) - - list := appsv1.StatefulSetList{} - selector, err := labels.Parse(strings.Join([]string{ - "postgres-operator.crunchydata.com/cluster=" + cluster.Name, - "postgres-operator.crunchydata.com/instance", - }, ",")) - assert.NilError(t, err) - assert.NilError(t, cc.List(ctx, &list, - client.InNamespace(cluster.Namespace), - client.MatchingLabelsSelector{Selector: selector})) - - instances = list.Items - - ready = int32(0) - for i := range instances { - ready += instances[i].Status.ReadyReplicas - } - return ready >= test.waitForRunningInstances, nil - }), "expected %v instances to be ready, got:\n%v", test.waitForRunningInstances, ready) + if ready := int32(0); !assert.Check(t, + wait.Poll(time.Second, Scale(time.Minute), func() (bool, error) { + mustReconcile(t, cluster) + assert.NilError(t, cc.Get(ctx, client.ObjectKeyFromObject(cluster), cluster)) + + ready = 0 + for _, set := range cluster.Status.InstanceSets { + ready += set.ReadyReplicas + } + return ready >= test.waitForRunningInstances, nil + }), "expected %v instances to be ready, got: %v", test.waitForRunningInstances, ready, + ) { + t.FailNow() + } if test.beforeDelete != nil { test.beforeDelete(t, cluster) @@ -412,27 +402,20 @@ func TestReconcilerHandleDeleteNamespace(t *testing.T) { client.Merge.Type(), []byte(`{"metadata":{"finalizers":[]}}`))))) }) - var instances []appsv1.StatefulSet - var ready int32 - assert.NilError(t, wait.Poll(time.Second, Scale(time.Minute), func() (bool, error) { - list := appsv1.StatefulSetList{} - selector, err := labels.Parse(strings.Join([]string{ - "postgres-operator.crunchydata.com/cluster=" + cluster.Name, - "postgres-operator.crunchydata.com/instance", - }, ",")) - assert.NilError(t, err) - assert.NilError(t, cc.List(ctx, &list, - client.InNamespace(cluster.Namespace), - client.MatchingLabelsSelector{Selector: selector})) - - instances = list.Items - - ready = 0 - for i := range instances { - ready += instances[i].Status.ReadyReplicas - } - return ready >= 2, nil - }), "expected 2 instances to be ready, got:\n%v", ready) + // Wait until instances are healthy. + if ready := int32(0); !assert.Check(t, + wait.Poll(time.Second, Scale(time.Minute), func() (bool, error) { + assert.NilError(t, cc.Get(ctx, client.ObjectKeyFromObject(cluster), cluster)) + + ready = 0 + for _, set := range cluster.Status.InstanceSets { + ready += set.ReadyReplicas + } + return ready >= 2, nil + }), "expected 2 instances to be ready, got: %v", ready, + ) { + t.FailNow() + } // Delete the namespace. assert.NilError(t, cc.Delete(ctx, ns)) From 68bf3d9956a88cf8b31e8371674e65142b64309d Mon Sep 17 00:00:00 2001 From: Chris Bandy Date: Thu, 27 Jan 2022 22:56:21 -0600 Subject: [PATCH 3/5] SKIP: always test --- .github/workflows/test.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 05c57aaa6..11ddaa2b3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -45,7 +45,6 @@ jobs: retention-days: 1 kubernetes-k3d: - if: "${{ github.repository == 'CrunchyData/postgres-operator' }}" runs-on: ubuntu-latest needs: [go-test] strategy: From ca236ecd6d9d15eb7efe8051de268d050e2e0be0 Mon Sep 17 00:00:00 2001 From: Chris Bandy Date: Tue, 8 Mar 2022 16:37:51 -0600 Subject: [PATCH 4/5] Log events when deletion tests fail These tests are flaky in CI and the current failure messages do not show why. Add a testing package that can format, sort, and filter corev1.Events. --- .../controller/postgrescluster/delete_test.go | 17 +++- internal/testing/events/format.go | 92 +++++++++++++++++++ 2 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 internal/testing/events/format.go diff --git a/internal/controller/postgrescluster/delete_test.go b/internal/controller/postgrescluster/delete_test.go index 5bd6e45e9..c79dd5ec5 100644 --- a/internal/controller/postgrescluster/delete_test.go +++ b/internal/controller/postgrescluster/delete_test.go @@ -41,6 +41,7 @@ import ( "sigs.k8s.io/yaml" "github.com/crunchydata/postgres-operator/internal/patroni" + "github.com/crunchydata/postgres-operator/internal/testing/events" "github.com/crunchydata/postgres-operator/internal/testing/require" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) @@ -200,7 +201,7 @@ func TestReconcilerHandleDelete(t *testing.T) { "cluster should immediately have a finalizer") // Continue until instances are healthy. - if ready := int32(0); !assert.Check(t, + if ready, start := int32(0), time.Now(); !assert.Check(t, wait.Poll(time.Second, Scale(time.Minute), func() (bool, error) { mustReconcile(t, cluster) assert.NilError(t, cc.Get(ctx, client.ObjectKeyFromObject(cluster), cluster)) @@ -212,7 +213,11 @@ func TestReconcilerHandleDelete(t *testing.T) { return ready >= test.waitForRunningInstances, nil }), "expected %v instances to be ready, got: %v", test.waitForRunningInstances, ready, ) { - t.FailNow() + list := corev1.EventList{} + assert.NilError(t, cc.List(ctx, &list, client.InNamespace(cluster.Namespace))) + + events.SortByTimestamp(list.Items) + t.Fatalf("Events:\n%v", events.FormatList(events.Since(list.Items, start))) } if test.beforeDelete != nil { @@ -403,7 +408,7 @@ func TestReconcilerHandleDeleteNamespace(t *testing.T) { }) // Wait until instances are healthy. - if ready := int32(0); !assert.Check(t, + if ready, start := int32(0), time.Now(); !assert.Check(t, wait.Poll(time.Second, Scale(time.Minute), func() (bool, error) { assert.NilError(t, cc.Get(ctx, client.ObjectKeyFromObject(cluster), cluster)) @@ -414,7 +419,11 @@ func TestReconcilerHandleDeleteNamespace(t *testing.T) { return ready >= 2, nil }), "expected 2 instances to be ready, got: %v", ready, ) { - t.FailNow() + list := corev1.EventList{} + assert.NilError(t, cc.List(ctx, &list, client.InNamespace(cluster.Namespace))) + + events.SortByTimestamp(list.Items) + t.Fatalf("Events:\n%v", events.FormatList(events.Since(list.Items, start))) } // Delete the namespace. diff --git a/internal/testing/events/format.go b/internal/testing/events/format.go new file mode 100644 index 000000000..b2da1b394 --- /dev/null +++ b/internal/testing/events/format.go @@ -0,0 +1,92 @@ +/* + Copyright 2022 Crunchy Data Solutions, Inc. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package events + +import ( + "fmt" + "sort" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/duration" +) + +// Format returns event in a format similar to `kubectl describe`. +func Format(event corev1.Event) string { + source := event.Source.Component + if source == "" { + source = event.ReportingController + } + + timestamp := event.EventTime.Time + if timestamp.IsZero() { + timestamp = event.FirstTimestamp.Time + } + + interval := duration.HumanDuration(time.Since(timestamp)) + if event.Series != nil { + interval = fmt.Sprintf("%s (x%d over %s)", + duration.HumanDuration(time.Since(event.Series.LastObservedTime.Time)), + event.Series.Count, interval) + } else if event.Count > 1 { + interval = fmt.Sprintf("%s (x%d over %s)", + duration.HumanDuration(time.Since(event.LastTimestamp.Time)), + event.Count, interval) + } + + return fmt.Sprintf("%s\t%-8s\t%s\t%-8s\t%s", + event.Type, event.Reason, interval, source, event.Message) +} + +// FormatList returns events formatted and separated by newlines. +func FormatList(events []corev1.Event) string { + var buffer strings.Builder + + if len(events) > 0 { + _, _ = buffer.WriteString(Format(events[0])) + + for _, event := range events[1:] { + _, _ = buffer.WriteString("\n" + Format(event)) + } + } + + return buffer.String() +} + +// Since returns events that occurred after t. +func Since(events []corev1.Event, t time.Time) []corev1.Event { + var result []corev1.Event + + for _, event := range events { + if event.EventTime.After(t) || event.FirstTimestamp.After(t) { + result = append(result, event) + } else if event.Series != nil && event.Series.LastObservedTime.After(t) { + result = append(result, event) + } else if event.Count > 1 && event.LastTimestamp.After(t) { + result = append(result, event) + } + } + + return result +} + +// SortByTimestamp sorts events by LastTimestamp, oldest first. +func SortByTimestamp(events []corev1.Event) { + sort.Slice(events, func(i, j int) bool { + return events[i].LastTimestamp.Time.Before(events[j].LastTimestamp.Time) + }) +} From d952b84f227ea45334478943548d8bcbe13eaab9 Mon Sep 17 00:00:00 2001 From: Chris Bandy Date: Wed, 9 Mar 2022 13:26:30 -0600 Subject: [PATCH 5/5] WIP: keep prefetch containers running --- .github/workflows/test.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 11ddaa2b3..5183c8c24 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -83,8 +83,9 @@ jobs: template: { metadata: { labels: $labels }, spec: { - initContainers: to_entries | map({ name: "c\(.key)", command: ["true"], image: .value }), - containers: [{ name: "pause", image: "k8s.gcr.io/pause:3.5" }] + containers: to_entries | map({ + name: "c\(.key)", image: .value, command: ["sleep", "15m"], + }), } } }