Skip to content

flakiness - fix cert rotation tests #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 49 additions & 2 deletions docker/mongodb-enterprise-tests/kubetester/certs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,27 @@
from kubeobject import CustomObject
from kubernetes import client
from kubernetes.client.rest import ApiException
from kubetester import create_secret, delete_secret, random_k8s_name, read_secret
from kubetester import (
create_secret,
delete_secret,
kubetester,
random_k8s_name,
read_secret,
)
from kubetester.kubetester import KubernetesTester
from kubetester.mongodb import Phase
from kubetester.mongodb_multi import MongoDBMulti, MultiClusterClient
from opentelemetry import trace
from tests import test_logger
from tests.vaultintegration import (
store_secret_in_vault,
vault_namespace_name,
vault_sts_name,
)

TRACER = trace.get_tracer("evergreen-agent")
logger = test_logger.get_test_logger(__name__)

ISSUER_CA_NAME = "ca-issuer"

SUBJECT = {
Expand Down Expand Up @@ -161,11 +173,13 @@ def generate_cert(
return secret_name


def rotate_cert(namespace, certificate_name):
def rotate_cert(namespace, certificate_name, should_block_until_ready=False):
cert = Certificate(name=certificate_name, namespace=namespace)
cert.load()
cert["spec"]["dnsNames"].append("foo") # Append DNS to cert to rotate the certificate
cert.update()
if should_block_until_ready:
cert.block_until_ready()


def create_tls_certs(
Expand Down Expand Up @@ -879,3 +893,36 @@ def yield_existing_csrs(csr_names: List[str], timeout: int = 300) -> Generator[s
raise AssertionError(
f"Expected to find {total_csrs} csrs, but only found {seen_csrs} after {timeout} seconds. Expected csrs {csr_names}"
)


@TRACER.start_as_current_span("assert_certificate_rotation")
def rotate_and_assert_certificates(mdb, namespace, certificate_name):
"""
Verifies certificate rotation completes successfully.

Rotates the specified certificate and validates that:
1. Automation config version increases, as cert changes causes a new ac version
2. All MongoDB processes reach the new goal version
3. MongoDB instance returns/stays to Running state

"""

old_ac_version = KubernetesTester.get_automation_config()["version"]
rotate_cert(namespace, certificate_name, should_block_until_ready=True)
Comment on lines +910 to +911
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assert_certificate_rotation gives an impression that it does not make any changes and only asserts when in fact it does.

What do you think about the following suggestion?

  1. Moving these two lines into the caller functions
  2. Make assert_certificate_rotation accept old_ac_version as an argument

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renamed! bf2d183

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit of context for transparency. We discussed it in DM: while I think it is good to keep mutation (cert rotation) separate from assertions, @nammn wanted to to keep everything together to avoid forgetting to do part of the checks.

At the same time I think it is more important to address the flake than worry about semantics here. We landed on the middle ground: renaming the function to rotate_and_assert_certificates to more clearly indicate the purpose.


# Create named function to check version and process status
def check_version_increased():

current_version = KubernetesTester.get_automation_config()["version"]
version_increased = current_version > old_ac_version

return version_increased

timeout = 600
KubernetesTester.wait_until(
check_version_increased,
timeout=timeout,
)
kubetester.wait_processes_ready()

mdb.assert_reaches_phase(Phase.Running, timeout=1200)
23 changes: 21 additions & 2 deletions docker/mongodb-enterprise-tests/kubetester/kubetester.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,13 @@
from kubernetes.client.rest import ApiException
from kubernetes.stream import stream
from kubetester.crypto import wait_for_certs_to_be_issued
from opentelemetry import trace
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
from tests import test_logger

TRACER = trace.get_tracer("evergreen-agent")
logger = test_logger.get_test_logger(__name__)

SSL_CA_CERT = "/var/run/secrets/kubernetes.io/serviceaccount/..data/ca.crt"
EXTERNALLY_MANAGED_TAG = "EXTERNALLY_MANAGED_BY_KUBERNETES"
MAX_TAG_LEN = 32
Expand Down Expand Up @@ -188,12 +192,17 @@ def decode_secret(cls, data: Dict[str, str]) -> Dict[str, str]:
return {k: b64decode(v).decode("utf-8") for (k, v) in data.items()}

@classmethod
def read_configmap(cls, namespace: str, name: str, api_client: Optional[client.ApiClient] = None) -> Dict[str, str]:
def read_configmap(
cls, namespace: str, name: str, api_client: Optional[client.ApiClient] = None, with_metadata=False
) -> Dict[str, str]:
corev1 = cls.clients("corev1")
if api_client is not None:
corev1 = client.CoreV1Api(api_client=api_client)

return corev1.read_namespaced_config_map(name, namespace).data
cm = corev1.read_namespaced_config_map(name, namespace)
if with_metadata:
return cm
return cm.data

@classmethod
def read_pod(cls, namespace: str, name: str) -> Dict[str, str]:
Expand Down Expand Up @@ -962,6 +971,16 @@ def get_automation_status(group_id=None, group_name=None):

return response.json()

@staticmethod
def get_automation_status(group_id=None, group_name=None):
if group_id is None:
group_id = KubernetesTester.get_om_group_id(group_name=group_name)

url = build_automation_status_endpoint(KubernetesTester.get_om_base_url(), group_id)
response = KubernetesTester.om_request("get", url)

return response.json()

@staticmethod
def get_monitoring_config(group_id=None):
if group_id is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
from kubetester.mongodb import MongoDB, Phase
from kubetester.mongotester import ShardedClusterTester
from kubetester.omtester import get_sc_cert_names
from opentelemetry import trace
from pytest import fixture
from tests import test_logger

TRACER = trace.get_tracer("evergreen-agent")

MDB_RESOURCE = "sharded-cluster-x509-to-scram-256"
USER_NAME = "mms-user-1"
PASSWORD_SECRET_NAME = "mms-user-1-password"
Expand Down Expand Up @@ -97,11 +100,6 @@ def test_ops_manager_state_updated_correctly(self):
tester.assert_authentication_disabled()


from opentelemetry import trace

TRACER = trace.get_tracer("evergreen-agent")


@pytest.mark.e2e_sharded_cluster_x509_to_scram_transition
class TestCanEnableScramSha256:
@TRACER.start_as_current_span("test_can_enable_scram_sha_256")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
ISSUER_CA_NAME,
create_x509_agent_tls_certs,
create_x509_mongodb_tls_certs,
rotate_cert,
rotate_and_assert_certificates,
)
from kubetester.kubetester import KubernetesTester
from kubetester.kubetester import fixture as load_fixture
Expand Down Expand Up @@ -47,20 +47,10 @@ def test_ops_manager_state_correctly_updated(self):
ac_tester.assert_internal_cluster_authentication_enabled()
ac_tester.assert_authentication_enabled()

def test_rotate_certificate(self, mdb: MongoDB, namespace: str):
rotate_cert(namespace, "{}-cert".format(MDB_RESOURCE))
mdb.assert_abandons_phase(Phase.Running, timeout=900)
mdb.assert_reaches_phase(Phase.Running, timeout=900)

def test_rotate_certificate_with_sts_restarting(self, mdb: MongoDB, namespace: str):
mdb.trigger_sts_restart()
assert_certificate_rotation(mdb, namespace, "{}-cert".format(MDB_RESOURCE))
rotate_and_assert_certificates(mdb, namespace, "{}-cert".format(MDB_RESOURCE))

def test_rotate_clusterfile_with_sts_restarting(self, mdb: MongoDB, namespace: str):
mdb.trigger_sts_restart()
assert_certificate_rotation(mdb, namespace, "{}-clusterfile".format(MDB_RESOURCE))


def assert_certificate_rotation(mdb, namespace, certificate_name):
rotate_cert(namespace, certificate_name)
mdb.assert_reaches_phase(Phase.Running, timeout=900)
rotate_and_assert_certificates(mdb, namespace, "{}-clusterfile".format(MDB_RESOURCE))
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
from kubetester.certs import (
create_sharded_cluster_certs,
create_x509_agent_tls_certs,
rotate_cert,
rotate_and_assert_certificates,
)
from kubetester.kubetester import KubernetesTester, ensure_ent_version, is_multi_cluster
from kubetester.kubetester import KubernetesTester, is_multi_cluster
from kubetester.mongodb import MongoDB, Phase
from kubetester.operator import Operator
from pytest import fixture
from tests import test_logger
from tests.shardedcluster.conftest import enable_multi_cluster_deployment

MDB_RESOURCE_NAME = "test-x509-all-options-sc"

logger = test_logger.get_test_logger(__name__)


@fixture(scope="module")
def agent_certs(issuer: str, namespace: str) -> str:
Expand Down Expand Up @@ -86,58 +89,14 @@ def test_ops_manager_state_correctly_updated(self):
ac_tester.assert_authentication_enabled()
ac_tester.assert_expected_users(0)

def test_rotate_shard_cert(self, sc: MongoDB, namespace: str):
rotate_cert(namespace, f"{MDB_RESOURCE_NAME}-0-cert")
sc.assert_abandons_phase(Phase.Running, timeout=900)
sc.assert_reaches_phase(Phase.Running, timeout=1200)

def test_rotate_config_cert(self, sc: MongoDB, namespace: str):
rotate_cert(namespace, f"{MDB_RESOURCE_NAME}-config-cert")
sc.assert_abandons_phase(Phase.Running, timeout=900)
sc.assert_reaches_phase(Phase.Running, timeout=1200)

def test_rotate_mongos_cert(self, sc: MongoDB, namespace: str):
rotate_cert(namespace, f"{MDB_RESOURCE_NAME}-mongos-cert")
sc.assert_abandons_phase(Phase.Running, timeout=900)
sc.assert_reaches_phase(Phase.Running, timeout=1200)

def test_rotate_shard_cert_with_sts_restarting(self, sc: MongoDB, namespace: str):
sc.trigger_sts_restart("shard")
assert_certificate_rotation(sc, namespace, f"{MDB_RESOURCE_NAME}-0-cert")
rotate_and_assert_certificates(sc, namespace, f"{MDB_RESOURCE_NAME}-0-cert")

def test_rotate_config_cert_with_sts_restarting(self, sc: MongoDB, namespace: str):
sc.trigger_sts_restart("config")
assert_certificate_rotation(sc, namespace, f"{MDB_RESOURCE_NAME}-config-cert")
rotate_and_assert_certificates(sc, namespace, f"{MDB_RESOURCE_NAME}-config-cert")

def test_rotate_mongos_cert_with_sts_restarting(self, sc: MongoDB, namespace: str):
sc.trigger_sts_restart("mongos")
assert_certificate_rotation(sc, namespace, f"{MDB_RESOURCE_NAME}-mongos-cert")

def test_rotate_shard_certfile_with_sts_restarting(self, sc: MongoDB, namespace: str):
sc.trigger_sts_restart("shard")
assert_certificate_rotation(
sc,
namespace,
f"{MDB_RESOURCE_NAME}-0-clusterfile",
)

def test_rotate_config_certfile_with_sts_restarting(self, sc: MongoDB, namespace: str):
sc.trigger_sts_restart("config")
assert_certificate_rotation(
sc,
namespace,
f"{MDB_RESOURCE_NAME}-config-clusterfile",
)

def test_rotate_mongos_certfile_with_sts_restarting(self, sc: MongoDB, namespace: str):
sc.trigger_sts_restart("mongos")
assert_certificate_rotation(
sc,
namespace,
f"{MDB_RESOURCE_NAME}-mongos-clusterfile",
)


def assert_certificate_rotation(sharded_cluster, namespace, certificate_name):
rotate_cert(namespace, certificate_name)
sharded_cluster.assert_reaches_phase(Phase.Running, timeout=1200)
rotate_and_assert_certificates(sc, namespace, f"{MDB_RESOURCE_NAME}-mongos-cert")