Skip to content

ci: mtu check for cilium e2e #3624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ stages:
fi
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"
- template: ../../templates/cilium-mtu-check.yaml
- template: ../k8s-e2e/k8s-e2e-job-template.yaml
parameters:
sub: $(BUILD_VALIDATIONS_SERVICE_CONNECTION)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,5 @@ steps:
fi
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"

- template: ../../templates/cilium-mtu-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,5 @@ steps:
fi
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"

- template: ../../templates/cilium-mtu-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ steps:
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"

- template: ../../templates/cilium-mtu-check.yaml

- script: |
ARTIFACT_DIR=$(Build.ArtifactStagingDirectory)/test-output/
echo $ARTIFACT_DIR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ steps:
fi
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"

- template: ../../templates/cilium-mtu-check.yaml

- script: |
ARTIFACT_DIR=$(Build.ArtifactStagingDirectory)/test-output/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,5 @@ steps:
fi
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"

- template: ../../templates/cilium-mtu-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -246,3 +246,7 @@ steps:
fi
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"

- template: ../../templates/cilium-mtu-check.yaml


7 changes: 7 additions & 0 deletions .pipelines/templates/cilium-mtu-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
steps:
- script: |
cd hack/scripts
chmod +x cilium-mtu-validation.sh
./cilium-mtu-validation.sh
name: "CiliumMTUValidation"
displayName: "Run Cilium MTU Validation"
2 changes: 2 additions & 0 deletions .pipelines/templates/cilium-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,5 @@ steps:
fi
name: "testAsyncDelete"
displayName: "Verify Async Delete when CNS is down"

- template: ./cilium-mtu-check.yaml
29 changes: 29 additions & 0 deletions hack/manifests/nginx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx
labels:
app: nginx
namespace: kube-system
spec:
replicas: 4
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx
image: mcr.microsoft.com/azurelinux/base/nginx:1
ports:
- containerPort: 80
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname # KV: Key is hostname, value is each unique nodename
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
app: nginx
103 changes: 103 additions & 0 deletions hack/scripts/cilium-mtu-validation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/bin/bash
NAMESPACE="kube-system"

echo "Deploy nginx pods for MTU testing"
kubectl apply -f ../manifests/nginx.yaml
kubectl wait --for=condition=available --timeout=60s -n $NAMESPACE deployment/nginx

# Check node count
node_count=$(kubectl get nodes --no-headers | wc -l)

# in CNI release test scenario scale deployments to 3 * node count to get replicas on each node
if [ "$node_count" -gt 1 ]; then
echo "Scaling nginx deployment to $((3 * node_count)) replicas"
kubectl scale deployment nginx --replicas=$((3 * node_count)) -n $NAMESPACE
fi
# Wait for nginx pods to be ready
kubectl wait --for=condition=available --timeout=60s -n $NAMESPACE deployment/nginx



echo "Checking MTU for pods in namespace: $NAMESPACE using Cilium agent and nginx MTU"

# Get all nodes
nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}')

for node in $nodes; do
echo "Checking node: $node"

# Get the Cilium agent pod running on this node
cilium_pod=$(kubectl get pods -n $NAMESPACE -o wide --field-selector spec.nodeName=$node -l k8s-app=cilium -o jsonpath='{.items[0].metadata.name}')

if [ -z "$cilium_pod" ]; then
echo "Failed to find Cilium agent pod on node $node"
echo "##[error]Failed to find Cilium agent pod on node $node"
exit 1
fi

# Get the MTU of eth0 in the Cilium agent pod
cilium_mtu=$(kubectl exec -n $NAMESPACE $cilium_pod -- cat /sys/class/net/eth0/mtu 2>/dev/null)

if [ -z "$cilium_mtu" ]; then
echo "Failed to get MTU from Cilium agent pod on node $node"
echo "##[error]Failed to get MTU from Cilium agent pod on node $node"
exit 1
fi

echo "Cilium agent eth0 MTU: $cilium_mtu"

# Get an nginx pod running on this node
nginx_pod=$(kubectl get pods -n $NAMESPACE -o wide --field-selector spec.nodeName=$node -l app=nginx -o jsonpath='{.items[0].metadata.name}')
if [ -z "$nginx_pod" ]; then
echo "Failed to find nginx pod on node $node"
echo "##[error]Failed to find nginx pod on node $node"
exit 1
fi
# Get the MTU of eth0 in the nginx pod
nginx_mtu=$(kubectl exec -n $NAMESPACE $nginx_pod -- cat /sys/class/net/eth0/mtu 2>/dev/null)
if [ -z "$nginx_mtu" ]; then
echo "Failed to get MTU from nginx pod on node $node"
echo "##[error]Failed to get MTU from nginx pod on node $node"
exit 1
fi
echo "Nginx pod eth0 MTU: $nginx_mtu"

# Get the node's eth0 MTU
node_mtu=$(kubectl debug node/$node -it --image=busybox -- sh -c "cat /sys/class/net/eth0/mtu" 2>/dev/null | tail -n 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason why we need to tail this and not the other one ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's some extra output from creating the debug pod that needs to be filtered


if [ -z "$node_mtu" ]; then
echo "Failed to get MTU from node $node"
echo "##[error]Failed to get MTU from node $node"
exit 1
fi
echo "Node eth0 MTU: $node_mtu"

# Check if the MTUs match
if [ "$cilium_mtu" -eq "$nginx_mtu" ] && [ "$nginx_mtu" -eq "$node_mtu" ]; then
echo "MTU validation passed for node $node"
else
echo "MTU validation failed for node $node"
echo "Cilium agent MTU: $cilium_mtu, Nginx pod MTU: $nginx_mtu, Node MTU: $node_mtu"
echo "##[error]MTU validation failed. MTUs do not match."
exit 1
fi

echo "----------------------------------------"

done

# Clean up
kubectl delete deployment nginx -n $NAMESPACE
echo "Cleaned up nginx deployment"

# Clean up the debug pod
debug_pod=$(kubectl get pods -o name | grep "node-debugger")
if [ -n "$debug_pod" ]; then
kubectl delete $debug_pod
kubectl wait --for=delete $debug_pod --timeout=60s
if [ $? -ne 0 ]; then
echo "Failed to clean up debug pod $debug_pod"
fi
else
echo "No debug pod found"
fi
Loading