refactor: address pr feedback

Piotr1215 · Piotr1215 · commit 4b9c82ecfefb · 2025-11-06T18:33:46.000+01:00
diff --git a/vcluster/configure/vcluster-yaml/control-plane/components/backing-store/etcd/embedded.mdx b/vcluster/configure/vcluster-yaml/control-plane/components/backing-store/etcd/embedded.mdx
@@ -9,6 +9,7 @@ description: Configure an embedded etcd instance as the virtual cluster's backin
 import ConfigReference from '../../../../../../_partials/config/controlPlane/backingStore/etcd/embedded.mdx'
 import ProAdmonition from '../../../../../../_partials/admonitions/pro-admonition.mdx'
 import InterpolatedCodeBlock from "@site/src/components/InterpolatedCodeBlock";
+import PageVariables from "@site/src/components/PageVariables";
 import Flow, { Step } from '@site/src/components/Flow';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
@@ -95,6 +96,26 @@ Normal pod restarts or terminations do not require manual recovery. These events
 
 Recovery procedures depend on whether the first replica (the pod ending with `-0`) is among the failing replicas.
 
+:::note
+The recovery procedure for the first replica also depends on your StatefulSet's `podManagementPolicy` configuration (`Parallel` or `OrderedReady`). See the [first replica recovery section](#migrate-to-parallel) for details on migrating between policies if needed.
+:::
+
+:::info Find your vCluster namespace
+If using VirtualClusterInstance (platform), the vCluster StatefulSet runs in a different namespace than the VirtualClusterInstance itself. Find the StatefulSet namespace with:
+```bash
+kubectl get virtualclusterinstance <instance-name> -n <vci-namespace> -o jsonpath='{.spec.clusterRef.namespace}'
+```
+For example, if your VirtualClusterInstance is named `my-vcluster` in the `p-default` namespace, the StatefulSet might be in `vcluster-my-vcluster-p-default`.
+
+If using Helm, the namespace is what you specified during installation (e.g., `vcluster-my-team`).
+:::
+
+<PageVariables
+  VCLUSTER_NAME="my-vcluster"
+  NAMESPACE="vcluster-my-team"
+  VCLUSTER_LABEL="app=vcluster"
+/>
+
 Use the following procedures when some replicas are still functioning:
 <br />
 
@@ -106,7 +127,7 @@ Use the following procedures when some replicas are still functioning:
 Scale the StatefulSet to one replica:
 
 <InterpolatedCodeBlock 
-  code={`kubectl scale statefulset [[VAR:VCLUSTER NAME:my-vcluster]] --replicas=1 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl scale statefulset [[GLOBAL:VCLUSTER_NAME]] --replicas=1 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
@@ -115,7 +136,7 @@ Scale the StatefulSet to one replica:
 Verify only one pod is running:
 
 <InterpolatedCodeBlock 
-  code={`kubectl get pods -l [[VAR:VCLUSTER LABEL:app=vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl get pods -l [[GLOBAL:VCLUSTER_LABEL]] -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 </Step>
@@ -124,7 +145,7 @@ Verify only one pod is running:
 Monitor the rebuild process:
 
 <InterpolatedCodeBlock 
-  code={`kubectl logs -f [[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl logs -f [[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
@@ -137,7 +158,7 @@ Watch for log messages indicating etcd is ready and the cluster is in good condi
 Scale back up to your target replica count:
 
 <InterpolatedCodeBlock 
-  code={`kubectl scale statefulset [[VAR:VCLUSTER NAME:my-vcluster]] --replicas=[[VAR:DESIRED REPLICA COUNT:3]] -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl scale statefulset [[GLOBAL:VCLUSTER_NAME]] --replicas=[[VAR:REPLICA COUNT:3]] -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
@@ -146,8 +167,8 @@ Scale back up to your target replica count:
 Verify all replicas are running:
 
 <InterpolatedCodeBlock 
-  code={`kubectl get pods -l [[VAR:VCLUSTER LABEL:app=vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]]
-kubectl logs [[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]] | grep "cluster is ready"`}
+  code={`kubectl get pods -l [[GLOBAL:VCLUSTER_LABEL]] -n [[GLOBAL:NAMESPACE]]
+kubectl logs [[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]] | grep "cluster is ready"`}
   language="bash"
 />
 </Step>
@@ -158,15 +179,28 @@ kubectl logs [[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-
 <TabItem value="first-replica-failing" label="First replica is failing">
 
 :::warning
-Before attempting any recovery procedure, create a backup of the virtual cluster namespace on the host cluster. If using namespace syncing, back up all synced namespaces as well.
+Before attempting any recovery procedure, [create a backup](../../../../../../manage/backup-restore/backup.mdx) of your virtual cluster using `vcluster snapshot create --include-volumes`. This ensures both the virtual cluster's etcd data and persistent volumes are backed up.
+
+If the virtual cluster's etcd is in a bad state and the snapshot command fails, you can still back up from the host cluster (which has its own functioning etcd). Use your preferred backup solution (e.g., Velero, Kasten, or cloud-native backup tools) to back up the host cluster namespace containing the vCluster resources. Ensure the backup includes:
+- All Kubernetes resources in the vCluster namespace (StatefulSet, Services, etc.)
+- PersistentVolumeClaims and their associated volume data (contains the virtual cluster's etcd data)
+- Secrets and ConfigMaps
+
+When restored, the vCluster pods will restart and the virtual cluster will be recreated from the backed-up etcd data.
+
+If using namespace syncing, back up all synced namespaces on the host cluster as well.
 :::
 
 The recovery procedure depends on your StatefulSet `podManagementPolicy` configuration. vCluster version 0.20 and later use `Parallel` by default. Earlier versions used `OrderedReady`.
 
+:::info
+If more than one pod is down with `podManagementPolicy: OrderedReady`, you must first [migrate to `Parallel`](#migrate-to-parallel) before attempting recovery.
+:::
+
 Check your configuration:
 
 <InterpolatedCodeBlock
-  code={`kubectl get statefulset [[VAR:VCLUSTER NAME:my-vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]] -o jsonpath='{.spec.podManagementPolicy}'`}
+  code={`kubectl get statefulset [[GLOBAL:VCLUSTER_NAME]] -n [[GLOBAL:NAMESPACE]] -o jsonpath='{.spec.podManagementPolicy}'`}
   language="bash"
 />
 
@@ -175,24 +209,33 @@ Check your configuration:
 
 <Flow>
 <Step title="Delete the failed pod and PVC">
-Delete the corrupted pod and PVC for replica-0:
+First, identify the PVC for replica-0:
+
+<InterpolatedCodeBlock
+  code={`kubectl get pvc -l [[GLOBAL:VCLUSTER_LABEL]] -n [[GLOBAL:NAMESPACE]]`}
+  language="bash"
+/>
+
+<br />
+
+The PVC name typically follows the pattern `data-<vcluster-name>-0` but may vary if customized in your configuration. Note the exact name from the output above, then delete the corrupted pod and its PVC:
 
 <InterpolatedCodeBlock
-  code={`kubectl delete pod [[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]]
-kubectl delete pvc [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl delete pod [[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]]
+kubectl delete pvc [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
 <br />
 
-The pod restarts with a new empty PVC. After 1-3 pod restarts, the automatic recovery adds it back to the etcd cluster.
+The pod restarts with a new empty PVC. The initial attempts fail because the new member tries to join the existing etcd cluster but lacks the required data. After 1-3 pod restarts, vCluster's automatic recovery detects the empty member and properly adds it as a new learner, allowing it to sync data from healthy members and join the cluster.
 </Step>
 
 <Step title="Monitor recovery">
 Monitor the recovery process:
 
 <InterpolatedCodeBlock
-  code={`kubectl get pods -l [[VAR:VCLUSTER LABEL:app=vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]] -w`}
+  code={`kubectl get pods -l [[GLOBAL:VCLUSTER_LABEL]] -n [[GLOBAL:NAMESPACE]] -w`}
   language="bash"
 />
 
@@ -201,7 +244,7 @@ Monitor the recovery process:
 Check the logs to verify the pod rejoins successfully:
 
 <InterpolatedCodeBlock
-  code={`kubectl logs -f [[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl logs -f [[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 </Step>
@@ -220,7 +263,7 @@ If more than one pod is down with `podManagementPolicy: OrderedReady`, migrate t
 Check that the StatefulSet retains PVCs on deletion:
 
 <InterpolatedCodeBlock
-  code={`kubectl get statefulset [[VAR:VCLUSTER NAME:my-vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]] -o jsonpath='{.spec.persistentVolumeClaimRetentionPolicy}'`}
+  code={`kubectl get statefulset [[GLOBAL:VCLUSTER_NAME]] -n [[GLOBAL:NAMESPACE]] -o jsonpath='{.spec.persistentVolumeClaimRetentionPolicy}'`}
   language="bash"
 />
 
@@ -233,41 +276,53 @@ The policy should be `Retain`. This is the default but can be overridden by `con
 Delete the StatefulSet without deleting the pods:
 
 <InterpolatedCodeBlock
-  code={`kubectl delete statefulset [[VAR:VCLUSTER NAME:my-vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]] --cascade=orphan`}
+  code={`kubectl delete statefulset [[GLOBAL:VCLUSTER_NAME]] -n [[GLOBAL:NAMESPACE]] --cascade=orphan`}
   language="bash"
 />
 </Step>
 
 <Step title="Update configuration to Parallel">
+<a id="migrate-to-parallel"></a>
+
 Update your virtual cluster configuration to use `Parallel` pod management policy.
 
-If using a VirtualClusterInstance:
+If using a VirtualClusterInstance, edit the instance and update the `podManagementPolicy`:
 
 <InterpolatedCodeBlock
-  code={`kubectl edit virtualclusterinstance [[VAR:VCLUSTER NAME:my-vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl edit virtualclusterinstance [[GLOBAL:VCLUSTER_NAME]] -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
+Then add or update this section in the spec:
+
+```yaml
+spec:
+  template:
+    helmRelease:
+      values: |
+        controlPlane:
+          statefulSet:
+            scheduling:
+              podManagementPolicy: Parallel
+```
+
 <br />
 
-Add or update the following configuration:
+If using Helm, update your `values.yaml` to set the pod management policy:
 
-<InterpolatedCodeBlock
-  code={`controlPlane:
+```yaml title="values.yaml"
+controlPlane:
   statefulSet:
     scheduling:
-      podManagementPolicy: Parallel`}
-  language="yaml"
-/>
-
-<br />
+      podManagementPolicy: Parallel
+```
 
-If using Helm, update your `values.yaml` and run:
+Then apply the update:
 
 <InterpolatedCodeBlock
-  code={`helm upgrade [[VAR:VCLUSTER NAME:my-vcluster]] vcluster \
+  code={`helm upgrade [[GLOBAL:VCLUSTER_NAME]] vcluster \
   --repo https://charts.loft.sh \
-  --namespace [[VAR:NAMESPACE:vcluster-my-team]] \
+  --namespace [[GLOBAL:NAMESPACE]] \
   --reuse-values \
   -f values.yaml`}
   language="bash"
@@ -279,17 +334,28 @@ The StatefulSet is recreated with `Parallel` policy and pods pick up the existin
 </Step>
 
 <Step title="Delete the failed pod and PVC">
-Now follow the same procedure as for `Parallel` mode:
+Now follow the same procedure as for `Parallel` mode.
+
+First, identify the PVC for replica-0:
+
+<InterpolatedCodeBlock
+  code={`kubectl get pvc -l [[GLOBAL:VCLUSTER_LABEL]] -n [[GLOBAL:NAMESPACE]]`}
+  language="bash"
+/>
+
+<br />
+
+The PVC name typically follows the pattern `data-<vcluster-name>-0` but may vary if customized in your configuration. Note the exact name from the output above, then delete the corrupted pod and its PVC:
 
 <InterpolatedCodeBlock
-  code={`kubectl delete pod [[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]]
-kubectl delete pvc [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl delete pod [[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]]
+kubectl delete pvc [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
 <br />
 
-The pod restarts with a new empty PVC and automatic recovery adds it back to the cluster after 1-3 pod restarts.
+The pod restarts with a new empty PVC. The initial attempts fail because the new member tries to join the existing etcd cluster but lacks the required data. After 1-3 pod restarts, vCluster's automatic recovery detects the empty member and properly adds it as a new learner, allowing it to sync data from healthy members and join the cluster.
 </Step>
 </Flow>
 
@@ -316,14 +382,14 @@ When the majority of etcd member replicas become corrupted or deleted simultaneo
 Verify all PVCs are corrupted or inaccessible:
 
 <InterpolatedCodeBlock
-  code={`kubectl get pvc -l [[VAR:VCLUSTER LABEL:app=vcluster]] -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl get pvc -l [[GLOBAL:VCLUSTER_LABEL]] -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
 <br />
 
 <InterpolatedCodeBlock
-  code={`kubectl describe pvc [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-0 [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-1 [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-2 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl describe pvc [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-0 [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-1 [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-2 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 </Step>
@@ -332,7 +398,7 @@ Verify all PVCs are corrupted or inaccessible:
 Stop all vCluster instances before beginning recovery:
 
 <InterpolatedCodeBlock
-  code={`kubectl scale statefulset [[VAR:VCLUSTER NAME:my-vcluster]] --replicas=0 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl scale statefulset [[GLOBAL:VCLUSTER_NAME]] --replicas=0 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 </Step>
@@ -341,13 +407,16 @@ Stop all vCluster instances before beginning recovery:
 Delete all corrupted PVCs:
 
 <InterpolatedCodeBlock
-  code={`kubectl delete pvc [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-0 [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-1 [[VAR:PVC PREFIX:data]]-[[VAR:VCLUSTER NAME:my-vcluster]]-2 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl delete pvc [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-0 [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-1 [[VAR:PVC PREFIX:data]]-[[GLOBAL:VCLUSTER_NAME]]-2 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 </Step>
 
 <Step title="Restore from backup or snapshot">
-Follow a backup restoration procedure. This typically involves restoring PVCs from your backup solution (Velero, CSI snapshots, or similar tools).
+Restore from a previous backup using one of these methods:
+- [vCluster snapshot restore](../../../../../../manage/backup-restore/restore.mdx) - Built-in snapshot restoration
+- [Volume snapshots](../../../../../../manage/backup-restore/volume-snapshots.mdx) - CSI volume snapshot restoration
+- [Velero](../../../../../../manage/backup-restore/velero.mdx) - Velero backup restoration
 
 <br />
 
@@ -363,7 +432,7 @@ Restore from snapshot:
 Scale up to a single replica to verify the restoration:
 
 <InterpolatedCodeBlock
-  code={`kubectl scale statefulset [[VAR:VCLUSTER NAME:my-vcluster]] --replicas=1 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl scale statefulset [[GLOBAL:VCLUSTER_NAME]] --replicas=1 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />
 
@@ -372,7 +441,7 @@ Scale up to a single replica to verify the restoration:
 Monitor logs and verify the cluster starts successfully:
 
 <InterpolatedCodeBlock
-  code={`kubectl logs -f [[VAR:VCLUSTER NAME:my-vcluster]]-0 -n [[VAR:NAMESPACE:vcluster-my-team]]`}
+  code={`kubectl logs -f [[GLOBAL:VCLUSTER_NAME]]-0 -n [[GLOBAL:NAMESPACE]]`}
   language="bash"
 />