Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 196 additions & 9 deletions heartbeat/podman-etcd
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
OCF_RESKEY_oom_default="-997"
OCF_RESKEY_config_location_default="/var/lib/etcd"
OCF_RESKEY_backup_location_default="/var/lib/etcd"
OCF_RESKEY_max_backup_snapshots_default="3"

: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
Expand All @@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}


#######################################################################
Expand Down Expand Up @@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
</parameter>

<parameter name="max_backup_snapshots" required="0" unique="0">
<longdesc lang="en">
Maximum number of etcd database snapshots to retain. When a new snapshot is created,
older snapshots will be automatically removed to maintain this limit. This helps
control storage usage while ensuring recent backups are available for recovery.
Set max_backup_snapshots=0 to disable backups.
</longdesc>
<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
</parameter>

</parameters>

<actions>
Expand Down Expand Up @@ -719,20 +732,190 @@ EOF
return $OCF_SUCCESS
}

# Remove etcd member directory to allow the node to rejoin the cluster as a learner.
#
# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
# data inconsistencies. This function removes the member directory and syncs to disk.
#
# Returns:
# OCF_SUCCESS - Member directory successfully removed
# OCF_ERR_GENERIC - Failed to remove member directory (critical error)
wipe_data_folder_for_learner()
{
ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin"
if ! rm -rf "$ETCD_MEMBER_DIR"; then
ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?"
return $OCF_ERR_GENERIC
fi
sync
return $OCF_SUCCESS
}


# Calculate available disk space in bytes for a given directory.
#
# This function queries the filesystem and returns available space in bytes.
# It converts df output (KB) to bytes for consistent size comparisons.
#
# Arguments:
# $1 - Target directory path to check
#
# Returns:
# OCF_SUCCESS - Available space in bytes (via stdout)
# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
get_available_space_in_directory()
{
local target_dir=$1
local available_space_kb
local available_space_bytes

available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1)

# Validate output is numeric
if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then
echo "df command failed or returned invalid value: $available_space_kb"
return $OCF_ERR_GENERIC
fi

available_space_bytes=$((available_space_kb*1024))
echo "$available_space_bytes"
return $OCF_SUCCESS
}

# Archive etcd database with backup and cleanup
#
# This function creates a backup copy of the etcd database, validates it, and
# removes old backups according to the retention policy. Backups are optional
# and can be disabled by setting max_backup_snapshots=0.
#
# Error handling strategy:
# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
# Backups are beneficial but not critical for recovery operations.
#
# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
# server is not running when this backup is performed.
archive_data_folder()
{
# TODO: use etcd snapshots
local dest_dir_name
local data_dir="/var/lib/etcd/member"
local backup_dir="$OCF_RESKEY_backup_location"
local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"

dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
if [ ! -d $data_dir ]; then
ocf_log info "no data dir to backup"
if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then
ocf_log debug "etcd backup disabled (max_backup_snapshots=0)"
return $OCF_SUCCESS
fi
ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
sync

# Check if the etcd database file exists
if [ ! -f "$etcd_db_path" ]; then
ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'"
return $OCF_SUCCESS
fi

# Ensure backup directory exists
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to make sure there is enough disk space before committing to the backup? I worry about users setting a high rotation threshold and filling the disk

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point!
I wonder how to estimate the threshold. From one side, I can use the other backups (if any, of course) + some guard, but if 1 single new backup can fill up the disk, it is probably too late already 😁. I wonder if we want to use a percentage of the disk available instead 🤔

Copy link
Contributor

@fonta-rh fonta-rh Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm torn. The simple "will we fill the disk" check is very straightforward, but as you say, might be useless and doesn't give any options to the user.

Adding a config parameter for the threshold would be the more complete option, but it's a lot more code. Just the check function might be something like this (Claude dixit), and you still need to call it, document it, etcd

# Check if filesystem has sufficient space for backup based on usage threshold
#
# Args:
#   $1 - Directory path to check
#
# Returns:
#   0 - Sufficient space available
#   1 - Insufficient space (usage above threshold)
check_backup_disk_space()
{
	local backup_dir="$1"
	local threshold="$OCF_RESKEY_backup_disk_usage_threshold"
	local usage_percent
	local fs_path

	# Validate threshold is a number between 0-100
	if ! echo "$threshold" | grep -q '^[0-9]\{1,3\}$' || [ "$threshold" -gt 100 ]; then
		ocf_log warn "invalid backup_disk_usage_threshold value: $threshold (must be 0-100), using default"
		threshold="$OCF_RESKEY_backup_disk_usage_threshold_default"
	fi

	# Get filesystem usage percentage (without % sign)
	# df output: Filesystem Size Used Avail Use% Mounted
	usage_percent=$(df -P "$backup_dir" 2>/dev/null | awk 'NR==2 {sub(/%/, "", $5); print $5}')

	if [ -z "$usage_percent" ]; then
		ocf_log warn "could not determine disk usage for $backup_dir, skipping space check"
		return 0
	fi

	if [ "$usage_percent" -ge "$threshold" ]; then
		ocf_log warn "backup filesystem usage ${usage_percent}% exceeds threshold ${threshold}%, skipping backup"
		return 1
	fi

	ocf_log debug "backup filesystem usage ${usage_percent}% is below threshold ${threshold}%"
	return 0
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can use the other backups (if any, of course) + some guard

of course I was hallucinating here... we are backing up 1 specific file, I know what size it has 🤦

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The simple "will we fill the disk" check is very straightforward, but as you say, might be useless and doesn't give any options to the user.

Considering that this is going to go on a kubernetes node that has better ways to detect disk pressure, I don't think we need to complicate this change further. What do you think?

Copy link
Contributor

@fonta-rh fonta-rh Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, what about just making we're not the ones filling the disk with something like:

# Check available disk space before creating backup
local available_kb
available_kb=$(df -P "$backup_dir" | awk 'NR==2 {print $4}')
local db_size_kb
db_size_kb=$(($(stat -c %s "$etcd_db_path") / 1024))

if [ "$available_kb" -lt "$((db_size_kb * 2))" ]; then
    ocf_log warn "insufficient disk space in $backup_dir (available: ${available_kb}KB, needed: ~$((db_size_kb * 2))KB), skipping backup"
    return $OCF_SUCCESS
fi

and it's out of our hands!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I came up with a similar code 😁 🤖

if [ ! -d "$backup_dir" ]; then
ocf_log debug "creating backup directory: '$backup_dir'"
if ! mkdir -p "$backup_dir"; then
ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'"
return $OCF_SUCCESS
fi
fi

ocf_log debug "checking disk space: backup_dir=$backup_dir"
local available_space_bytes
if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then
ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes"
return $OCF_SUCCESS
fi

local required_space_bytes
required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1)
if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then
ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes"
return $OCF_SUCCESS
fi

if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then
ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)"
return $OCF_SUCCESS
fi

# Generate timestamp and backup filename
local timestamp
timestamp=$(date +%Y%m%d-%H%M%S)

local backup_file
backup_file="$backup_dir/snapshot-$timestamp.db"

ocf_log info "creating etcd database backup: '$backup_file'"

# Create the backup by copying the database file (enable Copy-on-Write copy)
if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then
ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?"
return $OCF_SUCCESS
fi

# Validate the backup file exists and has the expected size
if [ ! -f "$backup_file" ]; then
ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist"
return $OCF_SUCCESS
fi

local backup_size_bytes
backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
if [ "$backup_size_bytes" -ne "$required_space_bytes" ]; then
ocf_log warn "backup validation failed: size mismatch (expected: ${required_space_bytes}B, got: ${backup_size_bytes}B)"
rm -f "$backup_file"
return $OCF_SUCCESS
fi

ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)"

# Cleanup old backups based on retention policy
cleanup_old_backups "$backup_dir"

return $OCF_SUCCESS
}

cleanup_old_backups()
{
local backup_dir="$1"
local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
local backup_count
local backups_to_remove
local old_backups

# Validate max_snapshots is a positive integer
if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Positive integer (1-99) to make the error clearer?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Positive because we handle elsewhere the case the user doesn't want snapshots

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, nevermind, I just realized that we allow for any positive integer, I thought it was only from 1 to 99 :)

return $OCF_SUCCESS
fi

# Count existing backup files
backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l)

if [ "$backup_count" -le "$max_snapshots" ]; then
ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
return $OCF_SUCCESS
fi

# Calculate how many backups to remove
backups_to_remove=$((backup_count - max_snapshots))
ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"

# Find oldest backups sorted by modification time
# -t sorts by modification time, -r reverses (oldest first)
# -print0 and -0 handle filenames with spaces/special characters
old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -print0 2>/dev/null | \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude tells me maybe we don't need -print0 because ls -tr is already going to produce newlines. Not sure it's worth retesting, though, as it does work

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the code was using print0 for safety with special characters. Since we aren't meant to use any, I like the idea of simplifying this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see now why this print0 was suggested

https://www.shellcheck.net/wiki/SC2038

Use -print0/-0 or find -exec + to allow for non-alphanumeric filenames.

xargs -0 -r ls -tr | \
head -n "$backups_to_remove")

if [ -n "$old_backups" ]; then
ocf_log info "removing old backups: $old_backups"
if ! echo "$old_backups" | xargs -r rm -f; then
ocf_log warn "failed to remove some old backups, error code: $?"
fi
fi

return $OCF_SUCCESS
}

etcd_pod_container_exists() {
Expand Down Expand Up @@ -1901,6 +2084,9 @@ podman_start()
fi

archive_data_folder
if ! wipe_data_folder_for_learner; then
return "$OCF_ERR_GENERIC"
fi
fi

ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
Expand Down Expand Up @@ -2250,6 +2436,7 @@ CONTAINER=$OCF_RESKEY_name
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
ETCD_MEMBER_DIR="/var/lib/etcd/member"
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
ETCD_REVISION_BUMP_PERCENTAGE=0.2
ETCD_BUMP_REV_DEFAULT=1000000000
Expand Down