Skip to content

Commit 628cdce

Browse files
committed
PGPRO-2425: previously successfull restore was relying on that there are no failed backups between base_full_backup and dest_backup, it could lead to false-positive validation or restore errors despite the fact that parent chain is valid. There even was a small chance of data corruption: if between base_full_backup and dest_backup were located backups from parallel chain. TLDR: restore was rolling backups blindly. See test test_restore_chain_with_corrupted_backup() for an example.
1 parent 2028275 commit 628cdce

File tree

3 files changed

+47
-24
lines changed

3 files changed

+47
-24
lines changed

src/catalog.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -981,6 +981,9 @@ is_parent(time_t parent_backup_time, pgBackup *child_backup, bool inclusive)
981981
if (!child_backup)
982982
elog(ERROR, "Target backup cannot be NULL");
983983

984+
if (inclusive && child_backup->start_time == parent_backup_time)
985+
return true;
986+
984987
while (child_backup->parent_backup_link &&
985988
child_backup->parent_backup != parent_backup_time)
986989
{
@@ -990,8 +993,8 @@ is_parent(time_t parent_backup_time, pgBackup *child_backup, bool inclusive)
990993
if (child_backup->parent_backup == parent_backup_time)
991994
return true;
992995

993-
if (inclusive && child_backup->start_time == parent_backup_time)
994-
return true;
996+
//if (inclusive && child_backup->start_time == parent_backup_time)
997+
// return true;
995998

996999
return false;
9971000
}

src/dir.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,11 +1047,11 @@ create_data_directories(const char *data_dir, const char *backup_dir,
10471047
}
10481048

10491049
if (link_sep)
1050-
elog(LOG, "create directory \"%s\" and symbolic link \"%.*s\"",
1050+
elog(VERBOSE, "create directory \"%s\" and symbolic link \"%.*s\"",
10511051
linked_path,
10521052
(int) (link_sep - relative_ptr), relative_ptr);
10531053
else
1054-
elog(LOG, "create directory \"%s\" and symbolic link \"%s\"",
1054+
elog(VERBOSE, "create directory \"%s\" and symbolic link \"%s\"",
10551055
linked_path, relative_ptr);
10561056

10571057
/* Firstly, create linked directory */
@@ -1082,7 +1082,7 @@ create_data_directories(const char *data_dir, const char *backup_dir,
10821082
}
10831083

10841084
create_directory:
1085-
elog(LOG, "create directory \"%s\"", relative_ptr);
1085+
elog(VERBOSE, "create directory \"%s\"", relative_ptr);
10861086

10871087
/* This is not symlink, create directory */
10881088
join_path_components(to_path, data_dir, relative_ptr);

src/restore.c

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
5757
int base_full_backup_index = 0;
5858
int corrupted_backup_index = 0;
5959
char *action = is_restore ? "Restore":"Validate";
60+
parray *parent_chain = NULL;
6061

6162
if (is_restore)
6263
{
@@ -285,34 +286,53 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
285286
if (is_restore)
286287
check_tablespace_mapping(dest_backup);
287288

289+
/* At this point we are sure that parent chain is whole
290+
* so we can build separate array, containing all needed backups,
291+
* to simplify validation and restore
292+
*/
293+
parent_chain = parray_new();
294+
295+
/* Take every backup that is a child of base_backup AND parent of dest_backup
296+
* including base_backup and dest_backup
297+
*/
298+
for (i = base_full_backup_index; i >= dest_backup_index; i--)
299+
{
300+
tmp_backup = (pgBackup *) parray_get(backups, i);
301+
302+
if (is_parent(base_full_backup->start_time, tmp_backup, true) &&
303+
is_parent(tmp_backup->start_time, dest_backup, true))
304+
{
305+
parray_append(parent_chain, tmp_backup);
306+
}
307+
}
308+
309+
/* for validation or restore with enabled validation */
288310
if (!is_restore || !rt->restore_no_validate)
289311
{
290312
if (dest_backup->backup_mode != BACKUP_MODE_FULL)
291313
elog(INFO, "Validating parents for backup %s", base36enc(dest_backup->start_time));
292314

293315
/*
294316
* Validate backups from base_full_backup to dest_backup.
295-
* At this point we are sure that parent chain is intact.
296317
*/
297-
for (i = base_full_backup_index; i >= dest_backup_index; i--)
318+
for (i = 0; i < parray_num(parent_chain); i++)
298319
{
299-
tmp_backup = (pgBackup *) parray_get(backups, i);
320+
tmp_backup = (pgBackup *) parray_get(parent_chain, i);
300321

301-
if (is_parent(base_full_backup->start_time, tmp_backup, true))
322+
pgBackupValidate(tmp_backup);
323+
/* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */
324+
if (tmp_backup->status == BACKUP_STATUS_CORRUPT)
302325
{
303-
304-
pgBackupValidate(tmp_backup);
305-
/* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */
306-
if (tmp_backup->status == BACKUP_STATUS_CORRUPT)
307-
{
308-
corrupted_backup = tmp_backup;
309-
corrupted_backup_index = i;
310-
break;
311-
}
312-
/* We do not validate WAL files of intermediate backups
313-
* It`s done to speed up restore
326+
corrupted_backup = tmp_backup;
327+
/* we need corrupted backup index from 'backups' not parent_chain
328+
* so we can properly orphanize all its descendants
314329
*/
330+
corrupted_backup_index = get_backup_index_number(backups, corrupted_backup);
331+
break;
315332
}
333+
/* We do not validate WAL files of intermediate backups
334+
* It`s done to speed up restore
335+
*/
316336
}
317337

318338
/* There is no point in wal validation of corrupted backups */
@@ -355,7 +375,6 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
355375
}
356376
}
357377

358-
// TODO: rewrite restore to use parent_chain
359378
/*
360379
* If dest backup is corrupted or was orphaned in previous check
361380
* produce corresponding error message
@@ -376,13 +395,12 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
376395
base36enc(dest_backup->start_time), status2str(dest_backup->status));
377396

378397
/* We ensured that all backups are valid, now restore if required
379-
* TODO: use parent_link
380398
*/
381399
if (is_restore)
382400
{
383-
for (i = base_full_backup_index; i >= dest_backup_index; i--)
401+
for (i = 0; i < parray_num(parent_chain); i++)
384402
{
385-
pgBackup *backup = (pgBackup *) parray_get(backups, i);
403+
pgBackup *backup = (pgBackup *) parray_get(parent_chain, i);
386404

387405
if (rt->lsn_specified && parse_server_version(backup->server_version) < 100000)
388406
elog(ERROR, "Backup %s was created for version %s which doesn't support recovery_target_lsn",
@@ -405,6 +423,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
405423
/* cleanup */
406424
parray_walk(backups, pgBackupFree);
407425
parray_free(backups);
426+
parray_free(parent_chain);
408427

409428
elog(INFO, "%s of backup %s completed.",
410429
action, base36enc(dest_backup->start_time));
@@ -480,6 +499,7 @@ restore_backup(pgBackup *backup)
480499
/* By default there are some error */
481500
threads_args[i].ret = 1;
482501

502+
/* Useless message TODO: rewrite */
483503
elog(LOG, "Start thread for num:%zu", parray_num(files));
484504

485505
pthread_create(&threads[i], NULL, restore_files, arg);

0 commit comments

Comments
 (0)