From 79acfa9de607c31dd5fff78fa2bef68ea87f477a Mon Sep 17 00:00:00 2001 From: Georgy Shelkovy Date: Mon, 25 Sep 2023 14:00:16 +0500 Subject: [PATCH 1/4] Fix gprestore with --resize-cluster to a smaller one When restoring a large cluster to a smaller one with the --resize-cluster option, the error "Found incorrect number of backup files" occurs. This error occurs when checking a backup, when the pgrestore expects an increased set of data in the backup directories for certain segments, but in fact this is not the case. Moreover, if you disable the checking, another error will appear indicating that not all expected data was restored. This patch fixes both of these bugs. To correct the first error, the patch collects actual data not only from the directory for one segment, but also from the remaining directories that will be restored on this segment. To correct the second error, the patch changes the path to the file being restored, replacing not only the file name, but also the directory name. This can be done because the path always contains the sequence "...SegmentNumber/backups/..." Tests for restoring with the --resize-cluster option to a smaller cluster contained the moveSegmentBackupFiles function, which moved the backup data to the required directories. Now you don't need to do this anymore. --- end_to_end/end_to_end_suite_test.go | 45 ----------------------------- helper/restore_helper.go | 1 + restore/remote.go | 20 +++++++++++-- 3 files changed, 19 insertions(+), 47 deletions(-) diff --git a/end_to_end/end_to_end_suite_test.go b/end_to_end/end_to_end_suite_test.go index e72fff592..8ad3fbb03 100644 --- a/end_to_end/end_to_end_suite_test.go +++ b/end_to_end/end_to_end_suite_test.go @@ -420,43 +420,6 @@ func extractSavedTarFile(backupDir string, tarBaseName string) string { return extractDirectory } -// Move extracted data files to the proper directory for a larger-to-smaller restore, if necessary -// Assumes all saved backups have a name in the format "N-segment-db-..." where N is the original cluster size -func moveSegmentBackupFiles(tarBaseName string, extractDirectory string, isMultiNode bool, timestamps ...string) { - re := regexp.MustCompile("^([0-9]+)-.*") - origSize, _ := strconv.Atoi(re.FindStringSubmatch(tarBaseName)[1]) - for _, ts := range timestamps { - if ts != "" { - baseDir := fmt.Sprintf("%s/demoDataDir%s/backups/%s/%s", extractDirectory, "%d", ts[0:8], ts) - if isMultiNode { - remoteOutput := backupCluster.GenerateAndExecuteCommand("Create backup directories on segments", cluster.ON_SEGMENTS, func(contentID int) string { - return fmt.Sprintf("mkdir -p %s", fmt.Sprintf(baseDir, contentID)) - }) - backupCluster.CheckClusterError(remoteOutput, "Unable to create directories", func(contentID int) string { - return "" - }) - for i := 0; i < origSize; i++ { - origDir := fmt.Sprintf(baseDir, i) - destDir := fmt.Sprintf(baseDir, i%segmentCount) - _, err := backupCluster.ExecuteLocalCommand(fmt.Sprintf(`rsync -r -e ssh %s/ %s:%s`, origDir, backupCluster.GetHostForContent(i%segmentCount), destDir)) - if err != nil { - Fail(fmt.Sprintf("Could not copy %s to %s: %v", origDir, destDir, err)) - } - } - } else { - for i := segmentCount; i < origSize; i++ { - origDir := fmt.Sprintf(baseDir, i) - destDir := fmt.Sprintf(baseDir, i%segmentCount) - files, _ := path.Glob(fmt.Sprintf("%s/*", origDir)) - for _, dataFile := range files { - os.Rename(dataFile, fmt.Sprintf("%s/%s", destDir, path.Base(dataFile))) - } - } - } - } - } -} - func TestEndToEnd(t *testing.T) { format.MaxLength = 0 RegisterFailHandler(Fail) @@ -1948,11 +1911,6 @@ LANGUAGE plpgsql NO SQL;`) defer testhelper.AssertQueryRuns(restoreConn, `DROP SCHEMA IF EXISTS schematwo CASCADE;`) defer testhelper.AssertQueryRuns(restoreConn, `DROP SCHEMA IF EXISTS schemathree CASCADE;`) - if !testUsesPlugin { // No need to manually move files when using a plugin - isMultiNode := (backupCluster.GetHostForContent(0) != backupCluster.GetHostForContent(-1)) - moveSegmentBackupFiles(tarBaseName, extractDirectory, isMultiNode, fullTimestamp, incrementalTimestamp) - } - // This block stops the test if it hangs. It was introduced to prevent hangs causing timeout failures in Concourse CI. // These hangs are still being observed only in CI, and a definitive RCA has not yet been accomplished completed := make(chan bool) @@ -2097,9 +2055,6 @@ LANGUAGE plpgsql NO SQL;`) extractDirectory := extractSavedTarFile(backupDir, tarBaseName) defer testhelper.AssertQueryRuns(restoreConn, `DROP SCHEMA IF EXISTS schemaone CASCADE;`) - isMultiNode := (backupCluster.GetHostForContent(0) != backupCluster.GetHostForContent(-1)) - moveSegmentBackupFiles(tarBaseName, extractDirectory, isMultiNode, fullTimestamp) - gprestoreArgs := []string{ "--timestamp", fullTimestamp, "--redirect-db", "restoredb", diff --git a/helper/restore_helper.go b/helper/restore_helper.go index 3a9e2c938..a385ae942 100644 --- a/helper/restore_helper.go +++ b/helper/restore_helper.go @@ -382,6 +382,7 @@ func doRestoreAgent() error { func constructSingleTableFilename(name string, contentToRestore int, oid int) string { name = strings.ReplaceAll(name, fmt.Sprintf("gpbackup_%d", *content), fmt.Sprintf("gpbackup_%d", contentToRestore)) + name = strings.ReplaceAll(name, fmt.Sprintf("%d/backups/", *content), fmt.Sprintf("%d/backups/", contentToRestore)) nameParts := strings.Split(name, ".") filename := fmt.Sprintf("%s_%d", nameParts[0], oid) if len(nameParts) > 1 { // We only expect filenames ending in ".gz" or ".zst", but they can contain dots so handle arbitrary numbers of dots diff --git a/restore/remote.go b/restore/remote.go index 1a2c49b2b..2b8990f78 100644 --- a/restore/remote.go +++ b/restore/remote.go @@ -38,8 +38,25 @@ func VerifyBackupDirectoriesExistOnAllHosts() { } func VerifyBackupFileCountOnSegments() { + origSize, destSize, isResizeRestore := GetResizeClusterInfo() + cmdMap := make(map[int][]int, destSize) + for i := 0; i < origSize; i++ { + cmdMap[i%destSize] = append(cmdMap[i%destSize], []int{i}...) + } + for i := origSize; i < destSize; i++ { + cmdMap[i] = append(cmdMap[i], []int{i}...) + } + + GetDirsForContent := func(i int) string { + dirs := "" + for _, e := range cmdMap[i] { + dirs += " " + fmt.Sprintf(`"%s"`, globalFPInfo.GetDirForContent(e)) + } + return dirs + } + remoteOutput := globalCluster.GenerateAndExecuteCommand("Verifying backup file count", cluster.ON_SEGMENTS, func(contentID int) string { - return fmt.Sprintf("find %s -type f | wc -l", globalFPInfo.GetDirForContent(contentID)) + return fmt.Sprintf(`find %s -type f | wc -l`, GetDirsForContent(contentID)) }) globalCluster.CheckClusterError(remoteOutput, "Could not verify backup file count", func(contentID int) string { return "Could not verify backup file count" @@ -51,7 +68,6 @@ func VerifyBackupFileCountOnSegments() { fileCount = len(globalTOC.DataEntries) } - origSize, destSize, isResizeRestore := GetResizeClusterInfo() batchMap := make(map[int]int, len(remoteOutput.Commands)) for i := 0; i < origSize; i++ { batchMap[i%destSize] += fileCount From 324c0150050a19dd7863b95172e0a9f650e74178 Mon Sep 17 00:00:00 2001 From: Georgy Shelkovy Date: Mon, 27 Nov 2023 20:25:11 +0500 Subject: [PATCH 2/4] simplify --- restore/remote.go | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/restore/remote.go b/restore/remote.go index 2b8990f78..726907233 100644 --- a/restore/remote.go +++ b/restore/remote.go @@ -39,24 +39,14 @@ func VerifyBackupDirectoriesExistOnAllHosts() { func VerifyBackupFileCountOnSegments() { origSize, destSize, isResizeRestore := GetResizeClusterInfo() - cmdMap := make(map[int][]int, destSize) - for i := 0; i < origSize; i++ { - cmdMap[i%destSize] = append(cmdMap[i%destSize], []int{i}...) - } - for i := origSize; i < destSize; i++ { - cmdMap[i] = append(cmdMap[i], []int{i}...) - } - GetDirsForContent := func(i int) string { + remoteOutput := globalCluster.GenerateAndExecuteCommand("Verifying backup file count", cluster.ON_SEGMENTS, func(contentID int) string { dirs := "" - for _, e := range cmdMap[i] { - dirs += " " + fmt.Sprintf(`"%s"`, globalFPInfo.GetDirForContent(e)) + for contentID < origSize || contentID < destSize { + dirs += " " + fmt.Sprintf(`"%s"`, globalFPInfo.GetDirForContent(contentID)) + contentID += destSize } - return dirs - } - - remoteOutput := globalCluster.GenerateAndExecuteCommand("Verifying backup file count", cluster.ON_SEGMENTS, func(contentID int) string { - return fmt.Sprintf(`find %s -type f | wc -l`, GetDirsForContent(contentID)) + return fmt.Sprintf(`find %s -type f | wc -l`, dirs) }) globalCluster.CheckClusterError(remoteOutput, "Could not verify backup file count", func(contentID int) string { return "Could not verify backup file count" From 25d0ecec2698318c18ceb82cd1b2e32f56d0c264 Mon Sep 17 00:00:00 2001 From: Georgy Shelkovy Date: Mon, 27 Nov 2023 21:32:39 +0500 Subject: [PATCH 3/4] remove double quote --- restore/remote.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/restore/remote.go b/restore/remote.go index 726907233..5beba174b 100644 --- a/restore/remote.go +++ b/restore/remote.go @@ -43,10 +43,10 @@ func VerifyBackupFileCountOnSegments() { remoteOutput := globalCluster.GenerateAndExecuteCommand("Verifying backup file count", cluster.ON_SEGMENTS, func(contentID int) string { dirs := "" for contentID < origSize || contentID < destSize { - dirs += " " + fmt.Sprintf(`"%s"`, globalFPInfo.GetDirForContent(contentID)) + dirs += " " + fmt.Sprintf("%s", globalFPInfo.GetDirForContent(contentID)) contentID += destSize } - return fmt.Sprintf(`find %s -type f | wc -l`, dirs) + return fmt.Sprintf("find %s -type f | wc -l", dirs) }) globalCluster.CheckClusterError(remoteOutput, "Could not verify backup file count", func(contentID int) string { return "Could not verify backup file count" From 217a3b8d0dd7453035ecb5c4bdeffc03b39bb34c Mon Sep 17 00:00:00 2001 From: Georgy Shelkovy Date: Tue, 28 Nov 2023 08:15:56 +0500 Subject: [PATCH 4/4] comments --- helper/restore_helper.go | 1 + restore/remote.go | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/helper/restore_helper.go b/helper/restore_helper.go index a385ae942..527997e22 100644 --- a/helper/restore_helper.go +++ b/helper/restore_helper.go @@ -382,6 +382,7 @@ func doRestoreAgent() error { func constructSingleTableFilename(name string, contentToRestore int, oid int) string { name = strings.ReplaceAll(name, fmt.Sprintf("gpbackup_%d", *content), fmt.Sprintf("gpbackup_%d", contentToRestore)) + // change the path to the file being restored, replacing not only the file name, but also the directory name name = strings.ReplaceAll(name, fmt.Sprintf("%d/backups/", *content), fmt.Sprintf("%d/backups/", contentToRestore)) nameParts := strings.Split(name, ".") filename := fmt.Sprintf("%s_%d", nameParts[0], oid) diff --git a/restore/remote.go b/restore/remote.go index 5beba174b..bdae535c3 100644 --- a/restore/remote.go +++ b/restore/remote.go @@ -42,6 +42,10 @@ func VerifyBackupFileCountOnSegments() { remoteOutput := globalCluster.GenerateAndExecuteCommand("Verifying backup file count", cluster.ON_SEGMENTS, func(contentID int) string { dirs := "" + /* + * collect actual data not only from the directory for one segment, but also + * from the remaining directories that will be restored on this segment + */ for contentID < origSize || contentID < destSize { dirs += " " + fmt.Sprintf("%s", globalFPInfo.GetDirForContent(contentID)) contentID += destSize