Skip to content

Commit

Permalink
Fix hanging in case the restore helper fails to get oid list from a file
Browse files Browse the repository at this point in the history
  • Loading branch information
whitehawk committed Jun 24, 2024
1 parent c0fd30a commit f6c13e6
Showing 1 changed file with 22 additions and 0 deletions.
22 changes: 22 additions & 0 deletions restore/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"sync"
"sync/atomic"
"time"

"github.com/greenplum-db/gp-common-go-libs/cluster"
"github.com/greenplum-db/gp-common-go-libs/dbconn"
Expand Down Expand Up @@ -226,6 +227,27 @@ func restoreDataFromTimestamp(fpInfo filepath.FilePathInfo, dataEntries []toc.Co
ctx, cancel := context.WithCancel(context.Background())
defer cancel() // Make sure it's called to release resources even if no errors

// Launch a checker that polls if the restore helper has ended with an error.
// It is our 'Ultima ratio regum' - in case restore helper couldn't read a file with the oid list, it is not aware
// about the pipes, pre-created by the gprestore, and it can't close them. So we cancel all pending COPY commands
// from here after giving a chance to the restore helper to close pipes on its own.
if backupConfig.SingleDataFile || resizeCluster {
go func() {
for {
time.Sleep(5 * time.Second)
remoteOutput := globalCluster.GenerateAndExecuteCommand("Checking gpbackup_helper agent failure", cluster.ON_SEGMENTS, func(contentID int) string {
helperErrorFileName := fmt.Sprintf("%s_error", fpInfo.GetSegmentPipeFilePath(contentID))
return fmt.Sprintf("! ls %s", helperErrorFileName)
})
if remoteOutput.NumErrors != 0 {
// the delay below is to give the restore helper a chance to close all pipes, if it can...
time.Sleep(5 * time.Second)
cancel()
}
}
}()
}

for i := 0; i < connectionPool.NumConns; i++ {
workerPool.Add(1)
go func(whichConn int) {
Expand Down

0 comments on commit f6c13e6

Please sign in to comment.