diff --git a/cmd/manager.go b/cmd/manager.go index 46e7e0516..b22e3a1fd 100644 --- a/cmd/manager.go +++ b/cmd/manager.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "strings" + "time" "github.com/spf13/cobra" @@ -58,4 +59,19 @@ func init() { managerCmd.PersistentFlags().StringSliceVar(&sbm.BundleCollectors, "extra-collectors", getEnvStringSlice("SUPPORT_BUNDLE_EXTRA_COLLECTORS"), "Get extra resource for the specific components e.g., harvester") managerCmd.PersistentFlags().StringVar(&sbm.Description, "description", os.Getenv("SUPPORT_BUNDLE_DESCRIPTION"), "The support bundle description") managerCmd.PersistentFlags().StringVar(&sbm.IssueURL, "issue-url", os.Getenv("SUPPORT_BUNDLE_ISSUE_URL"), "The support bundle issue url") + managerCmd.PersistentFlags().DurationVar(&sbm.NodeTimeout, "node-timeout", parseDurationString(os.Getenv("SUPPORT_BUNDLE_NODE_TIMEOUT")), "The support bundle node collection time out") +} + +// parseDurationString could parse `1s` and `10m` duration string. +func parseDurationString(value string) time.Duration { + if value == "" { + return 0 + } + + d, err := time.ParseDuration(value) + if err != nil { + return 0 + } + + return d } diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index f15b1ea2e..87e7b147b 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -43,6 +43,7 @@ type SupportBundleManager struct { RegistrySecret string IssueURL string Description string + NodeTimeout time.Duration ExcludeResources []schema.GroupResource ExcludeResourceList []string @@ -273,8 +274,7 @@ func (m *SupportBundleManager) collectNodeBundles() error { return err } - <-m.ch - logrus.Info("All node bundles are received.") + m.waitNodesCompleted() // Clean up when everything is fine. If something went wrong, keep ds for debugging. // The ds will be garbage-collected when manager pod is gone. @@ -293,6 +293,30 @@ func (m *SupportBundleManager) verifyNodeBundle(file string) error { return err } +func (m *SupportBundleManager) printTimeoutNodes() { + for node := range m.expectedNodes { + logrus.Warnf("Collection timed out for node: %s", node) + } +} + +func (m *SupportBundleManager) waitNodesCompleted() { + select { + case <-m.ch: + logrus.Info("All node bundles are received.") + case <-m.timeout(): + logrus.Info("Some nodes are timeout, not all node bundles are received.") + m.printTimeoutNodes() + } +} + +func (m *SupportBundleManager) timeout() <-chan time.Time { + if m.NodeTimeout == 0 { + return time.After(30 * time.Minute) // default time out + } + + return time.After(m.NodeTimeout) +} + func (m *SupportBundleManager) completeNode(node string) { m.nodesLock.Lock() defer m.nodesLock.Unlock() @@ -308,7 +332,7 @@ func (m *SupportBundleManager) completeNode(node string) { if len(m.expectedNodes) == 0 { if !m.done { logrus.Debugf("All nodes are completed") - m.ch <- struct{}{} + close(m.ch) m.done = true } }