Enhance error handling for rate limits; introduce RetryableAssignment…

…Operation and RetryableOperation functions to manage throttling and implement exponential backoff with jitter. Update state mapping in settings catalog to remove unused fields, improving clarity and maintainability.
deploymenttheory · Dec 1, 2024 · 5c533d9 · 5c533d9
1 parent d4f72bb
commit 5c533d9
Show file tree

Hide file tree

Showing 9 changed files with 820 additions and 334 deletions.
diff --git a/internal/resources/common/errors/error_handling.go b/internal/resources/common/errors/error_handling.go
@@ -26,6 +26,7 @@ type GraphErrorInfo struct {
 	AdditionalData map[string]interface{}
 	Headers        *abstractions.ResponseHeaders
 	RequestDetails string
+	RetryAfter     string
 }
 
 // standardErrorDescriptions provides consistent error messaging across the provider
@@ -85,6 +86,9 @@ func HandleGraphError(ctx context.Context, err error, resp interface{}, operatio
 	case 401, 403:
 		handlePermissionError(ctx, errorInfo, resp, operation, requiredPermissions)
 
+	case 429:
+		handleRateLimitError(ctx, errorInfo, resp)
+
 	default:
 		// Handle all other cases
 		addErrorToDiagnostics(ctx, resp, errorDesc.Summary,
@@ -216,6 +220,27 @@ func handlePermissionError(ctx context.Context, errorInfo GraphErrorInfo, resp i
 	addErrorToDiagnostics(ctx, resp, errorDesc.Summary, detail)
 }
 
+// handleRateLimitError processes rate limit errors and adds retry information to the error message
+func handleRateLimitError(ctx context.Context, errorInfo GraphErrorInfo, resp interface{}) GraphErrorInfo {
+	if headers := errorInfo.Headers; headers != nil {
+		retryValues := headers.Get("Retry-After")
+		if len(retryValues) > 0 {
+			errorInfo.RetryAfter = retryValues[0]
+		}
+	}
+
+	tflog.Warn(ctx, "Rate limit exceeded", map[string]interface{}{
+		"retry_after": errorInfo.RetryAfter,
+		"details":     errorInfo.ErrorMessage,
+	})
+
+	errorDesc := getErrorDescription(429)
+	detail := constructErrorDetail(errorDesc.Detail, errorInfo.ErrorMessage)
+	addErrorToDiagnostics(ctx, resp, errorDesc.Summary, detail)
+
+	return errorInfo
+}
+
 // addErrorToDiagnostics adds an error to the response diagnostics
 func addErrorToDiagnostics(ctx context.Context, resp interface{}, summary, detail string) {
 	switch r := resp.(type) {

diff --git a/internal/resources/common/retry/retry_assignments.go b/internal/resources/common/retry/retry_assignments.go
@@ -0,0 +1,97 @@
+// REF: https://learn.microsoft.com/en-us/graph/throttling-limits#assignment-service-limits
+
+package retry
+
+import (
+	"context"
+	"time"
+
+	"github.com/deploymenttheory/terraform-provider-microsoft365/internal/resources/common/errors"
+	"github.com/hashicorp/terraform-plugin-log/tflog"
+	"golang.org/x/exp/rand"
+)
+
+// RetryableAssignmentOperation executes an assignment operation with specific rate limiting
+func RetryableAssignmentOperation(ctx context.Context, operation string, fn func() error) error {
+	var attempt int
+	r := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	const (
+		tenSecondLimit = 500   // requests per 10 seconds per app per tenant
+		hourlyLimit    = 15000 // requests per hour per app per tenant
+		maxBackoff     = 10 * time.Second
+		baseDelay      = 3 * time.Second // Higher base delay for assignments
+	)
+
+	for {
+		err := fn()
+		if err == nil {
+			return nil
+		}
+
+		graphError := errors.GraphError(ctx, err)
+		if graphError.StatusCode != 429 {
+			return err
+		}
+
+		// Parse throttle scope if available
+		var throttleScope ThrottleScope
+		if scope := graphError.Headers.Get("x-ms-throttle-scope"); len(scope) > 0 {
+			throttleScope = parseThrottleScope(scope[0])
+		}
+
+		// Get throttle information
+		var throttleInfo string
+		if info := graphError.Headers.Get("x-ms-throttle-information"); len(info) > 0 {
+			throttleInfo = info[0]
+		}
+
+		// Use Retry-After if provided, otherwise use exponential backoff
+		var backoffDelay time.Duration
+		if graphError.RetryAfter != "" {
+			if seconds, err := time.ParseDuration(graphError.RetryAfter + "s"); err == nil {
+				backoffDelay = seconds
+			}
+		}
+
+		if backoffDelay == 0 {
+			backoffDelay = baseDelay * time.Duration(1<<attempt)
+			if backoffDelay > maxBackoff {
+				backoffDelay = maxBackoff
+			}
+		}
+
+		// Add jitter: randomly between 50-100% of calculated delay
+		jitterDelay := backoffDelay/2 + time.Duration(r.Int63n(int64(backoffDelay/2)))
+		attempt++
+
+		logDetails := map[string]interface{}{
+			"operation":      operation,
+			"attempt":        attempt,
+			"delay_seconds":  jitterDelay.Seconds(),
+			"status_code":    graphError.StatusCode,
+			"rate_limit_10s": tenSecondLimit,
+			"rate_limit_1h":  hourlyLimit,
+		}
+
+		if throttleInfo != "" {
+			logDetails["throttle_reason"] = throttleInfo
+		}
+		if throttleScope != (ThrottleScope{}) {
+			logDetails["throttle_scope"] = throttleScope.Scope
+			logDetails["throttle_limit"] = throttleScope.Limit
+		}
+
+		tflog.Info(ctx, "Microsoft Graph assignment rate limit encountered", logDetails)
+
+		timer := time.NewTimer(jitterDelay)
+		defer timer.Stop()
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-timer.C:
+			continue
+		}
+	}
+}
diff --git a/internal/resources/common/retry/retry_global.go b/internal/resources/common/retry/retry_global.go
@@ -0,0 +1,114 @@
+// REF: https://learn.microsoft.com/en-us/graph/throttling
+
+package retry
+
+import (
+	"context"
+	"strings"
+	"time"
+
+	"github.com/deploymenttheory/terraform-provider-microsoft365/internal/resources/common/errors"
+	"github.com/hashicorp/terraform-plugin-log/tflog"
+	"golang.org/x/exp/rand"
+)
+
+// ThrottleScope represents the scope of throttling from x-ms-throttle-scope header
+type ThrottleScope struct {
+	Scope         string
+	Limit         string
+	ApplicationID string
+	ResourceID    string
+}
+
+// parseThrottleScope parses the x-ms-throttle-scope header
+func parseThrottleScope(scope string) ThrottleScope {
+	parts := strings.Split(scope, "/")
+	if len(parts) != 4 {
+		return ThrottleScope{}
+	}
+	return ThrottleScope{
+		Scope:         parts[0],
+		Limit:         parts[1],
+		ApplicationID: parts[2],
+		ResourceID:    parts[3],
+	}
+}
+
+// RetryableOperation executes an operation with automatic retry on rate limit errors
+func RetryableOperation(ctx context.Context, operation string, fn func() error) error {
+	var attempt int
+	r := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	for {
+		err := fn()
+		if err == nil {
+			return nil
+		}
+
+		graphError := errors.GraphError(ctx, err)
+		if graphError.StatusCode != 429 {
+			return err
+		}
+
+		// Parse throttle scope if available
+		var throttleScope ThrottleScope
+		if scope := graphError.Headers.Get("x-ms-throttle-scope"); len(scope) > 0 {
+			throttleScope = parseThrottleScope(scope[0])
+		}
+
+		// Get throttle information
+		var throttleInfo string
+		if info := graphError.Headers.Get("x-ms-throttle-information"); len(info) > 0 {
+			throttleInfo = info[0]
+		}
+
+		const maxBackoff = 10 * time.Second
+		baseDelay := 2 * time.Second
+
+		// Use Retry-After if provided, otherwise use exponential backoff
+		var backoffDelay time.Duration
+		if graphError.RetryAfter != "" {
+			if seconds, err := time.ParseDuration(graphError.RetryAfter + "s"); err == nil {
+				backoffDelay = seconds
+			}
+		}
+
+		if backoffDelay == 0 {
+			backoffDelay = baseDelay * time.Duration(1<<attempt)
+			if backoffDelay > maxBackoff {
+				backoffDelay = maxBackoff
+			}
+		}
+
+		// Add jitter: randomly between 50-100% of calculated delay
+		jitterDelay := backoffDelay/2 + time.Duration(r.Int63n(int64(backoffDelay/2)))
+		attempt++
+
+		logDetails := map[string]interface{}{
+			"operation":     operation,
+			"attempt":       attempt,
+			"delay_seconds": jitterDelay.Seconds(),
+			"status_code":   graphError.StatusCode,
+		}
+
+		if throttleInfo != "" {
+			logDetails["throttle_reason"] = throttleInfo
+		}
+		if throttleScope != (ThrottleScope{}) {
+			logDetails["throttle_scope"] = throttleScope.Scope
+			logDetails["throttle_limit"] = throttleScope.Limit
+		}
+
+		tflog.Info(ctx, "Microsoft Graph rate limit encountered", logDetails)
+
+		timer := time.NewTimer(jitterDelay)
+		defer timer.Stop()
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-timer.C:
+			continue
+		}
+	}
+}
diff --git a/internal/resources/common/retry/retry_intune.go b/internal/resources/common/retry/retry_intune.go
@@ -0,0 +1,121 @@
+// REF: https://learn.microsoft.com/en-us/graph/throttling-limits#intune-service-limits
+
+package retry
+
+import (
+	"context"
+	"time"
+
+	"github.com/deploymenttheory/terraform-provider-microsoft365/internal/resources/common/errors"
+	"github.com/hashicorp/terraform-plugin-log/tflog"
+	"golang.org/x/exp/rand"
+)
+
+// IntuneOperationType defines the type of Intune operation
+type IntuneOperationType string
+
+const (
+	IntuneWrite IntuneOperationType = "Write" // POST, PUT, DELETE, PATCH
+	IntuneRead  IntuneOperationType = "Read"  // GET and others
+)
+
+// RetryableIntuneOperation executes an Intune operation with specific rate limiting
+func RetryableIntuneOperation(ctx context.Context, operation string, opType IntuneOperationType, fn func() error) error {
+	var attempt int
+	r := rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
+
+	const (
+		// Write operations (POST, PUT, DELETE, PATCH)
+		writePerAppLimit = 100 // requests per 20 seconds
+		writeTenantLimit = 200 // requests per 20 seconds
+
+		// General operations
+		generalPerAppLimit = 1000 // requests per 20 seconds
+		generalTenantLimit = 2000 // requests per 20 seconds
+
+		maxBackoff = 10 * time.Second
+		baseDelay  = 2 * time.Second
+	)
+
+	for {
+		err := fn()
+		if err == nil {
+			return nil
+		}
+
+		graphError := errors.GraphError(ctx, err)
+		if graphError.StatusCode != 429 {
+			return err
+		}
+
+		// Parse throttle scope if available
+		var throttleScope ThrottleScope
+		if scope := graphError.Headers.Get("x-ms-throttle-scope"); len(scope) > 0 {
+			throttleScope = parseThrottleScope(scope[0])
+		}
+
+		// Get throttle information
+		var throttleInfo string
+		if info := graphError.Headers.Get("x-ms-throttle-information"); len(info) > 0 {
+			throttleInfo = info[0]
+		}
+
+		// Use Retry-After if provided, otherwise use exponential backoff
+		var backoffDelay time.Duration
+		if graphError.RetryAfter != "" {
+			if seconds, err := time.ParseDuration(graphError.RetryAfter + "s"); err == nil {
+				backoffDelay = seconds
+			}
+		}
+
+		if backoffDelay == 0 {
+			backoffDelay = baseDelay * time.Duration(1<<attempt)
+			if backoffDelay > maxBackoff {
+				backoffDelay = maxBackoff
+			}
+		}
+
+		// Add jitter: randomly between 50-100% of calculated delay
+		jitterDelay := backoffDelay/2 + time.Duration(r.Int63n(int64(backoffDelay/2)))
+		attempt++
+
+		// Enhanced logging with rate limit context
+		logDetails := map[string]interface{}{
+			"operation":      operation,
+			"attempt":        attempt,
+			"delay_seconds":  jitterDelay.Seconds(),
+			"status_code":    graphError.StatusCode,
+			"operation_type": string(opType),
+		}
+
+		if opType == IntuneWrite {
+			logDetails["rate_limit_per_app"] = writePerAppLimit
+			logDetails["rate_limit_tenant"] = writeTenantLimit
+			logDetails["window_seconds"] = 20
+		} else {
+			logDetails["rate_limit_per_app"] = generalPerAppLimit
+			logDetails["rate_limit_tenant"] = generalTenantLimit
+			logDetails["window_seconds"] = 20
+		}
+
+		if throttleInfo != "" {
+			logDetails["throttle_reason"] = throttleInfo
+		}
+		if throttleScope != (ThrottleScope{}) {
+			logDetails["throttle_scope"] = throttleScope.Scope
+			logDetails["throttle_limit"] = throttleScope.Limit
+		}
+
+		tflog.Info(ctx, "Intune service rate limit encountered", logDetails)
+
+		timer := time.NewTimer(jitterDelay)
+		defer timer.Stop()
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-timer.C:
+			continue
+		}
+	}
+}