From 1a57963b8692c5025972a39f7b040568d33aaf67 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Tue, 16 Dec 2025 23:51:16 +0000 Subject: [PATCH 1/2] feat(prebuilds): add incremental backoff for failed prebuild creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prebuilds reconcile loop can currently spam workspace creation attempts that are always going to fail, such as when required dynamic parameters are missing or unresolved. This change introduces an in-memory per-preset backoff mechanism that tracks consecutive creation failures and delays subsequent creation attempts using linear backoff: - First failure: backoff for 1x interval (default 1 minute) - Second consecutive failure: backoff for 2x interval (2 minutes) - Third consecutive failure: backoff for 3x interval (3 minutes) - And so on... When a creation succeeds, the failure tracking is cleared and any subsequent failure starts backoff from 1x interval again. This complements the existing database-based backoff system by preventing immediate retry spam when creation fails quickly (e.g., due to missing parameters), while still allowing periodic retries and recovery when issues are fixed. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- enterprise/coderd/prebuilds/reconcile.go | 87 +++++++++++++++++++ enterprise/coderd/prebuilds/reconcile_test.go | 84 ++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 17a56d484c9f6..c4f7d2b37f5ec 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -58,6 +58,16 @@ type StoreReconciler struct { metrics *MetricsCollector // Operational metrics reconciliationDuration prometheus.Histogram + + // Per-preset creation failure tracking for incremental backoff + creationFailures map[uuid.UUID]*presetCreationFailure + creationFailuresMutex sync.RWMutex +} + +// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff. +type presetCreationFailure struct { + consecutiveFailures int + lastFailureAt time.Time } var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} @@ -102,6 +112,7 @@ func NewStoreReconciler(store database.Store, buildUsageChecker: buildUsageChecker, done: make(chan struct{}, 1), provisionNotifyCh: make(chan database.ProvisionerJob, 10), + creationFailures: make(map[uuid.UUID]*presetCreationFailure), } if registerer != nil { @@ -124,6 +135,68 @@ func NewStoreReconciler(store database.Store, return reconciler } +// RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count. +func (c *StoreReconciler) RecordCreationFailure(presetID uuid.UUID) { + c.recordCreationFailure(presetID) +} + +// RecordCreationSuccess clears the failure tracking for a preset after a successful creation. +func (c *StoreReconciler) RecordCreationSuccess(presetID uuid.UUID) { + c.recordCreationSuccess(presetID) +} + +// ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures. +// It returns true and the backoff time if we should delay, false and zero time otherwise. +func (c *StoreReconciler) ShouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) { + return c.shouldBackoffCreation(presetID) +} + +// recordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count. +func (c *StoreReconciler) recordCreationFailure(presetID uuid.UUID) { + c.creationFailuresMutex.Lock() + defer c.creationFailuresMutex.Unlock() + + failure, exists := c.creationFailures[presetID] + if !exists { + failure = &presetCreationFailure{} + c.creationFailures[presetID] = failure + } + + failure.consecutiveFailures++ + failure.lastFailureAt = c.clock.Now() +} + +// recordCreationSuccess clears the failure tracking for a preset after a successful creation. +func (c *StoreReconciler) recordCreationSuccess(presetID uuid.UUID) { + c.creationFailuresMutex.Lock() + defer c.creationFailuresMutex.Unlock() + + delete(c.creationFailures, presetID) +} + +// shouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures. +// It returns true and the backoff time if we should delay, false and zero time otherwise. +func (c *StoreReconciler) shouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) { + c.creationFailuresMutex.RLock() + defer c.creationFailuresMutex.RUnlock() + + failure, exists := c.creationFailures[presetID] + if !exists || failure.consecutiveFailures == 0 { + return false, time.Time{} + } + + // Calculate exponential backoff: backoffInterval * consecutiveFailures + // This gives us a linear backoff that increases with each consecutive failure. + backoffDuration := c.cfg.ReconciliationBackoffInterval.Value() * time.Duration(failure.consecutiveFailures) + backoffUntil := failure.lastFailureAt.Add(backoffDuration) + + if c.clock.Now().Before(backoffUntil) { + return true, backoffUntil + } + + return false, time.Time{} +} + func (c *StoreReconciler) Run(ctx context.Context) { reconciliationInterval := c.cfg.ReconciliationInterval.Value() if reconciliationInterval <= 0 { // avoids a panic @@ -643,6 +716,16 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge return nil case prebuilds.ActionTypeCreate: + // Check if we should backoff on this preset due to recent creation failures + if shouldBackoff, backoffUntil := c.shouldBackoffCreation(ps.Preset.ID); shouldBackoff { + logger.Warn(ctx, "backing off prebuild creation due to recent failures", + slog.F("preset_id", ps.Preset.ID.String()), + slog.F("backoff_until", backoffUntil.Format(time.RFC3339)), + slog.F("backoff_secs", math.Round(backoffUntil.Sub(c.clock.Now()).Seconds())), + ) + return nil + } + // Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes. // See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/. // This is obviously not comprehensive protection against this sort of problem, but this is one essential check. @@ -666,7 +749,11 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge for range action.Create { if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil { logger.Error(ctx, "failed to create prebuild", slog.Error(err)) + c.recordCreationFailure(ps.Preset.ID) multiErr.Errors = append(multiErr.Errors, err) + } else { + // Only clear failure tracking if we successfully created at least one prebuild + c.recordCreationSuccess(ps.Preset.ID) } } diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 7548faebd7dab..0dd7912e8df6e 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -2972,3 +2972,87 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) { require.NoError(t, err) require.Len(t, workspaces, 2, "should have recreated 2 prebuilds after resuming") } + + +func TestIncrementalBackoffOnCreationFailure(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + clock := quartz.NewMock(t) + db, ps := dbtestutil.NewDB(t) + backoffInterval := 1 * time.Minute + cfg := codersdk.PrebuildsConfig{ + ReconciliationInterval: serpent.Duration(testutil.WaitLong), + ReconciliationBackoffInterval: serpent.Duration(backoffInterval), + } + logger := slogtest.Make(t, nil) + cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{}) + reconciler := prebuilds.NewStoreReconciler(db, ps, cache, cfg, logger, clock, prometheus.NewRegistry(), newNoopEnqueuer(), newNoopUsageCheckerPtr()) + + // Setup a template with a preset + org := dbgen.Organization(t, db, database.Organization{}) + user := dbgen.User(t, db, database.User{}) + template := dbgen.Template(t, db, database.Template{ + CreatedBy: user.ID, + OrganizationID: org.ID, + }) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, ps, org.ID, user.ID, template.ID) + presetID := setupTestDBPreset(t, db, templateVersionID, 1, "test").ID + + // Test the backoff mechanism directly by simulating failures + // First failure + reconciler.RecordCreationFailure(presetID) + + // Check that backoff is active + shouldBackoff, backoffUntil := reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after first failure") + expectedBackoff := clock.Now().Add(backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 1x interval after first failure") + + // Advance clock past first backoff + clock.Advance(backoffInterval + time.Second) + + // Should no longer be in backoff + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.False(t, shouldBackoff, "should not be in backoff after period expires") + + // Second consecutive failure + reconciler.RecordCreationFailure(presetID) + + // Check that backoff is longer now (2 * interval) + shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after second failure") + expectedBackoff = clock.Now().Add(2 * backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 2x interval after second failure") + + // Advance clock by only 1 interval - should still be in backoff + clock.Advance(backoffInterval) + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should still be in backoff after 1 interval with 2 failures") + + // Advance clock by another interval - backoff should expire + clock.Advance(backoffInterval + time.Second) + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.False(t, shouldBackoff, "should not be in backoff after 2 intervals expire") + + // Third consecutive failure + reconciler.RecordCreationFailure(presetID) + + // Check that backoff is even longer now (3 * interval) + shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after third failure") + expectedBackoff = clock.Now().Add(3 * backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 3x interval after third failure") + + // Successful creation should clear the backoff + reconciler.RecordCreationSuccess(presetID) + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.False(t, shouldBackoff, "should not be in backoff after successful creation") + + // New failure after success should start backoff from 1x interval again + reconciler.RecordCreationFailure(presetID) + shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after failure following success") + expectedBackoff = clock.Now().Add(backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should reset to 1x interval after success") +} From fb63db712110444bcc9f6df5eb458bfec63e5e38 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Wed, 17 Dec 2025 00:08:32 +0000 Subject: [PATCH 2/2] refactor: extract failure tracking to presetFailuresTracker struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Encapsulates the failure tracking logic into a dedicated presetFailuresTracker struct with its own methods, improving code organization and separation of concerns. Changes: - Created presetFailuresTracker struct with failures map and mutex - Moved RecordFailure, RecordSuccess, and ShouldBackoff methods to the tracker - Updated StoreReconciler to hold a failureTracker instance - Updated all call sites to use the new tracker methods 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- enterprise/coderd/prebuilds/reconcile.go | 135 ++++++++++-------- enterprise/coderd/prebuilds/reconcile_test.go | 1 - 2 files changed, 74 insertions(+), 62 deletions(-) diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index c4f7d2b37f5ec..4e945e4451ec7 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -37,6 +37,72 @@ import ( "github.com/coder/quartz" ) +// presetFailuresTracker tracks creation failures for presets to implement incremental backoff. +type presetFailuresTracker struct { + failures map[uuid.UUID]*presetCreationFailure + mu sync.RWMutex + clock quartz.Clock +} + +// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff. +type presetCreationFailure struct { + consecutiveFailures int + lastFailureAt time.Time +} + +func newPresetFailuresTracker(clock quartz.Clock) *presetFailuresTracker { + return &presetFailuresTracker{ + failures: make(map[uuid.UUID]*presetCreationFailure), + clock: clock, + } +} + +// RecordFailure records a prebuild creation failure for a preset and increments the consecutive failure count. +func (t *presetFailuresTracker) RecordFailure(presetID uuid.UUID) { + t.mu.Lock() + defer t.mu.Unlock() + + failure, exists := t.failures[presetID] + if !exists { + failure = &presetCreationFailure{} + t.failures[presetID] = failure + } + + failure.consecutiveFailures++ + failure.lastFailureAt = t.clock.Now() +} + +// RecordSuccess clears the failure tracking for a preset after a successful creation. +func (t *presetFailuresTracker) RecordSuccess(presetID uuid.UUID) { + t.mu.Lock() + defer t.mu.Unlock() + + delete(t.failures, presetID) +} + +// ShouldBackoff checks if we should delay creation attempts for a preset based on recent failures. +// It returns true and the backoff time if we should delay, false and zero time otherwise. +func (t *presetFailuresTracker) ShouldBackoff(presetID uuid.UUID, backoffInterval time.Duration) (bool, time.Time) { + t.mu.RLock() + defer t.mu.RUnlock() + + failure, exists := t.failures[presetID] + if !exists || failure.consecutiveFailures == 0 { + return false, time.Time{} + } + + // Calculate exponential backoff: backoffInterval * consecutiveFailures + // This gives us a linear backoff that increases with each consecutive failure. + backoffDuration := backoffInterval * time.Duration(failure.consecutiveFailures) + backoffUntil := failure.lastFailureAt.Add(backoffDuration) + + if t.clock.Now().Before(backoffUntil) { + return true, backoffUntil + } + + return false, time.Time{} +} + type StoreReconciler struct { store database.Store cfg codersdk.PrebuildsConfig @@ -60,14 +126,7 @@ type StoreReconciler struct { reconciliationDuration prometheus.Histogram // Per-preset creation failure tracking for incremental backoff - creationFailures map[uuid.UUID]*presetCreationFailure - creationFailuresMutex sync.RWMutex -} - -// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff. -type presetCreationFailure struct { - consecutiveFailures int - lastFailureAt time.Time + failureTracker *presetFailuresTracker } var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} @@ -112,7 +171,7 @@ func NewStoreReconciler(store database.Store, buildUsageChecker: buildUsageChecker, done: make(chan struct{}, 1), provisionNotifyCh: make(chan database.ProvisionerJob, 10), - creationFailures: make(map[uuid.UUID]*presetCreationFailure), + failureTracker: newPresetFailuresTracker(clock), } if registerer != nil { @@ -137,64 +196,18 @@ func NewStoreReconciler(store database.Store, // RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count. func (c *StoreReconciler) RecordCreationFailure(presetID uuid.UUID) { - c.recordCreationFailure(presetID) + c.failureTracker.RecordFailure(presetID) } // RecordCreationSuccess clears the failure tracking for a preset after a successful creation. func (c *StoreReconciler) RecordCreationSuccess(presetID uuid.UUID) { - c.recordCreationSuccess(presetID) + c.failureTracker.RecordSuccess(presetID) } // ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures. // It returns true and the backoff time if we should delay, false and zero time otherwise. func (c *StoreReconciler) ShouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) { - return c.shouldBackoffCreation(presetID) -} - -// recordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count. -func (c *StoreReconciler) recordCreationFailure(presetID uuid.UUID) { - c.creationFailuresMutex.Lock() - defer c.creationFailuresMutex.Unlock() - - failure, exists := c.creationFailures[presetID] - if !exists { - failure = &presetCreationFailure{} - c.creationFailures[presetID] = failure - } - - failure.consecutiveFailures++ - failure.lastFailureAt = c.clock.Now() -} - -// recordCreationSuccess clears the failure tracking for a preset after a successful creation. -func (c *StoreReconciler) recordCreationSuccess(presetID uuid.UUID) { - c.creationFailuresMutex.Lock() - defer c.creationFailuresMutex.Unlock() - - delete(c.creationFailures, presetID) -} - -// shouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures. -// It returns true and the backoff time if we should delay, false and zero time otherwise. -func (c *StoreReconciler) shouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) { - c.creationFailuresMutex.RLock() - defer c.creationFailuresMutex.RUnlock() - - failure, exists := c.creationFailures[presetID] - if !exists || failure.consecutiveFailures == 0 { - return false, time.Time{} - } - - // Calculate exponential backoff: backoffInterval * consecutiveFailures - // This gives us a linear backoff that increases with each consecutive failure. - backoffDuration := c.cfg.ReconciliationBackoffInterval.Value() * time.Duration(failure.consecutiveFailures) - backoffUntil := failure.lastFailureAt.Add(backoffDuration) - - if c.clock.Now().Before(backoffUntil) { - return true, backoffUntil - } - - return false, time.Time{} + return c.failureTracker.ShouldBackoff(presetID, c.cfg.ReconciliationBackoffInterval.Value()) } func (c *StoreReconciler) Run(ctx context.Context) { @@ -717,7 +730,7 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge case prebuilds.ActionTypeCreate: // Check if we should backoff on this preset due to recent creation failures - if shouldBackoff, backoffUntil := c.shouldBackoffCreation(ps.Preset.ID); shouldBackoff { + if shouldBackoff, backoffUntil := c.failureTracker.ShouldBackoff(ps.Preset.ID, c.cfg.ReconciliationBackoffInterval.Value()); shouldBackoff { logger.Warn(ctx, "backing off prebuild creation due to recent failures", slog.F("preset_id", ps.Preset.ID.String()), slog.F("backoff_until", backoffUntil.Format(time.RFC3339)), @@ -749,11 +762,11 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge for range action.Create { if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil { logger.Error(ctx, "failed to create prebuild", slog.Error(err)) - c.recordCreationFailure(ps.Preset.ID) + c.failureTracker.RecordFailure(ps.Preset.ID) multiErr.Errors = append(multiErr.Errors, err) } else { // Only clear failure tracking if we successfully created at least one prebuild - c.recordCreationSuccess(ps.Preset.ID) + c.failureTracker.RecordSuccess(ps.Preset.ID) } } diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 0dd7912e8df6e..40642e091df77 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -2973,7 +2973,6 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) { require.Len(t, workspaces, 2, "should have recreated 2 prebuilds after resuming") } - func TestIncrementalBackoffOnCreationFailure(t *testing.T) { t.Parallel()