diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 17a56d484c9f6..4e945e4451ec7 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -37,6 +37,72 @@ import ( "github.com/coder/quartz" ) +// presetFailuresTracker tracks creation failures for presets to implement incremental backoff. +type presetFailuresTracker struct { + failures map[uuid.UUID]*presetCreationFailure + mu sync.RWMutex + clock quartz.Clock +} + +// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff. +type presetCreationFailure struct { + consecutiveFailures int + lastFailureAt time.Time +} + +func newPresetFailuresTracker(clock quartz.Clock) *presetFailuresTracker { + return &presetFailuresTracker{ + failures: make(map[uuid.UUID]*presetCreationFailure), + clock: clock, + } +} + +// RecordFailure records a prebuild creation failure for a preset and increments the consecutive failure count. +func (t *presetFailuresTracker) RecordFailure(presetID uuid.UUID) { + t.mu.Lock() + defer t.mu.Unlock() + + failure, exists := t.failures[presetID] + if !exists { + failure = &presetCreationFailure{} + t.failures[presetID] = failure + } + + failure.consecutiveFailures++ + failure.lastFailureAt = t.clock.Now() +} + +// RecordSuccess clears the failure tracking for a preset after a successful creation. +func (t *presetFailuresTracker) RecordSuccess(presetID uuid.UUID) { + t.mu.Lock() + defer t.mu.Unlock() + + delete(t.failures, presetID) +} + +// ShouldBackoff checks if we should delay creation attempts for a preset based on recent failures. +// It returns true and the backoff time if we should delay, false and zero time otherwise. +func (t *presetFailuresTracker) ShouldBackoff(presetID uuid.UUID, backoffInterval time.Duration) (bool, time.Time) { + t.mu.RLock() + defer t.mu.RUnlock() + + failure, exists := t.failures[presetID] + if !exists || failure.consecutiveFailures == 0 { + return false, time.Time{} + } + + // Calculate exponential backoff: backoffInterval * consecutiveFailures + // This gives us a linear backoff that increases with each consecutive failure. + backoffDuration := backoffInterval * time.Duration(failure.consecutiveFailures) + backoffUntil := failure.lastFailureAt.Add(backoffDuration) + + if t.clock.Now().Before(backoffUntil) { + return true, backoffUntil + } + + return false, time.Time{} +} + type StoreReconciler struct { store database.Store cfg codersdk.PrebuildsConfig @@ -58,6 +124,9 @@ type StoreReconciler struct { metrics *MetricsCollector // Operational metrics reconciliationDuration prometheus.Histogram + + // Per-preset creation failure tracking for incremental backoff + failureTracker *presetFailuresTracker } var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} @@ -102,6 +171,7 @@ func NewStoreReconciler(store database.Store, buildUsageChecker: buildUsageChecker, done: make(chan struct{}, 1), provisionNotifyCh: make(chan database.ProvisionerJob, 10), + failureTracker: newPresetFailuresTracker(clock), } if registerer != nil { @@ -124,6 +194,22 @@ func NewStoreReconciler(store database.Store, return reconciler } +// RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count. +func (c *StoreReconciler) RecordCreationFailure(presetID uuid.UUID) { + c.failureTracker.RecordFailure(presetID) +} + +// RecordCreationSuccess clears the failure tracking for a preset after a successful creation. +func (c *StoreReconciler) RecordCreationSuccess(presetID uuid.UUID) { + c.failureTracker.RecordSuccess(presetID) +} + +// ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures. +// It returns true and the backoff time if we should delay, false and zero time otherwise. +func (c *StoreReconciler) ShouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) { + return c.failureTracker.ShouldBackoff(presetID, c.cfg.ReconciliationBackoffInterval.Value()) +} + func (c *StoreReconciler) Run(ctx context.Context) { reconciliationInterval := c.cfg.ReconciliationInterval.Value() if reconciliationInterval <= 0 { // avoids a panic @@ -643,6 +729,16 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge return nil case prebuilds.ActionTypeCreate: + // Check if we should backoff on this preset due to recent creation failures + if shouldBackoff, backoffUntil := c.failureTracker.ShouldBackoff(ps.Preset.ID, c.cfg.ReconciliationBackoffInterval.Value()); shouldBackoff { + logger.Warn(ctx, "backing off prebuild creation due to recent failures", + slog.F("preset_id", ps.Preset.ID.String()), + slog.F("backoff_until", backoffUntil.Format(time.RFC3339)), + slog.F("backoff_secs", math.Round(backoffUntil.Sub(c.clock.Now()).Seconds())), + ) + return nil + } + // Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes. // See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/. // This is obviously not comprehensive protection against this sort of problem, but this is one essential check. @@ -666,7 +762,11 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge for range action.Create { if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil { logger.Error(ctx, "failed to create prebuild", slog.Error(err)) + c.failureTracker.RecordFailure(ps.Preset.ID) multiErr.Errors = append(multiErr.Errors, err) + } else { + // Only clear failure tracking if we successfully created at least one prebuild + c.failureTracker.RecordSuccess(ps.Preset.ID) } } diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 7548faebd7dab..40642e091df77 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -2972,3 +2972,86 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) { require.NoError(t, err) require.Len(t, workspaces, 2, "should have recreated 2 prebuilds after resuming") } + +func TestIncrementalBackoffOnCreationFailure(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitLong) + clock := quartz.NewMock(t) + db, ps := dbtestutil.NewDB(t) + backoffInterval := 1 * time.Minute + cfg := codersdk.PrebuildsConfig{ + ReconciliationInterval: serpent.Duration(testutil.WaitLong), + ReconciliationBackoffInterval: serpent.Duration(backoffInterval), + } + logger := slogtest.Make(t, nil) + cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{}) + reconciler := prebuilds.NewStoreReconciler(db, ps, cache, cfg, logger, clock, prometheus.NewRegistry(), newNoopEnqueuer(), newNoopUsageCheckerPtr()) + + // Setup a template with a preset + org := dbgen.Organization(t, db, database.Organization{}) + user := dbgen.User(t, db, database.User{}) + template := dbgen.Template(t, db, database.Template{ + CreatedBy: user.ID, + OrganizationID: org.ID, + }) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, ps, org.ID, user.ID, template.ID) + presetID := setupTestDBPreset(t, db, templateVersionID, 1, "test").ID + + // Test the backoff mechanism directly by simulating failures + // First failure + reconciler.RecordCreationFailure(presetID) + + // Check that backoff is active + shouldBackoff, backoffUntil := reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after first failure") + expectedBackoff := clock.Now().Add(backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 1x interval after first failure") + + // Advance clock past first backoff + clock.Advance(backoffInterval + time.Second) + + // Should no longer be in backoff + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.False(t, shouldBackoff, "should not be in backoff after period expires") + + // Second consecutive failure + reconciler.RecordCreationFailure(presetID) + + // Check that backoff is longer now (2 * interval) + shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after second failure") + expectedBackoff = clock.Now().Add(2 * backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 2x interval after second failure") + + // Advance clock by only 1 interval - should still be in backoff + clock.Advance(backoffInterval) + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should still be in backoff after 1 interval with 2 failures") + + // Advance clock by another interval - backoff should expire + clock.Advance(backoffInterval + time.Second) + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.False(t, shouldBackoff, "should not be in backoff after 2 intervals expire") + + // Third consecutive failure + reconciler.RecordCreationFailure(presetID) + + // Check that backoff is even longer now (3 * interval) + shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after third failure") + expectedBackoff = clock.Now().Add(3 * backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 3x interval after third failure") + + // Successful creation should clear the backoff + reconciler.RecordCreationSuccess(presetID) + shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID) + require.False(t, shouldBackoff, "should not be in backoff after successful creation") + + // New failure after success should start backoff from 1x interval again + reconciler.RecordCreationFailure(presetID) + shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID) + require.True(t, shouldBackoff, "should be in backoff after failure following success") + expectedBackoff = clock.Now().Add(backoffInterval) + require.Equal(t, expectedBackoff, backoffUntil, "backoff should reset to 1x interval after success") +}