Skip to content

Commit 1a57963

Browse files
Callum Styanclaude
andcommitted
feat(prebuilds): add incremental backoff for failed prebuild creation
The prebuilds reconcile loop can currently spam workspace creation attempts that are always going to fail, such as when required dynamic parameters are missing or unresolved. This change introduces an in-memory per-preset backoff mechanism that tracks consecutive creation failures and delays subsequent creation attempts using linear backoff: - First failure: backoff for 1x interval (default 1 minute) - Second consecutive failure: backoff for 2x interval (2 minutes) - Third consecutive failure: backoff for 3x interval (3 minutes) - And so on... When a creation succeeds, the failure tracking is cleared and any subsequent failure starts backoff from 1x interval again. This complements the existing database-based backoff system by preventing immediate retry spam when creation fails quickly (e.g., due to missing parameters), while still allowing periodic retries and recovery when issues are fixed. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 5092645 commit 1a57963

File tree

2 files changed

+171
-0
lines changed

2 files changed

+171
-0
lines changed

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,16 @@ type StoreReconciler struct {
5858
metrics *MetricsCollector
5959
// Operational metrics
6060
reconciliationDuration prometheus.Histogram
61+
62+
// Per-preset creation failure tracking for incremental backoff
63+
creationFailures map[uuid.UUID]*presetCreationFailure
64+
creationFailuresMutex sync.RWMutex
65+
}
66+
67+
// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
68+
type presetCreationFailure struct {
69+
consecutiveFailures int
70+
lastFailureAt time.Time
6171
}
6272

6373
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
@@ -102,6 +112,7 @@ func NewStoreReconciler(store database.Store,
102112
buildUsageChecker: buildUsageChecker,
103113
done: make(chan struct{}, 1),
104114
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
115+
creationFailures: make(map[uuid.UUID]*presetCreationFailure),
105116
}
106117

107118
if registerer != nil {
@@ -124,6 +135,68 @@ func NewStoreReconciler(store database.Store,
124135
return reconciler
125136
}
126137

138+
// RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
139+
func (c *StoreReconciler) RecordCreationFailure(presetID uuid.UUID) {
140+
c.recordCreationFailure(presetID)
141+
}
142+
143+
// RecordCreationSuccess clears the failure tracking for a preset after a successful creation.
144+
func (c *StoreReconciler) RecordCreationSuccess(presetID uuid.UUID) {
145+
c.recordCreationSuccess(presetID)
146+
}
147+
148+
// ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
149+
// It returns true and the backoff time if we should delay, false and zero time otherwise.
150+
func (c *StoreReconciler) ShouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) {
151+
return c.shouldBackoffCreation(presetID)
152+
}
153+
154+
// recordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
155+
func (c *StoreReconciler) recordCreationFailure(presetID uuid.UUID) {
156+
c.creationFailuresMutex.Lock()
157+
defer c.creationFailuresMutex.Unlock()
158+
159+
failure, exists := c.creationFailures[presetID]
160+
if !exists {
161+
failure = &presetCreationFailure{}
162+
c.creationFailures[presetID] = failure
163+
}
164+
165+
failure.consecutiveFailures++
166+
failure.lastFailureAt = c.clock.Now()
167+
}
168+
169+
// recordCreationSuccess clears the failure tracking for a preset after a successful creation.
170+
func (c *StoreReconciler) recordCreationSuccess(presetID uuid.UUID) {
171+
c.creationFailuresMutex.Lock()
172+
defer c.creationFailuresMutex.Unlock()
173+
174+
delete(c.creationFailures, presetID)
175+
}
176+
177+
// shouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
178+
// It returns true and the backoff time if we should delay, false and zero time otherwise.
179+
func (c *StoreReconciler) shouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) {
180+
c.creationFailuresMutex.RLock()
181+
defer c.creationFailuresMutex.RUnlock()
182+
183+
failure, exists := c.creationFailures[presetID]
184+
if !exists || failure.consecutiveFailures == 0 {
185+
return false, time.Time{}
186+
}
187+
188+
// Calculate exponential backoff: backoffInterval * consecutiveFailures
189+
// This gives us a linear backoff that increases with each consecutive failure.
190+
backoffDuration := c.cfg.ReconciliationBackoffInterval.Value() * time.Duration(failure.consecutiveFailures)
191+
backoffUntil := failure.lastFailureAt.Add(backoffDuration)
192+
193+
if c.clock.Now().Before(backoffUntil) {
194+
return true, backoffUntil
195+
}
196+
197+
return false, time.Time{}
198+
}
199+
127200
func (c *StoreReconciler) Run(ctx context.Context) {
128201
reconciliationInterval := c.cfg.ReconciliationInterval.Value()
129202
if reconciliationInterval <= 0 { // avoids a panic
@@ -643,6 +716,16 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
643716
return nil
644717

645718
case prebuilds.ActionTypeCreate:
719+
// Check if we should backoff on this preset due to recent creation failures
720+
if shouldBackoff, backoffUntil := c.shouldBackoffCreation(ps.Preset.ID); shouldBackoff {
721+
logger.Warn(ctx, "backing off prebuild creation due to recent failures",
722+
slog.F("preset_id", ps.Preset.ID.String()),
723+
slog.F("backoff_until", backoffUntil.Format(time.RFC3339)),
724+
slog.F("backoff_secs", math.Round(backoffUntil.Sub(c.clock.Now()).Seconds())),
725+
)
726+
return nil
727+
}
728+
646729
// Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
647730
// See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
648731
// This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
@@ -666,7 +749,11 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
666749
for range action.Create {
667750
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
668751
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
752+
c.recordCreationFailure(ps.Preset.ID)
669753
multiErr.Errors = append(multiErr.Errors, err)
754+
} else {
755+
// Only clear failure tracking if we successfully created at least one prebuild
756+
c.recordCreationSuccess(ps.Preset.ID)
670757
}
671758
}
672759

enterprise/coderd/prebuilds/reconcile_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2972,3 +2972,87 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) {
29722972
require.NoError(t, err)
29732973
require.Len(t, workspaces, 2, "should have recreated 2 prebuilds after resuming")
29742974
}
2975+
2976+
2977+
func TestIncrementalBackoffOnCreationFailure(t *testing.T) {
2978+
t.Parallel()
2979+
2980+
ctx := testutil.Context(t, testutil.WaitLong)
2981+
clock := quartz.NewMock(t)
2982+
db, ps := dbtestutil.NewDB(t)
2983+
backoffInterval := 1 * time.Minute
2984+
cfg := codersdk.PrebuildsConfig{
2985+
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
2986+
ReconciliationBackoffInterval: serpent.Duration(backoffInterval),
2987+
}
2988+
logger := slogtest.Make(t, nil)
2989+
cache := files.New(prometheus.NewRegistry(), &coderdtest.FakeAuthorizer{})
2990+
reconciler := prebuilds.NewStoreReconciler(db, ps, cache, cfg, logger, clock, prometheus.NewRegistry(), newNoopEnqueuer(), newNoopUsageCheckerPtr())
2991+
2992+
// Setup a template with a preset
2993+
org := dbgen.Organization(t, db, database.Organization{})
2994+
user := dbgen.User(t, db, database.User{})
2995+
template := dbgen.Template(t, db, database.Template{
2996+
CreatedBy: user.ID,
2997+
OrganizationID: org.ID,
2998+
})
2999+
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, ps, org.ID, user.ID, template.ID)
3000+
presetID := setupTestDBPreset(t, db, templateVersionID, 1, "test").ID
3001+
3002+
// Test the backoff mechanism directly by simulating failures
3003+
// First failure
3004+
reconciler.RecordCreationFailure(presetID)
3005+
3006+
// Check that backoff is active
3007+
shouldBackoff, backoffUntil := reconciler.ShouldBackoffCreation(presetID)
3008+
require.True(t, shouldBackoff, "should be in backoff after first failure")
3009+
expectedBackoff := clock.Now().Add(backoffInterval)
3010+
require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 1x interval after first failure")
3011+
3012+
// Advance clock past first backoff
3013+
clock.Advance(backoffInterval + time.Second)
3014+
3015+
// Should no longer be in backoff
3016+
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
3017+
require.False(t, shouldBackoff, "should not be in backoff after period expires")
3018+
3019+
// Second consecutive failure
3020+
reconciler.RecordCreationFailure(presetID)
3021+
3022+
// Check that backoff is longer now (2 * interval)
3023+
shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID)
3024+
require.True(t, shouldBackoff, "should be in backoff after second failure")
3025+
expectedBackoff = clock.Now().Add(2 * backoffInterval)
3026+
require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 2x interval after second failure")
3027+
3028+
// Advance clock by only 1 interval - should still be in backoff
3029+
clock.Advance(backoffInterval)
3030+
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
3031+
require.True(t, shouldBackoff, "should still be in backoff after 1 interval with 2 failures")
3032+
3033+
// Advance clock by another interval - backoff should expire
3034+
clock.Advance(backoffInterval + time.Second)
3035+
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
3036+
require.False(t, shouldBackoff, "should not be in backoff after 2 intervals expire")
3037+
3038+
// Third consecutive failure
3039+
reconciler.RecordCreationFailure(presetID)
3040+
3041+
// Check that backoff is even longer now (3 * interval)
3042+
shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID)
3043+
require.True(t, shouldBackoff, "should be in backoff after third failure")
3044+
expectedBackoff = clock.Now().Add(3 * backoffInterval)
3045+
require.Equal(t, expectedBackoff, backoffUntil, "backoff should be 3x interval after third failure")
3046+
3047+
// Successful creation should clear the backoff
3048+
reconciler.RecordCreationSuccess(presetID)
3049+
shouldBackoff, _ = reconciler.ShouldBackoffCreation(presetID)
3050+
require.False(t, shouldBackoff, "should not be in backoff after successful creation")
3051+
3052+
// New failure after success should start backoff from 1x interval again
3053+
reconciler.RecordCreationFailure(presetID)
3054+
shouldBackoff, backoffUntil = reconciler.ShouldBackoffCreation(presetID)
3055+
require.True(t, shouldBackoff, "should be in backoff after failure following success")
3056+
expectedBackoff = clock.Now().Add(backoffInterval)
3057+
require.Equal(t, expectedBackoff, backoffUntil, "backoff should reset to 1x interval after success")
3058+
}

0 commit comments

Comments
 (0)