Skip to content

Commit fb63db7

Browse files
Callum Styanclaude
andcommitted
refactor: extract failure tracking to presetFailuresTracker struct
Encapsulates the failure tracking logic into a dedicated presetFailuresTracker struct with its own methods, improving code organization and separation of concerns. Changes: - Created presetFailuresTracker struct with failures map and mutex - Moved RecordFailure, RecordSuccess, and ShouldBackoff methods to the tracker - Updated StoreReconciler to hold a failureTracker instance - Updated all call sites to use the new tracker methods 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 1a57963 commit fb63db7

File tree

2 files changed

+74
-62
lines changed

2 files changed

+74
-62
lines changed

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 74 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,72 @@ import (
3737
"github.com/coder/quartz"
3838
)
3939

40+
// presetFailuresTracker tracks creation failures for presets to implement incremental backoff.
41+
type presetFailuresTracker struct {
42+
failures map[uuid.UUID]*presetCreationFailure
43+
mu sync.RWMutex
44+
clock quartz.Clock
45+
}
46+
47+
// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
48+
type presetCreationFailure struct {
49+
consecutiveFailures int
50+
lastFailureAt time.Time
51+
}
52+
53+
func newPresetFailuresTracker(clock quartz.Clock) *presetFailuresTracker {
54+
return &presetFailuresTracker{
55+
failures: make(map[uuid.UUID]*presetCreationFailure),
56+
clock: clock,
57+
}
58+
}
59+
60+
// RecordFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
61+
func (t *presetFailuresTracker) RecordFailure(presetID uuid.UUID) {
62+
t.mu.Lock()
63+
defer t.mu.Unlock()
64+
65+
failure, exists := t.failures[presetID]
66+
if !exists {
67+
failure = &presetCreationFailure{}
68+
t.failures[presetID] = failure
69+
}
70+
71+
failure.consecutiveFailures++
72+
failure.lastFailureAt = t.clock.Now()
73+
}
74+
75+
// RecordSuccess clears the failure tracking for a preset after a successful creation.
76+
func (t *presetFailuresTracker) RecordSuccess(presetID uuid.UUID) {
77+
t.mu.Lock()
78+
defer t.mu.Unlock()
79+
80+
delete(t.failures, presetID)
81+
}
82+
83+
// ShouldBackoff checks if we should delay creation attempts for a preset based on recent failures.
84+
// It returns true and the backoff time if we should delay, false and zero time otherwise.
85+
func (t *presetFailuresTracker) ShouldBackoff(presetID uuid.UUID, backoffInterval time.Duration) (bool, time.Time) {
86+
t.mu.RLock()
87+
defer t.mu.RUnlock()
88+
89+
failure, exists := t.failures[presetID]
90+
if !exists || failure.consecutiveFailures == 0 {
91+
return false, time.Time{}
92+
}
93+
94+
// Calculate exponential backoff: backoffInterval * consecutiveFailures
95+
// This gives us a linear backoff that increases with each consecutive failure.
96+
backoffDuration := backoffInterval * time.Duration(failure.consecutiveFailures)
97+
backoffUntil := failure.lastFailureAt.Add(backoffDuration)
98+
99+
if t.clock.Now().Before(backoffUntil) {
100+
return true, backoffUntil
101+
}
102+
103+
return false, time.Time{}
104+
}
105+
40106
type StoreReconciler struct {
41107
store database.Store
42108
cfg codersdk.PrebuildsConfig
@@ -60,14 +126,7 @@ type StoreReconciler struct {
60126
reconciliationDuration prometheus.Histogram
61127

62128
// Per-preset creation failure tracking for incremental backoff
63-
creationFailures map[uuid.UUID]*presetCreationFailure
64-
creationFailuresMutex sync.RWMutex
65-
}
66-
67-
// presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
68-
type presetCreationFailure struct {
69-
consecutiveFailures int
70-
lastFailureAt time.Time
129+
failureTracker *presetFailuresTracker
71130
}
72131

73132
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
@@ -112,7 +171,7 @@ func NewStoreReconciler(store database.Store,
112171
buildUsageChecker: buildUsageChecker,
113172
done: make(chan struct{}, 1),
114173
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
115-
creationFailures: make(map[uuid.UUID]*presetCreationFailure),
174+
failureTracker: newPresetFailuresTracker(clock),
116175
}
117176

118177
if registerer != nil {
@@ -137,64 +196,18 @@ func NewStoreReconciler(store database.Store,
137196

138197
// RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
139198
func (c *StoreReconciler) RecordCreationFailure(presetID uuid.UUID) {
140-
c.recordCreationFailure(presetID)
199+
c.failureTracker.RecordFailure(presetID)
141200
}
142201

143202
// RecordCreationSuccess clears the failure tracking for a preset after a successful creation.
144203
func (c *StoreReconciler) RecordCreationSuccess(presetID uuid.UUID) {
145-
c.recordCreationSuccess(presetID)
204+
c.failureTracker.RecordSuccess(presetID)
146205
}
147206

148207
// ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
149208
// It returns true and the backoff time if we should delay, false and zero time otherwise.
150209
func (c *StoreReconciler) ShouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) {
151-
return c.shouldBackoffCreation(presetID)
152-
}
153-
154-
// recordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
155-
func (c *StoreReconciler) recordCreationFailure(presetID uuid.UUID) {
156-
c.creationFailuresMutex.Lock()
157-
defer c.creationFailuresMutex.Unlock()
158-
159-
failure, exists := c.creationFailures[presetID]
160-
if !exists {
161-
failure = &presetCreationFailure{}
162-
c.creationFailures[presetID] = failure
163-
}
164-
165-
failure.consecutiveFailures++
166-
failure.lastFailureAt = c.clock.Now()
167-
}
168-
169-
// recordCreationSuccess clears the failure tracking for a preset after a successful creation.
170-
func (c *StoreReconciler) recordCreationSuccess(presetID uuid.UUID) {
171-
c.creationFailuresMutex.Lock()
172-
defer c.creationFailuresMutex.Unlock()
173-
174-
delete(c.creationFailures, presetID)
175-
}
176-
177-
// shouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
178-
// It returns true and the backoff time if we should delay, false and zero time otherwise.
179-
func (c *StoreReconciler) shouldBackoffCreation(presetID uuid.UUID) (bool, time.Time) {
180-
c.creationFailuresMutex.RLock()
181-
defer c.creationFailuresMutex.RUnlock()
182-
183-
failure, exists := c.creationFailures[presetID]
184-
if !exists || failure.consecutiveFailures == 0 {
185-
return false, time.Time{}
186-
}
187-
188-
// Calculate exponential backoff: backoffInterval * consecutiveFailures
189-
// This gives us a linear backoff that increases with each consecutive failure.
190-
backoffDuration := c.cfg.ReconciliationBackoffInterval.Value() * time.Duration(failure.consecutiveFailures)
191-
backoffUntil := failure.lastFailureAt.Add(backoffDuration)
192-
193-
if c.clock.Now().Before(backoffUntil) {
194-
return true, backoffUntil
195-
}
196-
197-
return false, time.Time{}
210+
return c.failureTracker.ShouldBackoff(presetID, c.cfg.ReconciliationBackoffInterval.Value())
198211
}
199212

200213
func (c *StoreReconciler) Run(ctx context.Context) {
@@ -717,7 +730,7 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
717730

718731
case prebuilds.ActionTypeCreate:
719732
// Check if we should backoff on this preset due to recent creation failures
720-
if shouldBackoff, backoffUntil := c.shouldBackoffCreation(ps.Preset.ID); shouldBackoff {
733+
if shouldBackoff, backoffUntil := c.failureTracker.ShouldBackoff(ps.Preset.ID, c.cfg.ReconciliationBackoffInterval.Value()); shouldBackoff {
721734
logger.Warn(ctx, "backing off prebuild creation due to recent failures",
722735
slog.F("preset_id", ps.Preset.ID.String()),
723736
slog.F("backoff_until", backoffUntil.Format(time.RFC3339)),
@@ -749,11 +762,11 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
749762
for range action.Create {
750763
if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil {
751764
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
752-
c.recordCreationFailure(ps.Preset.ID)
765+
c.failureTracker.RecordFailure(ps.Preset.ID)
753766
multiErr.Errors = append(multiErr.Errors, err)
754767
} else {
755768
// Only clear failure tracking if we successfully created at least one prebuild
756-
c.recordCreationSuccess(ps.Preset.ID)
769+
c.failureTracker.RecordSuccess(ps.Preset.ID)
757770
}
758771
}
759772

enterprise/coderd/prebuilds/reconcile_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2973,7 +2973,6 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) {
29732973
require.Len(t, workspaces, 2, "should have recreated 2 prebuilds after resuming")
29742974
}
29752975

2976-
29772976
func TestIncrementalBackoffOnCreationFailure(t *testing.T) {
29782977
t.Parallel()
29792978

0 commit comments

Comments
 (0)