@@ -37,6 +37,72 @@ import (
3737 "github.com/coder/quartz"
3838)
3939
40+ // presetFailuresTracker tracks creation failures for presets to implement incremental backoff.
41+ type presetFailuresTracker struct {
42+ failures map [uuid.UUID ]* presetCreationFailure
43+ mu sync.RWMutex
44+ clock quartz.Clock
45+ }
46+
47+ // presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
48+ type presetCreationFailure struct {
49+ consecutiveFailures int
50+ lastFailureAt time.Time
51+ }
52+
53+ func newPresetFailuresTracker (clock quartz.Clock ) * presetFailuresTracker {
54+ return & presetFailuresTracker {
55+ failures : make (map [uuid.UUID ]* presetCreationFailure ),
56+ clock : clock ,
57+ }
58+ }
59+
60+ // RecordFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
61+ func (t * presetFailuresTracker ) RecordFailure (presetID uuid.UUID ) {
62+ t .mu .Lock ()
63+ defer t .mu .Unlock ()
64+
65+ failure , exists := t .failures [presetID ]
66+ if ! exists {
67+ failure = & presetCreationFailure {}
68+ t .failures [presetID ] = failure
69+ }
70+
71+ failure .consecutiveFailures ++
72+ failure .lastFailureAt = t .clock .Now ()
73+ }
74+
75+ // RecordSuccess clears the failure tracking for a preset after a successful creation.
76+ func (t * presetFailuresTracker ) RecordSuccess (presetID uuid.UUID ) {
77+ t .mu .Lock ()
78+ defer t .mu .Unlock ()
79+
80+ delete (t .failures , presetID )
81+ }
82+
83+ // ShouldBackoff checks if we should delay creation attempts for a preset based on recent failures.
84+ // It returns true and the backoff time if we should delay, false and zero time otherwise.
85+ func (t * presetFailuresTracker ) ShouldBackoff (presetID uuid.UUID , backoffInterval time.Duration ) (bool , time.Time ) {
86+ t .mu .RLock ()
87+ defer t .mu .RUnlock ()
88+
89+ failure , exists := t .failures [presetID ]
90+ if ! exists || failure .consecutiveFailures == 0 {
91+ return false , time.Time {}
92+ }
93+
94+ // Calculate exponential backoff: backoffInterval * consecutiveFailures
95+ // This gives us a linear backoff that increases with each consecutive failure.
96+ backoffDuration := backoffInterval * time .Duration (failure .consecutiveFailures )
97+ backoffUntil := failure .lastFailureAt .Add (backoffDuration )
98+
99+ if t .clock .Now ().Before (backoffUntil ) {
100+ return true , backoffUntil
101+ }
102+
103+ return false , time.Time {}
104+ }
105+
40106type StoreReconciler struct {
41107 store database.Store
42108 cfg codersdk.PrebuildsConfig
@@ -60,14 +126,7 @@ type StoreReconciler struct {
60126 reconciliationDuration prometheus.Histogram
61127
62128 // Per-preset creation failure tracking for incremental backoff
63- creationFailures map [uuid.UUID ]* presetCreationFailure
64- creationFailuresMutex sync.RWMutex
65- }
66-
67- // presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
68- type presetCreationFailure struct {
69- consecutiveFailures int
70- lastFailureAt time.Time
129+ failureTracker * presetFailuresTracker
71130}
72131
73132var _ prebuilds.ReconciliationOrchestrator = & StoreReconciler {}
@@ -112,7 +171,7 @@ func NewStoreReconciler(store database.Store,
112171 buildUsageChecker : buildUsageChecker ,
113172 done : make (chan struct {}, 1 ),
114173 provisionNotifyCh : make (chan database.ProvisionerJob , 10 ),
115- creationFailures : make ( map [uuid. UUID ] * presetCreationFailure ),
174+ failureTracker : newPresetFailuresTracker ( clock ),
116175 }
117176
118177 if registerer != nil {
@@ -137,64 +196,18 @@ func NewStoreReconciler(store database.Store,
137196
138197// RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
139198func (c * StoreReconciler ) RecordCreationFailure (presetID uuid.UUID ) {
140- c .recordCreationFailure (presetID )
199+ c .failureTracker . RecordFailure (presetID )
141200}
142201
143202// RecordCreationSuccess clears the failure tracking for a preset after a successful creation.
144203func (c * StoreReconciler ) RecordCreationSuccess (presetID uuid.UUID ) {
145- c .recordCreationSuccess (presetID )
204+ c .failureTracker . RecordSuccess (presetID )
146205}
147206
148207// ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
149208// It returns true and the backoff time if we should delay, false and zero time otherwise.
150209func (c * StoreReconciler ) ShouldBackoffCreation (presetID uuid.UUID ) (bool , time.Time ) {
151- return c .shouldBackoffCreation (presetID )
152- }
153-
154- // recordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
155- func (c * StoreReconciler ) recordCreationFailure (presetID uuid.UUID ) {
156- c .creationFailuresMutex .Lock ()
157- defer c .creationFailuresMutex .Unlock ()
158-
159- failure , exists := c .creationFailures [presetID ]
160- if ! exists {
161- failure = & presetCreationFailure {}
162- c .creationFailures [presetID ] = failure
163- }
164-
165- failure .consecutiveFailures ++
166- failure .lastFailureAt = c .clock .Now ()
167- }
168-
169- // recordCreationSuccess clears the failure tracking for a preset after a successful creation.
170- func (c * StoreReconciler ) recordCreationSuccess (presetID uuid.UUID ) {
171- c .creationFailuresMutex .Lock ()
172- defer c .creationFailuresMutex .Unlock ()
173-
174- delete (c .creationFailures , presetID )
175- }
176-
177- // shouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
178- // It returns true and the backoff time if we should delay, false and zero time otherwise.
179- func (c * StoreReconciler ) shouldBackoffCreation (presetID uuid.UUID ) (bool , time.Time ) {
180- c .creationFailuresMutex .RLock ()
181- defer c .creationFailuresMutex .RUnlock ()
182-
183- failure , exists := c .creationFailures [presetID ]
184- if ! exists || failure .consecutiveFailures == 0 {
185- return false , time.Time {}
186- }
187-
188- // Calculate exponential backoff: backoffInterval * consecutiveFailures
189- // This gives us a linear backoff that increases with each consecutive failure.
190- backoffDuration := c .cfg .ReconciliationBackoffInterval .Value () * time .Duration (failure .consecutiveFailures )
191- backoffUntil := failure .lastFailureAt .Add (backoffDuration )
192-
193- if c .clock .Now ().Before (backoffUntil ) {
194- return true , backoffUntil
195- }
196-
197- return false , time.Time {}
210+ return c .failureTracker .ShouldBackoff (presetID , c .cfg .ReconciliationBackoffInterval .Value ())
198211}
199212
200213func (c * StoreReconciler ) Run (ctx context.Context ) {
@@ -717,7 +730,7 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
717730
718731 case prebuilds .ActionTypeCreate :
719732 // Check if we should backoff on this preset due to recent creation failures
720- if shouldBackoff , backoffUntil := c .shouldBackoffCreation (ps .Preset .ID ); shouldBackoff {
733+ if shouldBackoff , backoffUntil := c .failureTracker . ShouldBackoff (ps .Preset .ID , c . cfg . ReconciliationBackoffInterval . Value () ); shouldBackoff {
721734 logger .Warn (ctx , "backing off prebuild creation due to recent failures" ,
722735 slog .F ("preset_id" , ps .Preset .ID .String ()),
723736 slog .F ("backoff_until" , backoffUntil .Format (time .RFC3339 )),
@@ -749,11 +762,11 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
749762 for range action .Create {
750763 if err := c .createPrebuiltWorkspace (prebuildsCtx , uuid .New (), ps .Preset .TemplateID , ps .Preset .ID ); err != nil {
751764 logger .Error (ctx , "failed to create prebuild" , slog .Error (err ))
752- c .recordCreationFailure (ps .Preset .ID )
765+ c .failureTracker . RecordFailure (ps .Preset .ID )
753766 multiErr .Errors = append (multiErr .Errors , err )
754767 } else {
755768 // Only clear failure tracking if we successfully created at least one prebuild
756- c .recordCreationSuccess (ps .Preset .ID )
769+ c .failureTracker . RecordSuccess (ps .Preset .ID )
757770 }
758771 }
759772
0 commit comments