@@ -58,6 +58,16 @@ type StoreReconciler struct {
5858 metrics * MetricsCollector
5959 // Operational metrics
6060 reconciliationDuration prometheus.Histogram
61+
62+ // Per-preset creation failure tracking for incremental backoff
63+ creationFailures map [uuid.UUID ]* presetCreationFailure
64+ creationFailuresMutex sync.RWMutex
65+ }
66+
67+ // presetCreationFailure tracks recent creation failures for a preset to implement incremental backoff.
68+ type presetCreationFailure struct {
69+ consecutiveFailures int
70+ lastFailureAt time.Time
6171}
6272
6373var _ prebuilds.ReconciliationOrchestrator = & StoreReconciler {}
@@ -102,6 +112,7 @@ func NewStoreReconciler(store database.Store,
102112 buildUsageChecker : buildUsageChecker ,
103113 done : make (chan struct {}, 1 ),
104114 provisionNotifyCh : make (chan database.ProvisionerJob , 10 ),
115+ creationFailures : make (map [uuid.UUID ]* presetCreationFailure ),
105116 }
106117
107118 if registerer != nil {
@@ -124,6 +135,68 @@ func NewStoreReconciler(store database.Store,
124135 return reconciler
125136}
126137
138+ // RecordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
139+ func (c * StoreReconciler ) RecordCreationFailure (presetID uuid.UUID ) {
140+ c .recordCreationFailure (presetID )
141+ }
142+
143+ // RecordCreationSuccess clears the failure tracking for a preset after a successful creation.
144+ func (c * StoreReconciler ) RecordCreationSuccess (presetID uuid.UUID ) {
145+ c .recordCreationSuccess (presetID )
146+ }
147+
148+ // ShouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
149+ // It returns true and the backoff time if we should delay, false and zero time otherwise.
150+ func (c * StoreReconciler ) ShouldBackoffCreation (presetID uuid.UUID ) (bool , time.Time ) {
151+ return c .shouldBackoffCreation (presetID )
152+ }
153+
154+ // recordCreationFailure records a prebuild creation failure for a preset and increments the consecutive failure count.
155+ func (c * StoreReconciler ) recordCreationFailure (presetID uuid.UUID ) {
156+ c .creationFailuresMutex .Lock ()
157+ defer c .creationFailuresMutex .Unlock ()
158+
159+ failure , exists := c .creationFailures [presetID ]
160+ if ! exists {
161+ failure = & presetCreationFailure {}
162+ c .creationFailures [presetID ] = failure
163+ }
164+
165+ failure .consecutiveFailures ++
166+ failure .lastFailureAt = c .clock .Now ()
167+ }
168+
169+ // recordCreationSuccess clears the failure tracking for a preset after a successful creation.
170+ func (c * StoreReconciler ) recordCreationSuccess (presetID uuid.UUID ) {
171+ c .creationFailuresMutex .Lock ()
172+ defer c .creationFailuresMutex .Unlock ()
173+
174+ delete (c .creationFailures , presetID )
175+ }
176+
177+ // shouldBackoffCreation checks if we should delay creation attempts for a preset based on recent failures.
178+ // It returns true and the backoff time if we should delay, false and zero time otherwise.
179+ func (c * StoreReconciler ) shouldBackoffCreation (presetID uuid.UUID ) (bool , time.Time ) {
180+ c .creationFailuresMutex .RLock ()
181+ defer c .creationFailuresMutex .RUnlock ()
182+
183+ failure , exists := c .creationFailures [presetID ]
184+ if ! exists || failure .consecutiveFailures == 0 {
185+ return false , time.Time {}
186+ }
187+
188+ // Calculate exponential backoff: backoffInterval * consecutiveFailures
189+ // This gives us a linear backoff that increases with each consecutive failure.
190+ backoffDuration := c .cfg .ReconciliationBackoffInterval .Value () * time .Duration (failure .consecutiveFailures )
191+ backoffUntil := failure .lastFailureAt .Add (backoffDuration )
192+
193+ if c .clock .Now ().Before (backoffUntil ) {
194+ return true , backoffUntil
195+ }
196+
197+ return false , time.Time {}
198+ }
199+
127200func (c * StoreReconciler ) Run (ctx context.Context ) {
128201 reconciliationInterval := c .cfg .ReconciliationInterval .Value ()
129202 if reconciliationInterval <= 0 { // avoids a panic
@@ -643,6 +716,16 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
643716 return nil
644717
645718 case prebuilds .ActionTypeCreate :
719+ // Check if we should backoff on this preset due to recent creation failures
720+ if shouldBackoff , backoffUntil := c .shouldBackoffCreation (ps .Preset .ID ); shouldBackoff {
721+ logger .Warn (ctx , "backing off prebuild creation due to recent failures" ,
722+ slog .F ("preset_id" , ps .Preset .ID .String ()),
723+ slog .F ("backoff_until" , backoffUntil .Format (time .RFC3339 )),
724+ slog .F ("backoff_secs" , math .Round (backoffUntil .Sub (c .clock .Now ()).Seconds ())),
725+ )
726+ return nil
727+ }
728+
646729 // Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes.
647730 // See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/.
648731 // This is obviously not comprehensive protection against this sort of problem, but this is one essential check.
@@ -666,7 +749,11 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
666749 for range action .Create {
667750 if err := c .createPrebuiltWorkspace (prebuildsCtx , uuid .New (), ps .Preset .TemplateID , ps .Preset .ID ); err != nil {
668751 logger .Error (ctx , "failed to create prebuild" , slog .Error (err ))
752+ c .recordCreationFailure (ps .Preset .ID )
669753 multiErr .Errors = append (multiErr .Errors , err )
754+ } else {
755+ // Only clear failure tracking if we successfully created at least one prebuild
756+ c .recordCreationSuccess (ps .Preset .ID )
670757 }
671758 }
672759
0 commit comments