Skip to content

Commit 476fd70

Browse files
committed
fix: stop disconnecting from coderd early and record disconnect correctly
1 parent 96fca01 commit 476fd70

File tree

3 files changed

+175
-122
lines changed

3 files changed

+175
-122
lines changed

agent/agent.go

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ const (
7171
EnvProcOOMScore = "CODER_PROC_OOM_SCORE"
7272
)
7373

74+
var ErrAgentClosing = xerrors.New("agent is closing")
75+
7476
type Options struct {
7577
Filesystem afero.Fs
7678
LogDir string
@@ -401,6 +403,7 @@ func (a *agent) runLoop() {
401403
// need to keep retrying up to the hardCtx so that we can send graceful shutdown-related
402404
// messages.
403405
ctx := a.hardCtx
406+
defer a.logger.Info(ctx, "agent main loop exited")
404407
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
405408
a.logger.Info(ctx, "connecting to coderd")
406409
err := a.run()
@@ -1348,7 +1351,7 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
13481351
a.closeMutex.Unlock()
13491352
if closing {
13501353
_ = network.Close()
1351-
return xerrors.New("agent is closing")
1354+
return ErrAgentClosing
13521355
}
13531356
} else {
13541357
// Update the wireguard IPs if the agent ID changed.
@@ -1471,7 +1474,7 @@ func (a *agent) trackGoroutine(fn func()) error {
14711474
a.closeMutex.Lock()
14721475
defer a.closeMutex.Unlock()
14731476
if a.closing {
1474-
return xerrors.New("track conn goroutine: agent is closing")
1477+
return xerrors.Errorf("track conn goroutine: %w", ErrAgentClosing)
14751478
}
14761479
a.closeWaitGroup.Add(1)
14771480
go func() {
@@ -2152,16 +2155,7 @@ func (a *apiConnRoutineManager) startAgentAPI(
21522155
a.eg.Go(func() error {
21532156
logger.Debug(ctx, "starting agent routine")
21542157
err := f(ctx, a.aAPI)
2155-
if xerrors.Is(err, context.Canceled) && ctx.Err() != nil {
2156-
logger.Debug(ctx, "swallowing context canceled")
2157-
// Don't propagate context canceled errors to the error group, because we don't want the
2158-
// graceful context being canceled to halt the work of routines with
2159-
// gracefulShutdownBehaviorRemain. Note that we check both that the error is
2160-
// context.Canceled and that *our* context is currently canceled, because when Coderd
2161-
// unilaterally closes the API connection (for example if the build is outdated), it can
2162-
// sometimes show up as context.Canceled in our RPC calls.
2163-
return nil
2164-
}
2158+
err = shouldPropagateError(ctx, logger, err)
21652159
logger.Debug(ctx, "routine exited", slog.Error(err))
21662160
if err != nil {
21672161
return xerrors.Errorf("error in routine %s: %w", name, err)
@@ -2189,21 +2183,7 @@ func (a *apiConnRoutineManager) startTailnetAPI(
21892183
a.eg.Go(func() error {
21902184
logger.Debug(ctx, "starting tailnet routine")
21912185
err := f(ctx, a.tAPI)
2192-
if (xerrors.Is(err, context.Canceled) ||
2193-
xerrors.Is(err, io.EOF)) &&
2194-
ctx.Err() != nil {
2195-
logger.Debug(ctx, "swallowing error because context is canceled", slog.Error(err))
2196-
// Don't propagate context canceled errors to the error group, because we don't want the
2197-
// graceful context being canceled to halt the work of routines with
2198-
// gracefulShutdownBehaviorRemain. Unfortunately, the dRPC library closes the stream
2199-
// when context is canceled on an RPC, so canceling the context can also show up as
2200-
// io.EOF. Also, when Coderd unilaterally closes the API connection (for example if the
2201-
// build is outdated), it can sometimes show up as context.Canceled in our RPC calls.
2202-
// We can't reliably distinguish between a context cancelation and a legit EOF, so we
2203-
// also check that *our* context is currently canceled. If it is, we can safely ignore
2204-
// the error.
2205-
return nil
2206-
}
2186+
err = shouldPropagateError(ctx, logger, err)
22072187
logger.Debug(ctx, "routine exited", slog.Error(err))
22082188
if err != nil {
22092189
return xerrors.Errorf("error in routine %s: %w", name, err)
@@ -2212,6 +2192,34 @@ func (a *apiConnRoutineManager) startTailnetAPI(
22122192
})
22132193
}
22142194

2195+
// shouldPropagateError decides whether an error from an API connection routine should be propagated to the
2196+
// apiConnRoutineManager. Its purpose is to prevent errors related to shutting down from propagating to the manager's
2197+
// error group, which will tear down the API connection and potentially stop graceful shutdown from succeeding.
2198+
func shouldPropagateError(ctx context.Context, logger slog.Logger, err error) error {
2199+
if (xerrors.Is(err, context.Canceled) ||
2200+
xerrors.Is(err, io.EOF)) &&
2201+
ctx.Err() != nil {
2202+
logger.Debug(ctx, "swallowing error because context is canceled", slog.Error(err))
2203+
// Don't propagate context canceled errors to the error group, because we don't want the
2204+
// graceful context being canceled to halt the work of routines with
2205+
// gracefulShutdownBehaviorRemain. Unfortunately, the dRPC library closes the stream
2206+
// when context is canceled on an RPC, so canceling the context can also show up as
2207+
// io.EOF. Also, when Coderd unilaterally closes the API connection (for example if the
2208+
// build is outdated), it can sometimes show up as context.Canceled in our RPC calls.
2209+
// We can't reliably distinguish between a context cancelation and a legit EOF, so we
2210+
// also check that *our* context is currently canceled. If it is, we can safely ignore
2211+
// the error.
2212+
return nil
2213+
}
2214+
if xerrors.Is(err, ErrAgentClosing) {
2215+
logger.Debug(ctx, "swallowing error because agent is closing")
2216+
// This can only be generated when the agent is closing, so we never want it to propagate to other routines.
2217+
// (They are signaled to exit via canceled contexts.)
2218+
return nil
2219+
}
2220+
return err
2221+
}
2222+
22152223
func (a *apiConnRoutineManager) wait() error {
22162224
return a.eg.Wait()
22172225
}

coderd/workspaceagentsrpc.go

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -372,13 +372,6 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
372372
finalCtx, cancel := context.WithTimeout(dbauthz.AsSystemRestricted(m.apiCtx), m.disconnectTimeout)
373373
defer cancel()
374374

375-
// Only update timestamp if the disconnect is new.
376-
if !m.disconnectedAt.Valid {
377-
m.disconnectedAt = sql.NullTime{
378-
Time: dbtime.Now(),
379-
Valid: true,
380-
}
381-
}
382375
err := m.updateConnectionTimes(finalCtx)
383376
if err != nil {
384377
// This is a bug with unit tests that cancel the app context and
@@ -403,6 +396,10 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
403396
m.logger.Debug(ctx, "agent connection monitor is closing connection",
404397
slog.F("reason", reason))
405398
_ = m.conn.Close(websocket.StatusGoingAway, reason)
399+
m.disconnectedAt = sql.NullTime{
400+
Time: dbtime.Now(),
401+
Valid: true,
402+
}
406403
}()
407404

408405
err := m.updateConnectionTimes(ctx)
@@ -432,8 +429,7 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
432429
m.logger.Warn(ctx, "connection to agent timed out")
433430
return
434431
}
435-
connectionStatusChanged := m.disconnectedAt.Valid
436-
m.disconnectedAt = sql.NullTime{}
432+
437433
m.lastConnectedAt = sql.NullTime{
438434
Time: dbtime.Now(),
439435
Valid: true,
@@ -447,13 +443,9 @@ func (m *agentConnectionMonitor) monitor(ctx context.Context) {
447443
}
448444
return
449445
}
450-
if connectionStatusChanged {
451-
m.updater.publishWorkspaceUpdate(ctx, m.workspace.OwnerID, wspubsub.WorkspaceEvent{
452-
Kind: wspubsub.WorkspaceEventKindAgentConnectionUpdate,
453-
WorkspaceID: m.workspaceBuild.WorkspaceID,
454-
AgentID: &m.workspaceAgent.ID,
455-
})
456-
}
446+
// we don't need to publish a workspace update here because we published an update when the workspace first
447+
// connected. Since all we've done is updated lastConnectedAt, the workspace is still connected and hasn't
448+
// changed status. We don't expect to get updates just for the times changing.
457449

458450
ctx, err := dbauthz.WithWorkspaceRBAC(ctx, m.workspace.RBACObject())
459451
if err != nil {

0 commit comments

Comments
 (0)