From d00ce62092b0e59032a7e3e66eabf75908f1f9c6 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Wed, 22 Oct 2025 22:29:16 +0000 Subject: [PATCH 1/5] refactor agent resource monitoring API to avoid excessive calls to DB for fetching workspaces/workspace agent monitor definitions Signed-off-by: Callum Styan --- coderd/agentapi/resources_monitoring.go | 167 +++++++++++++++++------- 1 file changed, 118 insertions(+), 49 deletions(-) diff --git a/coderd/agentapi/resources_monitoring.go b/coderd/agentapi/resources_monitoring.go index e5ee97e681a58..f2a30f2ab466f 100644 --- a/coderd/agentapi/resources_monitoring.go +++ b/coderd/agentapi/resources_monitoring.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "fmt" + "sync" "time" "golang.org/x/xerrors" @@ -33,45 +34,84 @@ type ResourcesMonitoringAPI struct { Debounce time.Duration Config resourcesmonitor.Config -} - -func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) { - memoryMonitor, memoryErr := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID) - if memoryErr != nil && !errors.Is(memoryErr, sql.ErrNoRows) { - return nil, xerrors.Errorf("failed to fetch memory resource monitor: %w", memoryErr) - } - volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID) - if err != nil { - return nil, xerrors.Errorf("failed to fetch volume resource monitors: %w", err) - } + // Cache resource monitors on first call to avoid millions of DB queries per day. + memOnce sync.Once + volOnce sync.Once + memoryMonitor *database.WorkspaceAgentMemoryResourceMonitor + volumeMonitors []database.WorkspaceAgentVolumeResourceMonitor + monitorsLock sync.RWMutex +} - return &proto.GetResourcesMonitoringConfigurationResponse{ - Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{ - CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()), - NumDatapoints: a.Config.NumDatapoints, - }, - Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory { - if memoryErr != nil { - return nil - } +// fetchMemoryMonitor fetches the memory monitor from the database and caches it. +// Returns an error if the fetch fails (except for sql.ErrNoRows which is expected). +func (a *ResourcesMonitoringAPI) fetchMemoryMonitor(ctx context.Context) error { + memMon, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return xerrors.Errorf("fetch memory resource monitor: %w", err) + } + if err == nil { + a.memoryMonitor = &memMon + } + return nil +} - return &proto.GetResourcesMonitoringConfigurationResponse_Memory{ - Enabled: memoryMonitor.Enabled, - } - }(), - Volumes: func() []*proto.GetResourcesMonitoringConfigurationResponse_Volume { - volumes := make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume, 0, len(volumeMonitors)) - for _, monitor := range volumeMonitors { - volumes = append(volumes, &proto.GetResourcesMonitoringConfigurationResponse_Volume{ - Enabled: monitor.Enabled, - Path: monitor.Path, - }) - } +// fetchVolumeMonitors fetches the volume monitors from the database and caches them. +func (a *ResourcesMonitoringAPI) fetchVolumeMonitors(ctx context.Context) error { + volMons, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID) + if err != nil { + return xerrors.Errorf("fetch volume resource monitors: %w", err) + } + a.volumeMonitors = volMons + return nil +} - return volumes - }(), - }, nil +func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) { + // Load memory monitor once + var memoryErr error + a.memOnce.Do(func() { + memoryErr = a.fetchMemoryMonitor(ctx) + }) + if memoryErr != nil { + return nil, memoryErr + } + + // Load volume monitors once + var volumeErr error + a.volOnce.Do(func() { + volumeErr = a.fetchVolumeMonitors(ctx) + }) + if volumeErr != nil { + return nil, volumeErr + } + + a.monitorsLock.RLock() + defer a.monitorsLock.RUnlock() + + return &proto.GetResourcesMonitoringConfigurationResponse{ + Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{ + CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()), + NumDatapoints: a.Config.NumDatapoints, + }, + Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory { + if a.memoryMonitor == nil { + return nil + } + return &proto.GetResourcesMonitoringConfigurationResponse_Memory{ + Enabled: a.memoryMonitor.Enabled, + } + }(), + Volumes: func() []*proto.GetResourcesMonitoringConfigurationResponse_Volume { + volumes := make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume, 0, len(a.volumeMonitors)) + for _, monitor := range a.volumeMonitors { + volumes = append(volumes, &proto.GetResourcesMonitoringConfigurationResponse_Volume{ + Enabled: monitor.Enabled, + Path: monitor.Path, + }) + } + return volumes + }(), + }, nil } func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) { @@ -89,15 +129,22 @@ func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Contex } func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error { - monitor, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID) - if err != nil { - // It is valid for an agent to not have a memory monitor, so we - // do not want to treat it as an error. - if errors.Is(err, sql.ErrNoRows) { - return nil - } + // Load monitor once + var fetchErr error + a.memOnce.Do(func() { + fetchErr = a.fetchMemoryMonitor(ctx) + }) + if fetchErr != nil { + return fetchErr + } + + a.monitorsLock.RLock() + monitor := a.memoryMonitor + a.monitorsLock.RUnlock() - return xerrors.Errorf("fetch memory resource monitor: %w", err) + // No memory monitor configured + if monitor == nil { + return nil } if !monitor.Enabled { @@ -109,7 +156,7 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ usageDatapoints = append(usageDatapoints, datapoint.Memory) } - usageStates := resourcesmonitor.CalculateMemoryUsageStates(monitor, usageDatapoints) + usageStates := resourcesmonitor.CalculateMemoryUsageStates(*monitor, usageDatapoints) oldState := monitor.State newState := resourcesmonitor.NextState(a.Config, oldState, usageStates) @@ -117,7 +164,7 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState) //nolint:gocritic // We need to be able to update the resource monitor here. - err = a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{ + err := a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{ AgentID: a.AgentID, State: newState, UpdatedAt: dbtime.Time(a.Clock.Now()), @@ -127,6 +174,13 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ return xerrors.Errorf("update workspace monitor: %w", err) } + // Update cached state + a.monitorsLock.Lock() + a.memoryMonitor.State = newState + a.memoryMonitor.DebouncedUntil = dbtime.Time(debouncedUntil) + a.memoryMonitor.UpdatedAt = dbtime.Time(a.Clock.Now()) + a.monitorsLock.Unlock() + if !shouldNotify { return nil } @@ -169,14 +223,22 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ } func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error { - volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID) - if err != nil { - return xerrors.Errorf("get or insert volume monitor: %w", err) + // Load monitors once + var fetchErr error + a.volOnce.Do(func() { + fetchErr = a.fetchVolumeMonitors(ctx) + }) + if fetchErr != nil { + return fetchErr } + a.monitorsLock.RLock() + volumeMonitors := a.volumeMonitors + a.monitorsLock.RUnlock() + outOfDiskVolumes := make([]map[string]any, 0) - for _, monitor := range volumeMonitors { + for i, monitor := range volumeMonitors { if !monitor.Enabled { continue } @@ -219,6 +281,13 @@ func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints }); err != nil { return xerrors.Errorf("update workspace monitor: %w", err) } + + // Update cached state + a.monitorsLock.Lock() + a.volumeMonitors[i].State = newState + a.volumeMonitors[i].DebouncedUntil = dbtime.Time(debouncedUntil) + a.volumeMonitors[i].UpdatedAt = dbtime.Time(a.Clock.Now()) + a.monitorsLock.Unlock() } if len(outOfDiskVolumes) == 0 { From 82a6fd0c0ab46de5d92aa1b041a4ea815a0fcba7 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Thu, 23 Oct 2025 02:11:02 +0000 Subject: [PATCH 2/5] fix linting Signed-off-by: Callum Styan --- coderd/agentapi/resources_monitoring.go | 136 ++++++++++++------------ 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/coderd/agentapi/resources_monitoring.go b/coderd/agentapi/resources_monitoring.go index f2a30f2ab466f..17e6207a4e298 100644 --- a/coderd/agentapi/resources_monitoring.go +++ b/coderd/agentapi/resources_monitoring.go @@ -36,82 +36,82 @@ type ResourcesMonitoringAPI struct { Config resourcesmonitor.Config // Cache resource monitors on first call to avoid millions of DB queries per day. - memOnce sync.Once - volOnce sync.Once - memoryMonitor *database.WorkspaceAgentMemoryResourceMonitor + memOnce sync.Once + volOnce sync.Once + memoryMonitor *database.WorkspaceAgentMemoryResourceMonitor volumeMonitors []database.WorkspaceAgentVolumeResourceMonitor - monitorsLock sync.RWMutex + monitorsLock sync.RWMutex } // fetchMemoryMonitor fetches the memory monitor from the database and caches it. // Returns an error if the fetch fails (except for sql.ErrNoRows which is expected). func (a *ResourcesMonitoringAPI) fetchMemoryMonitor(ctx context.Context) error { - memMon, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID) - if err != nil && !errors.Is(err, sql.ErrNoRows) { - return xerrors.Errorf("fetch memory resource monitor: %w", err) - } - if err == nil { - a.memoryMonitor = &memMon - } - return nil + memMon, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return xerrors.Errorf("fetch memory resource monitor: %w", err) + } + if err == nil { + a.memoryMonitor = &memMon + } + return nil } // fetchVolumeMonitors fetches the volume monitors from the database and caches them. func (a *ResourcesMonitoringAPI) fetchVolumeMonitors(ctx context.Context) error { - volMons, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID) - if err != nil { - return xerrors.Errorf("fetch volume resource monitors: %w", err) - } - a.volumeMonitors = volMons - return nil + volMons, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID) + if err != nil { + return xerrors.Errorf("fetch volume resource monitors: %w", err) + } + a.volumeMonitors = volMons + return nil } func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) { - // Load memory monitor once - var memoryErr error - a.memOnce.Do(func() { - memoryErr = a.fetchMemoryMonitor(ctx) - }) - if memoryErr != nil { - return nil, memoryErr - } - - // Load volume monitors once - var volumeErr error - a.volOnce.Do(func() { - volumeErr = a.fetchVolumeMonitors(ctx) - }) - if volumeErr != nil { - return nil, volumeErr - } - - a.monitorsLock.RLock() - defer a.monitorsLock.RUnlock() - - return &proto.GetResourcesMonitoringConfigurationResponse{ - Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{ - CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()), - NumDatapoints: a.Config.NumDatapoints, - }, - Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory { - if a.memoryMonitor == nil { - return nil - } - return &proto.GetResourcesMonitoringConfigurationResponse_Memory{ - Enabled: a.memoryMonitor.Enabled, - } - }(), - Volumes: func() []*proto.GetResourcesMonitoringConfigurationResponse_Volume { - volumes := make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume, 0, len(a.volumeMonitors)) - for _, monitor := range a.volumeMonitors { - volumes = append(volumes, &proto.GetResourcesMonitoringConfigurationResponse_Volume{ - Enabled: monitor.Enabled, - Path: monitor.Path, - }) - } - return volumes - }(), - }, nil + // Load memory monitor once + var memoryErr error + a.memOnce.Do(func() { + memoryErr = a.fetchMemoryMonitor(ctx) + }) + if memoryErr != nil { + return nil, memoryErr + } + + // Load volume monitors once + var volumeErr error + a.volOnce.Do(func() { + volumeErr = a.fetchVolumeMonitors(ctx) + }) + if volumeErr != nil { + return nil, volumeErr + } + + a.monitorsLock.RLock() + defer a.monitorsLock.RUnlock() + + return &proto.GetResourcesMonitoringConfigurationResponse{ + Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{ + CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()), + NumDatapoints: a.Config.NumDatapoints, + }, + Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory { + if a.memoryMonitor == nil { + return nil + } + return &proto.GetResourcesMonitoringConfigurationResponse_Memory{ + Enabled: a.memoryMonitor.Enabled, + } + }(), + Volumes: func() []*proto.GetResourcesMonitoringConfigurationResponse_Volume { + volumes := make([]*proto.GetResourcesMonitoringConfigurationResponse_Volume, 0, len(a.volumeMonitors)) + for _, monitor := range a.volumeMonitors { + volumes = append(volumes, &proto.GetResourcesMonitoringConfigurationResponse_Volume{ + Enabled: monitor.Enabled, + Path: monitor.Path, + }) + } + return volumes + }(), + }, nil } func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) { @@ -132,10 +132,10 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ // Load monitor once var fetchErr error a.memOnce.Do(func() { - fetchErr = a.fetchMemoryMonitor(ctx) + fetchErr = a.fetchMemoryMonitor(ctx) }) if fetchErr != nil { - return fetchErr + return fetchErr } a.monitorsLock.RLock() @@ -144,7 +144,7 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ // No memory monitor configured if monitor == nil { - return nil + return nil } if !monitor.Enabled { @@ -226,10 +226,10 @@ func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints // Load monitors once var fetchErr error a.volOnce.Do(func() { - fetchErr = a.fetchVolumeMonitors(ctx) + fetchErr = a.fetchVolumeMonitors(ctx) }) if fetchErr != nil { - return fetchErr + return fetchErr } a.monitorsLock.RLock() From ffcb43b4a92ea941691cdc82108bc87c40ba1134 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Sat, 25 Oct 2025 01:39:48 +0000 Subject: [PATCH 3/5] address review comments Signed-off-by: Callum Styan --- coderd/agentapi/api.go | 4 + coderd/agentapi/resources_monitoring.go | 96 +++++--------------- coderd/agentapi/resources_monitoring_test.go | 29 +++++- 3 files changed, 54 insertions(+), 75 deletions(-) diff --git a/coderd/agentapi/api.go b/coderd/agentapi/api.go index dbcb8ea024914..f8f72a5d0d6ea 100644 --- a/coderd/agentapi/api.go +++ b/coderd/agentapi/api.go @@ -239,6 +239,10 @@ func (a *API) Serve(ctx context.Context, l net.Listener) error { return xerrors.Errorf("create agent API server: %w", err) } + if err := a.ResourcesMonitoringAPI.InitMonitors(ctx); err != nil { + return xerrors.Errorf("initialize resource monitoring: %w", err) + } + return server.Serve(ctx, l) } diff --git a/coderd/agentapi/resources_monitoring.go b/coderd/agentapi/resources_monitoring.go index 17e6207a4e298..668b18cd4dede 100644 --- a/coderd/agentapi/resources_monitoring.go +++ b/coderd/agentapi/resources_monitoring.go @@ -36,65 +36,44 @@ type ResourcesMonitoringAPI struct { Config resourcesmonitor.Config // Cache resource monitors on first call to avoid millions of DB queries per day. - memOnce sync.Once - volOnce sync.Once - memoryMonitor *database.WorkspaceAgentMemoryResourceMonitor + memoryMonitor database.WorkspaceAgentMemoryResourceMonitor volumeMonitors []database.WorkspaceAgentVolumeResourceMonitor monitorsLock sync.RWMutex } -// fetchMemoryMonitor fetches the memory monitor from the database and caches it. -// Returns an error if the fetch fails (except for sql.ErrNoRows which is expected). -func (a *ResourcesMonitoringAPI) fetchMemoryMonitor(ctx context.Context) error { +// InitMonitors fetches resource monitors from the database and caches them. +// This must be called once after creating a ResourcesMonitoringAPI, the context should be +// the agent per-RPC connection context. If fetching fails with a real error (not sql.ErrNoRows), the +// connection should be torn down. +func (a *ResourcesMonitoringAPI) InitMonitors(ctx context.Context) error { memMon, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID) if err != nil && !errors.Is(err, sql.ErrNoRows) { return xerrors.Errorf("fetch memory resource monitor: %w", err) } + // If sql.ErrNoRows, memoryMonitor stays as zero value (CreatedAt.IsZero() = true). + // Otherwise, store the fetched monitor. if err == nil { - a.memoryMonitor = &memMon + a.memoryMonitor = memMon } - return nil -} -// fetchVolumeMonitors fetches the volume monitors from the database and caches them. -func (a *ResourcesMonitoringAPI) fetchVolumeMonitors(ctx context.Context) error { volMons, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID) if err != nil { return xerrors.Errorf("fetch volume resource monitors: %w", err) } + // 0 length is valid, indicating none configured, since the volume monitors in the DB can be many. a.volumeMonitors = volMons + return nil } func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) { - // Load memory monitor once - var memoryErr error - a.memOnce.Do(func() { - memoryErr = a.fetchMemoryMonitor(ctx) - }) - if memoryErr != nil { - return nil, memoryErr - } - - // Load volume monitors once - var volumeErr error - a.volOnce.Do(func() { - volumeErr = a.fetchVolumeMonitors(ctx) - }) - if volumeErr != nil { - return nil, volumeErr - } - - a.monitorsLock.RLock() - defer a.monitorsLock.RUnlock() - return &proto.GetResourcesMonitoringConfigurationResponse{ Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{ CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()), NumDatapoints: a.Config.NumDatapoints, }, Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory { - if a.memoryMonitor == nil { + if a.memoryMonitor.CreatedAt.IsZero() { return nil } return &proto.GetResourcesMonitoringConfigurationResponse_Memory{ @@ -117,6 +96,10 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) { var err error + // Lock for the entire push operation since calls are sequential from the agent + a.monitorsLock.Lock() + defer a.monitorsLock.Unlock() + if memoryErr := a.monitorMemory(ctx, req.Datapoints); memoryErr != nil { err = errors.Join(err, xerrors.Errorf("monitor memory: %w", memoryErr)) } @@ -129,25 +112,7 @@ func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Contex } func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error { - // Load monitor once - var fetchErr error - a.memOnce.Do(func() { - fetchErr = a.fetchMemoryMonitor(ctx) - }) - if fetchErr != nil { - return fetchErr - } - - a.monitorsLock.RLock() - monitor := a.memoryMonitor - a.monitorsLock.RUnlock() - - // No memory monitor configured - if monitor == nil { - return nil - } - - if !monitor.Enabled { + if !a.memoryMonitor.Enabled || a.memoryMonitor.CreatedAt.IsZero() { return nil } @@ -156,12 +121,12 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ usageDatapoints = append(usageDatapoints, datapoint.Memory) } - usageStates := resourcesmonitor.CalculateMemoryUsageStates(*monitor, usageDatapoints) + usageStates := resourcesmonitor.CalculateMemoryUsageStates(a.memoryMonitor, usageDatapoints) - oldState := monitor.State + oldState := a.memoryMonitor.State newState := resourcesmonitor.NextState(a.Config, oldState, usageStates) - debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState) + debouncedUntil, shouldNotify := a.memoryMonitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState) //nolint:gocritic // We need to be able to update the resource monitor here. err := a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{ @@ -175,11 +140,9 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ } // Update cached state - a.monitorsLock.Lock() a.memoryMonitor.State = newState a.memoryMonitor.DebouncedUntil = dbtime.Time(debouncedUntil) a.memoryMonitor.UpdatedAt = dbtime.Time(a.Clock.Now()) - a.monitorsLock.Unlock() if !shouldNotify { return nil @@ -197,7 +160,7 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ notifications.TemplateWorkspaceOutOfMemory, map[string]string{ "workspace": workspace.Name, - "threshold": fmt.Sprintf("%d%%", monitor.Threshold), + "threshold": fmt.Sprintf("%d%%", a.memoryMonitor.Threshold), }, map[string]any{ // NOTE(DanielleMaywood): @@ -223,22 +186,9 @@ func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints [ } func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error { - // Load monitors once - var fetchErr error - a.volOnce.Do(func() { - fetchErr = a.fetchVolumeMonitors(ctx) - }) - if fetchErr != nil { - return fetchErr - } - - a.monitorsLock.RLock() - volumeMonitors := a.volumeMonitors - a.monitorsLock.RUnlock() - outOfDiskVolumes := make([]map[string]any, 0) - for i, monitor := range volumeMonitors { + for i, monitor := range a.volumeMonitors { if !monitor.Enabled { continue } @@ -283,11 +233,9 @@ func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints } // Update cached state - a.monitorsLock.Lock() a.volumeMonitors[i].State = newState a.volumeMonitors[i].DebouncedUntil = dbtime.Time(debouncedUntil) a.volumeMonitors[i].UpdatedAt = dbtime.Time(a.Clock.Now()) - a.monitorsLock.Unlock() } if len(outOfDiskVolumes) == 0 { diff --git a/coderd/agentapi/resources_monitoring_test.go b/coderd/agentapi/resources_monitoring_test.go index c491d3789355b..debc6c0cce36e 100644 --- a/coderd/agentapi/resources_monitoring_test.go +++ b/coderd/agentapi/resources_monitoring_test.go @@ -101,6 +101,9 @@ func TestMemoryResourceMonitorDebounce(t *testing.T) { Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + // When: The monitor is given a state that will trigger NOK _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{ @@ -304,6 +307,9 @@ func TestMemoryResourceMonitor(t *testing.T) { Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + clock.Set(collectedAt) _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ Datapoints: datapoints, @@ -337,6 +343,8 @@ func TestMemoryResourceMonitorMissingData(t *testing.T) { State: database.WorkspaceAgentMonitorStateOK, Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) // When: A datapoint is missing, surrounded by two NOK datapoints. _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ @@ -387,6 +395,9 @@ func TestMemoryResourceMonitorMissingData(t *testing.T) { Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + // When: A datapoint is missing, surrounded by two OK datapoints. _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{ @@ -466,6 +477,9 @@ func TestVolumeResourceMonitorDebounce(t *testing.T) { Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + // When: // - First monitor is in a NOK state // - Second monitor is in an OK state @@ -482,6 +496,7 @@ func TestVolumeResourceMonitorDebounce(t *testing.T) { }) require.NoError(t, err) + // Then: // - We expect a notification from only the first monitor sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk)) @@ -742,6 +757,9 @@ func TestVolumeResourceMonitor(t *testing.T) { Threshold: tt.thresholdPercent, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + clock.Set(collectedAt) _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ Datapoints: datapoints, @@ -780,6 +798,9 @@ func TestVolumeResourceMonitorMultiple(t *testing.T) { Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + // When: both of them move to a NOK state _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{ @@ -800,7 +821,7 @@ func TestVolumeResourceMonitorMultiple(t *testing.T) { }, }, }) - require.NoError(t, err) + require.NoError(t, err) // Then: We expect a notification to alert with information about both sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk)) @@ -832,6 +853,9 @@ func TestVolumeResourceMonitorMissingData(t *testing.T) { Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + // When: A datapoint is missing, surrounded by two NOK datapoints. _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{ @@ -891,6 +915,9 @@ func TestVolumeResourceMonitorMissingData(t *testing.T) { Threshold: 80, }) + // Initialize API to fetch and cache the monitors + require.NoError(t, api.InitMonitors(context.Background())) + // When: A datapoint is missing, surrounded by two OK datapoints. _, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{ Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{ From 8743e8e6a94bc89448b9a55cc196dc0940a66950 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Sat, 25 Oct 2025 02:24:55 +0000 Subject: [PATCH 4/5] ctx is no longer used in this function Signed-off-by: Callum Styan --- coderd/agentapi/resources_monitoring.go | 2 +- coderd/agentapi/resources_monitoring_test.go | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/coderd/agentapi/resources_monitoring.go b/coderd/agentapi/resources_monitoring.go index 668b18cd4dede..d13e82621407b 100644 --- a/coderd/agentapi/resources_monitoring.go +++ b/coderd/agentapi/resources_monitoring.go @@ -66,7 +66,7 @@ func (a *ResourcesMonitoringAPI) InitMonitors(ctx context.Context) error { return nil } -func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) { +func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(_ context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) { return &proto.GetResourcesMonitoringConfigurationResponse{ Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{ CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()), diff --git a/coderd/agentapi/resources_monitoring_test.go b/coderd/agentapi/resources_monitoring_test.go index debc6c0cce36e..7b457dd45331a 100644 --- a/coderd/agentapi/resources_monitoring_test.go +++ b/coderd/agentapi/resources_monitoring_test.go @@ -496,7 +496,6 @@ func TestVolumeResourceMonitorDebounce(t *testing.T) { }) require.NoError(t, err) - // Then: // - We expect a notification from only the first monitor sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk)) @@ -821,7 +820,7 @@ func TestVolumeResourceMonitorMultiple(t *testing.T) { }, }, }) - require.NoError(t, err) + require.NoError(t, err) // Then: We expect a notification to alert with information about both sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk)) From 87934c33c9e1bbb4426db5432ce9c32fac7486dc Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Tue, 28 Oct 2025 20:14:44 +0000 Subject: [PATCH 5/5] remove unnecessary OR condition in monitorMemory Signed-off-by: Callum Styan --- coderd/agentapi/resources_monitoring.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/agentapi/resources_monitoring.go b/coderd/agentapi/resources_monitoring.go index d13e82621407b..db0d523192280 100644 --- a/coderd/agentapi/resources_monitoring.go +++ b/coderd/agentapi/resources_monitoring.go @@ -112,7 +112,7 @@ func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Contex } func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error { - if !a.memoryMonitor.Enabled || a.memoryMonitor.CreatedAt.IsZero() { + if !a.memoryMonitor.Enabled { return nil }