Skip to content

Commit 00793cc

Browse files
authored
feat: add prometheus observability metrics for dbpurge (#21074)
Related to [`internal#1139`](coder/internal#1139) This implements some prometheus metrics for records being removed from the database. Currently we're tracking the following fields being removed from the DB by this. They're viewable in the `/api/v2/debug/metrics` endpoint. * `expired_api_keys` * `aibridge_records` * `connection_logs` * `duration` ``` # HELP coderd_dbpurge_iteration_duration_seconds Duration of each dbpurge iteration in seconds. # TYPE coderd_dbpurge_iteration_duration_seconds histogram coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="1"} 1 coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="5"} 1 coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="10"} 1 coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="30"} 1 coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="60"} 1 coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="300"} 1 coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="600"} 1 coderd_dbpurge_iteration_duration_seconds_bucket{success="true",le="+Inf"} 1 coderd_dbpurge_iteration_duration_seconds_sum{success="true"} 0.014787814 coderd_dbpurge_iteration_duration_seconds_count{success="true"} 1 # HELP coderd_dbpurge_records_purged_total Total number of records purged by type. # TYPE coderd_dbpurge_records_purged_total counter coderd_dbpurge_records_purged_total{record_type="aibridge_records"} 0 coderd_dbpurge_records_purged_total{record_type="audit_logs"} 0 coderd_dbpurge_records_purged_total{record_type="connection_logs"} 0 coderd_dbpurge_records_purged_total{record_type="expired_api_keys"} 0 coderd_dbpurge_records_purged_total{record_type="workspace_agent_logs"} 0 ``` | Position | Pull-request | | -------- | ------------ | | ✅ | [feat: add prometheus observability metrics for `dbpurge`](#21074) | | | [feat: add rbac specificity for `dbpurge`](#21088) |
1 parent 5b3c24c commit 00793cc

File tree

3 files changed

+154
-15
lines changed

3 files changed

+154
-15
lines changed

cli/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1038,7 +1038,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10381038
defer shutdownConns()
10391039

10401040
// Ensures that old database entries are cleaned up over time!
1041-
purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database, options.DeploymentValues, quartz.NewReal())
1041+
purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database, options.DeploymentValues, quartz.NewReal(), options.PrometheusRegistry)
10421042
defer purger.Close()
10431043

10441044
// Updates workspace usage

coderd/database/dbpurge/dbpurge.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77

88
"golang.org/x/xerrors"
99

10+
"github.com/prometheus/client_golang/prometheus"
11+
1012
"cdr.dev/slog"
1113

1214
"github.com/coder/coder/v2/coderd/database"
@@ -40,13 +42,30 @@ const (
4042
// It is the caller's responsibility to call Close on the returned instance.
4143
//
4244
// This is for cleaning up old, unused resources from the database that take up space.
43-
func New(ctx context.Context, logger slog.Logger, db database.Store, vals *codersdk.DeploymentValues, clk quartz.Clock) io.Closer {
45+
func New(ctx context.Context, logger slog.Logger, db database.Store, vals *codersdk.DeploymentValues, clk quartz.Clock, reg prometheus.Registerer) io.Closer {
4446
closed := make(chan struct{})
4547

4648
ctx, cancelFunc := context.WithCancel(ctx)
4749
//nolint:gocritic // The system purges old db records without user input.
4850
ctx = dbauthz.AsSystemRestricted(ctx)
4951

52+
iterationDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
53+
Namespace: "coderd",
54+
Subsystem: "dbpurge",
55+
Name: "iteration_duration_seconds",
56+
Help: "Duration of each dbpurge iteration in seconds.",
57+
Buckets: []float64{1, 5, 10, 30, 60, 300, 600}, // 1s to 10min
58+
}, []string{"success"})
59+
reg.MustRegister(iterationDuration)
60+
61+
recordsPurged := prometheus.NewCounterVec(prometheus.CounterOpts{
62+
Namespace: "coderd",
63+
Subsystem: "dbpurge",
64+
Name: "records_purged_total",
65+
Help: "Total number of records purged by type.",
66+
}, []string{"record_type"})
67+
reg.MustRegister(recordsPurged)
68+
5069
// Start the ticker with the initial delay.
5170
ticker := clk.NewTicker(delay)
5271
doTick := func(ctx context.Context, start time.Time) {
@@ -164,9 +183,22 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, vals *coder
164183
slog.F("duration", clk.Since(start)),
165184
)
166185

186+
duration := clk.Since(start)
187+
iterationDuration.WithLabelValues("true").Observe(duration.Seconds())
188+
recordsPurged.WithLabelValues("workspace_agent_logs").Add(float64(purgedWorkspaceAgentLogs))
189+
recordsPurged.WithLabelValues("expired_api_keys").Add(float64(expiredAPIKeys))
190+
recordsPurged.WithLabelValues("aibridge_records").Add(float64(purgedAIBridgeRecords))
191+
recordsPurged.WithLabelValues("connection_logs").Add(float64(purgedConnectionLogs))
192+
recordsPurged.WithLabelValues("audit_logs").Add(float64(purgedAuditLogs))
193+
167194
return nil
168195
}, database.DefaultTXOptions().WithID("db_purge")); err != nil {
169196
logger.Error(ctx, "failed to purge old database entries", slog.Error(err))
197+
198+
// Record metrics for failed purge iteration.
199+
duration := clk.Since(start)
200+
iterationDuration.WithLabelValues("false").Observe(duration.Seconds())
201+
170202
return
171203
}
172204
}

coderd/database/dbpurge/dbpurge_test.go

Lines changed: 120 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,17 @@ import (
1212
"time"
1313

1414
"github.com/google/uuid"
15+
"github.com/prometheus/client_golang/prometheus"
1516
"github.com/stretchr/testify/assert"
1617
"github.com/stretchr/testify/require"
1718
"go.uber.org/goleak"
1819
"go.uber.org/mock/gomock"
20+
"golang.org/x/xerrors"
1921

2022
"cdr.dev/slog"
2123
"cdr.dev/slog/sloggers/slogtest"
2224

25+
"github.com/coder/coder/v2/coderd/coderdtest/promhelp"
2326
"github.com/coder/coder/v2/coderd/database"
2427
"github.com/coder/coder/v2/coderd/database/dbgen"
2528
"github.com/coder/coder/v2/coderd/database/dbmock"
@@ -52,11 +55,114 @@ func TestPurge(t *testing.T) {
5255
done := awaitDoTick(ctx, t, clk)
5356
mDB := dbmock.NewMockStore(gomock.NewController(t))
5457
mDB.EXPECT().InTx(gomock.Any(), database.DefaultTXOptions().WithID("db_purge")).Return(nil).Times(2)
55-
purger := dbpurge.New(context.Background(), testutil.Logger(t), mDB, &codersdk.DeploymentValues{}, clk)
58+
purger := dbpurge.New(context.Background(), testutil.Logger(t), mDB, &codersdk.DeploymentValues{}, clk, prometheus.NewRegistry())
5659
<-done // wait for doTick() to run.
5760
require.NoError(t, purger.Close())
5861
}
5962

63+
//nolint:paralleltest // It uses LockIDDBPurge.
64+
func TestMetrics(t *testing.T) {
65+
t.Run("SuccessfulIteration", func(t *testing.T) {
66+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
67+
defer cancel()
68+
69+
reg := prometheus.NewRegistry()
70+
clk := quartz.NewMock(t)
71+
now := time.Date(2025, 1, 15, 7, 30, 0, 0, time.UTC)
72+
clk.Set(now).MustWait(ctx)
73+
74+
db, _ := dbtestutil.NewDB(t)
75+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
76+
user := dbgen.User(t, db, database.User{})
77+
78+
oldExpiredKey, _ := dbgen.APIKey(t, db, database.APIKey{
79+
UserID: user.ID,
80+
ExpiresAt: now.Add(-8 * 24 * time.Hour), // Expired 8 days ago
81+
TokenName: "old-expired-key",
82+
})
83+
84+
_, err := db.GetAPIKeyByID(ctx, oldExpiredKey.ID)
85+
require.NoError(t, err, "key should exist before purge")
86+
87+
done := awaitDoTick(ctx, t, clk)
88+
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{
89+
Retention: codersdk.RetentionConfig{
90+
APIKeys: serpent.Duration(7 * 24 * time.Hour), // 7 days retention
91+
},
92+
}, clk, reg)
93+
defer closer.Close()
94+
testutil.TryReceive(ctx, t, done)
95+
96+
hist := promhelp.HistogramValue(t, reg, "coderd_dbpurge_iteration_duration_seconds", prometheus.Labels{
97+
"success": "true",
98+
})
99+
require.NotNil(t, hist)
100+
require.Greater(t, hist.GetSampleCount(), uint64(0), "should have at least one sample")
101+
102+
expiredAPIKeys := promhelp.CounterValue(t, reg, "coderd_dbpurge_records_purged_total", prometheus.Labels{
103+
"record_type": "expired_api_keys",
104+
})
105+
require.Greater(t, expiredAPIKeys, 0, "should have deleted at least one expired API key")
106+
107+
_, err = db.GetAPIKeyByID(ctx, oldExpiredKey.ID)
108+
require.Error(t, err, "key should be deleted after purge")
109+
110+
workspaceAgentLogs := promhelp.CounterValue(t, reg, "coderd_dbpurge_records_purged_total", prometheus.Labels{
111+
"record_type": "workspace_agent_logs",
112+
})
113+
require.GreaterOrEqual(t, workspaceAgentLogs, 0)
114+
115+
aibridgeRecords := promhelp.CounterValue(t, reg, "coderd_dbpurge_records_purged_total", prometheus.Labels{
116+
"record_type": "aibridge_records",
117+
})
118+
require.GreaterOrEqual(t, aibridgeRecords, 0)
119+
120+
connectionLogs := promhelp.CounterValue(t, reg, "coderd_dbpurge_records_purged_total", prometheus.Labels{
121+
"record_type": "connection_logs",
122+
})
123+
require.GreaterOrEqual(t, connectionLogs, 0)
124+
125+
auditLogs := promhelp.CounterValue(t, reg, "coderd_dbpurge_records_purged_total", prometheus.Labels{
126+
"record_type": "audit_logs",
127+
})
128+
require.GreaterOrEqual(t, auditLogs, 0)
129+
})
130+
131+
t.Run("FailedIteration", func(t *testing.T) {
132+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
133+
defer cancel()
134+
135+
reg := prometheus.NewRegistry()
136+
clk := quartz.NewMock(t)
137+
now := clk.Now()
138+
clk.Set(now).MustWait(ctx)
139+
140+
ctrl := gomock.NewController(t)
141+
mDB := dbmock.NewMockStore(ctrl)
142+
mDB.EXPECT().InTx(gomock.Any(), database.DefaultTXOptions().WithID("db_purge")).
143+
Return(xerrors.New("simulated database error")).
144+
MinTimes(1)
145+
146+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
147+
148+
done := awaitDoTick(ctx, t, clk)
149+
closer := dbpurge.New(ctx, logger, mDB, &codersdk.DeploymentValues{}, clk, reg)
150+
defer closer.Close()
151+
testutil.TryReceive(ctx, t, done)
152+
153+
hist := promhelp.HistogramValue(t, reg, "coderd_dbpurge_iteration_duration_seconds", prometheus.Labels{
154+
"success": "false",
155+
})
156+
require.NotNil(t, hist)
157+
require.Greater(t, hist.GetSampleCount(), uint64(0), "should have at least one sample")
158+
159+
successHist := promhelp.MetricValue(t, reg, "coderd_dbpurge_iteration_duration_seconds", prometheus.Labels{
160+
"success": "true",
161+
})
162+
require.Nil(t, successHist, "should not have success=true metric on failure")
163+
})
164+
}
165+
60166
//nolint:paralleltest // It uses LockIDDBPurge.
61167
func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
62168
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
@@ -130,7 +236,7 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
130236
})
131237

132238
// when
133-
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk)
239+
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk, prometheus.NewRegistry())
134240
defer closer.Close()
135241

136242
// then
@@ -155,7 +261,7 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
155261

156262
// Start a new purger to immediately trigger delete after rollup.
157263
_ = closer.Close()
158-
closer = dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk)
264+
closer = dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk, prometheus.NewRegistry())
159265
defer closer.Close()
160266

161267
// then
@@ -250,7 +356,8 @@ func TestDeleteOldWorkspaceAgentLogs(t *testing.T) {
250356
Retention: codersdk.RetentionConfig{
251357
WorkspaceAgentLogs: serpent.Duration(7 * 24 * time.Hour),
252358
},
253-
}, clk)
359+
}, clk, prometheus.NewRegistry())
360+
254361
defer closer.Close()
255362
<-done // doTick() has now run.
256363

@@ -464,7 +571,7 @@ func TestDeleteOldWorkspaceAgentLogsRetention(t *testing.T) {
464571
done := awaitDoTick(ctx, t, clk)
465572
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{
466573
Retention: tc.retentionConfig,
467-
}, clk)
574+
}, clk, prometheus.NewRegistry())
468575
defer closer.Close()
469576
testutil.TryReceive(ctx, t, done)
470577

@@ -555,7 +662,7 @@ func TestDeleteOldProvisionerDaemons(t *testing.T) {
555662
require.NoError(t, err)
556663

557664
// when
558-
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk)
665+
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk, prometheus.NewRegistry())
559666
defer closer.Close()
560667

561668
// then
@@ -659,7 +766,7 @@ func TestDeleteOldAuditLogConnectionEvents(t *testing.T) {
659766

660767
// Run the purge
661768
done := awaitDoTick(ctx, t, clk)
662-
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk)
769+
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk, prometheus.NewRegistry())
663770
defer closer.Close()
664771
// Wait for tick
665772
testutil.TryReceive(ctx, t, done)
@@ -822,7 +929,7 @@ func TestDeleteOldTelemetryHeartbeats(t *testing.T) {
822929
require.NoError(t, err)
823930

824931
done := awaitDoTick(ctx, t, clk)
825-
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk)
932+
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{}, clk, prometheus.NewRegistry())
826933
defer closer.Close()
827934
<-done // doTick() has now run.
828935

@@ -941,7 +1048,7 @@ func TestDeleteOldConnectionLogs(t *testing.T) {
9411048
done := awaitDoTick(ctx, t, clk)
9421049
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{
9431050
Retention: tc.retentionConfig,
944-
}, clk)
1051+
}, clk, prometheus.NewRegistry())
9451052
defer closer.Close()
9461053
testutil.TryReceive(ctx, t, done)
9471054

@@ -1197,7 +1304,7 @@ func TestDeleteOldAIBridgeRecords(t *testing.T) {
11971304
Retention: serpent.Duration(tc.retention),
11981305
},
11991306
},
1200-
}, clk)
1307+
}, clk, prometheus.NewRegistry())
12011308
defer closer.Close()
12021309
testutil.TryReceive(ctx, t, done)
12031310

@@ -1284,7 +1391,7 @@ func TestDeleteOldAuditLogs(t *testing.T) {
12841391
done := awaitDoTick(ctx, t, clk)
12851392
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{
12861393
Retention: tc.retentionConfig,
1287-
}, clk)
1394+
}, clk, prometheus.NewRegistry())
12881395
defer closer.Close()
12891396
testutil.TryReceive(ctx, t, done)
12901397

@@ -1374,7 +1481,7 @@ func TestDeleteOldAuditLogs(t *testing.T) {
13741481
Retention: codersdk.RetentionConfig{
13751482
AuditLogs: serpent.Duration(retentionPeriod),
13761483
},
1377-
}, clk)
1484+
}, clk, prometheus.NewRegistry())
13781485
defer closer.Close()
13791486
testutil.TryReceive(ctx, t, done)
13801487

@@ -1494,7 +1601,7 @@ func TestDeleteExpiredAPIKeys(t *testing.T) {
14941601
done := awaitDoTick(ctx, t, clk)
14951602
closer := dbpurge.New(ctx, logger, db, &codersdk.DeploymentValues{
14961603
Retention: tc.retentionConfig,
1497-
}, clk)
1604+
}, clk, prometheus.NewRegistry())
14981605
defer closer.Close()
14991606
testutil.TryReceive(ctx, t, done)
15001607

0 commit comments

Comments
 (0)