Skip to content

Commit 5f05feb

Browse files
committed
feat: add boundary log forwarding to coderd
Add a feature that transmits boundary audit logs from workspaces to coderd via the agent API, then re-emits them to stderr in a structured format. Architecture: - Boundary process connects to Unix socket - Boundary batches logs and sends TLV prefixed protobuf ReportBoundaryLogsRequest messages to Agent - Agent proxies messages to coderd via DRPC - coderd re-emits to stderr Log format example: [API] 2025-12-08 20:58:46.093 [warn] boundary: workspace.id=... decision=deny http.method="GET" http.url="..." time="..."
1 parent 547e53f commit 5f05feb

File tree

15 files changed

+1599
-280
lines changed

15 files changed

+1599
-280
lines changed

agent/agent.go

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import (
4343
"github.com/coder/coder/v2/agent/agentscripts"
4444
"github.com/coder/coder/v2/agent/agentsocket"
4545
"github.com/coder/coder/v2/agent/agentssh"
46+
"github.com/coder/coder/v2/agent/boundarylogproxy"
4647
"github.com/coder/coder/v2/agent/proto"
4748
"github.com/coder/coder/v2/agent/proto/resourcesmonitor"
4849
"github.com/coder/coder/v2/agent/reconnectingpty"
@@ -105,8 +106,8 @@ type Options struct {
105106
}
106107

107108
type Client interface {
108-
ConnectRPC26(ctx context.Context) (
109-
proto.DRPCAgentClient26, tailnetproto.DRPCTailnetClient26, error,
109+
ConnectRPC27(ctx context.Context) (
110+
proto.DRPCAgentClient27, tailnetproto.DRPCTailnetClient26, error,
110111
)
111112
tailnet.DERPMapRewriter
112113
agentsdk.RefreshableSessionTokenProvider
@@ -277,6 +278,11 @@ type agent struct {
277278

278279
logSender *agentsdk.LogSender
279280

281+
// boundaryLogProxy is a socket server that forwards boundary audit logs to coderd.
282+
// It may be nil if boundary audit logs are not enabled or there is a problem starting
283+
// the server.
284+
boundaryLogProxy *boundarylogproxy.Server
285+
280286
prometheusRegistry *prometheus.Registry
281287
// metrics are prometheus registered metrics that will be collected and
282288
// labeled in Coder with the agent + workspace.
@@ -371,6 +377,7 @@ func (a *agent) init() {
371377
)
372378

373379
a.initSocketServer()
380+
a.startBoundaryLogProxyServer()
374381

375382
go a.runLoop()
376383
}
@@ -395,6 +402,28 @@ func (a *agent) initSocketServer() {
395402
a.logger.Debug(a.hardCtx, "socket server started", slog.F("path", a.socketPath))
396403
}
397404

405+
// startBoundaryLogProxyServer starts the boundary log proxy socket server.
406+
// The socket is always created at the well-known path so boundary can connect.
407+
func (a *agent) startBoundaryLogProxyServer() {
408+
// Boundary connects to this socket to send audit logs to the agent.
409+
const boundaryAuditSocketPath = "/tmp/boundary-audit.sock"
410+
411+
proxy := boundarylogproxy.NewServer(a.logger, boundaryAuditSocketPath)
412+
if err := proxy.Start(a.hardCtx); err != nil {
413+
a.logger.Warn(a.hardCtx, "failed to start boundary log proxy", slog.Error(err))
414+
return
415+
}
416+
417+
a.boundaryLogProxy = proxy
418+
a.logger.Info(a.hardCtx, "boundary log proxy server started",
419+
slog.F("socket_path", boundaryAuditSocketPath))
420+
}
421+
422+
// forwardBoundaryLogs forwards buffered boundary audit logs to coderd.
423+
func (a *agent) forwardBoundaryLogs(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
424+
return a.boundaryLogProxy.RunForwarder(ctx, aAPI)
425+
}
426+
398427
// runLoop attempts to start the agent in a retry loop.
399428
// Coder may be offline temporarily, a connection issue
400429
// may be happening, but regardless after the intermittent
@@ -506,7 +535,7 @@ func (t *trySingleflight) Do(key string, fn func()) {
506535
fn()
507536
}
508537

509-
func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
538+
func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
510539
tickerDone := make(chan struct{})
511540
collectDone := make(chan struct{})
512541
ctx, cancel := context.WithCancel(ctx)
@@ -721,7 +750,7 @@ func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient26
721750

722751
// reportLifecycle reports the current lifecycle state once. All state
723752
// changes are reported in order.
724-
func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
753+
func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
725754
for {
726755
select {
727756
case <-a.lifecycleUpdate:
@@ -801,7 +830,7 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
801830
}
802831

803832
// reportConnectionsLoop reports connections to the agent for auditing.
804-
func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
833+
func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
805834
for {
806835
select {
807836
case <-a.reportConnectionsUpdate:
@@ -932,7 +961,7 @@ func (a *agent) reportConnection(id uuid.UUID, connectionType proto.Connection_T
932961
// fetchServiceBannerLoop fetches the service banner on an interval. It will
933962
// not be fetched immediately; the expectation is that it is primed elsewhere
934963
// (and must be done before the session actually starts).
935-
func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
964+
func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
936965
ticker := time.NewTicker(a.announcementBannersRefreshInterval)
937966
defer ticker.Stop()
938967
for {
@@ -967,7 +996,7 @@ func (a *agent) run() (retErr error) {
967996
}
968997

969998
// ConnectRPC returns the dRPC connection we use for the Agent and Tailnet v2+ APIs
970-
aAPI, tAPI, err := a.client.ConnectRPC26(a.hardCtx)
999+
aAPI, tAPI, err := a.client.ConnectRPC27(a.hardCtx)
9711000
if err != nil {
9721001
return err
9731002
}
@@ -984,7 +1013,7 @@ func (a *agent) run() (retErr error) {
9841013
connMan := newAPIConnRoutineManager(a.gracefulCtx, a.hardCtx, a.logger, aAPI, tAPI)
9851014

9861015
connMan.startAgentAPI("init notification banners", gracefulShutdownBehaviorStop,
987-
func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1016+
func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
9881017
bannersProto, err := aAPI.GetAnnouncementBanners(ctx, &proto.GetAnnouncementBannersRequest{})
9891018
if err != nil {
9901019
return xerrors.Errorf("fetch service banner: %w", err)
@@ -1001,7 +1030,7 @@ func (a *agent) run() (retErr error) {
10011030
// sending logs gets gracefulShutdownBehaviorRemain because we want to send logs generated by
10021031
// shutdown scripts.
10031032
connMan.startAgentAPI("send logs", gracefulShutdownBehaviorRemain,
1004-
func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1033+
func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
10051034
err := a.logSender.SendLoop(ctx, aAPI)
10061035
if xerrors.Is(err, agentsdk.ErrLogLimitExceeded) {
10071036
// we don't want this error to tear down the API connection and propagate to the
@@ -1012,6 +1041,12 @@ func (a *agent) run() (retErr error) {
10121041
return err
10131042
})
10141043

1044+
// Forward boundary audit logs to coderd if boundary log forwarding is enabled.
1045+
// These are audit logs so they should continue during graceful shutdown.
1046+
if a.boundaryLogProxy != nil {
1047+
connMan.startAgentAPI("boundary log proxy", gracefulShutdownBehaviorRemain, a.forwardBoundaryLogs)
1048+
}
1049+
10151050
// part of graceful shut down is reporting the final lifecycle states, e.g "ShuttingDown" so the
10161051
// lifecycle reporting has to be via gracefulShutdownBehaviorRemain
10171052
connMan.startAgentAPI("report lifecycle", gracefulShutdownBehaviorRemain, a.reportLifecycle)
@@ -1020,7 +1055,7 @@ func (a *agent) run() (retErr error) {
10201055
connMan.startAgentAPI("report metadata", gracefulShutdownBehaviorStop, a.reportMetadata)
10211056

10221057
// resources monitor can cease as soon as we start gracefully shutting down.
1023-
connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1058+
connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
10241059
logger := a.logger.Named("resources_monitor")
10251060
clk := quartz.NewReal()
10261061
config, err := aAPI.GetResourcesMonitoringConfiguration(ctx, &proto.GetResourcesMonitoringConfigurationRequest{})
@@ -1067,7 +1102,7 @@ func (a *agent) run() (retErr error) {
10671102
connMan.startAgentAPI("handle manifest", gracefulShutdownBehaviorStop, a.handleManifest(manifestOK))
10681103

10691104
connMan.startAgentAPI("app health reporter", gracefulShutdownBehaviorStop,
1070-
func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1105+
func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
10711106
if err := manifestOK.wait(ctx); err != nil {
10721107
return xerrors.Errorf("no manifest: %w", err)
10731108
}
@@ -1100,7 +1135,7 @@ func (a *agent) run() (retErr error) {
11001135

11011136
connMan.startAgentAPI("fetch service banner loop", gracefulShutdownBehaviorStop, a.fetchServiceBannerLoop)
11021137

1103-
connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1138+
connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
11041139
if err := networkOK.wait(ctx); err != nil {
11051140
return xerrors.Errorf("no network: %w", err)
11061141
}
@@ -1115,8 +1150,8 @@ func (a *agent) run() (retErr error) {
11151150
}
11161151

11171152
// handleManifest returns a function that fetches and processes the manifest
1118-
func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1119-
return func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1153+
func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
1154+
return func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
11201155
var (
11211156
sentResult = false
11221157
err error
@@ -1279,7 +1314,7 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
12791314

12801315
func (a *agent) createDevcontainer(
12811316
ctx context.Context,
1282-
aAPI proto.DRPCAgentClient26,
1317+
aAPI proto.DRPCAgentClient27,
12831318
dc codersdk.WorkspaceAgentDevcontainer,
12841319
script codersdk.WorkspaceAgentScript,
12851320
) (err error) {
@@ -1311,8 +1346,8 @@ func (a *agent) createDevcontainer(
13111346

13121347
// createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates
13131348
// the tailnet using the information in the manifest
1314-
func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient26) error {
1315-
return func(ctx context.Context, aAPI proto.DRPCAgentClient26) (retErr error) {
1349+
func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient27) error {
1350+
return func(ctx context.Context, aAPI proto.DRPCAgentClient27) (retErr error) {
13161351
if err := manifestOK.wait(ctx); err != nil {
13171352
return xerrors.Errorf("no manifest: %w", err)
13181353
}
@@ -1401,6 +1436,7 @@ func (a *agent) updateCommandEnv(current []string) (updated []string, err error)
14011436
"CODER_WORKSPACE_NAME": manifest.WorkspaceName,
14021437
"CODER_WORKSPACE_AGENT_NAME": manifest.AgentName,
14031438
"CODER_WORKSPACE_OWNER_NAME": manifest.OwnerName,
1439+
"CODER_WORKSPACE_ID": manifest.WorkspaceID.String(),
14041440

14051441
// Specific Coder subcommands require the agent token exposed!
14061442
"CODER_AGENT_TOKEN": a.client.GetSessionToken(),
@@ -2098,7 +2134,7 @@ const (
20982134

20992135
type apiConnRoutineManager struct {
21002136
logger slog.Logger
2101-
aAPI proto.DRPCAgentClient26
2137+
aAPI proto.DRPCAgentClient27
21022138
tAPI tailnetproto.DRPCTailnetClient24
21032139
eg *errgroup.Group
21042140
stopCtx context.Context
@@ -2107,7 +2143,7 @@ type apiConnRoutineManager struct {
21072143

21082144
func newAPIConnRoutineManager(
21092145
gracefulCtx, hardCtx context.Context, logger slog.Logger,
2110-
aAPI proto.DRPCAgentClient26, tAPI tailnetproto.DRPCTailnetClient24,
2146+
aAPI proto.DRPCAgentClient27, tAPI tailnetproto.DRPCTailnetClient24,
21112147
) *apiConnRoutineManager {
21122148
// routines that remain in operation during graceful shutdown use the remainCtx. They'll still
21132149
// exit if the errgroup hits an error, which usually means a problem with the conn.
@@ -2140,7 +2176,7 @@ func newAPIConnRoutineManager(
21402176
// but for Tailnet.
21412177
func (a *apiConnRoutineManager) startAgentAPI(
21422178
name string, behavior gracefulShutdownBehavior,
2143-
f func(context.Context, proto.DRPCAgentClient26) error,
2179+
f func(context.Context, proto.DRPCAgentClient27) error,
21442180
) {
21452181
logger := a.logger.With(slog.F("name", name))
21462182
var ctx context.Context

agent/agentcontainers/subagent.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,12 @@ type SubAgentClient interface {
147147
// agent API client.
148148
type subAgentAPIClient struct {
149149
logger slog.Logger
150-
api agentproto.DRPCAgentClient26
150+
api agentproto.DRPCAgentClient27
151151
}
152152

153153
var _ SubAgentClient = (*subAgentAPIClient)(nil)
154154

155-
func NewSubAgentClientFromAPI(logger slog.Logger, agentAPI agentproto.DRPCAgentClient26) SubAgentClient {
155+
func NewSubAgentClientFromAPI(logger slog.Logger, agentAPI agentproto.DRPCAgentClient27) SubAgentClient {
156156
if agentAPI == nil {
157157
panic("developer error: agentAPI cannot be nil")
158158
}

agent/agentcontainers/subagent_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func TestSubAgentClient_CreateWithDisplayApps(t *testing.T) {
8181

8282
agentAPI := agenttest.NewClient(t, logger, uuid.New(), agentsdk.Manifest{}, statsCh, tailnet.NewCoordinator(logger))
8383

84-
agentClient, _, err := agentAPI.ConnectRPC26(ctx)
84+
agentClient, _, err := agentAPI.ConnectRPC27(ctx)
8585
require.NoError(t, err)
8686

8787
subAgentClient := agentcontainers.NewSubAgentClientFromAPI(logger, agentClient)
@@ -245,7 +245,7 @@ func TestSubAgentClient_CreateWithDisplayApps(t *testing.T) {
245245

246246
agentAPI := agenttest.NewClient(t, logger, uuid.New(), agentsdk.Manifest{}, statsCh, tailnet.NewCoordinator(logger))
247247

248-
agentClient, _, err := agentAPI.ConnectRPC26(ctx)
248+
agentClient, _, err := agentAPI.ConnectRPC27(ctx)
249249
require.NoError(t, err)
250250

251251
subAgentClient := agentcontainers.NewSubAgentClientFromAPI(logger, agentClient)

agent/agenttest/client.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ func (c *Client) Close() {
124124
c.derpMapOnce.Do(func() { close(c.derpMapUpdates) })
125125
}
126126

127-
func (c *Client) ConnectRPC26(ctx context.Context) (
128-
agentproto.DRPCAgentClient26, proto.DRPCTailnetClient26, error,
127+
func (c *Client) ConnectRPC27(ctx context.Context) (
128+
agentproto.DRPCAgentClient27, proto.DRPCTailnetClient26, error,
129129
) {
130130
conn, lis := drpcsdk.MemTransportPipe()
131131
c.LastWorkspaceAgent = func() {
@@ -405,6 +405,9 @@ func (f *FakeAgentAPI) ReportConnection(_ context.Context, req *agentproto.Repor
405405
return &emptypb.Empty{}, nil
406406
}
407407

408+
func (f *FakeAgentAPI) ReportBoundaryLogs(_ context.Context, _ *agentproto.ReportBoundaryLogsRequest) (*agentproto.ReportBoundaryLogsResponse, error) {
409+
return &agentproto.ReportBoundaryLogsResponse{}, nil
410+
}
408411
func (f *FakeAgentAPI) GetConnectionReports() []*agentproto.ReportConnectionRequest {
409412
f.Lock()
410413
defer f.Unlock()

0 commit comments

Comments
 (0)