diff --git a/cli/exp_scaletest.go b/cli/exp_scaletest.go index 419b1955477b9..cf79ec7ebcaaa 100644 --- a/cli/exp_scaletest.go +++ b/cli/exp_scaletest.go @@ -67,6 +67,8 @@ func (r *RootCmd) scaletestCmd() *serpent.Command { r.scaletestTaskStatus(), r.scaletestSMTP(), r.scaletestPrebuilds(), + r.scaletestBridge(), + r.scaletestLLMMock(), }, } diff --git a/cli/exp_scaletest_bridge.go b/cli/exp_scaletest_bridge.go new file mode 100644 index 0000000000000..b45da22cc1432 --- /dev/null +++ b/cli/exp_scaletest_bridge.go @@ -0,0 +1,278 @@ +//go:build !slim + +package cli + +import ( + "fmt" + "net/http" + "os/signal" + "strconv" + + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/xerrors" + + "github.com/coder/coder/v2/codersdk" + "github.com/coder/coder/v2/scaletest/bridge" + "github.com/coder/coder/v2/scaletest/createusers" + "github.com/coder/coder/v2/scaletest/harness" + "github.com/coder/serpent" +) + +func (r *RootCmd) scaletestBridge() *serpent.Command { + var ( + userCount int64 + noCleanup bool + mode string + upstreamURL string + directToken string + provider string + requestCount int64 + model string + stream bool + requestPayloadSize int64 + + timeoutStrategy = &timeoutFlags{} + cleanupStrategy = newScaletestCleanupStrategy() + output = &scaletestOutputFlags{} + ) + + cmd := &serpent.Command{ + Use: "bridge", + Short: "Generate load on the AI Bridge service.", + Long: `Generate load on the AI Bridge service by making requests to OpenAI or Anthropic APIs. + +Examples: + # Test OpenAI API through bridge + coder scaletest bridge --mode bridge --provider openai --user-count 10 --request-count 5 + + # Test Anthropic API through bridge + coder scaletest bridge --mode bridge --provider anthropic --user-count 10 --request-count 5 + + # Test directly against mock server + coder scaletest bridge --mode direct --provider openai --upstream-url http://localhost:8080/v1/chat/completions + +The load generator builds conversation history over time, with each request including +all previous messages in the conversation.`, + Handler: func(inv *serpent.Invocation) error { + ctx := inv.Context() + client, err := r.InitClient(inv) + if err != nil { + return err + } + + notifyCtx, stop := signal.NotifyContext(ctx, StopSignals...) + defer stop() + ctx = notifyCtx + + if mode != "bridge" && mode != "direct" { + return xerrors.Errorf("--mode must be either 'bridge' or 'direct', got %q", mode) + } + + var me codersdk.User + if mode == "bridge" { + var err error + me, err = requireAdmin(ctx, client) + if err != nil { + return err + } + } else if upstreamURL == "" { + return xerrors.Errorf("--upstream-url must be set when using --mode direct") + } + + client.HTTPClient = &http.Client{ + Transport: &codersdk.HeaderTransport{ + Transport: http.DefaultTransport, + Header: map[string][]string{ + codersdk.BypassRatelimitHeader: {"true"}, + }, + }, + } + + if userCount <= 0 { + return xerrors.Errorf("--user-count must be greater than 0") + } + + if requestCount <= 0 { + requestCount = 1 + } + if provider == "" { + provider = "openai" + } + if model == "" { + if provider == "anthropic" { + model = "claude-3-opus-20240229" + } else { + model = "gpt-4" + } + } + + runnerCount := userCount + + outputs, err := output.parse() + if err != nil { + return xerrors.Errorf("could not parse --output flags") + } + + reg := prometheus.NewRegistry() + metrics := bridge.NewMetrics(reg) + + if mode == "bridge" { + _, _ = fmt.Fprintln(inv.Stderr, "Bridge mode: creating users and making requests through AI Bridge...") + } else { + _, _ = fmt.Fprintf(inv.Stderr, "Direct mode: making requests directly to %s\n", upstreamURL) + } + + configs := make([]bridge.Config, 0, runnerCount) + for range runnerCount { + config := bridge.Config{ + Mode: bridge.RequestMode(mode), + Metrics: metrics, + Provider: provider, + RequestCount: int(requestCount), + Model: model, + Stream: stream, + RequestPayloadSize: int(requestPayloadSize), + } + + if mode == "direct" { + config.UpstreamURL = upstreamURL + config.DirectToken = directToken + } else { + if len(me.OrganizationIDs) == 0 { + return xerrors.Errorf("admin user must have at least one organization") + } + config.User = createusers.Config{ + OrganizationID: me.OrganizationIDs[0], + } + } + + if err := config.Validate(); err != nil { + return xerrors.Errorf("validate config: %w", err) + } + configs = append(configs, config) + } + + th := harness.NewTestHarness(timeoutStrategy.wrapStrategy(harness.ConcurrentExecutionStrategy{}), cleanupStrategy.toStrategy()) + + for i, config := range configs { + id := strconv.Itoa(i) + name := fmt.Sprintf("bridge-%s", id) + var runner harness.Runnable = bridge.NewRunner(client, config) + th.AddRun(name, id, runner) + } + + _, _ = fmt.Fprintln(inv.Stderr, "Running bridge scaletest...") + testCtx, testCancel := timeoutStrategy.toContext(ctx) + defer testCancel() + err = th.Run(testCtx) + if err != nil { + return xerrors.Errorf("run test harness (harness failure, not a test failure): %w", err) + } + + // If the command was interrupted, skip stats. + if notifyCtx.Err() != nil { + return notifyCtx.Err() + } + + res := th.Results() + + for _, o := range outputs { + err = o.write(res, inv.Stdout) + if err != nil { + return xerrors.Errorf("write output %q to %q: %w", o.format, o.path, err) + } + } + + if !noCleanup { + _, _ = fmt.Fprintln(inv.Stderr, "\nCleaning up...") + cleanupCtx, cleanupCancel := cleanupStrategy.toContext(ctx) + defer cleanupCancel() + err = th.Cleanup(cleanupCtx) + if err != nil { + return xerrors.Errorf("cleanup tests: %w", err) + } + } + + if res.TotalFail > 0 { + return xerrors.New("load test failed, see above for more details") + } + + return nil + }, + } + + cmd.Options = serpent.OptionSet{ + { + Flag: "user-count", + FlagShorthand: "c", + Env: "CODER_SCALETEST_BRIDGE_USER_COUNT", + Description: "Required: Number of concurrent runners (in bridge mode, each creates a coder user).", + Value: serpent.Int64Of(&userCount), + Required: true, + }, + { + Flag: "mode", + Env: "CODER_SCALETEST_BRIDGE_MODE", + Default: "direct", + Description: "Request mode: 'bridge' (create users and use AI Bridge) or 'direct' (make requests directly to upstream-url).", + Value: serpent.StringOf(&mode), + }, + { + Flag: "upstream-url", + Env: "CODER_SCALETEST_BRIDGE_UPSTREAM_URL", + Description: "URL to make requests to directly (required in direct mode, e.g., http://localhost:8080/v1/chat/completions).", + Value: serpent.StringOf(&upstreamURL), + }, + { + Flag: "direct-token", + Env: "CODER_SCALETEST_BRIDGE_DIRECT_TOKEN", + Description: "Bearer token for direct mode (optional, uses client token if not set).", + Value: serpent.StringOf(&directToken), + }, + { + Flag: "provider", + Env: "CODER_SCALETEST_BRIDGE_PROVIDER", + Default: "openai", + Description: "API provider to use: 'openai' or 'anthropic'.", + Value: serpent.StringOf(&provider), + }, + { + Flag: "request-count", + Env: "CODER_SCALETEST_BRIDGE_REQUEST_COUNT", + Default: "1", + Description: "Number of sequential requests to make per runner.", + Value: serpent.Int64Of(&requestCount), + }, + { + Flag: "model", + Env: "CODER_SCALETEST_BRIDGE_MODEL", + Default: "gpt-4", + Description: "Model to use for requests.", + Value: serpent.StringOf(&model), + }, + { + Flag: "stream", + Env: "CODER_SCALETEST_BRIDGE_STREAM", + Description: "Enable streaming requests.", + Value: serpent.BoolOf(&stream), + }, + { + Flag: "request-payload-size", + Env: "CODER_SCALETEST_BRIDGE_REQUEST_PAYLOAD_SIZE", + Default: "0", + Description: "Size in bytes of the request payload (user message content). If 0, uses default message content.", + Value: serpent.Int64Of(&requestPayloadSize), + }, + { + Flag: "no-cleanup", + Env: "CODER_SCALETEST_NO_CLEANUP", + Description: "Do not clean up resources after the test completes.", + Value: serpent.BoolOf(&noCleanup), + }, + } + + timeoutStrategy.attach(&cmd.Options) + cleanupStrategy.attach(&cmd.Options) + output.attach(&cmd.Options) + return cmd +} diff --git a/cli/exp_scaletest_llmmock.go b/cli/exp_scaletest_llmmock.go new file mode 100644 index 0000000000000..2d03a08938ebb --- /dev/null +++ b/cli/exp_scaletest_llmmock.go @@ -0,0 +1,120 @@ +//go:build !slim + +package cli + +import ( + "fmt" + "net/http" + "os/signal" + "time" + + "golang.org/x/xerrors" + + "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" + "github.com/coder/coder/v2/scaletest/llmmock" + "github.com/coder/serpent" +) + +func (*RootCmd) scaletestLLMMock() *serpent.Command { + var ( + address string + artificialLatency time.Duration + responsePayloadSize int64 + + pprofEnable bool + pprofAddress string + + traceEnable bool + ) + cmd := &serpent.Command{ + Use: "llm-mock", + Short: "Start a mock LLM API server for testing", + Long: `Start a mock LLM API server that simulates OpenAI and Anthropic APIs`, + Handler: func(inv *serpent.Invocation) error { + ctx, stop := signal.NotifyContext(inv.Context(), StopSignals...) + defer stop() + + logger := slog.Make(sloghuman.Sink(inv.Stderr)).Leveled(slog.LevelInfo) + + if pprofEnable { + _ = http.DefaultServeMux + closePprof := ServeHandler(ctx, logger, nil, pprofAddress, "pprof") + defer closePprof() + logger.Info(ctx, "pprof server started", slog.F("address", pprofAddress)) + } + + config := llmmock.Config{ + Address: address, + Logger: logger, + ArtificialLatency: artificialLatency, + ResponsePayloadSize: int(responsePayloadSize), + PprofEnable: pprofEnable, + PprofAddress: pprofAddress, + TraceEnable: traceEnable, + } + srv := new(llmmock.Server) + + if err := srv.Start(ctx, config); err != nil { + return xerrors.Errorf("start mock LLM server: %w", err) + } + defer func() { + _ = srv.Stop() + }() + + _, _ = fmt.Fprintf(inv.Stdout, "Mock LLM API server started on %s\n", srv.APIAddress()) + _, _ = fmt.Fprintf(inv.Stdout, " OpenAI endpoint: %s/v1/chat/completions\n", srv.APIAddress()) + _, _ = fmt.Fprintf(inv.Stdout, " Anthropic endpoint: %s/v1/messages\n", srv.APIAddress()) + + <-ctx.Done() + return nil + }, + } + + cmd.Options = []serpent.Option{ + { + Flag: "address", + Env: "CODER_SCALETEST_LLM_MOCK_ADDRESS", + Default: "localhost", + Description: "Address to bind the mock LLM API server. Can include a port (e.g., 'localhost:8080' or ':8080'). Uses a random port if no port is specified.", + Value: serpent.StringOf(&address), + }, + { + Flag: "artificial-latency", + Env: "CODER_SCALETEST_LLM_MOCK_ARTIFICIAL_LATENCY", + Default: "0s", + Description: "Artificial latency to add to each response (e.g., 100ms, 1s). Simulates slow upstream processing.", + Value: serpent.DurationOf(&artificialLatency), + }, + { + Flag: "response-payload-size", + Env: "CODER_SCALETEST_LLM_MOCK_RESPONSE_PAYLOAD_SIZE", + Default: "0", + Description: "Size in bytes of the response payload. If 0, uses default context-aware responses.", + Value: serpent.Int64Of(&responsePayloadSize), + }, + { + Flag: "pprof-enable", + Env: "CODER_SCALETEST_LLM_MOCK_PPROF_ENABLE", + Default: "false", + Description: "Serve pprof metrics on the address defined by pprof-address.", + Value: serpent.BoolOf(&pprofEnable), + }, + { + Flag: "pprof-address", + Env: "CODER_SCALETEST_LLM_MOCK_PPROF_ADDRESS", + Default: "127.0.0.1:6060", + Description: "The bind address to serve pprof.", + Value: serpent.StringOf(&pprofAddress), + }, + { + Flag: "trace-enable", + Env: "CODER_SCALETEST_LLM_MOCK_TRACE_ENABLE", + Default: "false", + Description: "Whether application tracing data is collected. It exports to a backend configured by environment variables. See: https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md.", + Value: serpent.BoolOf(&traceEnable), + }, + } + + return cmd +} diff --git a/go.mod b/go.mod index 17fb110747803..62f0b5353c5c5 100644 --- a/go.mod +++ b/go.mod @@ -440,7 +440,7 @@ require ( go.opentelemetry.io/collector/pdata/pprofile v0.121.0 // indirect go.opentelemetry.io/collector/semconv v0.123.0 // indirect go.opentelemetry.io/contrib v1.19.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 go.opentelemetry.io/otel/metric v1.38.0 // indirect go.opentelemetry.io/proto/otlp v1.7.0 // indirect go.uber.org/multierr v1.11.0 // indirect diff --git a/scaletest/bridge/config.go b/scaletest/bridge/config.go new file mode 100644 index 0000000000000..ac7f7460a3aea --- /dev/null +++ b/scaletest/bridge/config.go @@ -0,0 +1,95 @@ +package bridge + +import ( + "golang.org/x/xerrors" + + "github.com/google/uuid" + + "github.com/coder/coder/v2/scaletest/createusers" +) + +type RequestMode string + +const ( + RequestModeBridge RequestMode = "bridge" + RequestModeDirect RequestMode = "direct" +) + +type Config struct { + // Mode determines how requests are made. + // "bridge": Create users in Coder and use their session tokens to make requests through AI Bridge. + // "direct": Make requests directly to UpstreamURL without user creation. + Mode RequestMode `json:"mode"` + + // User is the configuration for the user to create. + // Required in bridge mode. + User createusers.Config `json:"user"` + + // UpstreamURL is the URL to make requests to directly. + // Only used in direct mode. + UpstreamURL string `json:"upstream_url"` + + // DirectToken is the Bearer token for direct mode. + // If not set in direct mode, uses the client's token. + DirectToken string `json:"direct_token"` + + // Provider is the API provider to use: "openai" or "anthropic". + Provider string `json:"provider"` + + // RequestCount is the number of requests to make per runner. + RequestCount int `json:"request_count"` + + // Model is the model to use for requests. + Model string `json:"model"` + + // Stream indicates whether to use streaming requests. + Stream bool `json:"stream"` + + // RequestPayloadSize is the size in bytes of the request payload (user message content). + // If 0, uses default message content. + RequestPayloadSize int `json:"request_payload_size"` + + Metrics *Metrics `json:"-"` +} + +func (c Config) Validate() error { + if c.Metrics == nil { + return xerrors.New("metrics must be set") + } + + // Validate mode + if c.Mode != RequestModeBridge && c.Mode != RequestModeDirect { + return xerrors.New("mode must be either 'bridge' or 'direct'") + } + + if c.RequestCount <= 0 { + return xerrors.New("request_count must be greater than 0") + } + if c.Model == "" { + return xerrors.New("model must be set") + } + + // Validate provider + if c.Provider != "openai" && c.Provider != "anthropic" { + return xerrors.New("provider must be either 'openai' or 'anthropic'") + } + + if c.Mode == RequestModeDirect { + // In direct mode, UpstreamURL must be set. + if c.UpstreamURL == "" { + return xerrors.New("upstream_url must be set in direct mode") + } + return nil + } + + // In bridge mode, User config is required. + if c.User.OrganizationID == uuid.Nil { + return xerrors.New("user organization_id must be set in bridge mode") + } + + if err := c.User.Validate(); err != nil { + return xerrors.Errorf("user config: %w", err) + } + + return nil +} diff --git a/scaletest/bridge/local-observability/alloy/config.alloy b/scaletest/bridge/local-observability/alloy/config.alloy new file mode 100644 index 0000000000000..ed40a15d67e9b --- /dev/null +++ b/scaletest/bridge/local-observability/alloy/config.alloy @@ -0,0 +1,91 @@ +// Grafana Alloy configuration to scrape pprof from develop.sh and forward to Pyroscope +// The develop.sh server exposes pprof at /api/v2/debug/pprof/ instead of /debug/pprof/ + +pyroscope.scrape "coderd" { + targets = [ + { + "__address__" = "host.docker.internal:3000", + "service_name" = "coderd", + }, + ] + + authorization { + credentials = "" + type = "Bearer" + } + + forward_to = [pyroscope.write.local.receiver] + + profiling_config { + profile.process_cpu { + enabled = true + delta = true + path = "/api/v2/debug/pprof/profile" + } + profile.memory { + enabled = true + path = "/api/v2/debug/pprof/allocs" + } + profile.goroutine { + enabled = true + path = "/api/v2/debug/pprof/goroutine" + } + profile.block { + enabled = false + path = "/api/v2/debug/pprof/block" + } + profile.mutex { + enabled = false + path = "/api/v2/debug/pprof/mutex" + } + } + + delta_profiling_duration="2s" + scrape_interval = "3s" + scrape_timeout = "10s" +} + +pyroscope.scrape "llmmock" { + targets = [ + { + "__address__" = "host.docker.internal:6061", + "service_name" = "llmmock", + }, + ] + + forward_to = [pyroscope.write.local.receiver] + + profiling_config { + profile.process_cpu { + enabled = true + delta = true + path = "/debug/pprof/profile" + } + profile.memory { + enabled = true + path = "/debug/pprof/allocs" + } + profile.goroutine { + enabled = true + path = "/debug/pprof/goroutine" + } + profile.block { + enabled = false + path = "/debug/pprof/block" + } + profile.mutex { + enabled = false + path = "/debug/pprof/mutex" + } + } + + delta_profiling_duration="2s" + scrape_interval = "3s" + scrape_timeout = "10s" +} + +pyroscope.write "local" { + endpoint { + url = "http://pyroscope:4040/" + } +} diff --git a/scaletest/bridge/local-observability/docker-compose.yml b/scaletest/bridge/local-observability/docker-compose.yml new file mode 100644 index 0000000000000..6e02cc9db768c --- /dev/null +++ b/scaletest/bridge/local-observability/docker-compose.yml @@ -0,0 +1,132 @@ +version: '3.8' + +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--storage.tsdb.retention.time=30d' + extra_hosts: + - "host.docker.internal:host-gateway" + networks: + - observability + restart: unless-stopped + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3100:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + - GF_INSTALL_PLUGINS=pyroscope-datasource,pyroscope-panel + networks: + - observability + depends_on: + - prometheus + - pyroscope + - tempo + - loki + restart: unless-stopped + + pyroscope: + image: grafana/pyroscope:latest + container_name: pyroscope + ports: + - "4040:4040" + volumes: + - ./pyroscope/pyroscope.yml:/etc/pyroscope/pyroscope.yml + - pyroscope-data:/var/lib/pyroscope + command: + - server + - --config=/etc/pyroscope/pyroscope.yml + networks: + - observability + restart: unless-stopped + + grafana-alloy: + image: grafana/alloy:latest + container_name: grafana-alloy + volumes: + - ./alloy/config.alloy:/etc/alloy/config.alloy:ro + command: + - run + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + extra_hosts: + - "host.docker.internal:host-gateway" + networks: + - observability + depends_on: + - pyroscope + restart: unless-stopped + + tempo: + image: grafana/tempo:latest + container_name: tempo + ports: + - "3200:3200" # Tempo HTTP + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + volumes: + - ./tempo/tempo.yml:/etc/tempo/tempo.yml + command: + - -config.file=/etc/tempo/tempo.yml + - -target=all + networks: + - observability + restart: unless-stopped + + loki: + image: grafana/loki:latest + container_name: loki + ports: + - "3101:3100" + volumes: + - ./loki/loki.yml:/etc/loki/local-config.yaml + - loki-data:/loki + command: + - -config.file=/etc/loki/local-config.yaml + networks: + - observability + restart: unless-stopped + + promtail: + image: grafana/promtail:latest + container_name: promtail + volumes: + - ./promtail/promtail.yml:/etc/promtail/config.yml:ro + - ./logs:/var/log/coder:ro + command: + - -config.file=/etc/promtail/config.yml + networks: + - observability + depends_on: + - loki + restart: unless-stopped + + +volumes: + prometheus-data: + grafana-data: + pyroscope-data: + loki-data: + +networks: + observability: + driver: bridge diff --git a/scaletest/bridge/local-observability/grafana/provisioning/dashboards/bridge.json b/scaletest/bridge/local-observability/grafana/provisioning/dashboards/bridge.json new file mode 100644 index 0000000000000..ecc40c4fdf1dd --- /dev/null +++ b/scaletest/bridge/local-observability/grafana/provisioning/dashboards/bridge.json @@ -0,0 +1,1991 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "panels": [ + { + "datasource": { + "uid": "prometheus" + }, + "description": "Total user and system CPU time spent in seconds.\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum(rate(process_cpu_seconds_total{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "fromExploreMetrics": false, + "legendFormat": "sum(rate)", + "refId": "process_cpu_seconds_total-sum(rate)" + } + ], + "title": "process_cpu_seconds_total", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Resident memory size in bytes.\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 1, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(process_resident_memory_bytes{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "process_resident_memory_bytes-avg" + } + ], + "title": "process_resident_memory_bytes", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Number of bytes transmitted by the process over the network.\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 5, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(process_network_transmit_bytes_total{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "fromExploreMetrics": false, + "legendFormat": "sum(rate)", + "range": true, + "refId": "process_network_receive_bytes_total-sum(rate)" + } + ], + "title": "process_network_transmit_bytes_total", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Number of bytes received by the process over the network.\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 4, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum(rate(process_network_receive_bytes_total{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "fromExploreMetrics": false, + "legendFormat": "sum(rate)", + "refId": "process_network_receive_bytes_total-sum(rate)" + } + ], + "title": "process_network_receive_bytes_total", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Number of open file descriptors.\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "avg" + }, + "properties": [] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 3, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(process_open_fds{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "process_open_fds-avg" + } + ], + "title": "process_open_fds", + "type": "timeseries" + } + ], + "title": "Process", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 7, + "panels": [ + { + "datasource": { + "uid": "prometheus" + }, + "description": "The number of idle connections.\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "id": 8, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(go_sql_idle_connections{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "go_sql_idle_connections-avg" + } + ], + "title": "go_sql_idle_connections", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "The number of connections currently in use.\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 2 + }, + "id": 9, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(go_sql_in_use_connections{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "go_sql_in_use_connections-avg" + } + ], + "title": "go_sql_in_use_connections", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "The total time blocked waiting for a new connection.\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 10, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum(rate(go_sql_wait_duration_seconds_total{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "fromExploreMetrics": false, + "legendFormat": "sum(rate)", + "refId": "go_sql_wait_duration_seconds_total-sum(rate)" + } + ], + "title": "go_sql_wait_duration_seconds_total", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 11, + "maxDataPoints": 500, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 32 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "selectionMode": "x", + "showValue": "auto", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum by (le) (rate(coderd_db_tx_duration_seconds_bucket{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "format": "heatmap", + "fromExploreMetrics": false, + "refId": "coderd_db_tx_duration_seconds_bucket-heatmap" + } + ], + "title": "coderd_db_tx_duration_seconds_bucket", + "type": "heatmap" + } + ], + "title": "SQL", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 12, + "panels": [ + { + "datasource": { + "uid": "prometheus" + }, + "description": "Number of heap bytes allocated and currently in use, same as go_memstats_alloc_bytes. Equals to /memory/classes/heap/objects:bytes.\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "id": 16, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(go_memstats_heap_alloc_bytes{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "go_memstats_heap_alloc_bytes-avg" + } + ], + "title": "go_memstats_heap_alloc_bytes", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Number of heap bytes that are in use. Equals to /memory/classes/heap/objects:bytes + /memory/classes/heap/unused:bytes\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "id": 17, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(go_memstats_heap_inuse_bytes{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "go_memstats_heap_inuse_bytes-avg" + } + ], + "title": "go_memstats_heap_inuse_bytes", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Number of heap bytes waiting to be used. Equals to /memory/classes/heap/released:bytes + /memory/classes/heap/free:bytes.\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 18, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(go_memstats_heap_idle_bytes{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "go_memstats_heap_idle_bytes-avg" + } + ], + "title": "go_memstats_heap_idle_bytes", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Count of events that have been observed for the base metric (A summary of the wall-time pause (stop-the-world) duration in garbage collection cycles.)\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 14, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum(rate(go_gc_duration_seconds_count{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "fromExploreMetrics": false, + "legendFormat": "sum(rate)", + "refId": "go_gc_duration_seconds_count-sum(rate)" + } + ], + "title": "go_gc_duration_seconds_count", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "A summary of the wall-time pause (stop-the-world) duration in garbage collection cycles.\n\n**Type:** *summary*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 13, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(go_gc_duration_seconds{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "go_gc_duration_seconds-avg" + } + ], + "title": "go_gc_duration_seconds", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Total sum of all observed values for the base metric (A summary of the wall-time pause (stop-the-world) duration in garbage collection cycles.)\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 15, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum(rate(go_gc_duration_seconds_sum{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "fromExploreMetrics": false, + "legendFormat": "sum(rate)", + "refId": "go_gc_duration_seconds_sum-sum(rate)" + } + ], + "title": "go_gc_duration_seconds_sum", + "type": "timeseries" + } + ], + "title": "Garbage Collector", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 19, + "panels": [ + { + "datasource": { + "uid": "prometheus" + }, + "description": "The number of concurrent API requests.\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 20, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(coderd_api_concurrent_requests{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "coderd_api_concurrent_requests-avg" + } + ], + "title": "coderd_api_concurrent_requests", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "Cumulative counters for the observation buckets (Latency distribution of requests in seconds.)\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 21, + "maxDataPoints": 500, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 32 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "selectionMode": "x", + "showValue": "auto", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum by (le) (rate(coderd_api_request_latencies_seconds_bucket{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "format": "heatmap", + "fromExploreMetrics": false, + "refId": "coderd_api_request_latencies_seconds_bucket-heatmap" + } + ], + "title": "coderd_api_request_latencies_seconds_bucket", + "type": "heatmap" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "The total number of processed API requests\n\n**Type:** *counter*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 22, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "sum(rate(coderd_api_requests_processed_total{__ignore_usage__=\"\", service=\"coderd\"}[$__rate_interval]))", + "fromExploreMetrics": false, + "legendFormat": "sum(rate)", + "refId": "coderd_api_requests_processed_total-sum(rate)" + } + ], + "title": "coderd_api_requests_processed_total", + "type": "timeseries" + } + ], + "title": "API", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 23, + "panels": [], + "title": "PubSub", + "type": "row" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "The time taken to receive a message from a pubsub event channel\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 24, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(coder_pubsub_receive_latency_seconds{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "coder_pubsub_receive_latency_seconds-avg" + } + ], + "title": "coder_pubsub_receive_latency_seconds", + "type": "timeseries" + }, + { + "datasource": { + "uid": "prometheus" + }, + "description": "The time taken to send a message into a pubsub event channel\n\n**Type:** *gauge*\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 9, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 25, + "maxDataPoints": 500, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.0", + "targets": [ + { + "expr": "avg(coder_pubsub_send_latency_seconds{__ignore_usage__=\"\", service=\"coderd\"})", + "fromExploreMetrics": false, + "legendFormat": "avg", + "refId": "coder_pubsub_send_latency_seconds-avg" + } + ], + "title": "coder_pubsub_send_latency_seconds", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "2025-12-11T18:09:40.000Z", + "to": "2025-12-11T18:15:04.000Z" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Bridge Load Testing Metrics", + "uid": "adjfnrq", + "version": 25 +} diff --git a/scaletest/bridge/local-observability/grafana/provisioning/dashboards/dashboard.yml b/scaletest/bridge/local-observability/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 0000000000000..9141a4532b95a --- /dev/null +++ b/scaletest/bridge/local-observability/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'Default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards + foldersFromFilesStructure: true diff --git a/scaletest/bridge/local-observability/grafana/provisioning/datasources/prometheus.yml b/scaletest/bridge/local-observability/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000000000..1d15bafa90c9b --- /dev/null +++ b/scaletest/bridge/local-observability/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,31 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + timeInterval: "15s" + + - name: Pyroscope + type: pyroscope-datasource + uid: pyroscope + access: proxy + url: http://pyroscope:4040 + editable: true + jsonData: + keepNestedSets: true + minStep: "15s" + + - name: Loki + type: loki + uid: loki + access: proxy + url: http://loki:3100 + editable: true + jsonData: + maxLines: 1000 diff --git a/scaletest/bridge/local-observability/grafana/provisioning/datasources/tempo.yml b/scaletest/bridge/local-observability/grafana/provisioning/datasources/tempo.yml new file mode 100644 index 0000000000000..27856ee6530ef --- /dev/null +++ b/scaletest/bridge/local-observability/grafana/provisioning/datasources/tempo.yml @@ -0,0 +1,38 @@ +apiVersion: 1 + +datasources: + - name: Tempo + uid: tempo + type: tempo + access: proxy + url: http://tempo:3200 + editable: true + jsonData: + httpMethod: GET + + nodeGraph: + enabled: true + serviceMap: + datasourceUid: prometheus + + tracesToMetrics: + datasourceUid: prometheus + tags: + - key: service.name + value: service + - key: job + queries: + - name: P90 latency (spanmetrics) + query: sum(rate(tempo_spanmetrics_latency_bucket[$__interval])) by (le, service) + + # Optional: traces->logs (if you have Loki wired up) + # tracesToLogs: + # datasourceUid: loki + # tags: + # - job + # - instance + # mappedTags: + # - key: service.name + # value: service + # spanStartTimeShift: '1h' + # spanEndTimeShift: '1h' diff --git a/scaletest/bridge/local-observability/logs/.gitignore b/scaletest/bridge/local-observability/logs/.gitignore new file mode 100644 index 0000000000000..397b4a7624e35 --- /dev/null +++ b/scaletest/bridge/local-observability/logs/.gitignore @@ -0,0 +1 @@ +*.log diff --git a/scaletest/bridge/local-observability/loki/loki.yml b/scaletest/bridge/local-observability/loki/loki.yml new file mode 100644 index 0000000000000..023e7337f2a2e --- /dev/null +++ b/scaletest/bridge/local-observability/loki/loki.yml @@ -0,0 +1,26 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + period: 24h diff --git a/scaletest/bridge/local-observability/prometheus/prometheus.yml b/scaletest/bridge/local-observability/prometheus/prometheus.yml new file mode 100644 index 0000000000000..8829c66d01808 --- /dev/null +++ b/scaletest/bridge/local-observability/prometheus/prometheus.yml @@ -0,0 +1,40 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'local-observability' + environment: 'development' + +# Alertmanager configuration (optional, can be added later) +# alerting: +# alertmanagers: +# - static_configs: +# - targets: [] + +# Load rules once and periodically evaluate them (optional) +# rule_files: +# - "alert_rules.yml" + +scrape_configs: + # Scrape Prometheus itself + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Coder server from develop.sh + # This scrapes metrics from a running ./scripts/develop.sh server + # Requires: Start develop.sh with --prometheus-enable flag or set CODER_PROMETHEUS_ENABLE=true + - job_name: 'coderd-develop' + static_configs: + - targets: ['host.docker.internal:2118'] + labels: + service: 'coderd' + environment: 'development' + instance: 'develop-sh' + scrape_interval: 8s + scrape_timeout: 5s + metrics_path: '/api/v2/metrics' + + - job_name: 'tempo-metrics-generator' + static_configs: + - targets: ['tempo:3200'] diff --git a/scaletest/bridge/local-observability/promtail/promtail.yml b/scaletest/bridge/local-observability/promtail/promtail.yml new file mode 100644 index 0000000000000..51b9c21b25d4a --- /dev/null +++ b/scaletest/bridge/local-observability/promtail/promtail.yml @@ -0,0 +1,36 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: coder-logs + static_configs: + - targets: + - localhost + labels: + job: coder-logs + __path__: /var/log/coder/* + pipeline_stages: + - regex: + expression: '^(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d+Z|\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})' + - labels: + filename: + - timestamp: + source: timestamp + format: RFC3339Nano + fallback_formats: + - "2006-01-02 15:04:05" + - output: + source: message + relabel_configs: + - source_labels: + - __path__ + target_label: filename + regex: '.*/([^/]+)$' + replacement: '${1}' diff --git a/scaletest/bridge/local-observability/pyroscope/agent.yml b/scaletest/bridge/local-observability/pyroscope/agent.yml new file mode 100644 index 0000000000000..b019c958050fa --- /dev/null +++ b/scaletest/bridge/local-observability/pyroscope/agent.yml @@ -0,0 +1,12 @@ +# Pyroscope agent configuration +# This configures the agent to scrape pprof endpoints + +# Server address to forward profiles to +server-address: http://pyroscope:4040 + +# Scrape configuration +# Note: The Pyroscope agent may need to be configured differently +# depending on the version. This is a basic configuration. + +# Logging +log-level: info diff --git a/scaletest/bridge/local-observability/pyroscope/pyroscope.yml b/scaletest/bridge/local-observability/pyroscope/pyroscope.yml new file mode 100644 index 0000000000000..db141dd01c73b --- /dev/null +++ b/scaletest/bridge/local-observability/pyroscope/pyroscope.yml @@ -0,0 +1,31 @@ +# Pyroscope server configuration +# This configures Pyroscope to collect profiling data from Go processes + +# Storage configuration +storage: + # Path where Pyroscope will store data + path: /var/lib/pyroscope + # Retention period (30 days = 720 hours) + retention: 720h + +# Server configuration +server: + # HTTP API address + api-bind-address: :4040 + # Base URL for the UI (adjust if behind a proxy) + base-url: http://localhost:4040 + +# Ingest configuration +# Pyroscope can ingest from: +# 1. HTTP endpoint (pprof format) - configure scrape targets below +# 2. Direct push from Go applications using pyroscope client +ingestion: + # Maximum number of samples per second + max_ingestion_rate: 10000 + +# Logging +log-level: info + +# Note: Pyroscope server mode doesn't natively support scraping pprof endpoints. +# Grafana Alloy is used to scrape pprof endpoints and forward to Pyroscope. +# See README.md for configuration details. diff --git a/scaletest/bridge/local-observability/tempo/tempo.yml b/scaletest/bridge/local-observability/tempo/tempo.yml new file mode 100644 index 0000000000000..e720ed9e3f3a3 --- /dev/null +++ b/scaletest/bridge/local-observability/tempo/tempo.yml @@ -0,0 +1,69 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + http: + endpoint: "0.0.0.0:4318" + grpc: + endpoint: "0.0.0.0:4317" + +ingester: + # how long a trace can be idle before it's flushed to a block (optional, but nice) + trace_idle_period: 10s + + # you already used this before; fine to keep + max_block_duration: 5m + + lifecycler: + ring: + kvstore: + store: memberlist # use in-memory memberlist ring (good for single-binary/docker) + replication_factor: 1 # single node, so 1 is fine + heartbeat_period: 5s # 👈 this must be > 0 + +metrics_generator: + # WAL for *metrics* generated from traces + storage: + path: /tmp/tempo/generator/wal + + # WAL for *traces* used by local-blocks (needed for TraceQL metrics) + # See MetricSummary/local-blocks notes. + traces_storage: + path: /tmp/tempo/generator/traces + + processor: + # Prometheus span metrics (RED style metrics) + span_metrics: {} + + # Service graph metrics (for service map / node graph) + service_graphs: {} + + # Local blocks enable TraceQL metrics API (/api/metrics/...) + local_blocks: + # Persist blocks so you can query a longer window than just in-memory + flush_to_storage: true + +compactor: + compaction: + # Totally fine to tweak; this is just a sane default for local dev + block_retention: 24h + +storage: + trace: + backend: local + + local: + path: /tmp/tempo/traces + +overrides: + defaults: + # Enable metrics-generator processors for the (default) tenant + # Note: dashes here, underscores in the config block. + metrics_generator: + processors: + - span-metrics + - service-graphs + - local-blocks diff --git a/scaletest/bridge/metrics.go b/scaletest/bridge/metrics.go new file mode 100644 index 0000000000000..25a35f3e52bb4 --- /dev/null +++ b/scaletest/bridge/metrics.go @@ -0,0 +1,72 @@ +package bridge + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +type Metrics struct { + bridgeErrors *prometheus.CounterVec + bridgeRequests *prometheus.CounterVec + bridgeDuration prometheus.Histogram + bridgeTokensTotal *prometheus.CounterVec +} + +func NewMetrics(reg prometheus.Registerer) *Metrics { + if reg == nil { + reg = prometheus.DefaultRegisterer + } + + errors := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: "scaletest", + Name: "bridge_errors_total", + Help: "Total number of bridge errors", + }, []string{"action"}) + + requests := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: "scaletest", + Name: "bridge_requests_total", + Help: "Total number of bridge requests", + }, []string{"status"}) + + duration := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "scaletest", + Name: "bridge_request_duration_seconds", + Help: "Duration of bridge requests in seconds", + Buckets: prometheus.DefBuckets, + }) + + tokens := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: "scaletest", + Name: "bridge_response_tokens_total", + Help: "Total number of tokens in bridge responses", + }, []string{"type"}) + + reg.MustRegister(errors, requests, duration, tokens) + + return &Metrics{ + bridgeErrors: errors, + bridgeRequests: requests, + bridgeDuration: duration, + bridgeTokensTotal: tokens, + } +} + +func (m *Metrics) AddError(action string) { + m.bridgeErrors.WithLabelValues(action).Inc() +} + +func (m *Metrics) AddRequest(status string) { + m.bridgeRequests.WithLabelValues(status).Inc() +} + +func (m *Metrics) ObserveDuration(duration float64) { + m.bridgeDuration.Observe(duration) +} + +func (m *Metrics) AddTokens(tokenType string, count int64) { + m.bridgeTokensTotal.WithLabelValues(tokenType).Add(float64(count)) +} diff --git a/scaletest/bridge/run.go b/scaletest/bridge/run.go new file mode 100644 index 0000000000000..2a264d6c3654a --- /dev/null +++ b/scaletest/bridge/run.go @@ -0,0 +1,482 @@ +package bridge + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" + + "golang.org/x/xerrors" + + "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" + + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "go.opentelemetry.io/otel/attribute" + semconv "go.opentelemetry.io/otel/semconv/v1.14.0" + "go.opentelemetry.io/otel/semconv/v1.14.0/httpconv" + "go.opentelemetry.io/otel/trace" + + "github.com/coder/coder/v2/coderd/tracing" + "github.com/coder/coder/v2/codersdk" + "github.com/coder/coder/v2/scaletest/createusers" + "github.com/coder/coder/v2/scaletest/harness" + "github.com/coder/coder/v2/scaletest/loadtestutil" + "github.com/coder/quartz" +) + +type ( + tracingContextKey struct{} + tracingContext struct { + provider string + model string + stream bool + requestNum int + mode RequestMode + } +) + +type tracingTransport struct { + cfg Config + underlying http.RoundTripper +} + +func newTracingTransport(cfg Config, underlying http.RoundTripper) *tracingTransport { + if underlying == nil { + underlying = http.DefaultTransport + } + return &tracingTransport{ + cfg: cfg, + underlying: otelhttp.NewTransport(underlying), + } +} + +func (t *tracingTransport) RoundTrip(req *http.Request) (*http.Response, error) { + aibridgeCtx, hasAIBridgeCtx := req.Context().Value(tracingContextKey{}).(tracingContext) + + resp, err := t.underlying.RoundTrip(req) + + if hasAIBridgeCtx { + ctx := req.Context() + if resp != nil && resp.Request != nil { + ctx = resp.Request.Context() + } + span := trace.SpanFromContext(ctx) + if span.IsRecording() { + span.SetAttributes( + attribute.String("aibridge.provider", aibridgeCtx.provider), + attribute.String("aibridge.model", aibridgeCtx.model), + attribute.Bool("aibridge.stream", aibridgeCtx.stream), + attribute.Int("aibridge.request_num", aibridgeCtx.requestNum), + attribute.String("aibridge.mode", string(aibridgeCtx.mode)), + ) + } + } + + return resp, err +} + +type Runner struct { + client *codersdk.Client + cfg Config + + createUserRunner *createusers.Runner + + clock quartz.Clock + httpClient *http.Client + + requestCount int64 + successCount int64 + failureCount int64 + totalDuration time.Duration + totalTokens int64 +} + +func NewRunner(client *codersdk.Client, cfg Config) *Runner { + return &Runner{ + client: client, + cfg: cfg, + clock: quartz.NewReal(), + httpClient: &http.Client{ + Timeout: 30 * time.Second, + Transport: newTracingTransport(cfg, http.DefaultTransport), + }, + } +} + +func (r *Runner) WithClock(clock quartz.Clock) *Runner { + r.clock = clock + return r +} + +var ( + _ harness.Runnable = &Runner{} + _ harness.Cleanable = &Runner{} + _ harness.Collectable = &Runner{} +) + +func (r *Runner) Run(ctx context.Context, id string, logs io.Writer) error { + ctx, span := tracing.StartSpan(ctx) + defer span.End() + + logs = loadtestutil.NewSyncWriter(logs) + logger := slog.Make(sloghuman.Sink(logs)).Leveled(slog.LevelDebug) + + var token string + var requestURL string + + if r.cfg.Mode == RequestModeDirect { + // Direct mode: skip user creation, use upstream URL directly + requestURL = r.cfg.UpstreamURL + if r.cfg.DirectToken != "" { + token = r.cfg.DirectToken + } else if r.client.SessionToken() != "" { + token = r.client.SessionToken() + } + logger.Info(ctx, "bridge runner in direct mode", slog.F("url", requestURL)) + } else { + // Bridge mode: create user and use AI Bridge endpoint + r.client.SetLogger(logger) + r.client.SetLogBodies(true) + + r.createUserRunner = createusers.NewRunner(r.client, r.cfg.User) + newUserAndToken, err := r.createUserRunner.RunReturningUser(ctx, id, logs) + if err != nil { + r.cfg.Metrics.AddError("create_user") + return xerrors.Errorf("create user: %w", err) + } + newUser := newUserAndToken.User + token = newUserAndToken.SessionToken + + logger.Info(ctx, "runner user created", slog.F("username", newUser.Username), slog.F("user_id", newUser.ID.String())) + + // Construct AI Bridge URL based on provider + if r.cfg.Provider == "anthropic" { + requestURL = fmt.Sprintf("%s/api/v2/aibridge/anthropic/v1/messages", r.client.URL) + } else { + requestURL = fmt.Sprintf("%s/api/v2/aibridge/openai/v1/chat/completions", r.client.URL) + } + logger.Info(ctx, "bridge runner in bridge mode", slog.F("url", requestURL), slog.F("provider", r.cfg.Provider)) + } + + requestCount := r.cfg.RequestCount + if requestCount <= 0 { + requestCount = 1 + } + model := r.cfg.Model + if model == "" { + model = "gpt-4" + } + + logger.Info(ctx, "bridge runner is ready", + slog.F("request_count", requestCount), + slog.F("model", model), + slog.F("stream", r.cfg.Stream), + ) + + for i := 0; i < requestCount; i++ { + if err := r.makeRequest(ctx, logger, requestURL, token, model, i); err != nil { + logger.Warn(ctx, "bridge request failed", + slog.F("request_num", i+1), + slog.F("error_type", "request_failed"), + slog.Error(err), + ) + r.cfg.Metrics.AddError("request") + r.cfg.Metrics.AddRequest("failure") + r.failureCount++ + + // Continue making requests even if one fails + continue + } + r.successCount++ + r.cfg.Metrics.AddRequest("success") + r.requestCount++ + } + + logger.Info(ctx, "bridge runner completed", + slog.F("total_requests", r.requestCount), + slog.F("success", r.successCount), + slog.F("failure", r.failureCount), + ) + + // Fail the run if any request failed + if r.failureCount > 0 { + return xerrors.Errorf("bridge runner failed: %d out of %d requests failed", r.failureCount, requestCount) + } + + return nil +} + +func (r *Runner) makeRequest(ctx context.Context, logger slog.Logger, url, token, model string, requestNum int) error { + start := r.clock.Now() + + ctx = context.WithValue(ctx, tracingContextKey{}, tracingContext{ + provider: r.cfg.Provider, + model: model, + stream: r.cfg.Stream, + requestNum: requestNum + 1, + mode: r.cfg.Mode, + }) + + var content string + if r.cfg.RequestPayloadSize > 0 { + pattern := "x" + repeated := strings.Repeat(pattern, r.cfg.RequestPayloadSize) + content = repeated[:r.cfg.RequestPayloadSize] + } else { + content = fmt.Sprintf("Hello, this is test request #%d from the bridge load generator.", requestNum+1) + } + + newUserMessage := map[string]string{ + "role": "user", + "content": content, + } + messages := make([]map[string]string, 0) + messages = append(messages, newUserMessage) + + var reqBody map[string]interface{} + if r.cfg.Provider == "anthropic" { + anthropicMessages := make([]map[string]interface{}, 0, len(messages)) + for _, msg := range messages { + anthropicMessages = append(anthropicMessages, map[string]interface{}{ + "role": msg["role"], + "content": []map[string]string{ + { + "type": "text", + "text": msg["content"], + }, + }, + }) + } + reqBody = map[string]interface{}{ + "model": model, + "messages": anthropicMessages, + "max_tokens": 1024, + "stream": r.cfg.Stream, + } + } else { + reqBody = map[string]interface{}{ + "model": model, + "messages": messages, + "stream": r.cfg.Stream, + } + } + + bodyBytes, err := json.Marshal(reqBody) + if err != nil { + return xerrors.Errorf("marshal request body: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(bodyBytes)) + if err != nil { + return xerrors.Errorf("create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + + logger.Debug(ctx, "making bridge request", + slog.F("url", url), + slog.F("request_num", requestNum+1), + slog.F("model", model), + ) + + resp, err := r.httpClient.Do(req) + if err != nil { + span := trace.SpanFromContext(req.Context()) + if span.IsRecording() { + span.RecordError(err) + } + logger.Warn(ctx, "request failed during execution", + slog.F("request_num", requestNum+1), + slog.Error(err), + ) + return xerrors.Errorf("execute request: %w", err) + } + defer resp.Body.Close() + + span := trace.SpanFromContext(req.Context()) + if span.IsRecording() { + span.SetAttributes(semconv.HTTPStatusCodeKey.Int(resp.StatusCode)) + span.SetStatus(httpconv.ClientStatus(resp.StatusCode)) + } + + duration := r.clock.Since(start) + r.totalDuration += duration + r.cfg.Metrics.ObserveDuration(duration.Seconds()) + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + err := xerrors.Errorf("request failed with status %d: %s", resp.StatusCode, string(body)) + span.RecordError(err) + return err + } + + if r.cfg.Stream { + err := r.handleStreamingResponse(ctx, logger, resp) + if err != nil { + span.RecordError(err) + return err + } + return nil + } + + return r.handleNonStreamingResponse(ctx, logger, resp, requestNum) +} + +func (r *Runner) handleNonStreamingResponse(ctx context.Context, logger slog.Logger, resp *http.Response, requestNum int) error { + if r.cfg.Provider == "anthropic" { + return r.handleAnthropicResponse(ctx, logger, resp, requestNum) + } + return r.handleOpenAIResponse(ctx, logger, resp, requestNum) +} + +func (r *Runner) handleOpenAIResponse(ctx context.Context, logger slog.Logger, resp *http.Response, _ int) error { + var response struct { + ID string `json:"id"` + Model string `json:"model"` + Choices []struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } `json:"choices"` + Usage struct { + PromptTokens int `json:"prompt_tokens"` + CompletionTokens int `json:"completion_tokens"` + TotalTokens int `json:"total_tokens"` + } `json:"usage"` + } + + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + return xerrors.Errorf("decode response: %w", err) + } + + var assistantContent string + if len(response.Choices) > 0 { + assistantContent = response.Choices[0].Message.Content + logger.Debug(ctx, "received response", + slog.F("response_id", response.ID), + slog.F("content_length", len(assistantContent)), + ) + } + + if response.Usage.TotalTokens > 0 { + r.totalTokens += int64(response.Usage.TotalTokens) + r.cfg.Metrics.AddTokens("input", int64(response.Usage.PromptTokens)) + r.cfg.Metrics.AddTokens("output", int64(response.Usage.CompletionTokens)) + } + + return nil +} + +func (r *Runner) handleAnthropicResponse(ctx context.Context, logger slog.Logger, resp *http.Response, _ int) error { + var response struct { + ID string `json:"id"` + Model string `json:"model"` + Content []struct { + Type string `json:"type"` + Text string `json:"text"` + } `json:"content"` + Usage struct { + InputTokens int `json:"input_tokens"` + OutputTokens int `json:"output_tokens"` + } `json:"usage"` + } + + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + return xerrors.Errorf("decode response: %w", err) + } + + var assistantContent string + if len(response.Content) > 0 { + assistantContent = response.Content[0].Text + logger.Debug(ctx, "received response", + slog.F("response_id", response.ID), + slog.F("content_length", len(assistantContent)), + ) + } + + totalTokens := response.Usage.InputTokens + response.Usage.OutputTokens + if totalTokens > 0 { + r.totalTokens += int64(totalTokens) + r.cfg.Metrics.AddTokens("input", int64(response.Usage.InputTokens)) + r.cfg.Metrics.AddTokens("output", int64(response.Usage.OutputTokens)) + } + + return nil +} + +func (*Runner) handleStreamingResponse(ctx context.Context, logger slog.Logger, resp *http.Response) error { + buf := make([]byte, 4096) + totalRead := 0 + for { + // Check for context cancellation before each read + if ctx.Err() != nil { + logger.Warn(ctx, "streaming response canceled", + slog.F("bytes_read", totalRead), + slog.Error(ctx.Err()), + ) + return xerrors.Errorf("stream canceled: %w", ctx.Err()) + } + + n, err := resp.Body.Read(buf) + if n > 0 { + totalRead += n + } + if err == io.EOF { + break + } + if err != nil { + // Check if error is due to context cancellation + if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) { + logger.Warn(ctx, "streaming response read canceled", + slog.F("bytes_read", totalRead), + slog.Error(err), + ) + return xerrors.Errorf("stream read canceled: %w", err) + } + logger.Warn(ctx, "streaming response read error", + slog.F("bytes_read", totalRead), + slog.Error(err), + ) + return xerrors.Errorf("read stream: %w", err) + } + } + + logger.Debug(ctx, "received streaming response", slog.F("bytes_read", totalRead)) + return nil +} + +func (r *Runner) Cleanup(ctx context.Context, id string, logs io.Writer) error { + // Only cleanup user in bridge mode + if r.cfg.Mode == RequestModeBridge && r.createUserRunner != nil { + _, _ = fmt.Fprintln(logs, "Cleaning up user...") + if err := r.createUserRunner.Cleanup(ctx, id, logs); err != nil { + return xerrors.Errorf("cleanup user: %w", err) + } + } + + return nil +} + +func (r *Runner) GetMetrics() map[string]any { + avgDuration := time.Duration(0) + if r.requestCount > 0 { + avgDuration = r.totalDuration / time.Duration(r.requestCount) + } + + return map[string]any{ + "request_count": r.requestCount, + "success_count": r.successCount, + "failure_count": r.failureCount, + "total_duration": r.totalDuration.String(), + "avg_duration": avgDuration.String(), + "total_tokens": r.totalTokens, + } +} diff --git a/scaletest/llmmock/server.go b/scaletest/llmmock/server.go new file mode 100644 index 0000000000000..2238ec7fd6ba9 --- /dev/null +++ b/scaletest/llmmock/server.go @@ -0,0 +1,529 @@ +package llmmock + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net" + "net/http" + "strings" + "time" + + "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + semconv "go.opentelemetry.io/otel/semconv/v1.14.0" + "go.opentelemetry.io/otel/semconv/v1.14.0/httpconv" + "go.opentelemetry.io/otel/semconv/v1.14.0/netconv" + "go.opentelemetry.io/otel/trace" + "golang.org/x/xerrors" + + "cdr.dev/slog" + + "github.com/coder/coder/v2/coderd/pproflabel" + "github.com/coder/coder/v2/coderd/tracing" +) + +// Server wraps the LLM mock server and provides an HTTP API to retrieve requests. +type Server struct { + httpServer *http.Server + httpListener net.Listener + logger slog.Logger + + address string + artificialLatency time.Duration + responsePayloadSize int + + tracerProvider trace.TracerProvider + closeTracing func(context.Context) error +} + +type Config struct { + Address string + Logger slog.Logger + ArtificialLatency time.Duration + ResponsePayloadSize int + + PprofEnable bool + PprofAddress string + + TraceEnable bool +} + +type llmRequest struct { + Model string `json:"model"` + Stream bool `json:"stream,omitempty"` +} + +type openAIMessage struct { + Role string `json:"role"` + Content string `json:"content"` +} + +type openAIResponse struct { + ID string `json:"id"` + Object string `json:"object"` + Created int64 `json:"created"` + Model string `json:"model"` + Choices []struct { + Index int `json:"index"` + Message openAIMessage `json:"message"` + FinishReason string `json:"finish_reason"` + } `json:"choices"` + Usage struct { + PromptTokens int `json:"prompt_tokens"` + CompletionTokens int `json:"completion_tokens"` + TotalTokens int `json:"total_tokens"` + } `json:"usage"` +} + +type anthropicResponse struct { + ID string `json:"id"` + Type string `json:"type"` + Role string `json:"role"` + Content []struct { + Type string `json:"type"` + Text string `json:"text"` + } `json:"content"` + Model string `json:"model"` + StopReason string `json:"stop_reason"` + StopSequence *string `json:"stop_sequence"` + Usage struct { + InputTokens int `json:"input_tokens"` + OutputTokens int `json:"output_tokens"` + } `json:"usage"` +} + +func (s *Server) Start(ctx context.Context, cfg Config) error { + s.address = cfg.Address + s.logger = cfg.Logger + s.artificialLatency = cfg.ArtificialLatency + s.responsePayloadSize = cfg.ResponsePayloadSize + + if cfg.TraceEnable { + otel.SetTextMapPropagator( + propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + ), + ) + + tracerProvider, closeTracing, err := tracing.TracerProvider(ctx, "llm-mock", tracing.TracerOpts{ + Default: cfg.TraceEnable, + }) + if err != nil { + s.logger.Warn(ctx, "failed to initialize tracing", slog.Error(err)) + } else { + s.tracerProvider = tracerProvider + s.closeTracing = closeTracing + } + } + + if err := s.startAPIServer(ctx); err != nil { + return xerrors.Errorf("start API server: %w", err) + } + + return nil +} + +func (s *Server) Stop() error { + if s.httpServer != nil { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := s.httpServer.Shutdown(shutdownCtx); err != nil { + return xerrors.Errorf("shutdown HTTP server: %w", err) + } + } + if s.closeTracing != nil { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := s.closeTracing(shutdownCtx); err != nil { + s.logger.Warn(shutdownCtx, "failed to close tracing", slog.Error(err)) + } + } + return nil +} + +func (s *Server) APIAddress() string { + return fmt.Sprintf("http://%s", s.address) +} + +func (s *Server) startAPIServer(ctx context.Context) error { + mux := http.NewServeMux() + + mux.HandleFunc("POST /v1/chat/completions", s.handleOpenAI) + mux.HandleFunc("POST /v1/messages", s.handleAnthropic) + + var handler http.Handler = mux + if s.tracerProvider != nil { + handler = s.tracingMiddleware(handler) + } + + s.httpServer = &http.Server{ + Handler: handler, + ReadHeaderTimeout: 10 * time.Second, + } + + listener, err := net.Listen("tcp", s.address) + if err != nil { + return xerrors.Errorf("listen on %s: %w", s.address, err) + } + s.httpListener = listener + + pproflabel.Go(ctx, pproflabel.Service("llm-mock"), func(ctx context.Context) { + if err := s.httpServer.Serve(listener); err != nil && !errors.Is(err, http.ErrServerClosed) { + s.logger.Error(ctx, "http API server error", slog.Error(err)) + } + }) + + return nil +} + +func (s *Server) handleOpenAI(w http.ResponseWriter, r *http.Request) { + pproflabel.Do(r.Context(), pproflabel.Service("llm-mock"), func(ctx context.Context) { + s.handleOpenAIWithLabels(w, r.WithContext(ctx)) + }) +} + +func (s *Server) handleOpenAIWithLabels(w http.ResponseWriter, r *http.Request) { + s.logger.Debug(r.Context(), "handling OpenAI request") + defer s.logger.Debug(r.Context(), "handled OpenAI request") + + ctx := r.Context() + requestID := uuid.New() + now := time.Now() + + var req llmRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + s.logger.Error(ctx, "failed to parse OpenAI request", slog.Error(err)) + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + if s.artificialLatency > 0 { + time.Sleep(s.artificialLatency) + } + + var resp openAIResponse + resp.ID = fmt.Sprintf("chatcmpl-%s", requestID.String()[:8]) + resp.Object = "chat.completion" + resp.Created = now.Unix() + resp.Model = req.Model + + var responseContent string + if s.responsePayloadSize > 0 { + pattern := "x" + repeated := strings.Repeat(pattern, s.responsePayloadSize) + responseContent = repeated[:s.responsePayloadSize] + } else { + responseContent = "This is a mock response from OpenAI." + } + + resp.Choices = []struct { + Index int `json:"index"` + Message openAIMessage `json:"message"` + FinishReason string `json:"finish_reason"` + }{ + { + Index: 0, + Message: openAIMessage{ + Role: "assistant", + Content: responseContent, + }, + FinishReason: "stop", + }, + } + + resp.Usage.PromptTokens = 10 + resp.Usage.CompletionTokens = 5 + resp.Usage.TotalTokens = 15 + + responseBody, _ := json.Marshal(resp) + + if req.Stream { + s.sendOpenAIStream(ctx, w, resp) + } else { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if _, err := w.Write(responseBody); err != nil { + s.logger.Error(ctx, "failed to write OpenAI response", + slog.F("request_id", requestID), + slog.Error(err), + slog.F("error_type", "write_error"), + slog.F("likely_cause", "network_error"), + ) + } + } +} + +func (s *Server) handleAnthropic(w http.ResponseWriter, r *http.Request) { + pproflabel.Do(r.Context(), pproflabel.Service("llm-mock"), func(ctx context.Context) { + s.handleAnthropicWithLabels(w, r.WithContext(ctx)) + }) +} + +func (s *Server) handleAnthropicWithLabels(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + requestID := uuid.New() + + var req llmRequest + + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + s.logger.Error(ctx, "failed to parse LLM request", slog.Error(err)) + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + if s.artificialLatency > 0 { + time.Sleep(s.artificialLatency) + } + + var resp anthropicResponse + resp.ID = fmt.Sprintf("msg_%s", requestID.String()[:8]) + resp.Type = "message" + resp.Role = "assistant" + + var responseText string + if s.responsePayloadSize > 0 { + pattern := "x" + repeated := strings.Repeat(pattern, s.responsePayloadSize) + responseText = repeated[:s.responsePayloadSize] + } else { + responseText = "This is a mock response from Anthropic." + } + + resp.Content = []struct { + Type string `json:"type"` + Text string `json:"text"` + }{ + { + Type: "text", + Text: responseText, + }, + } + resp.Model = req.Model + resp.StopReason = "end_turn" + resp.Usage.InputTokens = 10 + resp.Usage.OutputTokens = 5 + + responseBody, _ := json.Marshal(resp) + + if req.Stream { + s.sendAnthropicStream(ctx, w, resp) + } else { + w.Header().Set("Content-Type", "application/json") + w.Header().Set("anthropic-version", "2023-06-01") + w.WriteHeader(http.StatusOK) + if _, err := w.Write(responseBody); err != nil { + s.logger.Error(ctx, "failed to write Anthropic response", + slog.F("request_id", requestID), + slog.Error(err), + slog.F("error_type", "write_error"), + slog.F("likely_cause", "network_error"), + ) + } + } +} + +func (s *Server) sendOpenAIStream(ctx context.Context, w http.ResponseWriter, resp openAIResponse) { + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.Header().Set("Connection", "keep-alive") + w.WriteHeader(http.StatusOK) + + // Helper function to write with error checking + writeChunk := func(data string) bool { + if _, err := fmt.Fprintf(w, "%s", data); err != nil { + s.logger.Error(ctx, "failed to write OpenAI stream chunk", + slog.F("response_id", resp.ID), + slog.Error(err), + slog.F("error_type", "write_error"), + slog.F("likely_cause", "network_error"), + ) + return false + } + return true + } + + // Send initial chunk + chunk := map[string]interface{}{ + "id": resp.ID, + "object": "chat.completion.chunk", + "created": resp.Created, + "model": resp.Model, + "choices": []map[string]interface{}{ + { + "index": 0, + "delta": map[string]interface{}{ + "role": "assistant", + "content": resp.Choices[0].Message.Content, + }, + "finish_reason": nil, + }, + }, + } + chunkBytes, _ := json.Marshal(chunk) + if !writeChunk(fmt.Sprintf("data: %s\n\n", chunkBytes)) { + return + } + + // Send final chunk + finalChunk := map[string]interface{}{ + "id": resp.ID, + "object": "chat.completion.chunk", + "created": resp.Created, + "model": resp.Model, + "choices": []map[string]interface{}{ + { + "index": 0, + "delta": map[string]interface{}{}, + "finish_reason": resp.Choices[0].FinishReason, + }, + }, + } + finalChunkBytes, _ := json.Marshal(finalChunk) + if !writeChunk(fmt.Sprintf("data: %s\n\n", finalChunkBytes)) { + return + } + writeChunk("data: [DONE]\n\n") +} + +func (s *Server) sendAnthropicStream(ctx context.Context, w http.ResponseWriter, resp anthropicResponse) { + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.Header().Set("Connection", "keep-alive") + w.Header().Set("anthropic-version", "2023-06-01") + w.WriteHeader(http.StatusOK) + + writeChunk := func(data string) bool { + if _, err := fmt.Fprintf(w, "%s", data); err != nil { + s.logger.Error(ctx, "failed to write Anthropic stream chunk", + slog.F("response_id", resp.ID), + slog.Error(err), + slog.F("error_type", "write_error"), + slog.F("likely_cause", "network_error"), + ) + return false + } + return true + } + + startEvent := map[string]interface{}{ + "type": "message_start", + "message": map[string]interface{}{ + "id": resp.ID, + "type": resp.Type, + "role": resp.Role, + "model": resp.Model, + }, + } + startBytes, _ := json.Marshal(startEvent) + if !writeChunk(fmt.Sprintf("data: %s\n\n", startBytes)) { + return + } + + // Send content_block_start event + contentStartEvent := map[string]interface{}{ + "type": "content_block_start", + "index": 0, + "content_block": map[string]interface{}{ + "type": "text", + "text": resp.Content[0].Text, + }, + } + contentStartBytes, _ := json.Marshal(contentStartEvent) + if !writeChunk(fmt.Sprintf("data: %s\n\n", contentStartBytes)) { + return + } + + // Send content_block_delta event + deltaEvent := map[string]interface{}{ + "type": "content_block_delta", + "index": 0, + "delta": map[string]interface{}{ + "type": "text_delta", + "text": resp.Content[0].Text, + }, + } + deltaBytes, _ := json.Marshal(deltaEvent) + if !writeChunk(fmt.Sprintf("data: %s\n\n", deltaBytes)) { + return + } + + // Send content_block_stop event + contentStopEvent := map[string]interface{}{ + "type": "content_block_stop", + "index": 0, + } + contentStopBytes, _ := json.Marshal(contentStopEvent) + if !writeChunk(fmt.Sprintf("data: %s\n\n", contentStopBytes)) { + return + } + + // Send message_delta event + deltaMsgEvent := map[string]interface{}{ + "type": "message_delta", + "delta": map[string]interface{}{ + "stop_reason": resp.StopReason, + "stop_sequence": resp.StopSequence, + }, + "usage": resp.Usage, + } + deltaMsgBytes, _ := json.Marshal(deltaMsgEvent) + if !writeChunk(fmt.Sprintf("data: %s\n\n", deltaMsgBytes)) { + return + } + + // Send message_stop event + stopEvent := map[string]interface{}{ + "type": "message_stop", + } + stopBytes, _ := json.Marshal(stopEvent) + writeChunk(fmt.Sprintf("data: %s\n\n", stopBytes)) +} + +func (s *Server) tracingMiddleware(next http.Handler) http.Handler { + tracer := s.tracerProvider.Tracer("llm-mock") + + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + // Wrap response writer with StatusWriter for tracing + sw := &tracing.StatusWriter{ResponseWriter: rw} + + // Extract trace context from headers + propagator := otel.GetTextMapPropagator() + hc := propagation.HeaderCarrier(r.Header) + ctx := propagator.Extract(r.Context(), hc) + + // Start span with initial name (will be updated after handler) + ctx, span := tracer.Start(ctx, fmt.Sprintf("%s %s", r.Method, r.RequestURI)) + defer span.End() + r = r.WithContext(ctx) + + // Inject trace context into response headers + if span.SpanContext().HasTraceID() && span.SpanContext().HasSpanID() { + rw.Header().Set("X-Trace-ID", span.SpanContext().TraceID().String()) + rw.Header().Set("X-Span-ID", span.SpanContext().SpanID().String()) + + hc := propagation.HeaderCarrier(rw.Header()) + propagator.Inject(ctx, hc) + } + + // Execute the handler + next.ServeHTTP(sw, r) + + // Update span with final route and response information + route := r.URL.Path + span.SetName(fmt.Sprintf("%s %s", r.Method, route)) + span.SetAttributes(netconv.Transport("tcp")) + span.SetAttributes(httpconv.ServerRequest("llm-mock", r)...) + span.SetAttributes(semconv.HTTPRouteKey.String(route)) + + status := sw.Status + if status == 0 { + status = http.StatusOK + } + span.SetAttributes(semconv.HTTPStatusCodeKey.Int(status)) + span.SetStatus(httpconv.ServerStatus(status)) + }) +} diff --git a/scaletest/llmmock/types.go b/scaletest/llmmock/types.go new file mode 100644 index 0000000000000..f1f4e1772153a --- /dev/null +++ b/scaletest/llmmock/types.go @@ -0,0 +1,47 @@ +package llmmock + +import ( + "time" + + "github.com/google/uuid" +) + +// Provider represents the LLM provider type. +type Provider string + +const ( + ProviderOpenAI Provider = "openai" + ProviderAnthropic Provider = "anthropic" +) + +// RequestSummary contains metadata about an intercepted LLM API request. +type RequestSummary struct { + ID uuid.UUID `json:"id"` + Timestamp time.Time `json:"timestamp"` + Provider Provider `json:"provider"` + Model string `json:"model"` + UserID string `json:"user_id,omitempty"` + Stream bool `json:"stream"` + // Request body as JSON string for reference + RequestBody string `json:"request_body,omitempty"` +} + +// ResponseSummary contains metadata about an LLM API response. +type ResponseSummary struct { + RequestID uuid.UUID `json:"request_id"` + Timestamp time.Time `json:"timestamp"` + Status int `json:"status"` + Stream bool `json:"stream"` + FinishReason string `json:"finish_reason,omitempty"` // OpenAI: finish_reason, Anthropic: stop_reason + PromptTokens int `json:"prompt_tokens,omitempty"` + OutputTokens int `json:"output_tokens,omitempty"` // OpenAI: completion_tokens, Anthropic: output_tokens + TotalTokens int `json:"total_tokens,omitempty"` + // Response body as JSON string for reference (non-streaming) or first chunk (streaming) + ResponseBody string `json:"response_body,omitempty"` +} + +// RequestRecord combines request and response information. +type RequestRecord struct { + Request RequestSummary `json:"request"` + Response *ResponseSummary `json:"response,omitempty"` +}