diff --git a/go.mod b/go.mod index 41b3801..951ad13 100644 --- a/go.mod +++ b/go.mod @@ -3,3 +3,5 @@ module github.com/step-security/dev-machine-guard go 1.24 require golang.org/x/sys v0.33.0 + +require github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index 9e6ab4c..d70ecf3 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,4 @@ +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= diff --git a/internal/model/model.go b/internal/model/model.go index 55bd260..6cdd5d9 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -20,13 +20,13 @@ type ScanResult struct { PythonPkgManagers []PkgManager `json:"python_package_managers"` PythonPackages []PythonPackage `json:"python_packages"` PythonProjects []ProjectInfo `json:"python_projects"` - SystemPkgManager *PkgManager `json:"system_package_manager,omitempty"` - SystemPackages []SystemPackage `json:"system_packages"` - SnapPkgManager *PkgManager `json:"snap_package_manager,omitempty"` - SnapPackages []SystemPackage `json:"snap_packages"` - FlatpakPkgManager *PkgManager `json:"flatpak_package_manager,omitempty"` - FlatpakPackages []SystemPackage `json:"flatpak_packages"` - Summary Summary `json:"summary"` + SystemPkgManager *PkgManager `json:"system_package_manager,omitempty"` + SystemPackages []SystemPackage `json:"system_packages"` + SnapPkgManager *PkgManager `json:"snap_package_manager,omitempty"` + SnapPackages []SystemPackage `json:"snap_packages"` + FlatpakPkgManager *PkgManager `json:"flatpak_package_manager,omitempty"` + FlatpakPackages []SystemPackage `json:"flatpak_packages"` + Summary Summary `json:"summary"` } type Device struct { @@ -153,7 +153,7 @@ type PythonPackage struct { // Unlike BrewScanResult (which sends raw base64), this sends pre-parsed packages // since syspkg.go already handles the format-specific parsing edge cases. type SystemPackageScanResult struct { - ScanType string `json:"scan_type"` // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak" + ScanType string `json:"scan_type"` // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak" PackageManager *PkgManager `json:"package_manager,omitempty"` Packages []SystemPackage `json:"packages"` PackagesCount int `json:"packages_count"` diff --git a/internal/telemetry/run_status.go b/internal/telemetry/run_status.go new file mode 100644 index 0000000..dce7962 --- /dev/null +++ b/internal/telemetry/run_status.go @@ -0,0 +1,136 @@ +package telemetry + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "runtime" + "time" + + "github.com/step-security/dev-machine-guard/internal/buildinfo" + "github.com/step-security/dev-machine-guard/internal/config" + "github.com/step-security/dev-machine-guard/internal/progress" +) + +const ( + runStatusStarted = "started" + runStatusFailed = "failed" + runStatusCancelled = "cancelled by user" + runStatusMaxErrorChars = 2000 + runStatusHTTPTimeout = 3 * time.Second + + // Retry counts per status. "started" is load-bearing for attempt + // visibility — we retry harder so a single transient network blip + // does not lose the signal that the run was attempted. "failed" + // fires during shutdown, so one retry covers the common case. + runStatusStartedAttempts = 3 + runStatusFailedAttempts = 2 + runStatusRetryBackoff = 500 * time.Millisecond +) + +// reportRunStatus POSTs a lifecycle transition to the backend with a small +// retry budget. Never returns an error: running the scan is the priority. +// +// status must be "started" or "failed". Passing "succeeded" (or any other +// value) is a defensive no-op — success is written by the backend worker +// after it persists the uploaded telemetry. +func reportRunStatus(ctx context.Context, log *progress.Logger, + executionID, deviceID, status, errMsg string) { + + if !config.IsEnterpriseMode() { + return + } + if status != runStatusStarted && status != runStatusFailed { + return + } + if executionID == "" { + return + } + + payload := map[string]string{ + "execution_id": executionID, + "device_id": deviceID, + "status": status, + "agent_version": buildinfo.Version, + "platform": runtime.GOOS, + } + if status == runStatusFailed { + if errMsg == "" { + // Backend rejects a "failed" report with no error_message. + errMsg = "unspecified failure" + } + if len(errMsg) > runStatusMaxErrorChars { + errMsg = errMsg[:runStatusMaxErrorChars] + } + payload["error_message"] = errMsg + } + + body, err := json.Marshal(payload) + if err != nil { + log.Progress("run-status: marshal error: %v", err) + return + } + + endpoint := fmt.Sprintf("%s/v1/%s/developer-mdm-agent/telemetry/run-status", + config.APIEndpoint, config.CustomerID) + + attempts := runStatusFailedAttempts + if status == runStatusStarted { + attempts = runStatusStartedAttempts + } + + for i := 1; i <= attempts; i++ { + if i > 1 { + // Fixed short backoff. Keeps the total time budget bounded so + // retries don't visibly delay the scan start. + select { + case <-time.After(runStatusRetryBackoff): + case <-ctx.Done(): + log.Progress("run-status: parent context done, abandoning retries") + return + } + } + if postRunStatusOnce(ctx, log, endpoint, body, status, i, attempts) { + return + } + } +} + +// postRunStatusOnce performs a single HTTP attempt. Returns true on a 2xx +// or 4xx (terminal — retrying a bad request will not help). Returns false +// on transport errors or 5xx so the caller can retry. +func postRunStatusOnce(ctx context.Context, log *progress.Logger, + endpoint string, body []byte, status string, attempt, maxAttempts int) bool { + + cctx, cancel := context.WithTimeout(ctx, runStatusHTTPTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(cctx, http.MethodPost, endpoint, bytes.NewReader(body)) + if err != nil { + log.Progress("run-status[%s %d/%d]: request error: %v", status, attempt, maxAttempts, err) + return false + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+config.APIKey) + req.Header.Set("X-Agent-Version", buildinfo.Version) + + client := &http.Client{Timeout: runStatusHTTPTimeout} + resp, err := client.Do(req) + if err != nil { + log.Progress("run-status[%s %d/%d]: POST error: %v", status, attempt, maxAttempts, err) + return false + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode < 300 { + return true + } + if resp.StatusCode >= 400 && resp.StatusCode < 500 { + log.Progress("run-status[%s]: HTTP %d (terminal, no retry)", status, resp.StatusCode) + return true + } + log.Progress("run-status[%s %d/%d]: HTTP %d from backend", status, attempt, maxAttempts, resp.StatusCode) + return false +} diff --git a/internal/telemetry/run_status_test.go b/internal/telemetry/run_status_test.go new file mode 100644 index 0000000..8349074 --- /dev/null +++ b/internal/telemetry/run_status_test.go @@ -0,0 +1,215 @@ +package telemetry + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/step-security/dev-machine-guard/internal/config" + "github.com/step-security/dev-machine-guard/internal/progress" +) + +// withEnterpriseConfig temporarily patches config to look enterprise-enabled +// and points APIEndpoint at the given test server. Restores on return. +func withEnterpriseConfig(t *testing.T, endpoint string) func() { + t.Helper() + savedKey, savedCustomer, savedEndpoint := config.APIKey, config.CustomerID, config.APIEndpoint + config.APIKey = "sk-test-123" + config.CustomerID = "test-customer" + config.APIEndpoint = endpoint + return func() { + config.APIKey, config.CustomerID, config.APIEndpoint = savedKey, savedCustomer, savedEndpoint + } +} + +func TestReportRunStatus_StartedRetriesOn5xx(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "") + + if got := atomic.LoadInt32(&calls); got != int32(runStatusStartedAttempts) { + t.Fatalf("expected %d retries on 5xx, got %d", runStatusStartedAttempts, got) + } +} + +func TestReportRunStatus_StartedStopsAfter2xx(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "") + + if got := atomic.LoadInt32(&calls); got != 1 { + t.Fatalf("expected exactly 1 call on 2xx, got %d", got) + } +} + +func TestReportRunStatus_DoesNotRetryOn4xx(t *testing.T) { + // 4xx is terminal: validation or auth rejection; retrying cannot help. + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(http.StatusBadRequest) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "") + + if got := atomic.LoadInt32(&calls); got != 1 { + t.Fatalf("expected 1 call for 4xx (no retry), got %d", got) + } +} + +func TestReportRunStatus_FailedRetriesOn5xx(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "boom") + + if got := atomic.LoadInt32(&calls); got != int32(runStatusFailedAttempts) { + t.Fatalf("expected %d retries on 5xx for failed, got %d", runStatusFailedAttempts, got) + } +} + +func TestReportRunStatus_FailedIncludesErrorMessage(t *testing.T) { + var gotBody map[string]string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + _ = json.Unmarshal(body, &gotBody) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "context deadline exceeded") + + if gotBody["status"] != runStatusFailed { + t.Errorf("status = %q, want %q", gotBody["status"], runStatusFailed) + } + if gotBody["error_message"] != "context deadline exceeded" { + t.Errorf("error_message = %q, want %q", gotBody["error_message"], "context deadline exceeded") + } + if gotBody["execution_id"] == "" { + t.Errorf("execution_id missing from body: %+v", gotBody) + } +} + +func TestReportRunStatus_SkipsSucceededAndUnknownStatus(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "succeeded", "") + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "cancelled", "") + + if got := atomic.LoadInt32(&calls); got != 0 { + t.Fatalf("expected zero HTTP calls for non-agent statuses, got %d", got) + } +} + +func TestReportRunStatus_SkipsWhenNotEnterprise(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + })) + defer srv.Close() + + // Restore config after test. Default config.APIKey is the placeholder, which + // makes IsEnterpriseMode return false. + savedKey := config.APIKey + config.APIKey = "{{API_KEY}}" + defer func() { config.APIKey = savedKey }() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "") + + if got := atomic.LoadInt32(&calls); got != 0 { + t.Fatalf("expected zero calls when not in enterprise mode, got %d", got) + } +} + +func TestReportRunStatus_SkipsEmptyExecutionID(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + log := progress.NewLogger(progress.LevelInfo) + reportRunStatus(context.Background(), log, "", "dev-1", runStatusStarted, "") + + if got := atomic.LoadInt32(&calls); got != 0 { + t.Fatalf("expected zero calls when execution_id is empty, got %d", got) + } +} + +func TestReportRunStatus_AbortsRetriesOnCtxCancel(t *testing.T) { + // Server hangs — every attempt will hit the per-attempt timeout, but we + // cancel the parent context mid-run to confirm retries stop. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(runStatusHTTPTimeout + 2*time.Second) + })) + defer srv.Close() + defer withEnterpriseConfig(t, srv.URL)() + + ctx, cancel := context.WithCancel(context.Background()) + // Cancel after the first attempt completes (~runStatusHTTPTimeout) so we + // land in the backoff select where ctx.Done wins. + time.AfterFunc(runStatusHTTPTimeout+100*time.Millisecond, cancel) + + log := progress.NewLogger(progress.LevelInfo) + done := make(chan struct{}) + start := time.Now() + go func() { + reportRunStatus(ctx, log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "") + close(done) + }() + + select { + case <-done: + elapsed := time.Since(start) + // Should be close to the first-attempt timeout, well under the full + // retry budget (~runStatusHTTPTimeout * runStatusStartedAttempts). + budget := runStatusHTTPTimeout*2 + 500*time.Millisecond + if elapsed > budget { + t.Fatalf("reportRunStatus took %s, expected ≤ %s once ctx is cancelled", elapsed, budget) + } + case <-time.After(runStatusHTTPTimeout*int64Attempts() + 5*time.Second): + t.Fatal("reportRunStatus did not return") + } +} + +func int64Attempts() time.Duration { + return time.Duration(runStatusStartedAttempts) +} diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index cfe6c81..aca395b 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -9,6 +9,9 @@ import ( "io" "net/http" "os" + "os/signal" + "sync/atomic" + "syscall" "time" "github.com/step-security/dev-machine-guard/internal/buildinfo" @@ -35,19 +38,19 @@ type Payload struct { CollectedAt int64 `json:"collected_at"` NoUserLoggedIn bool `json:"no_user_logged_in"` - IDEExtensions []model.Extension `json:"ide_extensions"` - IDEInstallations []model.IDE `json:"ide_installations"` - NodePkgManagers []model.PkgManager `json:"node_package_managers"` - NodeGlobalPackages []model.NodeScanResult `json:"node_global_packages"` - NodeProjects []model.NodeScanResult `json:"node_projects"` - BrewPkgManager *model.PkgManager `json:"brew_package_manager,omitempty"` - BrewScans []model.BrewScanResult `json:"brew_scans"` - PythonPkgManagers []model.PkgManager `json:"python_package_managers"` - PythonGlobalPackages []model.PythonScanResult `json:"python_global_packages"` - PythonProjects []model.ProjectInfo `json:"python_projects"` - SystemPackageScans []model.SystemPackageScanResult `json:"system_package_scans"` - AIAgents []model.AITool `json:"ai_agents"` - MCPConfigs []model.MCPConfigEnterprise `json:"mcp_configs"` + IDEExtensions []model.Extension `json:"ide_extensions"` + IDEInstallations []model.IDE `json:"ide_installations"` + NodePkgManagers []model.PkgManager `json:"node_package_managers"` + NodeGlobalPackages []model.NodeScanResult `json:"node_global_packages"` + NodeProjects []model.NodeScanResult `json:"node_projects"` + BrewPkgManager *model.PkgManager `json:"brew_package_manager,omitempty"` + BrewScans []model.BrewScanResult `json:"brew_scans"` + PythonPkgManagers []model.PkgManager `json:"python_package_managers"` + PythonGlobalPackages []model.PythonScanResult `json:"python_global_packages"` + PythonProjects []model.ProjectInfo `json:"python_projects"` + SystemPackageScans []model.SystemPackageScanResult `json:"system_package_scans"` + AIAgents []model.AITool `json:"ai_agents"` + MCPConfigs []model.MCPConfigEnterprise `json:"mcp_configs"` ExecutionLogs *ExecutionLogs `json:"execution_logs,omitempty"` PerformanceMetrics *PerformanceMetrics `json:"performance_metrics,omitempty"` @@ -82,10 +85,76 @@ type PerformanceMetrics struct { // [scanning] Lock acquired (PID: 32560) // [scanning] Device ID (Serial): ... // ... -func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) error { +func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err error) { ctx := context.Background() startTime := time.Now() + // Generate a per-run execution ID up front so failures before device.Gather + // can still be attributed. Fall back to a timestamp-derived ID if crypto/rand + // errors (vanishingly unlikely) — reporting is best-effort and must never + // block the scan itself. + executionID, idErr := newExecutionID() + if idErr != nil { + executionID = fmt.Sprintf("nouuid-%d", time.Now().UnixNano()) + fmt.Fprintf(os.Stderr, "[warn] failed to generate execution id, using fallback: %v\n", idErr) + } + + // deviceID is populated once device.Gather completes; the closure below + // captures it by reference so the deferred failure report uses whatever is + // known at the point of failure (empty is tolerated by the backend). + var deviceID string + + // Ensures exactly one "failed" report lands per run. The signal handler + // goroutine and the deferred recovery can both fire in quick succession + // during cancellation — only the first one through should post. + var reportedFailed atomic.Bool + reportFailedOnce := func(errMsg string) { + if reportedFailed.CompareAndSwap(false, true) { + reportRunStatus(context.Background(), log, executionID, deviceID, runStatusFailed, errMsg) + } + } + + // Catch SIGINT / SIGTERM so cancellation (Ctrl+C, launchd stop, kill) + // still records a failure row and fires the Slack alert before exit. + // Go's default signal disposition terminates the process without running + // defers, which would silently drop the signal — we intercept it here. + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM) + sigHandlerDone := make(chan struct{}) + go func() { + select { + case sig := <-sigCh: + fmt.Fprintf(os.Stderr, "\n[cancel] received %s, reporting failure before exit\n", sig) + reportFailedOnce(fmt.Sprintf("%s: %s", runStatusCancelled, sig)) + // Best-effort lock cleanup. A new run can recover from a stale + // lock file on its own via lock.Acquire; this is just polite. + os.Exit(130) // conventional exit code for SIGINT + case <-sigHandlerDone: + return + } + }() + + // Global recovery + failure report. Runs on panic and on any non-nil error + // return. Uses context.Background() because the original ctx may be the + // source of the failure (e.g., context deadline exceeded). Success is + // reported by the backend worker after it finishes processing the uploaded + // telemetry — not here. + defer func() { + // Stop the signal goroutine so it doesn't leak between test runs / + // subsequent invocations in long-running processes. + signal.Stop(sigCh) + close(sigHandlerDone) + + if r := recover(); r != nil { + err = fmt.Errorf("panic in telemetry.Run: %v", r) + reportFailedOnce(err.Error()) + return + } + if err != nil { + reportFailedOnce(err.Error()) + } + }() + // Start capturing all stderr output for execution_logs. // Defer Finalize immediately to ensure stderr is always restored, // even on early returns (e.g., lock failure). @@ -113,6 +182,7 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) error { // Device info log.Progress("Gathering device information...") dev := device.Gather(ctx, exec) + deviceID = dev.SerialNumber log.Progress("Device ID (Serial): %s", dev.SerialNumber) log.Progress("OS Version: %s", dev.OSVersion) log.Progress("Developer: %s", dev.UserIdentity) @@ -124,6 +194,9 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) error { log.Warn("user identity could not be determined — telemetry will be marked no_user_logged_in") } + // Report "started" now that we have a device_id. Fire-and-forget. + reportRunStatus(ctx, log, executionID, deviceID, runStatusStarted, "") + // Detect logged-in user for running commands as the real user when root. // Skip "root" — if LoggedInUser() fell back to CurrentUser(), delegating // via sudo -H -u root is pointless and changes PATH/env behavior. @@ -485,7 +558,7 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) error { // Upload to S3 log.Progress("Requesting upload URL from backend...") - if err := uploadToS3(ctx, log, payload); err != nil { + if err := uploadToS3(ctx, log, payload, executionID); err != nil { return fmt.Errorf("uploading telemetry: %w", err) } @@ -520,7 +593,7 @@ func totalSystemPackagesCount(scans []model.SystemPackageScanResult) int { return total } -func uploadToS3(ctx context.Context, log *progress.Logger, payload *Payload) error { +func uploadToS3(ctx context.Context, log *progress.Logger, payload *Payload, executionID string) error { payloadJSON, err := json.Marshal(payload) if err != nil { return fmt.Errorf("marshaling payload: %w", err) @@ -633,8 +706,9 @@ func uploadToS3(ctx context.Context, log *progress.Logger, payload *Payload) err // Notify backend log.Progress("Notifying backend of upload...") notifyBody, _ := json.Marshal(map[string]string{ - "s3_key": urlResp.S3Key, - "device_id": payload.DeviceID, + "s3_key": urlResp.S3Key, + "device_id": payload.DeviceID, + "execution_id": executionID, }) notifyEndpoint := fmt.Sprintf("%s/v1/%s/developer-mdm-agent/telemetry/process-uploaded", diff --git a/internal/telemetry/telemetry_test.go b/internal/telemetry/telemetry_test.go index c98435b..4c7f8fd 100644 --- a/internal/telemetry/telemetry_test.go +++ b/internal/telemetry/telemetry_test.go @@ -98,7 +98,8 @@ func TestUploadToS3_SendsCompressedBodyAndIsCompressedFlag(t *testing.T) { DeviceID: "dev-1", } - if err := uploadToS3(context.Background(), progress.NewLogger(progress.LevelInfo), payload); err != nil { + const testExecutionID = "11111111-2222-4333-8444-555555555555" + if err := uploadToS3(context.Background(), progress.NewLogger(progress.LevelInfo), payload, testExecutionID); err != nil { t.Fatalf("uploadToS3 failed: %v", err) } @@ -151,4 +152,7 @@ func TestUploadToS3_SendsCompressedBodyAndIsCompressedFlag(t *testing.T) { if !strings.HasSuffix(notify["s3_key"], ".json.gz") { t.Errorf("expected s3_key with .json.gz suffix, got %q", notify["s3_key"]) } + if notify["execution_id"] != testExecutionID { + t.Errorf("expected execution_id=%q in notify body, got %q", testExecutionID, notify["execution_id"]) + } } diff --git a/internal/telemetry/uuid.go b/internal/telemetry/uuid.go new file mode 100644 index 0000000..e8aab8e --- /dev/null +++ b/internal/telemetry/uuid.go @@ -0,0 +1,16 @@ +package telemetry + +import ( + "fmt" + + "github.com/google/uuid" +) + +// newExecutionID returns a UUID v4 string (RFC 4122). +func newExecutionID() (string, error) { + u, err := uuid.NewRandom() + if err != nil { + return "", fmt.Errorf("generating execution id: %w", err) + } + return u.String(), nil +} diff --git a/internal/telemetry/uuid_test.go b/internal/telemetry/uuid_test.go new file mode 100644 index 0000000..97d8e65 --- /dev/null +++ b/internal/telemetry/uuid_test.go @@ -0,0 +1,25 @@ +package telemetry + +import ( + "regexp" + "testing" +) + +var uuidRegex = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$`) + +func TestNewExecutionID_FormatAndUniqueness(t *testing.T) { + seen := make(map[string]bool, 1000) + for i := 0; i < 1000; i++ { + id, err := newExecutionID() + if err != nil { + t.Fatalf("newExecutionID returned error: %v", err) + } + if !uuidRegex.MatchString(id) { + t.Fatalf("generated id %q does not match UUID v4 pattern", id) + } + if seen[id] { + t.Fatalf("duplicate id generated: %q", id) + } + seen[id] = true + } +} diff --git a/tests/test_smoke_go.sh b/tests/test_smoke_go.sh index b3bd50c..63e9742 100755 --- a/tests/test_smoke_go.sh +++ b/tests/test_smoke_go.sh @@ -45,7 +45,11 @@ assert_eq() { assert_contains() { local label="$1" haystack="$2" needle="$3" - if echo "$haystack" | grep -q "$needle"; then + # Here-string instead of `echo "$h" | grep`: under `set -o pipefail`, a + # large haystack triggers SIGPIPE on echo when grep -q exits early after + # the first match, which propagates as exit 141 and falsely fails the + # check. + if grep -q "$needle" <<<"$haystack"; then pass "$label" else fail "$label" "output does not contain: $needle"