step-security · ashishkurmi · Apr 28, 2026 · Apr 23, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/go.mod b/go.mod
@@ -3,3 +3,5 @@ module github.com/step-security/dev-machine-guard
 go 1.24
 
 require golang.org/x/sys v0.33.0
+
+require github.com/google/uuid v1.6.0
diff --git a/go.sum b/go.sum
@@ -1,2 +1,4 @@
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
 golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
diff --git a/internal/model/model.go b/internal/model/model.go
@@ -20,13 +20,13 @@ type ScanResult struct {
 	PythonPkgManagers []PkgManager    `json:"python_package_managers"`
 	PythonPackages    []PythonPackage `json:"python_packages"`
 	PythonProjects    []ProjectInfo   `json:"python_projects"`
-	SystemPkgManager    *PkgManager     `json:"system_package_manager,omitempty"`
-	SystemPackages      []SystemPackage `json:"system_packages"`
-	SnapPkgManager      *PkgManager     `json:"snap_package_manager,omitempty"`
-	SnapPackages        []SystemPackage `json:"snap_packages"`
-	FlatpakPkgManager   *PkgManager     `json:"flatpak_package_manager,omitempty"`
-	FlatpakPackages     []SystemPackage `json:"flatpak_packages"`
-	Summary             Summary         `json:"summary"`
+	SystemPkgManager  *PkgManager     `json:"system_package_manager,omitempty"`
+	SystemPackages    []SystemPackage `json:"system_packages"`
+	SnapPkgManager    *PkgManager     `json:"snap_package_manager,omitempty"`
+	SnapPackages      []SystemPackage `json:"snap_packages"`
+	FlatpakPkgManager *PkgManager     `json:"flatpak_package_manager,omitempty"`
+	FlatpakPackages   []SystemPackage `json:"flatpak_packages"`
+	Summary           Summary         `json:"summary"`
 }
 
 type Device struct {
@@ -153,7 +153,7 @@ type PythonPackage struct {
 // Unlike BrewScanResult (which sends raw base64), this sends pre-parsed packages
 // since syspkg.go already handles the format-specific parsing edge cases.
 type SystemPackageScanResult struct {
-	ScanType       string          `json:"scan_type"`                    // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak"
+	ScanType       string          `json:"scan_type"` // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak"
 	PackageManager *PkgManager     `json:"package_manager,omitempty"`
 	Packages       []SystemPackage `json:"packages"`
 	PackagesCount  int             `json:"packages_count"`

diff --git a/internal/telemetry/run_status.go b/internal/telemetry/run_status.go
@@ -0,0 +1,136 @@
+package telemetry
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"runtime"
+	"time"
+
+	"github.com/step-security/dev-machine-guard/internal/buildinfo"
+	"github.com/step-security/dev-machine-guard/internal/config"
+	"github.com/step-security/dev-machine-guard/internal/progress"
+)
+
+const (
+	runStatusStarted       = "started"
+	runStatusFailed        = "failed"
+	runStatusCancelled     = "cancelled by user"
+	runStatusMaxErrorChars = 2000
+	runStatusHTTPTimeout   = 3 * time.Second
+
+	// Retry counts per status. "started" is load-bearing for attempt
+	// visibility — we retry harder so a single transient network blip
+	// does not lose the signal that the run was attempted. "failed"
+	// fires during shutdown, so one retry covers the common case.
+	runStatusStartedAttempts = 3
+	runStatusFailedAttempts  = 2
+	runStatusRetryBackoff    = 500 * time.Millisecond
+)
+
+// reportRunStatus POSTs a lifecycle transition to the backend with a small
+// retry budget. Never returns an error: running the scan is the priority.
+//
+// status must be "started" or "failed". Passing "succeeded" (or any other
+// value) is a defensive no-op — success is written by the backend worker
+// after it persists the uploaded telemetry.
+func reportRunStatus(ctx context.Context, log *progress.Logger,
+	executionID, deviceID, status, errMsg string) {
+
+	if !config.IsEnterpriseMode() {
+		return
+	}
+	if status != runStatusStarted && status != runStatusFailed {
+		return
+	}
+	if executionID == "" {
+		return
+	}
+
+	payload := map[string]string{
+		"execution_id":  executionID,
+		"device_id":     deviceID,
+		"status":        status,
+		"agent_version": buildinfo.Version,
+		"platform":      runtime.GOOS,
+	}
+	if status == runStatusFailed {
+		if errMsg == "" {
+			// Backend rejects a "failed" report with no error_message.
+			errMsg = "unspecified failure"
+		}
+		if len(errMsg) > runStatusMaxErrorChars {
+			errMsg = errMsg[:runStatusMaxErrorChars]
+		}
+		payload["error_message"] = errMsg
+	}
+
+	body, err := json.Marshal(payload)
+	if err != nil {
+		log.Progress("run-status: marshal error: %v", err)
+		return
+	}
+
+	endpoint := fmt.Sprintf("%s/v1/%s/developer-mdm-agent/telemetry/run-status",
+		config.APIEndpoint, config.CustomerID)
+
+	attempts := runStatusFailedAttempts
+	if status == runStatusStarted {
+		attempts = runStatusStartedAttempts
+	}
+
+	for i := 1; i <= attempts; i++ {
+		if i > 1 {
+			// Fixed short backoff. Keeps the total time budget bounded so
+			// retries don't visibly delay the scan start.
+			select {
+			case <-time.After(runStatusRetryBackoff):
+			case <-ctx.Done():
+				log.Progress("run-status: parent context done, abandoning retries")
+				return
+			}
+		}
+		if postRunStatusOnce(ctx, log, endpoint, body, status, i, attempts) {
+			return
+		}
+	}
+}
+
+// postRunStatusOnce performs a single HTTP attempt. Returns true on a 2xx
+// or 4xx (terminal — retrying a bad request will not help). Returns false
+// on transport errors or 5xx so the caller can retry.
+func postRunStatusOnce(ctx context.Context, log *progress.Logger,
+	endpoint string, body []byte, status string, attempt, maxAttempts int) bool {
+
+	cctx, cancel := context.WithTimeout(ctx, runStatusHTTPTimeout)
+	defer cancel()
+
+	req, err := http.NewRequestWithContext(cctx, http.MethodPost, endpoint, bytes.NewReader(body))
+	if err != nil {
+		log.Progress("run-status[%s %d/%d]: request error: %v", status, attempt, maxAttempts, err)
+		return false
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+config.APIKey)
+	req.Header.Set("X-Agent-Version", buildinfo.Version)
+
+	client := &http.Client{Timeout: runStatusHTTPTimeout}
+	resp, err := client.Do(req)
+	if err != nil {
+		log.Progress("run-status[%s %d/%d]: POST error: %v", status, attempt, maxAttempts, err)
+		return false
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	if resp.StatusCode < 300 {
+		return true
+	}
+	if resp.StatusCode >= 400 && resp.StatusCode < 500 {
+		log.Progress("run-status[%s]: HTTP %d (terminal, no retry)", status, resp.StatusCode)
+		return true
+	}
+	log.Progress("run-status[%s %d/%d]: HTTP %d from backend", status, attempt, maxAttempts, resp.StatusCode)
+	return false
+}
diff --git a/internal/telemetry/run_status_test.go b/internal/telemetry/run_status_test.go
@@ -0,0 +1,215 @@
+package telemetry
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/step-security/dev-machine-guard/internal/config"
+	"github.com/step-security/dev-machine-guard/internal/progress"
+)
+
+// withEnterpriseConfig temporarily patches config to look enterprise-enabled
+// and points APIEndpoint at the given test server. Restores on return.
+func withEnterpriseConfig(t *testing.T, endpoint string) func() {
+	t.Helper()
+	savedKey, savedCustomer, savedEndpoint := config.APIKey, config.CustomerID, config.APIEndpoint
+	config.APIKey = "sk-test-123"
+	config.CustomerID = "test-customer"
+	config.APIEndpoint = endpoint
+	return func() {
+		config.APIKey, config.CustomerID, config.APIEndpoint = savedKey, savedCustomer, savedEndpoint
+	}
+}
+
+func TestReportRunStatus_StartedRetriesOn5xx(t *testing.T) {
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&calls, 1)
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
+
+	if got := atomic.LoadInt32(&calls); got != int32(runStatusStartedAttempts) {
+		t.Fatalf("expected %d retries on 5xx, got %d", runStatusStartedAttempts, got)
+	}
+}
+
+func TestReportRunStatus_StartedStopsAfter2xx(t *testing.T) {
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&calls, 1)
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
+
+	if got := atomic.LoadInt32(&calls); got != 1 {
+		t.Fatalf("expected exactly 1 call on 2xx, got %d", got)
+	}
+}
+
+func TestReportRunStatus_DoesNotRetryOn4xx(t *testing.T) {
+	// 4xx is terminal: validation or auth rejection; retrying cannot help.
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&calls, 1)
+		w.WriteHeader(http.StatusBadRequest)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
+
+	if got := atomic.LoadInt32(&calls); got != 1 {
+		t.Fatalf("expected 1 call for 4xx (no retry), got %d", got)
+	}
+}
+
+func TestReportRunStatus_FailedRetriesOn5xx(t *testing.T) {
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&calls, 1)
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "boom")
+
+	if got := atomic.LoadInt32(&calls); got != int32(runStatusFailedAttempts) {
+		t.Fatalf("expected %d retries on 5xx for failed, got %d", runStatusFailedAttempts, got)
+	}
+}
+
+func TestReportRunStatus_FailedIncludesErrorMessage(t *testing.T) {
+	var gotBody map[string]string
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		body, _ := io.ReadAll(r.Body)
+		_ = json.Unmarshal(body, &gotBody)
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "context deadline exceeded")
+
+	if gotBody["status"] != runStatusFailed {
+		t.Errorf("status = %q, want %q", gotBody["status"], runStatusFailed)
+	}
+	if gotBody["error_message"] != "context deadline exceeded" {
+		t.Errorf("error_message = %q, want %q", gotBody["error_message"], "context deadline exceeded")
+	}
+	if gotBody["execution_id"] == "" {
+		t.Errorf("execution_id missing from body: %+v", gotBody)
+	}
+}
+
+func TestReportRunStatus_SkipsSucceededAndUnknownStatus(t *testing.T) {
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&calls, 1)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "succeeded", "")
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "cancelled", "")
+
+	if got := atomic.LoadInt32(&calls); got != 0 {
+		t.Fatalf("expected zero HTTP calls for non-agent statuses, got %d", got)
+	}
+}
+
+func TestReportRunStatus_SkipsWhenNotEnterprise(t *testing.T) {
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&calls, 1)
+	}))
+	defer srv.Close()
+
+	// Restore config after test. Default config.APIKey is the placeholder, which
+	// makes IsEnterpriseMode return false.
+	savedKey := config.APIKey
+	config.APIKey = "{{API_KEY}}"
+	defer func() { config.APIKey = savedKey }()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
+
+	if got := atomic.LoadInt32(&calls); got != 0 {
+		t.Fatalf("expected zero calls when not in enterprise mode, got %d", got)
+	}
+}
+
+func TestReportRunStatus_SkipsEmptyExecutionID(t *testing.T) {
+	var calls int32
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&calls, 1)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	log := progress.NewLogger(progress.LevelInfo)
+	reportRunStatus(context.Background(), log, "", "dev-1", runStatusStarted, "")
+
+	if got := atomic.LoadInt32(&calls); got != 0 {
+		t.Fatalf("expected zero calls when execution_id is empty, got %d", got)
+	}
+}
+
+func TestReportRunStatus_AbortsRetriesOnCtxCancel(t *testing.T) {
+	// Server hangs — every attempt will hit the per-attempt timeout, but we
+	// cancel the parent context mid-run to confirm retries stop.
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		time.Sleep(runStatusHTTPTimeout + 2*time.Second)
+	}))
+	defer srv.Close()
+	defer withEnterpriseConfig(t, srv.URL)()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	// Cancel after the first attempt completes (~runStatusHTTPTimeout) so we
+	// land in the backoff select where ctx.Done wins.
+	time.AfterFunc(runStatusHTTPTimeout+100*time.Millisecond, cancel)
+
+	log := progress.NewLogger(progress.LevelInfo)
+	done := make(chan struct{})
+	start := time.Now()
+	go func() {
+		reportRunStatus(ctx, log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		elapsed := time.Since(start)
+		// Should be close to the first-attempt timeout, well under the full
+		// retry budget (~runStatusHTTPTimeout * runStatusStartedAttempts).
+		budget := runStatusHTTPTimeout*2 + 500*time.Millisecond
+		if elapsed > budget {
+			t.Fatalf("reportRunStatus took %s, expected ≤ %s once ctx is cancelled", elapsed, budget)
+		}
+	case <-time.After(runStatusHTTPTimeout*int64Attempts() + 5*time.Second):
+		t.Fatal("reportRunStatus did not return")
+	}
+}
+
+func int64Attempts() time.Duration {
+	return time.Duration(runStatusStartedAttempts)
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,5 @@ module github.com/step-security/dev-machine-guard
		go 1.24

		require golang.org/x/sys v0.33.0

		require github.com/google/uuid v1.6.0