Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ module github.com/step-security/dev-machine-guard
go 1.24

require golang.org/x/sys v0.33.0

require github.com/google/uuid v1.6.0
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
16 changes: 8 additions & 8 deletions internal/model/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ type ScanResult struct {
PythonPkgManagers []PkgManager `json:"python_package_managers"`
PythonPackages []PythonPackage `json:"python_packages"`
PythonProjects []ProjectInfo `json:"python_projects"`
SystemPkgManager *PkgManager `json:"system_package_manager,omitempty"`
SystemPackages []SystemPackage `json:"system_packages"`
SnapPkgManager *PkgManager `json:"snap_package_manager,omitempty"`
SnapPackages []SystemPackage `json:"snap_packages"`
FlatpakPkgManager *PkgManager `json:"flatpak_package_manager,omitempty"`
FlatpakPackages []SystemPackage `json:"flatpak_packages"`
Summary Summary `json:"summary"`
SystemPkgManager *PkgManager `json:"system_package_manager,omitempty"`
SystemPackages []SystemPackage `json:"system_packages"`
SnapPkgManager *PkgManager `json:"snap_package_manager,omitempty"`
SnapPackages []SystemPackage `json:"snap_packages"`
FlatpakPkgManager *PkgManager `json:"flatpak_package_manager,omitempty"`
FlatpakPackages []SystemPackage `json:"flatpak_packages"`
Summary Summary `json:"summary"`
}

type Device struct {
Expand Down Expand Up @@ -153,7 +153,7 @@ type PythonPackage struct {
// Unlike BrewScanResult (which sends raw base64), this sends pre-parsed packages
// since syspkg.go already handles the format-specific parsing edge cases.
type SystemPackageScanResult struct {
ScanType string `json:"scan_type"` // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak"
ScanType string `json:"scan_type"` // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak"
PackageManager *PkgManager `json:"package_manager,omitempty"`
Packages []SystemPackage `json:"packages"`
PackagesCount int `json:"packages_count"`
Expand Down
136 changes: 136 additions & 0 deletions internal/telemetry/run_status.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package telemetry

import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"runtime"
"time"

"github.com/step-security/dev-machine-guard/internal/buildinfo"
"github.com/step-security/dev-machine-guard/internal/config"
"github.com/step-security/dev-machine-guard/internal/progress"
)

const (
runStatusStarted = "started"
runStatusFailed = "failed"
runStatusCancelled = "cancelled by user"
runStatusMaxErrorChars = 2000
runStatusHTTPTimeout = 3 * time.Second

// Retry counts per status. "started" is load-bearing for attempt
// visibility — we retry harder so a single transient network blip
// does not lose the signal that the run was attempted. "failed"
// fires during shutdown, so one retry covers the common case.
runStatusStartedAttempts = 3
runStatusFailedAttempts = 2
runStatusRetryBackoff = 500 * time.Millisecond
)

// reportRunStatus POSTs a lifecycle transition to the backend with a small
// retry budget. Never returns an error: running the scan is the priority.
//
// status must be "started" or "failed". Passing "succeeded" (or any other
// value) is a defensive no-op — success is written by the backend worker
// after it persists the uploaded telemetry.
func reportRunStatus(ctx context.Context, log *progress.Logger,
executionID, deviceID, status, errMsg string) {

if !config.IsEnterpriseMode() {
return
}
if status != runStatusStarted && status != runStatusFailed {
return
}
if executionID == "" {
return
}

payload := map[string]string{
"execution_id": executionID,
"device_id": deviceID,
"status": status,
"agent_version": buildinfo.Version,
"platform": runtime.GOOS,
}
if status == runStatusFailed {
if errMsg == "" {
// Backend rejects a "failed" report with no error_message.
errMsg = "unspecified failure"
}
if len(errMsg) > runStatusMaxErrorChars {
errMsg = errMsg[:runStatusMaxErrorChars]
}
payload["error_message"] = errMsg
}

body, err := json.Marshal(payload)
if err != nil {
log.Progress("run-status: marshal error: %v", err)
return
}

endpoint := fmt.Sprintf("%s/v1/%s/developer-mdm-agent/telemetry/run-status",
config.APIEndpoint, config.CustomerID)

attempts := runStatusFailedAttempts
if status == runStatusStarted {
attempts = runStatusStartedAttempts
}

for i := 1; i <= attempts; i++ {
if i > 1 {
// Fixed short backoff. Keeps the total time budget bounded so
// retries don't visibly delay the scan start.
select {
case <-time.After(runStatusRetryBackoff):
case <-ctx.Done():
log.Progress("run-status: parent context done, abandoning retries")
return
}
}
if postRunStatusOnce(ctx, log, endpoint, body, status, i, attempts) {
return
}
}
}

// postRunStatusOnce performs a single HTTP attempt. Returns true on a 2xx
// or 4xx (terminal — retrying a bad request will not help). Returns false
// on transport errors or 5xx so the caller can retry.
func postRunStatusOnce(ctx context.Context, log *progress.Logger,
endpoint string, body []byte, status string, attempt, maxAttempts int) bool {

cctx, cancel := context.WithTimeout(ctx, runStatusHTTPTimeout)
defer cancel()

req, err := http.NewRequestWithContext(cctx, http.MethodPost, endpoint, bytes.NewReader(body))
if err != nil {
log.Progress("run-status[%s %d/%d]: request error: %v", status, attempt, maxAttempts, err)
return false
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+config.APIKey)
req.Header.Set("X-Agent-Version", buildinfo.Version)

client := &http.Client{Timeout: runStatusHTTPTimeout}
resp, err := client.Do(req)
if err != nil {
log.Progress("run-status[%s %d/%d]: POST error: %v", status, attempt, maxAttempts, err)
return false
}
defer func() { _ = resp.Body.Close() }()

if resp.StatusCode < 300 {
return true
}
if resp.StatusCode >= 400 && resp.StatusCode < 500 {
log.Progress("run-status[%s]: HTTP %d (terminal, no retry)", status, resp.StatusCode)
return true
}
log.Progress("run-status[%s %d/%d]: HTTP %d from backend", status, attempt, maxAttempts, resp.StatusCode)
return false
}
215 changes: 215 additions & 0 deletions internal/telemetry/run_status_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
package telemetry

import (
"context"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"

"github.com/step-security/dev-machine-guard/internal/config"
"github.com/step-security/dev-machine-guard/internal/progress"
)

// withEnterpriseConfig temporarily patches config to look enterprise-enabled
// and points APIEndpoint at the given test server. Restores on return.
func withEnterpriseConfig(t *testing.T, endpoint string) func() {
t.Helper()
savedKey, savedCustomer, savedEndpoint := config.APIKey, config.CustomerID, config.APIEndpoint
config.APIKey = "sk-test-123"
config.CustomerID = "test-customer"
config.APIEndpoint = endpoint
return func() {
config.APIKey, config.CustomerID, config.APIEndpoint = savedKey, savedCustomer, savedEndpoint
}
}

func TestReportRunStatus_StartedRetriesOn5xx(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")

if got := atomic.LoadInt32(&calls); got != int32(runStatusStartedAttempts) {
t.Fatalf("expected %d retries on 5xx, got %d", runStatusStartedAttempts, got)
}
}

func TestReportRunStatus_StartedStopsAfter2xx(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")

if got := atomic.LoadInt32(&calls); got != 1 {
t.Fatalf("expected exactly 1 call on 2xx, got %d", got)
}
}

func TestReportRunStatus_DoesNotRetryOn4xx(t *testing.T) {
// 4xx is terminal: validation or auth rejection; retrying cannot help.
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
w.WriteHeader(http.StatusBadRequest)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")

if got := atomic.LoadInt32(&calls); got != 1 {
t.Fatalf("expected 1 call for 4xx (no retry), got %d", got)
}
}

func TestReportRunStatus_FailedRetriesOn5xx(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "boom")

if got := atomic.LoadInt32(&calls); got != int32(runStatusFailedAttempts) {
t.Fatalf("expected %d retries on 5xx for failed, got %d", runStatusFailedAttempts, got)
}
}

func TestReportRunStatus_FailedIncludesErrorMessage(t *testing.T) {
var gotBody map[string]string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
_ = json.Unmarshal(body, &gotBody)
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "context deadline exceeded")

if gotBody["status"] != runStatusFailed {
t.Errorf("status = %q, want %q", gotBody["status"], runStatusFailed)
}
if gotBody["error_message"] != "context deadline exceeded" {
t.Errorf("error_message = %q, want %q", gotBody["error_message"], "context deadline exceeded")
}
if gotBody["execution_id"] == "" {
t.Errorf("execution_id missing from body: %+v", gotBody)
}
}

func TestReportRunStatus_SkipsSucceededAndUnknownStatus(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "succeeded", "")
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "cancelled", "")

if got := atomic.LoadInt32(&calls); got != 0 {
t.Fatalf("expected zero HTTP calls for non-agent statuses, got %d", got)
}
}

func TestReportRunStatus_SkipsWhenNotEnterprise(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
}))
defer srv.Close()

// Restore config after test. Default config.APIKey is the placeholder, which
// makes IsEnterpriseMode return false.
savedKey := config.APIKey
config.APIKey = "{{API_KEY}}"
defer func() { config.APIKey = savedKey }()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")

if got := atomic.LoadInt32(&calls); got != 0 {
t.Fatalf("expected zero calls when not in enterprise mode, got %d", got)
}
}

func TestReportRunStatus_SkipsEmptyExecutionID(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

log := progress.NewLogger(progress.LevelInfo)
reportRunStatus(context.Background(), log, "", "dev-1", runStatusStarted, "")

if got := atomic.LoadInt32(&calls); got != 0 {
t.Fatalf("expected zero calls when execution_id is empty, got %d", got)
}
}

func TestReportRunStatus_AbortsRetriesOnCtxCancel(t *testing.T) {
// Server hangs — every attempt will hit the per-attempt timeout, but we
// cancel the parent context mid-run to confirm retries stop.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(runStatusHTTPTimeout + 2*time.Second)
}))
defer srv.Close()
defer withEnterpriseConfig(t, srv.URL)()

ctx, cancel := context.WithCancel(context.Background())
// Cancel after the first attempt completes (~runStatusHTTPTimeout) so we
// land in the backoff select where ctx.Done wins.
time.AfterFunc(runStatusHTTPTimeout+100*time.Millisecond, cancel)

log := progress.NewLogger(progress.LevelInfo)
done := make(chan struct{})
start := time.Now()
go func() {
reportRunStatus(ctx, log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
close(done)
}()

select {
case <-done:
elapsed := time.Since(start)
// Should be close to the first-attempt timeout, well under the full
// retry budget (~runStatusHTTPTimeout * runStatusStartedAttempts).
budget := runStatusHTTPTimeout*2 + 500*time.Millisecond
if elapsed > budget {
t.Fatalf("reportRunStatus took %s, expected ≤ %s once ctx is cancelled", elapsed, budget)
}
case <-time.After(runStatusHTTPTimeout*int64Attempts() + 5*time.Second):
t.Fatal("reportRunStatus did not return")
}
}

func int64Attempts() time.Duration {
return time.Duration(runStatusStartedAttempts)
}
Loading
Loading