From 9fc7d3954083c2a71ef79248f8cd7473c30aebd5 Mon Sep 17 00:00:00 2001 From: Harsh Rawat Date: Mon, 18 May 2026 11:55:59 +0530 Subject: [PATCH] [shimV2] Tolerate guest/VM disappearance during teardown - Once a utility VM exits or its in-guest agent dies, container and network cleanup has nothing left to act on, yet the old code kept returning errors and looping on resources that are already gone. - Export the closed-bridge sentinel and wrap it around the underlying transport failure when the bridge is killed, and add a shared helper that classifies the host-side conditions meaning the compute system is no longer available for modification. Container resource release (combined layers, every category of guest mount and device, and the explicit state delete) and both legs of network endpoint and namespace removal now treat these two signals as a successful release and move on. The VM exit watcher also collapses the bridge on termination so in-flight waits unblock instead of hanging until context cancellation, and task deletion is allowed once the VM has reached the terminated state so containerd can drain stale tasks. - Along the way, migrate the shim's task service registration to the v3 task API, and stop logging the noisy JSON parse error emitted when GCS writes a non-JSON trailer (such as a runtime panic dump) on shutdown, since the raw payload is already surfaced by the terminated-output path. Signed-off-by: Harsh Rawat --- .../service/service.go | 4 +- .../service/service_task.go | 4 +- .../service/service_task_internal.go | 13 +- .../controller/linuxcontainer/container.go | 26 ++-- .../linuxcontainer/container_test.go | 142 ++++++++++++------ internal/controller/network/network_lcow.go | 7 +- .../controller/network/network_lcow_test.go | 65 ++++++++ internal/controller/network/network_wcow.go | 12 +- .../controller/network/network_wcow_test.go | 68 +++++++++ internal/controller/process/process.go | 2 +- internal/controller/vm/vm.go | 10 ++ internal/gcs/bridge.go | 14 +- internal/gcs/bridge_test.go | 30 +++- internal/vm/vmutils/gcs_logs.go | 8 +- internal/vm/vmutils/utils.go | 11 ++ 15 files changed, 330 insertions(+), 86 deletions(-) diff --git a/cmd/containerd-shim-lcow-v2/service/service.go b/cmd/containerd-shim-lcow-v2/service/service.go index c44b4590db..72b685dc36 100644 --- a/cmd/containerd-shim-lcow-v2/service/service.go +++ b/cmd/containerd-shim-lcow-v2/service/service.go @@ -14,7 +14,7 @@ import ( "github.com/Microsoft/hcsshim/internal/shimdiag" sandboxsvc "github.com/containerd/containerd/api/runtime/sandbox/v1" - tasksvc "github.com/containerd/containerd/api/runtime/task/v2" + tasksvc "github.com/containerd/containerd/api/runtime/task/v3" "github.com/containerd/containerd/v2/core/runtime" "github.com/containerd/containerd/v2/pkg/namespaces" "github.com/containerd/containerd/v2/pkg/shutdown" @@ -91,7 +91,7 @@ func NewService(ctx context.Context, eventsPublisher shim.Publisher, sd shutdown // RegisterTTRPC registers the Task, Sandbox, and ShimDiag TTRPC services on // the provided server so that containerd can call into the shim over TTRPC. func (s *Service) RegisterTTRPC(server *ttrpc.Server) error { - tasksvc.RegisterTaskService(server, s) + tasksvc.RegisterTTRPCTaskService(server, s) sandboxsvc.RegisterTTRPCSandboxService(server, s) shimdiag.RegisterShimDiagService(server, s) return nil diff --git a/cmd/containerd-shim-lcow-v2/service/service_task.go b/cmd/containerd-shim-lcow-v2/service/service_task.go index 98ed3491d6..b1158082ae 100644 --- a/cmd/containerd-shim-lcow-v2/service/service_task.go +++ b/cmd/containerd-shim-lcow-v2/service/service_task.go @@ -9,14 +9,14 @@ import ( "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/oc" - "github.com/containerd/containerd/api/runtime/task/v2" + "github.com/containerd/containerd/api/runtime/task/v3" "github.com/containerd/errdefs/pkg/errgrpc" "go.opencensus.io/trace" "google.golang.org/protobuf/types/known/emptypb" ) // Ensure Service implements the TTRPCTaskService interface at compile time. -var _ task.TaskService = &Service{} +var _ task.TTRPCTaskService = &Service{} // State returns the current state of a task or process. // This method is part of the instrumentation layer and business logic is included in stateInternal. diff --git a/cmd/containerd-shim-lcow-v2/service/service_task_internal.go b/cmd/containerd-shim-lcow-v2/service/service_task_internal.go index 9e417ba741..677b52f4c5 100644 --- a/cmd/containerd-shim-lcow-v2/service/service_task_internal.go +++ b/cmd/containerd-shim-lcow-v2/service/service_task_internal.go @@ -12,6 +12,7 @@ import ( container "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer" "github.com/Microsoft/hcsshim/internal/controller/pod" "github.com/Microsoft/hcsshim/internal/controller/process" + "github.com/Microsoft/hcsshim/internal/controller/vm" "github.com/Microsoft/hcsshim/internal/hcs" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" @@ -23,7 +24,7 @@ import ( "github.com/Microsoft/hcsshim/pkg/ctrdtaskapi" eventstypes "github.com/containerd/containerd/api/events" - "github.com/containerd/containerd/api/runtime/task/v2" + "github.com/containerd/containerd/api/runtime/task/v3" "github.com/containerd/errdefs" "github.com/containerd/typeurl/v2" "github.com/opencontainers/runtime-spec/specs-go" @@ -69,10 +70,6 @@ func (s *Service) getPodController(podID string) (*pod.Controller, bool) { // stateInternal returns the current status of a process within a container. func (s *Service) stateInternal(_ context.Context, request *task.StateRequest) (*task.StateResponse, error) { - if err := s.ensureVMRunning(); err != nil { - return nil, err - } - // Look up the container controller for the requested container. ctrCtrl, err := s.getContainerController(request.ID) if err != nil { @@ -277,10 +274,6 @@ func (s *Service) startInternal(ctx context.Context, request *task.StartRequest) // deleteInternal deletes a process, container, or pod sandbox depending on the request. func (s *Service) deleteInternal(ctx context.Context, request *task.DeleteRequest) (*task.DeleteResponse, error) { - if err := s.ensureVMRunning(); err != nil { - return nil, err - } - // Look up the container controller for the target ID. ctrCtrl, err := s.getContainerController(request.ID) if err != nil { @@ -328,7 +321,7 @@ func (s *Service) deleteInternal(ctx context.Context, request *task.DeleteReques // left should be the sandbox container itself (request.ID). remaining := podCtrl.ListContainers() delete(remaining, request.ID) // exclude the sandbox container itself - if len(remaining) > 0 { + if len(remaining) > 0 && s.vmController.State() == vm.StateRunning { return nil, fmt.Errorf("cannot delete sandbox container %s: %d workload container(s) still exist in the pod: %w", request.ID, len(remaining), errdefs.ErrFailedPrecondition) } diff --git a/internal/controller/linuxcontainer/container.go b/internal/controller/linuxcontainer/container.go index 72cbb16df5..03320485f6 100644 --- a/internal/controller/linuxcontainer/container.go +++ b/internal/controller/linuxcontainer/container.go @@ -4,6 +4,7 @@ package linuxcontainer import ( "context" + "errors" "fmt" "sync" "time" @@ -12,7 +13,6 @@ import ( "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats" "github.com/Microsoft/hcsshim/internal/controller/process" "github.com/Microsoft/hcsshim/internal/gcs" - "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" @@ -25,7 +25,7 @@ import ( "github.com/Microsoft/go-winio/pkg/guid" eventstypes "github.com/containerd/containerd/api/events" - "github.com/containerd/containerd/api/runtime/task/v2" + "github.com/containerd/containerd/api/runtime/task/v3" containerdtypes "github.com/containerd/containerd/api/types/task" "github.com/containerd/errdefs" "github.com/containerd/typeurl/v2" @@ -220,6 +220,14 @@ func (c *Controller) closeContainer() { } } +// isResourceAlreadyReleased reports whether a teardown failure indicates the +// resource is already gone — either the in-guest agent died (so guest-side +// state went with it) or the VM itself has exited (so host-side state went +// with it). Either way, teardown can move on instead of retrying. +func isResourceAlreadyReleased(err error) bool { + return errors.Is(err, gcs.ErrBridgeClosed) || vmutils.IsVMNotAvailableError(err) +} + // releaseResources undoes each allocation in reverse order. // It is idempotent — subsequent calls after the first are no-ops. func (c *Controller) releaseResources(ctx context.Context) error { @@ -236,7 +244,7 @@ func (c *Controller) releaseResources(ctx context.Context) error { ContainerRootPath: c.layers.rootfsPath, Layers: hcsLayers, ScratchPath: c.layers.scratch.guestPath, - }); err != nil { + }); err != nil && !isResourceAlreadyReleased(err) { return fmt.Errorf("remove combined layers from guest: %w", err) } @@ -248,7 +256,7 @@ func (c *Controller) releaseResources(ctx context.Context) error { // unmapped on a prior call. var zeroGUID guid.GUID if c.layers != nil && c.layers.scratch.id != zeroGUID { - if err := c.scsi.UnmapFromGuest(ctx, c.layers.scratch.id); err != nil { + if err := c.scsi.UnmapFromGuest(ctx, c.layers.scratch.id); err != nil && !isResourceAlreadyReleased(err) { return fmt.Errorf("unmap scratch layer: %w", err) } c.layers.scratch = scsiReservation{} @@ -258,7 +266,7 @@ func (c *Controller) releaseResources(ctx context.Context) error { // resumes from the first failure. if c.layers != nil { for i, layer := range c.layers.roLayers { - if err := c.scsi.UnmapFromGuest(ctx, layer.id); err != nil { + if err := c.scsi.UnmapFromGuest(ctx, layer.id); err != nil && !isResourceAlreadyReleased(err) { c.layers.roLayers = c.layers.roLayers[i:] return fmt.Errorf("unmap ro layer: %w", err) } @@ -267,7 +275,7 @@ func (c *Controller) releaseResources(ctx context.Context) error { // Unmap additional SCSI mounts. for i, id := range c.scsiResources { - if err := c.scsi.UnmapFromGuest(ctx, id); err != nil { + if err := c.scsi.UnmapFromGuest(ctx, id); err != nil && !isResourceAlreadyReleased(err) { c.scsiResources = c.scsiResources[i:] return fmt.Errorf("unmap scsi resource: %w", err) } @@ -275,7 +283,7 @@ func (c *Controller) releaseResources(ctx context.Context) error { // Unmap Plan9 shares. for i, id := range c.plan9Resources { - if err := c.plan9.UnmapFromGuest(ctx, id); err != nil { + if err := c.plan9.UnmapFromGuest(ctx, id); err != nil && !isResourceAlreadyReleased(err) { c.plan9Resources = c.plan9Resources[i:] return fmt.Errorf("unmap plan9 share: %w", err) } @@ -283,7 +291,7 @@ func (c *Controller) releaseResources(ctx context.Context) error { // Remove VPCI devices. for i, id := range c.devices { - if err := c.vpci.RemoveFromVM(ctx, id); err != nil { + if err := c.vpci.RemoveFromVM(ctx, id); err != nil && !isResourceAlreadyReleased(err) { c.devices = c.devices[i:] return fmt.Errorf("remove vpci device: %w", err) } @@ -295,7 +303,7 @@ func (c *Controller) releaseResources(ctx context.Context) error { if !c.isContainerStateDeleted && c.guest.Capabilities().IsDeleteContainerStateSupported() { // GCS bridge evicts the container from its host-state map even if the inner Delete fails, // so retries will always return not-found. - if err := c.guest.DeleteContainerState(ctx, c.gcsContainerID); err != nil && !hcs.IsNotExist(err) { + if err := c.guest.DeleteContainerState(ctx, c.gcsContainerID); err != nil && !isResourceAlreadyReleased(err) { return fmt.Errorf("delete container state: %w", err) } diff --git a/internal/controller/linuxcontainer/container_test.go b/internal/controller/linuxcontainer/container_test.go index 6f9ed6ece7..1eb20d2269 100644 --- a/internal/controller/linuxcontainer/container_test.go +++ b/internal/controller/linuxcontainer/container_test.go @@ -606,38 +606,71 @@ func TestReleaseResources_Idempotent(t *testing.T) { } } -// TestReleaseResources_StopsOnFirstError verifies that releaseResources -// returns the first error encountered and does not proceed to subsequent -// resource categories. +// TestReleaseResources_StopsOnFirstError verifies the chain's response to an +// error at the SCSI-extras unmap leg. A real failure short-circuits the chain +// (plan9/vpci skipped, slice retained for retry). Any "already-gone" error — +// the GCS bridge being closed or the VM no longer being available — is +// tolerated and the chain proceeds through the remaining categories. func TestReleaseResources_StopsOnFirstError(t *testing.T) { t.Parallel() - c, scsiCtrl, _, _, _ := newContainerTestController(t) - scsiGUID, _ := guid.NewV4() - plan9GUID, _ := guid.NewV4() - deviceGUID, _ := guid.NewV4() + cases := []struct { + name string + scsiErr error + wantStops bool + }{ + {name: "RealError_StopsChain", scsiErr: errUnmapSCSI, wantStops: true}, + {name: "BridgeClosed_Continues", scsiErr: fmt.Errorf("transport gone: %w", gcs.ErrBridgeClosed)}, + {name: "ComputeSystemDoesNotExist_Continues", scsiErr: fmt.Errorf("hcs::System::Modify: %w", hcs.ErrComputeSystemDoesNotExist)}, + {name: "VmcomputeAlreadyStopped_Continues", scsiErr: fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeAlreadyStopped)}, + {name: "VmcomputeOperationInvalidState_Continues", scsiErr: fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeOperationInvalidState)}, + {name: "AlreadyClosed_Continues", scsiErr: fmt.Errorf("hcs::System::Modify: %w", hcs.ErrAlreadyClosed)}, + } - c.scsiResources = []guid.GUID{scsiGUID} - c.plan9Resources = []guid.GUID{plan9GUID} - c.devices = []guid.GUID{deviceGUID} + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + c, scsiCtrl, plan9Ctrl, vpciCtrl, guestCtrl := newContainerTestController(t) - // SCSI unmap fails; plan9/vpci should not be invoked. - scsiCtrl.EXPECT().UnmapFromGuest(gomock.Any(), scsiGUID).Return(errUnmapSCSI) + scsiGUID, _ := guid.NewV4() + plan9GUID, _ := guid.NewV4() + deviceGUID, _ := guid.NewV4() - err := c.releaseResources(t.Context()) - if !errors.Is(err, errUnmapSCSI) { - t.Fatalf("releaseResources error = %v, want %v", err, errUnmapSCSI) - } + c.scsiResources = []guid.GUID{scsiGUID} + c.plan9Resources = []guid.GUID{plan9GUID} + c.devices = []guid.GUID{deviceGUID} - // Failed scsi entry is retained for retry; plan9 and devices unchanged. - if len(c.scsiResources) != 1 { - t.Errorf("scsiResources len = %d, want 1 (retained for retry)", len(c.scsiResources)) - } - if len(c.plan9Resources) != 1 { - t.Errorf("plan9Resources len = %d, want 1 (untouched)", len(c.plan9Resources)) - } - if len(c.devices) != 1 { - t.Errorf("devices len = %d, want 1 (untouched)", len(c.devices)) + scsiCtrl.EXPECT().UnmapFromGuest(gomock.Any(), scsiGUID).Return(tc.scsiErr) + + if !tc.wantStops { + // Tolerated error: the chain must proceed through plan9, vpci, + // and the DeleteContainerState capability gate. + plan9Ctrl.EXPECT().UnmapFromGuest(gomock.Any(), plan9GUID).Return(nil) + vpciCtrl.EXPECT().RemoveFromVM(gomock.Any(), deviceGUID).Return(nil) + guestCtrl.EXPECT().Capabilities().Return(&gcs.LCOWGuestDefinedCapabilities{}) + } + + err := c.releaseResources(t.Context()) + if tc.wantStops { + if !errors.Is(err, tc.scsiErr) { + t.Fatalf("releaseResources error = %v, want %v", err, tc.scsiErr) + } + // Failed scsi entry retained; plan9 and devices untouched. + if len(c.scsiResources) != 1 { + t.Errorf("scsiResources len = %d, want 1 (retained for retry)", len(c.scsiResources)) + } + if len(c.plan9Resources) != 1 { + t.Errorf("plan9Resources len = %d, want 1 (untouched)", len(c.plan9Resources)) + } + if len(c.devices) != 1 { + t.Errorf("devices len = %d, want 1 (untouched)", len(c.devices)) + } + return + } + if err != nil { + t.Fatalf("releaseResources error = %v, want nil for tolerated error", err) + } + }) } } @@ -860,32 +893,47 @@ func TestReleaseResources_DeleteContainerStateFails(t *testing.T) { } } -// TestReleaseResources_DeleteContainerStateNotFoundIsSuccess verifies that a -// "compute system does not exist" error from DeleteContainerState is treated -// as success. This matters because the GCS bridge removes the container from -// its host-state map even when its inner Delete() fails, so any retry of the -// RPC will always come back with "not found" — looping forever would otherwise -// be the consequence. -func TestReleaseResources_DeleteContainerStateNotFoundIsSuccess(t *testing.T) { +// TestReleaseResources_DeleteContainerState_ToleratesAlreadyGone verifies +// that any error indicating the resource is already gone is treated as a +// successful delete: the GCS bridge being closed (so the in-guest agent +// cannot answer) and any of the HCS conditions that surface when the VM has +// already exited or is in an invalid state for further modifications. In all +// cases the chain must complete and isContainerStateDeleted must flip so a +// retry does not re-issue the RPC. +func TestReleaseResources_DeleteContainerState_ToleratesAlreadyGone(t *testing.T) { t.Parallel() - c, _, _, _, guestCtrl := newContainerTestController(t) - caps := &gcs.LCOWGuestDefinedCapabilities{} - caps.DeleteContainerStateSupported = true + cases := []struct { + name string + err error + }{ + {name: "BridgeClosed", err: fmt.Errorf("transport gone: %w", gcs.ErrBridgeClosed)}, + {name: "ComputeSystemDoesNotExist", err: fmt.Errorf("guest RPC failure: %w", hcs.ErrComputeSystemDoesNotExist)}, + {name: "VmcomputeAlreadyStopped", err: fmt.Errorf("guest RPC failure: %w", hcs.ErrVmcomputeAlreadyStopped)}, + {name: "VmcomputeOperationInvalidState", err: fmt.Errorf("guest RPC failure: %w", hcs.ErrVmcomputeOperationInvalidState)}, + {name: "AlreadyClosed", err: fmt.Errorf("guest RPC failure: %w", hcs.ErrAlreadyClosed)}, + } - guestCtrl.EXPECT().Capabilities().Return(caps) - // The bridge's response Result code is wrapped through windows.Errno - // (= syscall.Errno) so errors.Is against hcs.ErrComputeSystemDoesNotExist - // matches. - guestCtrl.EXPECT(). - DeleteContainerState(gomock.Any(), c.gcsContainerID). - Return(fmt.Errorf("guest RPC failure: %w", hcs.ErrComputeSystemDoesNotExist)) + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + c, _, _, _, guestCtrl := newContainerTestController(t) - if err := c.releaseResources(t.Context()); err != nil { - t.Fatalf("releaseResources returned error for not-found: %v", err) - } - if !c.isContainerStateDeleted { - t.Fatal("isContainerStateDeleted should be true even when the guest reports the container is gone") + caps := &gcs.LCOWGuestDefinedCapabilities{} + caps.DeleteContainerStateSupported = true + + guestCtrl.EXPECT().Capabilities().Return(caps) + guestCtrl.EXPECT(). + DeleteContainerState(gomock.Any(), c.gcsContainerID). + Return(tc.err) + + if err := c.releaseResources(t.Context()); err != nil { + t.Fatalf("releaseResources returned error: %v", err) + } + if !c.isContainerStateDeleted { + t.Fatal("isContainerStateDeleted should be true when the guest reports the container is gone") + } + }) } } diff --git a/internal/controller/network/network_lcow.go b/internal/controller/network/network_lcow.go index 1ec3fde0eb..4452f7539a 100644 --- a/internal/controller/network/network_lcow.go +++ b/internal/controller/network/network_lcow.go @@ -4,12 +4,15 @@ package network import ( "context" + "errors" "fmt" "github.com/Microsoft/hcsshim/hcn" + "github.com/Microsoft/hcsshim/internal/gcs" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" + "github.com/Microsoft/hcsshim/internal/vm/vmutils" ) // guestNetwork exposes linux guest network operations. @@ -81,7 +84,7 @@ func (c *Controller) removeEndpointFromGuestNamespace(ctx context.Context, nicID if err := c.guestNetwork.RemoveNetworkInterface(ctx, &guestresource.LCOWNetworkAdapter{ NamespaceID: c.namespaceID, ID: nicID, - }); err != nil { + }); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) { return fmt.Errorf("remove NIC %s from guest: %w", nicID, err) } @@ -92,7 +95,7 @@ func (c *Controller) removeEndpointFromGuestNamespace(ctx context.Context, nicID if err := c.vmNetwork.RemoveNIC(ctx, nicID, &hcsschema.NetworkAdapter{ EndpointId: endpoint.Id, MacAddress: endpoint.MacAddress, - }); err != nil { + }); err != nil && !vmutils.IsVMNotAvailableError(err) { return fmt.Errorf("remove NIC %s from host (endpoint %s): %w", nicID, endpoint.Id, err) } diff --git a/internal/controller/network/network_lcow_test.go b/internal/controller/network/network_lcow_test.go index 819e94448b..0ae57eaf42 100644 --- a/internal/controller/network/network_lcow_test.go +++ b/internal/controller/network/network_lcow_test.go @@ -5,6 +5,7 @@ package network import ( "context" "errors" + "fmt" "testing" "go.uber.org/mock/gomock" @@ -13,6 +14,7 @@ import ( "github.com/Microsoft/hcsshim/internal/controller/network/mocks" "github.com/Microsoft/hcsshim/internal/gcs" "github.com/Microsoft/hcsshim/internal/guest/prot" + "github.com/Microsoft/hcsshim/internal/hcs" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" ) @@ -191,6 +193,30 @@ func TestLCOW_RemoveEndpoint_GuestFails_HostNotCalled(t *testing.T) { } } +// TestLCOW_RemoveEndpoint_BridgeClosed_HostStillCalled verifies that when the +// guest-side removal fails because the bridge is closed (the GCS is gone and +// its state with it), the controller still hot-removes the NIC from the host +// so cleanup completes instead of stalling on a doomed retry. +func TestLCOW_RemoveEndpoint_BridgeClosed_HostStillCalled(t *testing.T) { + ctrl := gomock.NewController(t) + c, vm, guest := newLCOWController(t, ctrl, true) + + ep := newLCOWEndpoint("eth0") + + gomock.InOrder( + guest.EXPECT().RemoveNetworkInterface(gomock.Any(), gomock.Any()). + Return(fmt.Errorf("transport gone: %w", gcs.ErrBridgeClosed)), + vm.EXPECT().RemoveNIC(gomock.Any(), "nic-1", &hcsschema.NetworkAdapter{ + EndpointId: ep.Id, + MacAddress: ep.MacAddress, + }).Return(nil), + ) + + if err := c.removeEndpointFromGuestNamespace(context.Background(), "nic-1", ep); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + // TestLCOW_RemoveEndpoint_NoNamespaceSupport_HostOnly verifies that when the // guest never received the namespace, the controller skips the guest-side // removal and only hot-removes the NIC from the host. @@ -211,6 +237,45 @@ func TestLCOW_RemoveEndpoint_NoNamespaceSupport_HostOnly(t *testing.T) { } } +// TestLCOW_RemoveEndpoint_HostFails_VMGone_Tolerated verifies that when the +// host-side RemoveNIC fails because the UVM has already exited (HCS reports +// the system as gone / already stopped / invalid state / handle closed), the +// controller treats the failure as success. The NIC is destroyed alongside +// the VM, so propagating the error would only leak the cached endpoint +// mapping and block teardown — symmetric with the bridge-closed tolerance on +// the guest side. +func TestLCOW_RemoveEndpoint_HostFails_VMGone_Tolerated(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + err error + }{ + {"ComputeSystemDoesNotExist", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrComputeSystemDoesNotExist)}, + {"VmcomputeAlreadyStopped", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeAlreadyStopped)}, + {"VmcomputeOperationInvalidState", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeOperationInvalidState)}, + {"AlreadyClosed", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrAlreadyClosed)}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + c, vm, guest := newLCOWController(t, ctrl, true) + + ep := newLCOWEndpoint("eth0") + + gomock.InOrder( + guest.EXPECT().RemoveNetworkInterface(gomock.Any(), gomock.Any()).Return(nil), + vm.EXPECT().RemoveNIC(gomock.Any(), "nic-1", gomock.Any()).Return(tc.err), + ) + + if err := c.removeEndpointFromGuestNamespace(context.Background(), "nic-1", ep); err != nil { + t.Fatalf("expected VM-gone error from host RemoveNIC to be tolerated, got: %v", err) + } + }) + } +} + // ───────────────────────────────────────────────────────────────────────────── // Teardown tests (LCOW: removeNetNSInsideGuest is a no-op so full Teardown // is exercisable end-to-end without HNS) diff --git a/internal/controller/network/network_wcow.go b/internal/controller/network/network_wcow.go index c9911351d2..55a9a8ef69 100644 --- a/internal/controller/network/network_wcow.go +++ b/internal/controller/network/network_wcow.go @@ -4,13 +4,16 @@ package network import ( "context" + "errors" "fmt" "github.com/Microsoft/hcsshim/hcn" + "github.com/Microsoft/hcsshim/internal/gcs" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" + "github.com/Microsoft/hcsshim/internal/vm/vmutils" "github.com/sirupsen/logrus" ) @@ -55,7 +58,10 @@ func (c *Controller) removeNetNSInsideGuest(ctx context.Context, namespaceID str return fmt.Errorf("get network namespace %s: %w", namespaceID, err) } - if err := c.guestNetwork.RemoveNetworkNamespace(ctx, hcnNamespace); err != nil { + // If the GCS bridge is already closed (e.g. the guest agent crashed), the + // namespace will be torn down with the VM, so treat that as success and let + // teardown continue. + if err = c.guestNetwork.RemoveNetworkNamespace(ctx, hcnNamespace); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) { return fmt.Errorf("remove network namespace %s from guest: %w", namespaceID, err) } } @@ -119,7 +125,7 @@ func (c *Controller) removeEndpointFromGuestNamespace(ctx context.Context, nicID nicID, guestrequest.RequestTypeRemove, nil, - ); err != nil { + ); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) { return fmt.Errorf("remove NIC %s from guest (endpoint %s): %w", nicID, endpoint.Id, err) } @@ -129,7 +135,7 @@ func (c *Controller) removeEndpointFromGuestNamespace(ctx context.Context, nicID if err := c.vmNetwork.RemoveNIC(ctx, nicID, &hcsschema.NetworkAdapter{ EndpointId: endpoint.Id, MacAddress: endpoint.MacAddress, - }); err != nil { + }); err != nil && !vmutils.IsVMNotAvailableError(err) { return fmt.Errorf("remove NIC %s from host (endpoint %s): %w", nicID, endpoint.Id, err) } diff --git a/internal/controller/network/network_wcow_test.go b/internal/controller/network/network_wcow_test.go index a68e0a8dbe..5eb77e31e7 100644 --- a/internal/controller/network/network_wcow_test.go +++ b/internal/controller/network/network_wcow_test.go @@ -5,6 +5,7 @@ package network import ( "context" "errors" + "fmt" "testing" "go.uber.org/mock/gomock" @@ -12,6 +13,7 @@ import ( "github.com/Microsoft/hcsshim/hcn" "github.com/Microsoft/hcsshim/internal/controller/network/mocks" "github.com/Microsoft/hcsshim/internal/gcs" + "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" @@ -196,6 +198,72 @@ func TestWCOW_RemoveEndpoint_GuestFails_HostNotCalled(t *testing.T) { } } +// TestWCOW_RemoveEndpoint_BridgeClosed_HostStillCalled verifies that when the +// guest-side removal fails because the GCS bridge is closed (the guest agent +// is gone and its state with it), the controller still hot-removes the NIC +// from the host so cleanup completes instead of stalling on a doomed retry. +// Symmetric with the VM-gone tolerance on the host side. +func TestWCOW_RemoveEndpoint_BridgeClosed_HostStillCalled(t *testing.T) { + ctrl := gomock.NewController(t) + c, vm, guest := newWCOWController(t, ctrl, true) + + ep := newWCOWEndpoint("eth0") + + gomock.InOrder( + guest.EXPECT().RemoveNetworkInterface( + gomock.Any(), "nic-1", guestrequest.RequestTypeRemove, gomock.Nil(), + ).Return(fmt.Errorf("transport gone: %w", gcs.ErrBridgeClosed)), + vm.EXPECT().RemoveNIC(gomock.Any(), "nic-1", &hcsschema.NetworkAdapter{ + EndpointId: ep.Id, + MacAddress: ep.MacAddress, + }).Return(nil), + ) + + if err := c.removeEndpointFromGuestNamespace(context.Background(), "nic-1", ep); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +// TestWCOW_RemoveEndpoint_HostFails_VMGone_Tolerated verifies that when the +// host-side RemoveNIC fails because the UVM has already exited (HCS reports +// the system as gone / already stopped / invalid state / handle closed), the +// controller treats the failure as success. The NIC is destroyed alongside +// the VM, so propagating the error would only leak the cached endpoint +// mapping and block teardown. +func TestWCOW_RemoveEndpoint_HostFails_VMGone_Tolerated(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + err error + }{ + {"ComputeSystemDoesNotExist", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrComputeSystemDoesNotExist)}, + {"VmcomputeAlreadyStopped", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeAlreadyStopped)}, + {"VmcomputeOperationInvalidState", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeOperationInvalidState)}, + {"AlreadyClosed", fmt.Errorf("hcs::System::Modify: %w", hcs.ErrAlreadyClosed)}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + c, vm, guest := newWCOWController(t, ctrl, true) + + ep := newWCOWEndpoint("eth0") + + gomock.InOrder( + guest.EXPECT().RemoveNetworkInterface( + gomock.Any(), "nic-1", guestrequest.RequestTypeRemove, gomock.Nil(), + ).Return(nil), + vm.EXPECT().RemoveNIC(gomock.Any(), "nic-1", gomock.Any()).Return(tc.err), + ) + + if err := c.removeEndpointFromGuestNamespace(context.Background(), "nic-1", ep); err != nil { + t.Fatalf("expected VM-gone error from host RemoveNIC to be tolerated, got: %v", err) + } + }) + } +} + // ───────────────────────────────────────────────────────────────────────────── // Teardown tests // diff --git a/internal/controller/process/process.go b/internal/controller/process/process.go index b35038762f..43ff044d33 100644 --- a/internal/controller/process/process.go +++ b/internal/controller/process/process.go @@ -15,7 +15,7 @@ import ( "github.com/Microsoft/hcsshim/internal/logfields" eventstypes "github.com/containerd/containerd/api/events" - "github.com/containerd/containerd/api/runtime/task/v2" + "github.com/containerd/containerd/api/runtime/task/v3" "github.com/containerd/errdefs" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" diff --git a/internal/controller/vm/vm.go b/internal/controller/vm/vm.go index d623ef2d7f..68ac56dcb4 100644 --- a/internal/controller/vm/vm.go +++ b/internal/controller/vm/vm.go @@ -342,6 +342,7 @@ func (c *Controller) waitForVMExit(ctx context.Context) { // copy the original to prevent it affecting the background wait go routine ctx = context.WithoutCancel(ctx) _ = c.uvm.Wait(ctx) + // Once the VM has exited, attempt to transition to Terminated. // This may be a no-op if TerminateVM already ran concurrently and // transitioned the state first — log the discarded error so that @@ -350,6 +351,15 @@ func (c *Controller) waitForVMExit(ctx context.Context) { if c.vmState != StateTerminated { c.vmState = StateTerminated } + + // Force the bridge to collapse so in-flight container/process waits are + // released. CloseConnection is idempotent. + if c.guest != nil { + if err := c.guest.CloseConnection(); err != nil { + log.G(ctx).WithError(err).Warn("close guest connection after vm exit failed") + } + } + c.mu.Unlock() } diff --git a/internal/gcs/bridge.go b/internal/gcs/bridge.go index b03b555662..5b611f262e 100644 --- a/internal/gcs/bridge.go +++ b/internal/gcs/bridge.go @@ -60,7 +60,7 @@ type bridge struct { waitCh chan struct{} } -var errBridgeClosed = fmt.Errorf("bridge closed: %w", net.ErrClosed) +var ErrBridgeClosed = fmt.Errorf("bridge closed: %w", net.ErrClosed) const ( // bridgeFailureTimeout is the default value for bridge.Timeout @@ -103,9 +103,11 @@ func (brdg *bridge) kill(err error) { } brdg.closed = true brdg.mu.Unlock() - brdg.brdgErr = err if err != nil { - brdg.log.WithError(err).Error("bridge forcibly terminating") + // Wrap the cause so callers can recognize this as a closed-bridge + // failure and stop retrying. + brdg.brdgErr = fmt.Errorf("%w: %w", ErrBridgeClosed, err) + brdg.log.WithError(brdg.brdgErr).Error("bridge forcibly terminating") } else { brdg.log.Debug("bridge terminating") } @@ -147,7 +149,7 @@ func (brdg *bridge) AsyncRPC(ctx context.Context, proc prot.RPCProc, req request case <-brdg.waitCh: err := brdg.brdgErr if err == nil { - err = errBridgeClosed + err = ErrBridgeClosed } return nil, err case <-ctx.Done(): @@ -244,7 +246,7 @@ func (brdg *bridge) recvLoopRoutine() { brdg.rpcs = nil brdg.mu.Unlock() for _, call := range rpcs { - call.complete(errBridgeClosed) + call.complete(ErrBridgeClosed) } } @@ -422,7 +424,7 @@ func (brdg *bridge) sendRPC(buf *bytes.Buffer, enc *json.Encoder, call *rpc) err brdg.mu.Lock() if brdg.rpcs == nil { brdg.mu.Unlock() - call.complete(errBridgeClosed) + call.complete(ErrBridgeClosed) return nil } id := brdg.nextID diff --git a/internal/gcs/bridge_test.go b/internal/gcs/bridge_test.go index c1b3c1ab55..61faea9256 100644 --- a/internal/gcs/bridge_test.go +++ b/internal/gcs/bridge_test.go @@ -147,8 +147,34 @@ func TestBridgeRPCBridgeClosed(t *testing.T) { eerr := errors.New("forcibly terminated") b.kill(eerr) err := b.RPC(context.Background(), prot.RPCCreate, nil, nil, false) - if err != eerr { //nolint:errorlint - t.Fatal("unexpected: ", err) + // Must wrap both the transport error and ErrBridgeClosed. + if !errors.Is(err, eerr) { + t.Fatalf("expected err to wrap eerr; got: %v", err) + } + if !errors.Is(err, ErrBridgeClosed) { + t.Fatalf("expected err to wrap ErrBridgeClosed; got: %v", err) + } +} + +// TestBridgeKillWrapsSentinel verifies that once the bridge has been killed, +// any subsequent or in-flight operation surfaces an error recognizable as a +// closed bridge, so cleanup paths can safely give up after the guest dies +// instead of retrying forever. +func TestBridgeKillWrapsSentinel(t *testing.T) { + b := startReflectedBridge(t, 0) + transportErr := errors.New("bridge read failed: header read: EOF") + b.kill(transportErr) + + if err := b.Wait(); !errors.Is(err, ErrBridgeClosed) || !errors.Is(err, transportErr) { + t.Fatalf("Wait(): expected err to wrap both ErrBridgeClosed and transportErr; got: %v", err) + } + + err := b.RPC(context.Background(), prot.RPCCreate, &testReq{}, &testResp{}, false) + if !errors.Is(err, ErrBridgeClosed) { + t.Fatalf("post-kill RPC: expected errors.Is(err, ErrBridgeClosed); got: %v", err) + } + if !errors.Is(err, transportErr) { + t.Fatalf("post-kill RPC: expected errors.Is(err, transportErr); got: %v", err) } } diff --git a/internal/vm/vmutils/gcs_logs.go b/internal/vm/vmutils/gcs_logs.go index c680dfcd76..62ce3c8cb3 100644 --- a/internal/vm/vmutils/gcs_logs.go +++ b/internal/vm/vmutils/gcs_logs.go @@ -63,8 +63,12 @@ func ParseGCSLogrus(vmID string) OutputHandler { if err != nil { // Handle decoding errors, EOF, or disconnections // Log the error unless it's an expected EOF or network disconnect - // (WSAECONNABORTED or WSAECONNRESET indicate expected shutdown/disconnect) - if !errors.Is(err, io.EOF) && !hcs.IsAny(err, windows.WSAECONNABORTED, windows.WSAECONNRESET) { + // (WSAECONNABORTED or WSAECONNRESET indicate expected shutdown/disconnect). + // A *json.SyntaxError means GCS wrote a non-JSON line (e.g. a Go + // runtime panic dump). The trailing-bytes read below logs the + // raw payload as "gcs terminated", so skip the noisy parse error. + var syntaxErr *json.SyntaxError + if !errors.Is(err, io.EOF) && !hcs.IsAny(err, windows.WSAECONNABORTED, windows.WSAECONNRESET) && !errors.As(err, &syntaxErr) { logrus.WithFields(logrus.Fields{ logfields.UVMID: vmID, logrus.ErrorKey: err, diff --git a/internal/vm/vmutils/utils.go b/internal/vm/vmutils/utils.go index f609f975ff..e5b1a51748 100644 --- a/internal/vm/vmutils/utils.go +++ b/internal/vm/vmutils/utils.go @@ -10,6 +10,7 @@ import ( "path/filepath" runhcsoptions "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" + "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/log" "github.com/containerd/typeurl/v2" @@ -61,3 +62,13 @@ func UnmarshalRuntimeOptions(ctx context.Context, options *anypb.Any) (*runhcsop return shimOpts, nil } + +// IsVMNotAvailableError reports whether err indicates the underlying compute system is +// no longer available for modification — either it has stopped, no longer +// exists, or is in an invalid state for further modifications.. +func IsVMNotAvailableError(err error) bool { + return hcs.IsNotExist(err) || + hcs.IsAlreadyStopped(err) || + hcs.IsAlreadyClosed(err) || + hcs.IsOperationInvalidState(err) +}