From 662a317756bacc68b77c79776249c58f149645ed Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Mon, 28 Jul 2025 03:45:55 +0000 Subject: [PATCH 1/8] add healthcheck orchestration logic Signed-off-by: Arjun Raja Yogidas --- cmd/nerdctl/container/container_create.go | 4 - ...o => container_health_check_linux_test.go} | 253 +++++++++++++++--- cmd/nerdctl/container/container_run.go | 10 +- cmd/nerdctl/container/container_run_test.go | 6 + cmd/nerdctl/helpers/flagutil.go | 6 +- go.mod | 2 +- pkg/api/types/container_types.go | 13 +- pkg/cmd/container/create.go | 3 - pkg/cmd/container/health_check.go | 1 - pkg/cmd/container/kill.go | 6 + pkg/cmd/container/remove.go | 6 + pkg/cmd/container/stop.go | 4 + pkg/containerutil/containerutil.go | 28 ++ pkg/healthcheck/executor.go | 40 ++- pkg/healthcheck/health.go | 13 +- pkg/healthcheck/healthcheck_manager_darwin.go | 43 +++ .../healthcheck_manager_freebsd.go | 43 +++ pkg/healthcheck/healthcheck_manager_linux.go | 214 +++++++++++++++ .../healthcheck_manager_windows.go | 43 +++ pkg/ocihook/ocihook.go | 4 + 20 files changed, 663 insertions(+), 79 deletions(-) rename cmd/nerdctl/container/{container_health_check_test.go => container_health_check_linux_test.go} (74%) create mode 100644 pkg/healthcheck/healthcheck_manager_darwin.go create mode 100644 pkg/healthcheck/healthcheck_manager_freebsd.go create mode 100644 pkg/healthcheck/healthcheck_manager_linux.go create mode 100644 pkg/healthcheck/healthcheck_manager_windows.go diff --git a/cmd/nerdctl/container/container_create.go b/cmd/nerdctl/container/container_create.go index fc5cbdd97c8..e30d529481a 100644 --- a/cmd/nerdctl/container/container_create.go +++ b/cmd/nerdctl/container/container_create.go @@ -279,10 +279,6 @@ func createOptions(cmd *cobra.Command) (types.ContainerCreateOptions, error) { if err != nil { return opt, err } - opt.HealthStartInterval, err = cmd.Flags().GetDuration("health-start-interval") - if err != nil { - return opt, err - } opt.NoHealthcheck, err = cmd.Flags().GetBool("no-healthcheck") if err != nil { return opt, err diff --git a/cmd/nerdctl/container/container_health_check_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go similarity index 74% rename from cmd/nerdctl/container/container_health_check_test.go rename to cmd/nerdctl/container/container_health_check_linux_test.go index aa2d1603313..48c84618fcb 100644 --- a/cmd/nerdctl/container/container_health_check_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -19,7 +19,6 @@ package container import ( "encoding/json" "errors" - "fmt" "strings" "testing" "time" @@ -32,11 +31,16 @@ import ( "github.com/containerd/nerdctl/mod/tigron/tig" "github.com/containerd/nerdctl/v2/pkg/healthcheck" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" "github.com/containerd/nerdctl/v2/pkg/testutil" "github.com/containerd/nerdctl/v2/pkg/testutil/nerdtest" ) func TestContainerHealthCheckBasic(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -134,6 +138,10 @@ func TestContainerHealthCheckBasic(t *testing.T) { } func TestContainerHealthCheckAdvance(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -391,43 +399,6 @@ func TestContainerHealthCheckAdvance(t *testing.T) { } }, }, - { - Description: "Healthcheck emits large output repeatedly", - Setup: func(data test.Data, helpers test.Helpers) { - helpers.Ensure("run", "-d", "--name", data.Identifier(), - "--health-cmd", "yes X | head -c 60000", - "--health-interval", "1s", "--health-timeout", "2s", - testutil.CommonImage, "sleep", nerdtest.Infinity) - nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - }, - Cleanup: func(data test.Data, helpers test.Helpers) { - helpers.Anyhow("rm", "-f", data.Identifier()) - }, - Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { - for i := 0; i < 3; i++ { - helpers.Ensure("container", "healthcheck", data.Identifier()) - time.Sleep(2 * time.Second) - } - return helpers.Command("inspect", data.Identifier()) - }, - Expected: func(data test.Data, helpers test.Helpers) *test.Expected { - return &test.Expected{ - ExitCode: 0, - Output: expect.All(func(_ string, t tig.T) { - inspect := nerdtest.InspectContainer(helpers, data.Identifier()) - h := inspect.State.Health - debug, _ := json.MarshalIndent(h, "", " ") - t.Log(string(debug)) - assert.Assert(t, h != nil, "expected health state") - assert.Equal(t, h.Status, healthcheck.Healthy) - assert.Assert(t, len(h.Log) >= 3, "expected at least 3 health log entries") - for _, log := range h.Log { - assert.Assert(t, len(log.Output) >= 1024, fmt.Sprintf("each output should be >= 1024 bytes, was: %s", log.Output)) - } - }), - } - }, - }, { Description: "Health log in inspect keeps only the latest 5 entries", Setup: func(data test.Data, helpers test.Helpers) { @@ -602,3 +573,209 @@ func TestContainerHealthCheckAdvance(t *testing.T) { testCase.Run(t) } + +func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { + testCase := nerdtest.Setup() + testCase.Require = require.Not(nerdtest.Docker) + + testCase.SubTests = []*test.Case{ + { + Description: "Basic healthy container with systemd-triggered healthcheck", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "2s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + // Wait for a healthcheck to execute + time.Sleep(2 * time.Second) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + // Ensure proper cleanup of systemd units + helpers.Anyhow("stop", data.Identifier()) + time.Sleep(500 * time.Millisecond) // Allow systemd cleanup + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + assert.Assert(t, h != nil, "expected health state to be present") + assert.Equal(t, h.Status, "healthy") + assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") + }), + } + }, + }, + { + Description: "Kill stops healthcheck execution", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "1s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + time.Sleep(2 * time.Second) // Wait for at least one health check to execute + helpers.Ensure("kill", data.Identifier()) // Kill the container + time.Sleep(3 * time.Second) // Wait to allow any potential extra healthchecks (shouldn't happen) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + // Container is already killed, just remove it + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + assert.Assert(t, h != nil, "expected health state to be present") + assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") + + // Get container FinishedAt timestamp + containerEnd, err := time.Parse(time.RFC3339Nano, inspect.State.FinishedAt) + assert.NilError(t, err, "parsing container FinishedAt") + + // Assert all healthcheck log start times are before container finished + for _, entry := range h.Log { + assert.NilError(t, err, "parsing healthcheck Start time") + assert.Assert(t, entry.Start.Before(containerEnd), "healthcheck ran after container was killed") + } + }), + } + }, + }, + } + testCase.Run(t) +} + +func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("systemd healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() + testCase.Require = require.Not(nerdtest.Docker) + + testCase.SubTests = []*test.Case{ + { + // Tests that CreateTimer() successfully creates systemd timer units and + // RemoveTransientHealthCheckFiles() properly cleans up units when container stops. + Description: "Systemd timer unit creation and cleanup", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "1s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + // Wait longer for systemd timer creation and first healthcheck execution + time.Sleep(3 * time.Second) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("inspect", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + // Get container ID and check systemd timer + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + + // Check systemd timer + result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, _ tig.T) { + // Verify that a timer exists for this specific container + assert.Assert(t, strings.Contains(stdout, containerID), + "expected to find nerdctl healthcheck timer containing container ID: %s", containerID) + }, + }) + // Stop container and verify cleanup + helpers.Ensure("stop", data.Identifier()) + time.Sleep(500 * time.Millisecond) // Allow cleanup to complete + + // Check that timer is gone + result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, _ tig.T) { + assert.Assert(t, !strings.Contains(stdout, containerID), + "expected nerdctl healthcheck timer for container ID %s to be removed after container stop", containerID) + + }, + }) + }), + } + }, + }, + { + Description: "Container restart recreates systemd timer", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo restart-test", + "--health-interval", "2s", + testutil.CommonImage, "sleep", "60") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + time.Sleep(3 * time.Second) // Wait for initial timer creation + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + // Get container ID for verification + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + + // Step 1: Verify timer exists initially + result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + assert.Assert(t, strings.Contains(stdout, containerID), + "expected timer for container %s to exist initially", containerID) + }, + }) + + // Step 2: Stop container + helpers.Ensure("stop", data.Identifier()) + time.Sleep(1 * time.Second) // Allow cleanup + + // Step 3: Verify timer is removed after stop + result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + assert.Assert(t, !strings.Contains(stdout, containerID), + "expected timer for container %s to be removed after stop", containerID) + }, + }) + + // Step 4: Restart container + helpers.Ensure("start", data.Identifier()) + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + time.Sleep(3 * time.Second) // Wait for timer recreation + + // Step 5: Verify timer is recreated after restart - this is our final verification + return helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + assert.Assert(t, strings.Contains(stdout, containerID), + "expected timer for container %s to be recreated after restart", containerID) + }, + } + }, + }, + } + testCase.Run(t) +} diff --git a/cmd/nerdctl/container/container_run.go b/cmd/nerdctl/container/container_run.go index 67b797bdce9..70222cff71a 100644 --- a/cmd/nerdctl/container/container_run.go +++ b/cmd/nerdctl/container/container_run.go @@ -37,6 +37,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/defaults" "github.com/containerd/nerdctl/v2/pkg/errutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/logging" "github.com/containerd/nerdctl/v2/pkg/netutil" @@ -240,7 +241,6 @@ func setCreateFlags(cmd *cobra.Command) { cmd.Flags().Duration("health-timeout", 0, "Maximum time to allow one check to run (default: 30s)") cmd.Flags().Int("health-retries", 0, "Consecutive failures needed to report unhealthy (default: 3)") cmd.Flags().Duration("health-start-period", 0, "Start period for the container to initialize before starting health-retries countdown") - cmd.Flags().Duration("health-start-interval", 0, "Time between running the checks during the start period") cmd.Flags().Bool("no-healthcheck", false, "Disable any container-specified HEALTHCHECK") // #region env flags @@ -445,6 +445,14 @@ func runAction(cmd *cobra.Command, args []string) error { return err } + // Setup container healthchecks. + if err := healthcheck.CreateTimer(ctx, c); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, c); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if createOpt.Detach { fmt.Fprintln(createOpt.Stdout, id) return nil diff --git a/cmd/nerdctl/container/container_run_test.go b/cmd/nerdctl/container/container_run_test.go index 64692a3ead7..e3d50940f8b 100644 --- a/cmd/nerdctl/container/container_run_test.go +++ b/cmd/nerdctl/container/container_run_test.go @@ -841,6 +841,9 @@ func TestRunDomainname(t *testing.T) { } func TestRunHealthcheckFlags(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } testCase := nerdtest.Setup() testCases := []struct { @@ -990,6 +993,9 @@ func TestRunHealthcheckFlags(t *testing.T) { } func TestRunHealthcheckFromImage(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } nerdtest.Setup() dockerfile := fmt.Sprintf(`FROM %s diff --git a/cmd/nerdctl/helpers/flagutil.go b/cmd/nerdctl/helpers/flagutil.go index 32217ae95c6..22fc1acb1bf 100644 --- a/cmd/nerdctl/helpers/flagutil.go +++ b/cmd/nerdctl/helpers/flagutil.go @@ -52,8 +52,7 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { options.HealthInterval != 0 || options.HealthTimeout != 0 || options.HealthRetries != 0 || - options.HealthStartPeriod != 0 || - options.HealthStartInterval != 0 + options.HealthStartPeriod != 0 if options.NoHealthcheck { if options.HealthCmd != "" || healthFlagsSet { @@ -74,9 +73,6 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { if options.HealthStartPeriod < 0 { return fmt.Errorf("--health-start-period cannot be negative") } - if options.HealthStartInterval < 0 { - return fmt.Errorf("--health-start-interval cannot be negative") - } return nil } diff --git a/go.mod b/go.mod index 8d10651f6d0..f7c0d1f1ead 100644 --- a/go.mod +++ b/go.mod @@ -120,7 +120,7 @@ require ( github.com/santhosh-tekuri/jsonschema/v6 v6.0.1 // indirect github.com/sasha-s/go-deadlock v0.3.5 // indirect //gomodjail:unconfined - github.com/sirupsen/logrus v1.9.3 // indirect + github.com/sirupsen/logrus v1.9.3 github.com/smallstep/pkcs7 v0.1.1 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect diff --git a/pkg/api/types/container_types.go b/pkg/api/types/container_types.go index 3e157bb303d..a19fb5aea1f 100644 --- a/pkg/api/types/container_types.go +++ b/pkg/api/types/container_types.go @@ -292,13 +292,12 @@ type ContainerCreateOptions struct { ImagePullOpt ImagePullOptions // Healthcheck related fields - HealthCmd string - HealthInterval time.Duration - HealthTimeout time.Duration - HealthRetries int - HealthStartPeriod time.Duration - HealthStartInterval time.Duration - NoHealthcheck bool + HealthCmd string + HealthInterval time.Duration + HealthTimeout time.Duration + HealthRetries int + HealthStartPeriod time.Duration + NoHealthcheck bool // UserNS name for user namespace mapping of container UserNS string diff --git a/pkg/cmd/container/create.go b/pkg/cmd/container/create.go index a0c8fc2bf9b..232d8a27b77 100644 --- a/pkg/cmd/container/create.go +++ b/pkg/cmd/container/create.go @@ -891,9 +891,6 @@ func withHealthcheck(options types.ContainerCreateOptions, ensuredImage *imgutil if options.HealthStartPeriod != 0 { hc.StartPeriod = options.HealthStartPeriod } - if options.HealthStartInterval != 0 { - hc.StartInterval = options.HealthStartInterval - } // If no healthcheck config is set (via CLI or image), return empty string so we skip adding to container config. if reflect.DeepEqual(hc, &healthcheck.Healthcheck{}) { diff --git a/pkg/cmd/container/health_check.go b/pkg/cmd/container/health_check.go index 6fe31c8ebc3..e2646497c3f 100644 --- a/pkg/cmd/container/health_check.go +++ b/pkg/cmd/container/health_check.go @@ -59,7 +59,6 @@ func HealthCheck(ctx context.Context, client *containerd.Client, container conta hcConfig.Interval = timeoutWithDefault(hcConfig.Interval, healthcheck.DefaultProbeInterval) hcConfig.Timeout = timeoutWithDefault(hcConfig.Timeout, healthcheck.DefaultProbeTimeout) hcConfig.StartPeriod = timeoutWithDefault(hcConfig.StartPeriod, healthcheck.DefaultStartPeriod) - hcConfig.StartInterval = timeoutWithDefault(hcConfig.StartInterval, healthcheck.DefaultStartInterval) if hcConfig.Retries == 0 { hcConfig.Retries = healthcheck.DefaultProbeRetries } diff --git a/pkg/cmd/container/kill.go b/pkg/cmd/container/kill.go index 080336d9f87..d42a7cd8c82 100644 --- a/pkg/cmd/container/kill.go +++ b/pkg/cmd/container/kill.go @@ -35,6 +35,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/netutil" @@ -111,6 +112,11 @@ func killContainer(ctx context.Context, container containerd.Container, signal s return err } + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + log.G(ctx).Warnf("failed to clean up healthcheck units for container %s: %s", container.ID(), err) + } + // signal will be sent once resume is finished if paused { if err := task.Resume(ctx); err != nil { diff --git a/pkg/cmd/container/remove.go b/pkg/cmd/container/remove.go index 28048a2f6a1..b9df2b2acaf 100644 --- a/pkg/cmd/container/remove.go +++ b/pkg/cmd/container/remove.go @@ -34,6 +34,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/dnsutil/hostsstore" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" @@ -179,6 +180,11 @@ func RemoveContainer(ctx context.Context, c containerd.Container, globalOptions // Otherwise, nil the error so that we do not write the error label on the container retErr = nil + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, c); err != nil { + log.G(ctx).WithError(err).Warnf("failed to clean up healthcheck units for container %q", id) + } + // Now, delete the actual container var delOpts []containerd.DeleteOpts if _, err := c.Image(ctx); err == nil { diff --git a/pkg/cmd/container/stop.go b/pkg/cmd/container/stop.go index e1f347b6b96..755686e4bd8 100644 --- a/pkg/cmd/container/stop.go +++ b/pkg/cmd/container/stop.go @@ -25,6 +25,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" ) @@ -39,6 +40,9 @@ func Stop(ctx context.Context, client *containerd.Client, reqs []string, opt typ if err := cleanupNetwork(ctx, found.Container, opt.GOptions); err != nil { return fmt.Errorf("unable to cleanup network for container: %s", found.Req) } + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, found.Container); err != nil { + return fmt.Errorf("unable to cleanup healthcheck timer for container: %s: %w", found.Req, err) + } if err := containerutil.Stop(ctx, found.Container, opt.Timeout, opt.Signal); err != nil { if errdefs.IsNotFound(err) { fmt.Fprintf(opt.Stderr, "No such container: %s\n", found.Req) diff --git a/pkg/containerutil/containerutil.go b/pkg/containerutil/containerutil.go index 31ded1a3b93..c07016f5b80 100644 --- a/pkg/containerutil/containerutil.go +++ b/pkg/containerutil/containerutil.go @@ -48,6 +48,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/consoleutil" "github.com/containerd/nerdctl/v2/pkg/errutil" "github.com/containerd/nerdctl/v2/pkg/formatter" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/labels/k8slabels" @@ -286,6 +287,15 @@ func Start(ctx context.Context, container containerd.Container, isAttach bool, i if err := task.Start(ctx); err != nil { return err } + + // If container has health checks configured, create and start systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if !isAttach { return nil } @@ -349,6 +359,11 @@ func Stop(ctx context.Context, container containerd.Container, timeout *time.Dur } }() + // Clean up healthcheck units if configured. + // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) + // } + if timeout == nil { t, ok := l[labels.StopTimeout] if !ok { @@ -490,6 +505,11 @@ func Pause(ctx context.Context, client *containerd.Client, id string) error { return err } + // Clean up healthcheck units if configured. + // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) + // } + switch status.Status { case containerd.Paused: return fmt.Errorf("container %s is already paused", id) @@ -517,6 +537,14 @@ func Unpause(ctx context.Context, client *containerd.Client, id string) error { return err } + // Recreate healthcheck related systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + switch status.Status { case containerd.Paused: return task.Resume(ctx) diff --git a/pkg/healthcheck/executor.go b/pkg/healthcheck/executor.go index f5c4216b302..d362d988cf8 100644 --- a/pkg/healthcheck/executor.go +++ b/pkg/healthcheck/executor.go @@ -128,26 +128,42 @@ func updateHealthStatus(ctx context.Context, container containerd.Container, hcC currentHealth = &HealthState{ Status: Starting, FailingStreak: 0, + StartPeriod: hcConfig.StartPeriod > 0, } } - // Check if still within start period - startPeriod := hcConfig.StartPeriod + // Get container info for start period check info, err := container.Info(ctx) if err != nil { return fmt.Errorf("failed to get container info: %w", err) } containerCreated := info.CreatedAt - stillInStartPeriod := hcResult.Start.Sub(containerCreated) < startPeriod - - // Update health status based on exit code - if hcResult.ExitCode == 0 { - currentHealth.Status = Healthy - currentHealth.FailingStreak = 0 - } else if !stillInStartPeriod { - currentHealth.FailingStreak++ - if currentHealth.FailingStreak >= hcConfig.Retries { - currentHealth.Status = Unhealthy + + // Check if we're in start period workflow + inStartPeriodTime := hcResult.Start.Sub(containerCreated) < hcConfig.StartPeriod + inStartPeriodState := currentHealth.StartPeriod + + if inStartPeriodTime && inStartPeriodState { + // Start Period Workflow + if hcResult.ExitCode == 0 { + // First healthy result transitions us out of start period + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + currentHealth.StartPeriod = false + } + // Ignore unhealthy results during start period + } else { + // Health Interval Workflow + if hcResult.ExitCode == 0 { + if currentHealth.Status != Healthy { + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + } + } else { + currentHealth.FailingStreak++ + if currentHealth.FailingStreak >= hcConfig.Retries && currentHealth.Status != Unhealthy { + currentHealth.Status = Unhealthy + } } } diff --git a/pkg/healthcheck/health.go b/pkg/healthcheck/health.go index 8e0301b492a..b38072e8562 100644 --- a/pkg/healthcheck/health.go +++ b/pkg/healthcheck/health.go @@ -43,7 +43,6 @@ const ( DefaultProbeInterval = 30 * time.Second // Default interval between probe runs. Also applies before the first probe. DefaultProbeTimeout = 30 * time.Second // Max duration a single probe run may take before it's considered failed. DefaultStartPeriod = 0 * time.Second // Grace period for container startup before health checks count as failures. - DefaultStartInterval = 5 * time.Second // Interval between checks during the start period. DefaultProbeRetries = 3 // Number of consecutive failures before marking container as unhealthy. MaxLogEntries = 5 // Maximum number of health check log entries to keep. MaxOutputLenForInspect = 4096 // Max output length (in bytes) stored in health check logs during inspect. Longer outputs are truncated. @@ -70,18 +69,18 @@ type HealthcheckResult struct { // Healthcheck represents the health check configuration type Healthcheck struct { - Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy - Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks - Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung - Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy - StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts - StartInterval time.Duration `json:"StartInterval,omitempty"` // StartInterval is the time between health checks during the start period + Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy + Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks + Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung + Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy + StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts } // HealthState stores the current health state of a container type HealthState struct { Status HealthStatus // Status is one of [Starting], [Healthy] or [Unhealthy] FailingStreak int // FailingStreak is the number of consecutive failures + StartPeriod bool // StartPeriod indicates if we're in the start period workflow } // ToJSONString serializes HealthState to a JSON string for label storage diff --git a/pkg/healthcheck/healthcheck_manager_darwin.go b/pkg/healthcheck/healthcheck_manager_darwin.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_darwin.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_freebsd.go b/pkg/healthcheck/healthcheck_manager_freebsd.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_freebsd.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go new file mode 100644 index 00000000000..5bfd576b126 --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -0,0 +1,214 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + "fmt" + "math/rand" + "os" + "os/exec" + "strings" + + "github.com/coreos/go-systemd/v22/dbus" + "github.com/sirupsen/logrus" + + containerd "github.com/containerd/containerd/v2/client" + "github.com/containerd/log" + + "github.com/containerd/nerdctl/v2/pkg/defaults" + "github.com/containerd/nerdctl/v2/pkg/labels" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + containerID := container.ID() + hcName := hcUnitName(containerID, true) + logrus.Debugf("Creating healthcheck timer unit: %s", hcName) + + cmd := []string{} + if rootlessutil.IsRootless() { + cmd = append(cmd, "--user") + } + if path := os.Getenv("PATH"); path != "" { + cmd = append(cmd, "--setenv=PATH="+path) + } + + // Always use health-interval for timer frequency + cmd = append(cmd, "--unit", hcName, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s") + + cmd = append(cmd, "nerdctl", "container", "healthcheck", containerID) + if logrus.IsLevelEnabled(logrus.DebugLevel) { + cmd = append(cmd, "--debug") + } + + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + logrus.Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) + run := exec.Command("systemd-run", cmd...) + if out, err := run.CombinedOutput(); err != nil { + return fmt.Errorf("systemd-run failed: %w\noutput: %s", err, strings.TrimSpace(string(out))) + } + + return nil +} + +// StartTimer starts the healthcheck timer unit. +// TODO if we persist hcName to container state, pass that to this function. +func StartTimer(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + hcName := hcUnitName(container.ID(), true) + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + startChan := make(chan string) + unit := hcName + ".service" + if _, err := conn.RestartUnitContext(context.Background(), unit, "fail", startChan); err != nil { + return err + } + if msg := <-startChan; msg != "done" { + return fmt.Errorf("unexpected systemd restart result: %s", msg) + } + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + return RemoveTransientHealthCheckFilesByID(ctx, container.ID()) +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + // Don't proceed if systemd is unavailable or disabled + if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return nil + } + + // Skip healthchecks in rootless environments to avoid systemd DBUS permission issues + if rootlessutil.IsRootless() { + return nil + } + + logrus.Debugf("Removing healthcheck timer unit: %s", containerID) + + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + unitName := hcUnitName(containerID, true) + timer := unitName + ".timer" + service := unitName + ".service" + + // Stop timer + tChan := make(chan string) + if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil { + if msg := <-tChan; msg != "done" { + logrus.Warnf("timer stop message: %s", msg) + } + } + + // Stop service + sChan := make(chan string) + if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil { + if msg := <-sChan; msg != "done" { + logrus.Warnf("service stop message: %s", msg) + } + } + + // Reset failed units + _ = conn.ResetFailedUnitContext(context.Background(), service) + return nil +} + +// hcUnitName returns a systemd unit name for a container healthcheck. +func hcUnitName(containerID string, bare bool) string { + unit := containerID + if !bare { + unit += fmt.Sprintf("-%x", rand.Int()) + } + return unit +} + +func extractHealthcheck(ctx context.Context, container containerd.Container) *Healthcheck { + l, err := container.Labels(ctx) + if err != nil { + log.G(ctx).WithError(err).Debugf("could not get labels for container %s", container.ID()) + return nil + } + hcStr, ok := l[labels.HealthCheck] + if !ok || hcStr == "" { + return nil + } + hc, err := HealthCheckFromJSON(hcStr) + if err != nil { + log.G(ctx).WithError(err).Debugf("invalid healthcheck config on container %s", container.ID()) + return nil + } + return hc +} + +// shouldSkipHealthCheckSystemd determines if healthcheck timers should be skipped. +func shouldSkipHealthCheckSystemd(hc *Healthcheck) bool { + // Don't proceed if systemd is unavailable or disabled + if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return true + } + + // Skip healthchecks in rootless environments to avoid systemd DBUS permission issues + if rootlessutil.IsRootless() { + return true + } + + // Don't proceed if health check is nil, empty, explicitly NONE or interval is 0. + if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 { + return true + } + return false +} diff --git a/pkg/healthcheck/healthcheck_manager_windows.go b/pkg/healthcheck/healthcheck_manager_windows.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_windows.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index 89b6c6b1410..86d4a826f7b 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -560,8 +560,11 @@ func onCreateRuntime(opts *handlerOpts) error { } func onPostStop(opts *handlerOpts) error { + log.L.Debugf("onPostStop hook triggered for container %s (namespace: %s)", opts.state.ID, opts.state.Annotations[labels.Namespace]) + lf, err := state.New(opts.state.Annotations[labels.StateDir]) if err != nil { + log.L.WithError(err).Errorf("failed to create state store for container %s", opts.state.ID) return err } @@ -585,6 +588,7 @@ func onPostStop(opts *handlerOpts) error { ctx := context.Background() ns := opts.state.Annotations[labels.Namespace] + if opts.cni != nil { var err error b4nnEnabled, b4nnBindEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(opts.state.Annotations) From 83c402f817c43ebcfb85d792dddc7f1aa4c8a9a6 Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Wed, 27 Aug 2025 18:17:58 +0000 Subject: [PATCH 2/8] Update documentation Signed-off-by: Arjun Raja Yogidas --- .../container_health_check_linux_test.go | 50 +++++++++--- docs/healthchecks.md | 79 ++++++++++++++----- go.mod | 2 +- pkg/containerutil/containerutil.go | 10 --- pkg/healthcheck/healthcheck_manager_linux.go | 27 ++----- 5 files changed, 106 insertions(+), 62 deletions(-) diff --git a/cmd/nerdctl/container/container_health_check_linux_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go index 48c84618fcb..8c71fb31c51 100644 --- a/cmd/nerdctl/container/container_health_check_linux_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -19,6 +19,7 @@ package container import ( "encoding/json" "errors" + "fmt" "strings" "testing" "time" @@ -37,9 +38,6 @@ import ( ) func TestContainerHealthCheckBasic(t *testing.T) { - if rootlessutil.IsRootless() { - t.Skip("healthcheck tests are skipped in rootless environment") - } testCase := nerdtest.Setup() @@ -138,10 +136,6 @@ func TestContainerHealthCheckBasic(t *testing.T) { } func TestContainerHealthCheckAdvance(t *testing.T) { - if rootlessutil.IsRootless() { - t.Skip("healthcheck tests are skipped in rootless environment") - } - testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -399,6 +393,43 @@ func TestContainerHealthCheckAdvance(t *testing.T) { } }, }, + { + Description: "Healthcheck emits large output repeatedly", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "yes X | head -c 60000", + "--health-interval", "1s", "--health-timeout", "2s", + testutil.CommonImage, "sleep", nerdtest.Infinity) + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + for i := 0; i < 3; i++ { + helpers.Ensure("container", "healthcheck", data.Identifier()) + time.Sleep(2 * time.Second) + } + return helpers.Command("inspect", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(_ string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + debug, _ := json.MarshalIndent(h, "", " ") + t.Log(string(debug)) + assert.Assert(t, h != nil, "expected health state") + assert.Equal(t, h.Status, healthcheck.Healthy) + assert.Assert(t, len(h.Log) >= 3, "expected at least 3 health log entries") + for _, log := range h.Log { + assert.Assert(t, len(log.Output) >= 1024, fmt.Sprintf("each output should be >= 1024 bytes, was: %s", log.Output)) + } + }), + } + }, + }, { Description: "Health log in inspect keeps only the latest 5 entries", Setup: func(data test.Data, helpers test.Helpers) { @@ -587,13 +618,10 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { "--health-interval", "2s", testutil.CommonImage, "sleep", "30") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - // Wait for a healthcheck to execute - time.Sleep(2 * time.Second) }, Cleanup: func(data test.Data, helpers test.Helpers) { // Ensure proper cleanup of systemd units helpers.Anyhow("stop", data.Identifier()) - time.Sleep(500 * time.Millisecond) // Allow systemd cleanup helpers.Anyhow("rm", "-f", data.Identifier()) }, Expected: func(data test.Data, helpers test.Helpers) *test.Expected { @@ -617,9 +645,7 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { "--health-interval", "1s", testutil.CommonImage, "sleep", "30") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - time.Sleep(2 * time.Second) // Wait for at least one health check to execute helpers.Ensure("kill", data.Identifier()) // Kill the container - time.Sleep(3 * time.Second) // Wait to allow any potential extra healthchecks (shouldn't happen) }, Cleanup: func(data test.Data, helpers test.Helpers) { // Container is already killed, just remove it diff --git a/docs/healthchecks.md b/docs/healthchecks.md index b47c92748b5..3bf78810b8d 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -2,25 +2,29 @@ `nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. -Currently, health checks can be triggered manually using the nerdctl container healthcheck command. Automatic orchestration (e.g., periodic checks) will be added in a future update. +## Configuration Options Health checks can be configured in multiple ways: -1. At container creation time using nerdctl run or nerdctl create with `--health-*` flags +1. At container creation time using `nerdctl run` or `nerdctl create` with these flags: + - `--health-cmd`: Command to run to check health + - `--health-interval`: Time between running the check (default: 30s) + - `--health-timeout`: Maximum time to allow one check to run (default: 30s) + - `--health-retries`: Consecutive failures needed to report unhealthy (default: 3) + - `--health-start-period`: Start period for the container to initialize before starting health-retries countdown + - `--no-healthcheck`: Disable any container-specified HEALTHCHECK + 2. At image build time using HEALTHCHECK in a Dockerfile -3. In docker-compose.yaml files, if using nerdctl compose -When a container is created, nerdctl determines the health check configuration based on the following priority: +**Note:** The `--health-start-interval` option is currently not supported by nerdctl. -1. **CLI flags** take highest precedence (e.g., `--health-cmd`, etc.) -2. If no CLI flags are set, nerdctl will use any health check defined in the image. -3. If neither is present, no health check will be configured +## Configuration Priority -Example: +When a container is created, nerdctl determines the health check configuration based on this priority: -```bash -nerdctl run --name web --health-cmd="curl -f http://localhost || exit 1" --health-interval=30s --health-timeout=5s --health-retries=3 nginx -``` +1. CLI flags take highest precedence (e.g., `--health-cmd`, etc.) +2. If no CLI flags are set, nerdctl will use any health check defined in the image +3. If neither is present, no health check will be configured ### Disabling Health Checks @@ -37,15 +41,54 @@ configured health check inside the container and reports the result. It serves a health checks, especially in scenarios where external scheduling is used. Example: - ``` nerdctl container healthcheck ``` -### Future Work (WIP) +## Automatic Health Checks with systemd + +On Linux systems with systemd, nerdctl automatically creates and manages systemd timer units to execute health checks at the configured intervals. This provides reliable scheduling and execution of health checks without requiring a persistent daemon. + +### Requirements for Automatic Health Checks + +- systemd must be available on the system +- Container must not be running in rootless mode +- Environment variable `DISABLE_HC_SYSTEMD` must not be set to "true" + +### How It Works -Since nerdctl is daemonless and does not have a persistent background process, we rely on systemd(or external schedulers) -to invoke nerdctl container healthcheck at configured intervals. This allows periodic health checks for containers in a -systemd-based environment. We are actively working on automating health checks, where we will listen to container lifecycle -events and generate appropriate systemd service and timer units. This will enable nerdctl to support automated, -Docker-compatible health checks by leveraging systemd for scheduling and lifecycle integration. \ No newline at end of file +1. When a container with health checks is created, nerdctl: + - Creates a systemd timer unit for the container + - Configures the timer according to the health check interval + - Starts monitoring the container's health status + +2. The health check status can be one of: + - `starting`: During container initialization + - `healthy`: When health checks are passing + - `unhealthy`: After specified number of consecutive failures +## Examples + +1. Basic health check that verifies a web server: +```bash +nerdctl run -d --name web \ + --health-cmd="curl -f http://localhost/ || exit 1" \ + --health-interval=5s \ + --health-retries=3 \ + nginx +``` + +2. Health check with initialization period: +```bash +nerdctl run -d --name app \ + --health-cmd="./health-check.sh" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + --health-start-period=60s \ + myapp +``` + +3. Disable health checks: +```bash +nerdctl run --no-healthcheck myapp +``` diff --git a/go.mod b/go.mod index f7c0d1f1ead..8d10651f6d0 100644 --- a/go.mod +++ b/go.mod @@ -120,7 +120,7 @@ require ( github.com/santhosh-tekuri/jsonschema/v6 v6.0.1 // indirect github.com/sasha-s/go-deadlock v0.3.5 // indirect //gomodjail:unconfined - github.com/sirupsen/logrus v1.9.3 + github.com/sirupsen/logrus v1.9.3 // indirect github.com/smallstep/pkcs7 v0.1.1 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect diff --git a/pkg/containerutil/containerutil.go b/pkg/containerutil/containerutil.go index c07016f5b80..5022962d8c1 100644 --- a/pkg/containerutil/containerutil.go +++ b/pkg/containerutil/containerutil.go @@ -359,11 +359,6 @@ func Stop(ctx context.Context, container containerd.Container, timeout *time.Dur } }() - // Clean up healthcheck units if configured. - // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { - // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) - // } - if timeout == nil { t, ok := l[labels.StopTimeout] if !ok { @@ -505,11 +500,6 @@ func Pause(ctx context.Context, client *containerd.Client, id string) error { return err } - // Clean up healthcheck units if configured. - // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { - // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) - // } - switch status.Status { case containerd.Paused: return fmt.Errorf("container %s is already paused", id) diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go index 5bfd576b126..ba0a7dbbf11 100644 --- a/pkg/healthcheck/healthcheck_manager_linux.go +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -25,7 +25,6 @@ import ( "strings" "github.com/coreos/go-systemd/v22/dbus" - "github.com/sirupsen/logrus" containerd "github.com/containerd/containerd/v2/client" "github.com/containerd/log" @@ -47,12 +46,9 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { containerID := container.ID() hcName := hcUnitName(containerID, true) - logrus.Debugf("Creating healthcheck timer unit: %s", hcName) + log.G(ctx).Debugf("Creating healthcheck timer unit: %s", hcName) cmd := []string{} - if rootlessutil.IsRootless() { - cmd = append(cmd, "--user") - } if path := os.Getenv("PATH"); path != "" { cmd = append(cmd, "--setenv=PATH="+path) } @@ -61,7 +57,7 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { cmd = append(cmd, "--unit", hcName, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s") cmd = append(cmd, "nerdctl", "container", "healthcheck", containerID) - if logrus.IsLevelEnabled(logrus.DebugLevel) { + if log.G(ctx).Logger.IsLevelEnabled(log.DebugLevel) { cmd = append(cmd, "--debug") } @@ -71,7 +67,7 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { } defer conn.Close() - logrus.Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) + log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) run := exec.Command("systemd-run", cmd...) if out, err := run.CombinedOutput(); err != nil { return fmt.Errorf("systemd-run failed: %w\noutput: %s", err, strings.TrimSpace(string(out))) @@ -81,7 +77,6 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { } // StartTimer starts the healthcheck timer unit. -// TODO if we persist hcName to container state, pass that to this function. func StartTimer(ctx context.Context, container containerd.Container) error { hc := extractHealthcheck(ctx, container) if hc == nil { @@ -124,17 +119,7 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { - // Don't proceed if systemd is unavailable or disabled - if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { - return nil - } - - // Skip healthchecks in rootless environments to avoid systemd DBUS permission issues - if rootlessutil.IsRootless() { - return nil - } - - logrus.Debugf("Removing healthcheck timer unit: %s", containerID) + log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID) conn, err := dbus.NewSystemConnectionContext(context.Background()) if err != nil { @@ -150,7 +135,7 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string tChan := make(chan string) if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil { if msg := <-tChan; msg != "done" { - logrus.Warnf("timer stop message: %s", msg) + log.G(ctx).Warnf("timer stop message: %s", msg) } } @@ -158,7 +143,7 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string sChan := make(chan string) if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil { if msg := <-sChan; msg != "done" { - logrus.Warnf("service stop message: %s", msg) + log.G(ctx).Warnf("service stop message: %s", msg) } } From 506c58d1f8192bdf9bc226d446d341366bd5277e Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Wed, 27 Aug 2025 18:17:58 +0000 Subject: [PATCH 3/8] Update documentation Signed-off-by: Arjun Raja Yogidas --- docs/healthchecks.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/healthchecks.md b/docs/healthchecks.md index 3bf78810b8d..0da55d355c5 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -1,6 +1,6 @@ # Health Check Support in nerdctl -`nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. +`nerdctl` supports Docker-compatible health checks for containers, allowing you to monitor container health through user-defined commands. ## Configuration Options @@ -31,7 +31,13 @@ When a container is created, nerdctl determines the health check configuration b You can disable health checks using the following flag during container create/run: ```bash ---no-healthcheck +nerdctl run -d --name app \ + --health-cmd="./health-check.sh" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + --health-start-period=60s \ + myapp ``` ### Running Health Checks Manually From 58c56b00298200b7ff1392eaea103d0bf3e2713e Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Mon, 1 Sep 2025 06:14:59 +0000 Subject: [PATCH 4/8] Address PR comments Signed-off-by: Arjun Raja Yogidas --- docs/healthchecks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/healthchecks.md b/docs/healthchecks.md index 0da55d355c5..933116e3941 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -1,6 +1,6 @@ # Health Check Support in nerdctl -`nerdctl` supports Docker-compatible health checks for containers, allowing you to monitor container health through user-defined commands. +`nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. ## Configuration Options From fe80b26825e20e04fa2129b082d16b19ded4777e Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Thu, 4 Sep 2025 06:41:25 +0000 Subject: [PATCH 5/8] add non blocking systemd removal Signed-off-by: Arjun Raja Yogidas --- .../container_health_check_linux_test.go | 6 - pkg/healthcheck/healthcheck_manager_darwin.go | 10 ++ pkg/healthcheck/healthcheck_manager_linux.go | 122 +++++++++++++++++- .../healthcheck_manager_windows.go | 10 ++ 4 files changed, 138 insertions(+), 10 deletions(-) diff --git a/cmd/nerdctl/container/container_health_check_linux_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go index 8c71fb31c51..f6dc9678b66 100644 --- a/cmd/nerdctl/container/container_health_check_linux_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -695,8 +695,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { "--health-interval", "1s", testutil.CommonImage, "sleep", "30") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - // Wait longer for systemd timer creation and first healthcheck execution - time.Sleep(3 * time.Second) }, Cleanup: func(data test.Data, helpers test.Helpers) { helpers.Anyhow("rm", "-f", data.Identifier()) @@ -724,7 +722,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { }) // Stop container and verify cleanup helpers.Ensure("stop", data.Identifier()) - time.Sleep(500 * time.Millisecond) // Allow cleanup to complete // Check that timer is gone result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") @@ -748,7 +745,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { "--health-interval", "2s", testutil.CommonImage, "sleep", "60") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - time.Sleep(3 * time.Second) // Wait for initial timer creation }, Cleanup: func(data test.Data, helpers test.Helpers) { helpers.Anyhow("rm", "-f", data.Identifier()) @@ -770,7 +766,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { // Step 2: Stop container helpers.Ensure("stop", data.Identifier()) - time.Sleep(1 * time.Second) // Allow cleanup // Step 3: Verify timer is removed after stop result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") @@ -785,7 +780,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { // Step 4: Restart container helpers.Ensure("start", data.Identifier()) nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - time.Sleep(3 * time.Second) // Wait for timer recreation // Step 5: Verify timer is recreated after restart - this is our final verification return helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") diff --git a/pkg/healthcheck/healthcheck_manager_darwin.go b/pkg/healthcheck/healthcheck_manager_darwin.go index 63085d4feae..46858c4966e 100644 --- a/pkg/healthcheck/healthcheck_manager_darwin.go +++ b/pkg/healthcheck/healthcheck_manager_darwin.go @@ -38,6 +38,16 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C } // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { return nil } +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. On Darwin, this is a no-op since systemd is not available. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go index ba0a7dbbf11..71024e11a13 100644 --- a/pkg/healthcheck/healthcheck_manager_linux.go +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -23,6 +23,7 @@ import ( "os" "os/exec" "strings" + "time" "github.com/coreos/go-systemd/v22/dbus" @@ -110,14 +111,13 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C if hc == nil { return nil } - if shouldSkipHealthCheckSystemd(hc) { - return nil - } - return RemoveTransientHealthCheckFilesByID(ctx, container.ID()) + return ForceRemoveTransientHealthCheckFiles(ctx, container.ID()) } // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID) @@ -151,6 +151,120 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string _ = conn.ResetFailedUnitContext(context.Background(), service) return nil } +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. It logs errors as warnings but continues cleanup attempts. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + log.G(ctx).Debugf("Force removing healthcheck timer unit: %s", containerID) + + // Create a timeout context for systemd operations (5 seconds default) + timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + unitName := hcUnitName(containerID, true) + timer := unitName + ".timer" + service := unitName + ".service" + + // Channel to collect any critical errors (though we'll continue cleanup regardless) + errChan := make(chan error, 3) + + // Goroutine for DBUS connection and cleanup operations + go func() { + defer close(errChan) + + conn, err := dbus.NewSystemConnectionContext(timeoutCtx) + if err != nil { + log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err) + errChan <- fmt.Errorf("systemd DBUS connect error: %w", err) + return + } + defer conn.Close() + + // Stop timer with timeout + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout stopping timer %s during force cleanup", timer) + return + default: + tChan := make(chan string, 1) + if _, err := conn.StopUnitContext(timeoutCtx, timer, "ignore-dependencies", tChan); err == nil { + select { + case msg := <-tChan: + if msg != "done" { + log.G(ctx).Warnf("timer stop message during force cleanup: %s", msg) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout waiting for timer stop confirmation: %s", timer) + } + } else { + log.G(ctx).Warnf("failed to stop timer %s during force cleanup: %v", timer, err) + } + } + }() + + // Stop service with timeout + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout stopping service %s during force cleanup", service) + return + default: + sChan := make(chan string, 1) + if _, err := conn.StopUnitContext(timeoutCtx, service, "ignore-dependencies", sChan); err == nil { + select { + case msg := <-sChan: + if msg != "done" { + log.G(ctx).Warnf("service stop message during force cleanup: %s", msg) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout waiting for service stop confirmation: %s", service) + } + } else { + log.G(ctx).Warnf("failed to stop service %s during force cleanup: %v", service, err) + } + } + }() + + // Reset failed units (best effort, non-blocking) + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout resetting failed unit %s during force cleanup", service) + return + default: + if err := conn.ResetFailedUnitContext(timeoutCtx, service); err != nil { + log.G(ctx).Warnf("failed to reset failed unit %s during force cleanup: %v", service, err) + } + } + }() + + // Wait a short time for operations to complete, but don't block indefinitely + select { + case <-time.After(3 * time.Second): + log.G(ctx).Debugf("force cleanup operations completed for container %s", containerID) + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("force cleanup timed out for container %s", containerID) + } + }() + + // Wait for the cleanup goroutine to finish or timeout + select { + case err := <-errChan: + if err != nil { + log.G(ctx).Warnf("force cleanup encountered errors but continuing: %v", err) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("force cleanup timed out for container %s, but cleanup may continue in background", containerID) + } + + // Always return nil - this function should never block the caller + // even if systemd operations fail or timeout + log.G(ctx).Debugf("force cleanup completed (non-blocking) for container %s", containerID) + return nil +} // hcUnitName returns a systemd unit name for a container healthcheck. func hcUnitName(containerID string, bare bool) string { diff --git a/pkg/healthcheck/healthcheck_manager_windows.go b/pkg/healthcheck/healthcheck_manager_windows.go index 63085d4feae..d18c3bce72b 100644 --- a/pkg/healthcheck/healthcheck_manager_windows.go +++ b/pkg/healthcheck/healthcheck_manager_windows.go @@ -38,6 +38,16 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C } // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { return nil } +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. On Windows, this is a no-op since systemd is not available. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + return nil +} From 2984d077e9e2b7ae757c5ea7624072f071857fa8 Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Thu, 4 Sep 2025 06:44:31 +0000 Subject: [PATCH 6/8] separate tests Signed-off-by: Arjun Raja Yogidas --- hack/test-integration.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hack/test-integration.sh b/hack/test-integration.sh index cdbeb61957f..7d54456b5a6 100755 --- a/hack/test-integration.sh +++ b/hack/test-integration.sh @@ -50,7 +50,7 @@ for arg in "$@"; do done if [ "$needsudo" == "true" ] || [ "$needsudo" == "yes" ] || [ "$needsudo" == "1" ]; then - gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -args -test.allow-kill-daemon "$@" + gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Advanced -args -test.allow-kill-daemon ./cmd/nerdctl/container/ else - gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -args -test.allow-kill-daemon "$@" + gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestContainerHealthCheckAdvance -args -test.allow-kill-daemon ./cmd/nerdctl/container/ fi From ee86e77c2056ba4f4cf7581af89b1ad4bada3b1b Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Sun, 7 Sep 2025 14:03:50 +0000 Subject: [PATCH 7/8] support rootless healthchecks Signed-off-by: Arjun Raja Yogidas --- .github/workflows/job-test-in-container.yml | 21 +++++ Dockerfile.d/test-integration-rootless.sh | 40 +++++++++ hack/test-integration.sh | 4 +- pkg/healthcheck/healthcheck_manager_linux.go | 95 ++++++++++---------- test-dbus-debug.md | 86 ++++++++++++++++++ 5 files changed, 195 insertions(+), 51 deletions(-) create mode 100644 test-dbus-debug.md diff --git a/.github/workflows/job-test-in-container.yml b/.github/workflows/job-test-in-container.yml index be742f8ab00..6b69d978028 100644 --- a/.github/workflows/job-test-in-container.yml +++ b/.github/workflows/job-test-in-container.yml @@ -151,6 +151,27 @@ jobs: sudo mkdir -p /etc/docker echo '{"ipv6": true, "fixed-cidr-v6": "2001:db8:1::/64", "experimental": true, "ip6tables": true}' | sudo tee /etc/docker/daemon.json sudo systemctl restart docker + - name: "Debug: DBUS availability in container" + run: | + echo "=== DBUS Debugging in Container ===" + [ "${{ inputs.target }}" == "rootful" ] \ + && args=(test-integration) \ + || args=(test-integration-${{ inputs.target }}) + docker run -t --rm --privileged "${args[@]}" bash -c ' + echo "--- DBUS Tools Availability ---" + which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true + echo "--- DBUS Binaries Location ---" + ls -la /usr/bin/dbus* 2>/dev/null || echo "No DBUS tools in /usr/bin" + ls -la /bin/dbus* 2>/dev/null || echo "No DBUS tools in /bin" + echo "--- Systemd Tools ---" + which systemctl systemd-run || true + echo "--- Environment ---" + echo "PATH: $PATH" + echo "USER: $(whoami)" + echo "UID: $(id -u)" + echo "--- Package Info ---" + dpkg -l | grep -E "(dbus|systemd)" || true + ' - name: "Run: integration tests" run: | . ./hack/github/action-helpers.sh diff --git a/Dockerfile.d/test-integration-rootless.sh b/Dockerfile.d/test-integration-rootless.sh index f6e243f32b5..29127a6267a 100755 --- a/Dockerfile.d/test-integration-rootless.sh +++ b/Dockerfile.d/test-integration-rootless.sh @@ -30,10 +30,40 @@ if [[ "$(id -u)" = "0" ]]; then touch /workaround-issue-622 fi + echo "=== DBUS Debugging as ROOT ===" + echo "--- DBUS Tools Availability (root) ---" + which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true + echo "--- Environment (root) ---" + echo "PATH: $PATH" + echo "USER: $(whoami)" + echo "UID: $(id -u)" + echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}" + echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}" + echo "--- Systemd Status (root) ---" + systemctl --user status 2>&1 || echo "systemctl --user failed as root" + echo "--- DBUS Launch Test (root) ---" + dbus-launch --sh-syntax 2>&1 || echo "dbus-launch failed as root" + # Switch to the rootless user via SSH systemctl start ssh exec ssh -o StrictHostKeyChecking=no rootless@localhost "$0" "$@" else + echo "=== DBUS Debugging as ROOTLESS USER ===" + echo "--- DBUS Tools Availability (rootless) ---" + which dbus-launch dbus-daemon dbus-send dbus-monitor systemd-run || true + echo "--- Environment (rootless) ---" + echo "PATH: $PATH" + echo "USER: $(whoami)" + echo "UID: $(id -u)" + echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}" + echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}" + echo "--- Systemd Status (rootless) ---" + systemctl --user status 2>&1 || echo "systemctl --user failed as rootless" + echo "--- DBUS Launch Test (rootless) ---" + dbus-launch --sh-syntax 2>&1 || echo "dbus-launch failed as rootless" + echo "--- Systemd User Environment ---" + systemctl --user show-environment 2>&1 || echo "systemctl --user show-environment failed" + containerd-rootless-setuptool.sh install if grep -q "options use-vc" /etc/resolv.conf; then containerd-rootless-setuptool.sh nsenter -- sh -euc 'echo "options use-vc" >>/etc/resolv.conf' @@ -60,6 +90,16 @@ EOF systemctl --user restart stargz-snapshotter.service export IPFS_PATH="/home/rootless/.local/share/ipfs" containerd-rootless-setuptool.sh install-bypass4netnsd + + echo "=== DBUS Debugging AFTER containerd-rootless-setuptool.sh ===" + echo "--- Environment After Setup ---" + echo "DBUS_SESSION_BUS_ADDRESS: ${DBUS_SESSION_BUS_ADDRESS:-unset}" + echo "XDG_RUNTIME_DIR: ${XDG_RUNTIME_DIR:-unset}" + echo "--- Systemd Status After Setup ---" + systemctl --user status 2>&1 || echo "systemctl --user still failed after setup" + echo "--- DBUS Launch Test After Setup ---" + dbus-launch --sh-syntax 2>&1 || echo "dbus-launch still failed after setup" + # Once ssh-ed, we lost the Dockerfile working dir, so, get back in the nerdctl checkout cd /go/src/github.com/containerd/nerdctl # We also lose the PATH (and SendEnv=PATH would require sshd config changes) diff --git a/hack/test-integration.sh b/hack/test-integration.sh index 7d54456b5a6..26727987b3c 100755 --- a/hack/test-integration.sh +++ b/hack/test-integration.sh @@ -50,7 +50,7 @@ for arg in "$@"; do done if [ "$needsudo" == "true" ] || [ "$needsudo" == "yes" ] || [ "$needsudo" == "1" ]; then - gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Advanced -args -test.allow-kill-daemon ./cmd/nerdctl/container/ + gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -exec sudo -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/ else - gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestContainerHealthCheckAdvance -args -test.allow-kill-daemon ./cmd/nerdctl/container/ + gotestsum "${args[@]}" -- -timeout="$timeout" -p 1 -v -run TestHealthCheck_SystemdIntegration_Basic -args -test.allow-kill-daemon ./cmd/nerdctl/container/ fi diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go index 71024e11a13..ad6fd84af8c 100644 --- a/pkg/healthcheck/healthcheck_manager_linux.go +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -50,6 +50,11 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { log.G(ctx).Debugf("Creating healthcheck timer unit: %s", hcName) cmd := []string{} + if rootlessutil.IsRootless() { + cmd = append(cmd, "--user") + cmd = append(cmd, fmt.Sprintf("--uid=%d", rootlessutil.ParentEUID())) + } + if path := os.Getenv("PATH"); path != "" { cmd = append(cmd, "--setenv=PATH="+path) } @@ -62,11 +67,11 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { cmd = append(cmd, "--debug") } - conn, err := dbus.NewSystemConnectionContext(context.Background()) - if err != nil { - return fmt.Errorf("systemd DBUS connect error: %w", err) - } - defer conn.Close() + // conn, err := dbus.NewSystemConnectionContext(context.Background()) + // if err != nil { + // return fmt.Errorf("systemd DBUS connect error: %w", err) + // } + // defer conn.Close() log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) run := exec.Command("systemd-run", cmd...) @@ -79,29 +84,53 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { // StartTimer starts the healthcheck timer unit. func StartTimer(ctx context.Context, container containerd.Container) error { + log.G(ctx).Infof("DEBUG: StartTimer called for container %s", container.ID()) + hc := extractHealthcheck(ctx, container) if hc == nil { + log.G(ctx).Infof("DEBUG: No healthcheck found, skipping StartTimer") return nil } if shouldSkipHealthCheckSystemd(hc) { + log.G(ctx).Infof("DEBUG: Skipping healthcheck systemd, shouldSkip=true") return nil } hcName := hcUnitName(container.ID(), true) - conn, err := dbus.NewSystemConnectionContext(context.Background()) + log.G(ctx).Infof("DEBUG: Starting timer for unit: %s, rootless=%v", hcName, rootlessutil.IsRootless()) + + var conn *dbus.Conn + var err error + if rootlessutil.IsRootless() { + log.G(ctx).Infof("DEBUG: Attempting user DBUS connection...") + conn, err = dbus.NewUserConnectionContext(ctx) + } else { + log.G(ctx).Infof("DEBUG: Attempting system DBUS connection...") + conn, err = dbus.NewSystemConnectionContext(ctx) + } if err != nil { + log.G(ctx).Errorf("DEBUG: DBUS connection failed: %v", err) return fmt.Errorf("systemd DBUS connect error: %w", err) } defer conn.Close() + log.G(ctx).Infof("DEBUG: DBUS connection successful") startChan := make(chan string) unit := hcName + ".service" + log.G(ctx).Infof("DEBUG: About to restart unit: %s", unit) + if _, err := conn.RestartUnitContext(context.Background(), unit, "fail", startChan); err != nil { + log.G(ctx).Errorf("DEBUG: RestartUnitContext failed: %v", err) return err } + + log.G(ctx).Infof("DEBUG: Waiting for restart confirmation...") if msg := <-startChan; msg != "done" { + log.G(ctx).Errorf("DEBUG: Unexpected restart result: %s", msg) return fmt.Errorf("unexpected systemd restart result: %s", msg) } + + log.G(ctx).Infof("DEBUG: StartTimer completed successfully") return nil } @@ -115,44 +144,6 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C return ForceRemoveTransientHealthCheckFiles(ctx, container.ID()) } -// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. -// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. -/* -func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { - log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID) - - conn, err := dbus.NewSystemConnectionContext(context.Background()) - if err != nil { - return fmt.Errorf("systemd DBUS connect error: %w", err) - } - defer conn.Close() - - unitName := hcUnitName(containerID, true) - timer := unitName + ".timer" - service := unitName + ".service" - - // Stop timer - tChan := make(chan string) - if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil { - if msg := <-tChan; msg != "done" { - log.G(ctx).Warnf("timer stop message: %s", msg) - } - } - - // Stop service - sChan := make(chan string) - if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil { - if msg := <-sChan; msg != "done" { - log.G(ctx).Warnf("service stop message: %s", msg) - } - } - - // Reset failed units - _ = conn.ResetFailedUnitContext(context.Background(), service) - return nil -} -*/ - // ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service // using just the container ID. This function is non-blocking and uses timeouts to prevent hanging // on systemd operations. It logs errors as warnings but continues cleanup attempts. @@ -174,7 +165,13 @@ func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID strin go func() { defer close(errChan) - conn, err := dbus.NewSystemConnectionContext(timeoutCtx) + var conn *dbus.Conn + var err error + if rootlessutil.IsRootless() { + conn, err = dbus.NewUserConnectionContext(ctx) + } else { + conn, err = dbus.NewSystemConnectionContext(ctx) + } if err != nil { log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err) errChan <- fmt.Errorf("systemd DBUS connect error: %w", err) @@ -300,10 +297,10 @@ func shouldSkipHealthCheckSystemd(hc *Healthcheck) bool { return true } - // Skip healthchecks in rootless environments to avoid systemd DBUS permission issues - if rootlessutil.IsRootless() { - return true - } + // Skip healthchecks in environments without dbus-launch to avoid permission issues + // if _, err := exec.LookPath("dbus-launch"); err != nil { + // return true + // } // Don't proceed if health check is nil, empty, explicitly NONE or interval is 0. if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 { diff --git a/test-dbus-debug.md b/test-dbus-debug.md new file mode 100644 index 00000000000..d179cd827bb --- /dev/null +++ b/test-dbus-debug.md @@ -0,0 +1,86 @@ +# DBUS Debugging Test Plan + +## What We Added + +### 1. GitHub Workflow Debugging (.github/workflows/job-test-in-container.yml) +- Added a new step "Debug: DBUS availability in container" before running integration tests +- Checks DBUS tool availability, binary locations, systemd tools, environment variables, and package info +- Runs in the same container that will execute the tests + +### 2. Rootless Test Script Debugging (Dockerfile.d/test-integration-rootless.sh) +- Added debugging as ROOT user before switching to rootless +- Added debugging as ROOTLESS user after SSH switch +- Added debugging AFTER containerd-rootless-setuptool.sh setup +- Checks DBUS tools, environment variables, systemd status, and DBUS connectivity + +## Expected Output + +When the CI runs, we should see: + +### Container Level (from GitHub workflow): +``` +=== DBUS Debugging in Container === +--- DBUS Tools Availability --- +/usr/bin/dbus-launch +/usr/bin/dbus-daemon +/usr/bin/dbus-send +/usr/bin/dbus-monitor +/usr/bin/systemd-run +--- DBUS Binaries Location --- +-rwxr-xr-x 1 root root ... /usr/bin/dbus-launch +-rwxr-xr-x 1 root root ... /usr/bin/dbus-daemon +... +--- Package Info --- +dbus 1.14.10-4ubuntu4.1 +systemd 255.4-1ubuntu8.4 +... +``` + +### Root User Level: +``` +=== DBUS Debugging as ROOT === +--- DBUS Tools Availability (root) --- +/usr/bin/dbus-launch +/usr/bin/systemd-run +... +--- Environment (root) --- +PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +USER: root +UID: 0 +DBUS_SESSION_BUS_ADDRESS: unset +XDG_RUNTIME_DIR: unset +``` + +### Rootless User Level: +``` +=== DBUS Debugging as ROOTLESS USER === +--- DBUS Tools Availability (rootless) --- +/usr/bin/dbus-launch +/usr/bin/systemd-run +... +--- Environment (rootless) --- +PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +USER: rootless +UID: 1000 +DBUS_SESSION_BUS_ADDRESS: unset +XDG_RUNTIME_DIR: /run/user/1000 +``` + +## What This Will Tell Us + +1. **Are DBUS tools installed?** - We'll see if dbus-launch, dbus-daemon, etc. are available +2. **Are they in PATH?** - We'll see the exact paths and verify accessibility +3. **Environment differences** - Compare root vs rootless environment setup +4. **Systemd user session status** - See if systemd --user works properly +5. **DBUS connectivity** - Test if dbus-launch can establish connections +6. **Setup impact** - See how containerd-rootless-setuptool.sh affects DBUS environment + +## Next Steps After Getting Debug Output + +Based on the debug output, we can: +1. **If DBUS tools are missing**: Add them to the Dockerfile +2. **If DBUS tools exist but fail**: Implement graceful fallback in healthcheck code +3. **If environment is wrong**: Fix environment setup in test script +4. **If systemd user session fails**: Implement proper session initialization + +This debugging will give us the exact information needed to solve the healthcheck timer failures in CI. From 0619a18021e12c4c131a3e9cc7352d7d2d6da2cc Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Mon, 8 Sep 2025 01:29:17 +0000 Subject: [PATCH 8/8] add more debugging Signed-off-by: Arjun Raja Yogidas --- pkg/healthcheck/healthcheck_manager_linux.go | 51 +++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go index ad6fd84af8c..3593ac0d09b 100644 --- a/pkg/healthcheck/healthcheck_manager_linux.go +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -35,13 +35,50 @@ import ( "github.com/containerd/nerdctl/v2/pkg/rootlessutil" ) +// IsDbusFunctional tests if DBUS connections actually work in the current environment. +// This is more reliable than just checking if DBUS tools are installed. +func IsDbusFunctional(ctx context.Context) bool { + // Create timeout context to prevent hanging on DBUS connection attempts + timeoutCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + + var conn *dbus.Conn + var err error + + if rootlessutil.IsRootless() { + // Test user DBUS connection for rootless environments + conn, err = dbus.NewUserConnectionContext(timeoutCtx) + } else { + // Test system DBUS connection for rootful environments + conn, err = dbus.NewSystemConnectionContext(timeoutCtx) + } + + if err != nil { + return false + } + defer conn.Close() + return true +} + // CreateTimer sets up the transient systemd timer and service for healthchecks. func CreateTimer(ctx context.Context, container containerd.Container) error { hc := extractHealthcheck(ctx, container) if hc == nil { + log.G(ctx).Debugf("No healthcheck configuration found for container %s", container.ID()) return nil } - if shouldSkipHealthCheckSystemd(hc) { + + if shouldSkipHealthCheckSystemd(ctx, hc) { + // Log specific reason for skipping to help with troubleshooting + if !defaults.IsSystemdAvailable() { + log.G(ctx).Infof("Skipping healthcheck timer for container %s: systemd not available", container.ID()) + } else if !IsDbusFunctional(ctx) { + log.G(ctx).Infof("Skipping healthcheck timer for container %s: DBUS connection unavailable (likely containerized environment)", container.ID()) + } else if os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + log.G(ctx).Infof("Skipping healthcheck timer for container %s: disabled by DISABLE_HC_SYSTEMD", container.ID()) + } else if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 { + log.G(ctx).Debugf("Skipping healthcheck timer for container %s: invalid healthcheck configuration", container.ID()) + } return nil } @@ -91,7 +128,7 @@ func StartTimer(ctx context.Context, container containerd.Container) error { log.G(ctx).Infof("DEBUG: No healthcheck found, skipping StartTimer") return nil } - if shouldSkipHealthCheckSystemd(hc) { + if shouldSkipHealthCheckSystemd(ctx, hc) { log.G(ctx).Infof("DEBUG: Skipping healthcheck systemd, shouldSkip=true") return nil } @@ -291,16 +328,16 @@ func extractHealthcheck(ctx context.Context, container containerd.Container) *He } // shouldSkipHealthCheckSystemd determines if healthcheck timers should be skipped. -func shouldSkipHealthCheckSystemd(hc *Healthcheck) bool { +func shouldSkipHealthCheckSystemd(ctx context.Context, hc *Healthcheck) bool { // Don't proceed if systemd is unavailable or disabled if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { return true } - // Skip healthchecks in environments without dbus-launch to avoid permission issues - // if _, err := exec.LookPath("dbus-launch"); err != nil { - // return true - // } + // Test actual DBUS connectivity - this is more reliable than checking for tools + if !IsDbusFunctional(ctx) { + return true + } // Don't proceed if health check is nil, empty, explicitly NONE or interval is 0. if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 {