From f3ea62f9278a44cacb7406fa6a9c4e146e1a3223 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Fri, 22 Aug 2025 23:43:48 -0700 Subject: [PATCH 01/16] feat: controller sharding --- examples/sharded-namespace/Dockerfile | 21 ++ examples/sharded-namespace/README.md | 139 +++++++++ examples/sharded-namespace/main.go | 122 ++++++++ .../manifests/kustomization.yaml | 6 + .../sharded-namespace/manifests/rbac.yaml | 56 ++++ .../manifests/sample-data.yaml | 42 +++ .../manifests/statefulset.yaml | 22 ++ pkg/controller/controller.go | 51 +-- pkg/manager/leaseguard.go | 158 ++++++++++ pkg/manager/manager.go | 231 +++++++++++++- pkg/manager/peers/registry.go | 212 +++++++++++++ pkg/manager/sharder/hrw.go | 31 ++ pkg/manager/sharder/sharder.go | 10 + pkg/manager/sharding.go | 47 +++ pkg/source/kind.go | 291 +++++++++++++++--- 15 files changed, 1365 insertions(+), 74 deletions(-) create mode 100644 examples/sharded-namespace/Dockerfile create mode 100644 examples/sharded-namespace/README.md create mode 100644 examples/sharded-namespace/main.go create mode 100644 examples/sharded-namespace/manifests/kustomization.yaml create mode 100644 examples/sharded-namespace/manifests/rbac.yaml create mode 100644 examples/sharded-namespace/manifests/sample-data.yaml create mode 100644 examples/sharded-namespace/manifests/statefulset.yaml create mode 100644 pkg/manager/leaseguard.go create mode 100644 pkg/manager/peers/registry.go create mode 100644 pkg/manager/sharder/hrw.go create mode 100644 pkg/manager/sharder/sharder.go create mode 100644 pkg/manager/sharding.go diff --git a/examples/sharded-namespace/Dockerfile b/examples/sharded-namespace/Dockerfile new file mode 100644 index 00000000..63b80f97 --- /dev/null +++ b/examples/sharded-namespace/Dockerfile @@ -0,0 +1,21 @@ +# examples/sharded-namespace/Dockerfile +# Multi-stage to keep image slim +FROM golang:1.24 as build + +WORKDIR /src +# Copy go.mod/sum from repo root so deps resolve; adjust paths if needed. +COPY go.mod go.sum ./ +RUN go mod download + +# Copy the whole repo to compile the example against local packages +COPY . . + +# Build the example +WORKDIR /src/examples/sharded-namespace +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/sharded-example main.go + +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=build /out/sharded-example /sharded-example +USER 65532:65532 +ENTRYPOINT ["/sharded-example"] diff --git a/examples/sharded-namespace/README.md b/examples/sharded-namespace/README.md new file mode 100644 index 00000000..ebab4203 --- /dev/null +++ b/examples/sharded-namespace/README.md @@ -0,0 +1,139 @@ +# Sharded Namespace Example + +This demo runs two replicas of a multicluster manager that **split ownership** of downstream "clusters" discovered by the *namespace provider* (each Kubernetes Namespace == one cluster). Ownership is decided by HRW hashing across peers, then serialized with per-cluster fencing Leases so exactly one peer reconciles a given cluster at a time. + +**Key Components:** +- **Peer membership (presence)**: `coordination.k8s.io/Lease` with prefix `mcr-peer-*` +- **Per-cluster fencing (ownership)**: `coordination.k8s.io/Lease` with prefix `mcr-shard-` + +Controllers attach per-cluster watches when ownership starts, and cleanly detach & re-attach when ownership transfers. + +## Build the image + +From repo root: + +```bash +docker build -t mcr-namespace:dev -f examples/sharded-namespace/Dockerfile . +``` + +If using KinD: +```bash +kind create cluster --name mcr-demo +kind load docker-image mcr-namespace:dev --name mcr-demo +``` + +## Deploy + +```bash +kubectl apply -k examples/sharded-namespace/manifests +``` + +## Observe + +Tail logs from pods: + +```bash +kubectl -n mcr-demo get pods +kubectl -n mcr-demo logs statefulset/sharded-namespace -f +``` + +You should see lines like: +```bash +"ownership start {"cluster": "zoo", "peer": "sharded-namespace-0"}" +"ownership start {"cluster": "jungle", "peer": "sharded-namespace-1"}" + +Reconciling ConfigMap {"controller": "multicluster-configmaps", "controllerGroup": "", "controllerKind": "ConfigMap", "reconcileID": "4f1116b3-b5 │ +│ 4e-4e6a-b84f-670ca5cfc9ce", "cluster": "zoo", "ns": "default", "name": "elephant"} + +Reconciling ConfigMap {"controller": "multicluster-configmaps", "controllerGroup": "", "controllerKind": "ConfigMap", "reconcileID": "688b8467-f5 │ +│ d3-491b-989e-87bc8aad780e", "cluster": "jungle", "ns": "default", "name": "monkey"} +``` + +Check Leases: +```bash +# Peer membership (one per pod) +kubectl -n kube-system get lease | grep '^mcr-peer-' + +# Per-cluster fencing (one per namespace/"cluster") +kubectl -n kube-system get lease | grep '^mcr-shard-' +``` + +Who owns a given cluster? +```bash +C=zoo +kubectl -n kube-system get lease mcr-shard-$C \ + -o custom-columns=HOLDER:.spec.holderIdentity,RENEW:.spec.renewTime +``` + + +## Test Ownership + +Scale down to 1 replica and watch ownership consolidate: +```bash +# Scale down +kubectl -n mcr-demo scale statefulset/sharded-namespace --replicas=1 + + +# Watch leases lose their holders as pods terminate +watch 'kubectl -n kube-system get lease -o custom-columns=NAME:.metadata.name,HOLDER:.spec.holderIdentity | grep "^mcr-shard-"' + +# Wait for all clusters to be owned by the single remaining pod (~30s) +kubectl -n kube-system wait --for=jsonpath='{.spec.holderIdentity}'=sharded-namespace-0 \ + lease/mcr-shard-zoo lease/mcr-shard-jungle lease/mcr-shard-island --timeout=60s + +``` +Create/patch a ConfigMap and confirm the single owner reconciles it: +```bash +# Pick a cluster and create a test ConfigMap +C=island +kubectl -n "$C" create cm test-$(date +%s) --from-literal=ts=$(date +%s) --dry-run=client -oyaml | kubectl apply -f - + +# Verify only pod-q reconciles it (since it owns everything now) +kubectl -n mcr-demo logs pod/sharded-namespace-0 --since=100s | grep "Reconciling ConfigMap.*$C" +``` + +Scale up to 3 replicas and watch ownership rebalance: +```bash +# Scale up +kubectl -n mcr-demo scale statefulset/sharded-namespace --replicas=3 +# Watch leases regain holders as pods start +watch 'kubectl -n kube-system get lease -o custom-columns=NAME:.metadata.name,HOLDER:.spec.holderIdentity | grep "^mcr-shard-"' + +# Create a cm in the default ns which belongs to sharded-namespace-2 +C=default +kubectl -n "$C" create cm test-$(date +%s) --from-literal=ts=$(date +%s) --dry-run=client -oyaml | kubectl apply -f - +# Verify only pod-2 reconciles it (since it owns the default ns now) +kubectl -n mcr-demo logs pod/sharded-namespace-2 --since=100s | grep "Reconciling ConfigMap.*$C" +``` + +## Tuning +In your example app (e.g., examples/sharded-namespace/main.go), configure fencing and timings: + +```go +mcmanager.Configure(mgr, + // Per-cluster fencing Leases live here as mcr-shard- + mcmanager.WithShardLease("kube-system", "mcr-shard"), + mcmanager.WithPerClusterLease(true), // enabled by default + + // Optional: tune fencing timings (duration, renew, throttle): + // mcmanager.WithLeaseTimings(30*time.Second, 10*time.Second, 750*time.Millisecond), + + // Optional: peer weight for HRW: + // mcmanager.WithPeerWeight(1), +) +``` + +The peer registry uses mcr-peer-* automatically and derives the peer ID from the pod hostname (StatefulSet ordinal). + +## Cleanup + +```bash +kubectl delete -k examples/sharded-namespace/manifests +kind delete cluster --name mcr-demo + +``` + +## Notes + +- This example assumes the `peers` and `sharder` code we wrote is integrated and `mcmanager.Configure` hooks it up (defaults: HRW + Lease registry; hostname as peer ID). +- If your repo uses a different module path, the Dockerfile build context may need minor tweaking—otherwise it compiles against your local packages. diff --git a/examples/sharded-namespace/main.go b/examples/sharded-namespace/main.go new file mode 100644 index 00000000..4049efa9 --- /dev/null +++ b/examples/sharded-namespace/main.go @@ -0,0 +1,122 @@ +// examples/sharded-namespace/main.go +package main + +import ( + "context" + "errors" + "fmt" + "os" + + "github.com/go-logr/logr" + "golang.org/x/sync/errgroup" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + + mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" + "sigs.k8s.io/multicluster-runtime/providers/namespace" +) + +func main() { + klog.Background() // ensure klog initialized + ctrl.SetLogger(zap.New(zap.UseDevMode(true))) + log := ctrl.Log.WithName("sharded-example") + + ctx := ctrl.SetupSignalHandler() + + if err := run(ctx, log); err != nil { + log.Error(err, "exiting") + os.Exit(1) + } +} + +func run(ctx context.Context, log logr.Logger) error { + // use in-cluster config; fall back to default loading rules for local runs + cfg, err := ctrl.GetConfig() + if err != nil { + return fmt.Errorf("get kubeconfig: %w", err) + } + + // Provider: treats namespaces in the host cluster as “downstream clusters”. + host, err := cluster.New(cfg) + if err != nil { + return fmt.Errorf("create host cluster: %w", err) + } + provider := namespace.New(host) + + // Multicluster manager (no peer ID passed; pod hostname becomes peer ID). + mgr, err := mcmanager.New(cfg, provider, manager.Options{}) + if err != nil { + return fmt.Errorf("create mc manager: %w", err) + } + + // Configure sharding: + // - fencing prefix: "mcr-shard" (per-cluster Lease names become mcr-shard-) + // - peer membership still uses "mcr-peer" internally (set in WithMultiCluster) + // Peer ID defaults to os.Hostname(). + mcmanager.Configure(mgr, + mcmanager.WithShardLease("kube-system", "mcr-shard"), + // optional but explicit (your manager already defaults this to true) + mcmanager.WithPerClusterLease(true), + ) + + // A simple controller that logs ConfigMaps per owned “cluster” (namespace). + if err := mcbuilder.ControllerManagedBy(mgr). + Named("multicluster-configmaps"). + For(&corev1.ConfigMap{}). + Complete(mcreconcile.Func(func(ctx context.Context, req mcreconcile.Request) (ctrl.Result, error) { + // attach cluster once; don't repeat it in Info() + l := ctrl.LoggerFrom(ctx).WithValues("cluster", req.ClusterName) + + // get the right cluster client + cl, err := mgr.GetCluster(ctx, req.ClusterName) + if err != nil { + return ctrl.Result{}, err + } + + // fetch the object, then log from the object (truth) + cm := &corev1.ConfigMap{} + if err := cl.GetClient().Get(ctx, req.Request.NamespacedName, cm); err != nil { + if apierrors.IsNotFound(err) { + // object vanished — nothing to do + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // now cm.Namespace is accurate (e.g., "zoo", "kube-system", etc.) + l.Info("Reconciling ConfigMap", + "ns", cm.GetNamespace(), + "name", cm.GetName(), + ) + + // show which peer handled it (pod hostname) + if host, _ := os.Hostname(); host != "" { + l.Info("Handled by peer", "peer", host) + } + return ctrl.Result{}, nil + })); err != nil { + return fmt.Errorf("build controller: %w", err) + } + + // Start everything + g, ctx := errgroup.WithContext(ctx) + g.Go(func() error { return ignoreCanceled(provider.Run(ctx, mgr)) }) + g.Go(func() error { return ignoreCanceled(host.Start(ctx)) }) + g.Go(func() error { return ignoreCanceled(mgr.Start(ctx)) }) + return g.Wait() +} + +func ignoreCanceled(err error) error { + if errors.Is(err, context.Canceled) { + return nil + } + return err +} diff --git a/examples/sharded-namespace/manifests/kustomization.yaml b/examples/sharded-namespace/manifests/kustomization.yaml new file mode 100644 index 00000000..5b6504be --- /dev/null +++ b/examples/sharded-namespace/manifests/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - rbac.yaml + - statefulset.yaml + - sample-data.yaml diff --git a/examples/sharded-namespace/manifests/rbac.yaml b/examples/sharded-namespace/manifests/rbac.yaml new file mode 100644 index 00000000..71572946 --- /dev/null +++ b/examples/sharded-namespace/manifests/rbac.yaml @@ -0,0 +1,56 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: mcr-demo +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: mcr-demo + namespace: mcr-demo +--- +# Allow reading/writing peer Leases in kube-system +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: mcr-lease +rules: +- apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get","list","watch","create","update","patch","delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: mcr-lease +subjects: +- kind: ServiceAccount + name: mcr-demo + namespace: mcr-demo +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: mcr-lease +--- +# Allow the example to read namespaces/configmaps (namespace provider + demo controller) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: mcr-example +rules: +- apiGroups: [""] + resources: ["namespaces","configmaps"] + verbs: ["get","list","watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: mcr-example +subjects: +- kind: ServiceAccount + name: mcr-demo + namespace: mcr-demo +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: mcr-example diff --git a/examples/sharded-namespace/manifests/sample-data.yaml b/examples/sharded-namespace/manifests/sample-data.yaml new file mode 100644 index 00000000..6c542bbf --- /dev/null +++ b/examples/sharded-namespace/manifests/sample-data.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: zoo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: elephant + namespace: zoo +data: { a: "1" } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: lion + namespace: zoo +data: { a: "1" } +--- +apiVersion: v1 +kind: Namespace +metadata: + name: jungle +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: monkey + namespace: jungle +data: { a: "1" } +--- +apiVersion: v1 +kind: Namespace +metadata: + name: island +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: bird + namespace: island +data: { a: "1" } diff --git a/examples/sharded-namespace/manifests/statefulset.yaml b/examples/sharded-namespace/manifests/statefulset.yaml new file mode 100644 index 00000000..09c6a3dc --- /dev/null +++ b/examples/sharded-namespace/manifests/statefulset.yaml @@ -0,0 +1,22 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: sharded-namespace + namespace: mcr-demo +spec: + serviceName: sharded-namespace + replicas: 2 + selector: + matchLabels: + app: sharded-namespace + template: + metadata: + labels: + app: sharded-namespace + spec: + serviceAccountName: mcr-demo + containers: + - name: app + image: mcr-namespace:dev # change to your image name + imagePullPolicy: IfNotPresent + # No env needed; hostname == pod name → unique peer ID automatically diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index a8ce31a4..ec412e63 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -91,7 +91,7 @@ func NewTypedUnmanaged[request mcreconcile.ClusterAware[request]](name string, m } return &mcController[request]{ TypedController: c, - clusters: make(map[string]engagedCluster), + clusters: make(map[string]*engagedCluster), }, nil } @@ -101,28 +101,41 @@ type mcController[request mcreconcile.ClusterAware[request]] struct { controller.TypedController[request] lock sync.Mutex - clusters map[string]engagedCluster + clusters map[string]*engagedCluster sources []mcsource.TypedSource[client.Object, request] } type engagedCluster struct { name string cluster cluster.Cluster + ctx context.Context + cancel context.CancelFunc } func (c *mcController[request]) Engage(ctx context.Context, name string, cl cluster.Cluster) error { c.lock.Lock() defer c.lock.Unlock() - if old, ok := c.clusters[name]; ok && old.cluster == cl { - return nil - } - - ctx, cancel := context.WithCancel(ctx) //nolint:govet // cancel is called in the error case only. + // Check if we already have this cluster engaged with the SAME context + if old, ok := c.clusters[name]; ok { + // If the context is different or cancelled, we need to re-engage + select { + case <-old.ctx.Done(): + // Old context is cancelled, need to re-engage + delete(c.clusters, name) + default: + if old.cluster == cl { + // Same cluster, same active context, nothing to do + return nil + } + } + } + + engCtx, cancel := context.WithCancel(ctx) // pass through in case the controller itself is cluster aware if ctrl, ok := c.TypedController.(multicluster.Aware); ok { - if err := ctrl.Engage(ctx, name, cl); err != nil { + if err := ctrl.Engage(engCtx, name, cl); err != nil { cancel() return err } @@ -135,24 +148,28 @@ func (c *mcController[request]) Engage(ctx context.Context, name string, cl clus cancel() return fmt.Errorf("failed to engage for cluster %q: %w", name, err) } - if err := c.TypedController.Watch(startWithinContext[request](ctx, src)); err != nil { + if err := c.TypedController.Watch(startWithinContext[request](engCtx, src)); err != nil { cancel() return fmt.Errorf("failed to watch for cluster %q: %w", name, err) } } - ec := engagedCluster{ + ec := &engagedCluster{ name: name, cluster: cl, + ctx: engCtx, + cancel: cancel, } c.clusters[name] = ec - go func() { + go func(ctx context.Context, key string, token *engagedCluster) { + <-ctx.Done() c.lock.Lock() defer c.lock.Unlock() - if c.clusters[name] == ec { - delete(c.clusters, name) + if cur, ok := c.clusters[key]; ok && cur == token { + delete(c.clusters, key) } - }() + // note: cancel() is driven by parent; no need to call here + }(engCtx, name, ec) return nil //nolint:govet // cancel is called in the error case only. } @@ -161,16 +178,12 @@ func (c *mcController[request]) MultiClusterWatch(src mcsource.TypedSource[clien c.lock.Lock() defer c.lock.Unlock() - ctx, cancel := context.WithCancel(context.Background()) //nolint:govet // cancel is called in the error case only. - for name, eng := range c.clusters { src, err := src.ForCluster(name, eng.cluster) if err != nil { - cancel() return fmt.Errorf("failed to engage for cluster %q: %w", name, err) } - if err := c.TypedController.Watch(startWithinContext[request](ctx, src)); err != nil { - cancel() + if err := c.TypedController.Watch(startWithinContext[request](eng.ctx, src)); err != nil { return fmt.Errorf("failed to watch for cluster %q: %w", name, err) } } diff --git a/pkg/manager/leaseguard.go b/pkg/manager/leaseguard.go new file mode 100644 index 00000000..a40d5a9e --- /dev/null +++ b/pkg/manager/leaseguard.go @@ -0,0 +1,158 @@ +package manager + +import ( + "context" + "time" + + coordinationv1 "k8s.io/api/coordination/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type leaseGuard struct { + c client.Client + ns, nm string + id string + ldur time.Duration // lease duration + renew time.Duration // renew period + onLost func() // callback when we lose the lease + + held bool + cancel context.CancelFunc +} + +func newLeaseGuard(c client.Client, ns, name, id string, ldur, renew time.Duration, onLost func()) *leaseGuard { + return &leaseGuard{c: c, ns: ns, nm: name, id: id, ldur: ldur, renew: renew, onLost: onLost} +} + +// TryAcquire attempts to acquire the lease. Returns true if we own it (or already owned). +func (g *leaseGuard) TryAcquire(ctx context.Context) bool { + if g.held { + return true + } + + key := types.NamespacedName{Namespace: g.ns, Name: g.nm} + now := metav1.NowMicro() + + ldurSec := int32(g.ldur / time.Second) + + var ls coordinationv1.Lease + err := g.c.Get(ctx, key, &ls) + switch { + case apierrors.IsNotFound(err): + ls = coordinationv1.Lease{ + ObjectMeta: metav1.ObjectMeta{Namespace: g.ns, Name: g.nm}, + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &g.id, + LeaseDurationSeconds: &ldurSec, + AcquireTime: &now, + RenewTime: &now, + }, + } + if err := g.c.Create(ctx, &ls); err != nil { + return false + } + case err != nil: + return false + default: + // adopt if free/expired/ours + ho := "" + if ls.Spec.HolderIdentity != nil { + ho = *ls.Spec.HolderIdentity + } + if ho != "" && ho != g.id { + if !expired(&ls, now) { + return false + } + } + ls.Spec.HolderIdentity = &g.id + ls.Spec.LeaseDurationSeconds = &ldurSec + // keep first AcquireTime if already ours, otherwise set it + if ho != g.id || ls.Spec.AcquireTime == nil { + ls.Spec.AcquireTime = &now + } + ls.Spec.RenewTime = &now + if err := g.c.Update(ctx, &ls); err != nil { + return false + } + } + + // we own it; start renewer + rctx, cancel := context.WithCancel(context.Background()) + g.cancel = cancel + g.held = true + go g.renewLoop(rctx, key) + return true +} + +func (g *leaseGuard) renewLoop(ctx context.Context, key types.NamespacedName) { + t := time.NewTicker(g.renew) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if ok := g.renewOnce(ctx, key); !ok { + // best-effort notify once, then release + if g.onLost != nil { + g.onLost() + } + g.Release(context.Background()) + return + } + } + } +} + +func (g *leaseGuard) renewOnce(ctx context.Context, key types.NamespacedName) bool { + now := metav1.NowMicro() + var ls coordinationv1.Lease + if err := g.c.Get(ctx, key, &ls); err != nil { + return false + } + // another holder? + if ls.Spec.HolderIdentity != nil && *ls.Spec.HolderIdentity != g.id && !expired(&ls, now) { + return false + } + // update + ldurSec := int32(g.ldur / time.Second) + ls.Spec.HolderIdentity = &g.id + ls.Spec.LeaseDurationSeconds = &ldurSec + ls.Spec.RenewTime = &now + if err := g.c.Update(ctx, &ls); err != nil { + return false + } + return true +} + +// Release stops renewing; best-effort clear if we still own it. +func (g *leaseGuard) Release(ctx context.Context) { + if !g.held { + return + } + if g.cancel != nil { + g.cancel() + } + g.held = false + + key := types.NamespacedName{Namespace: g.ns, Name: g.nm} + var ls coordinationv1.Lease + if err := g.c.Get(ctx, key, &ls); err == nil { + if ls.Spec.HolderIdentity != nil && *ls.Spec.HolderIdentity == g.id { + empty := "" + ls.Spec.HolderIdentity = &empty + // keep RenewTime/AcquireTime; just clear holder + _ = g.c.Update(ctx, &ls) // ignore errors + } + } +} + +func expired(ls *coordinationv1.Lease, now metav1.MicroTime) bool { + if ls.Spec.RenewTime == nil || ls.Spec.LeaseDurationSeconds == nil { + return true + } + return now.Time.After(ls.Spec.RenewTime.Time.Add(time.Duration(*ls.Spec.LeaseDurationSeconds) * time.Second)) +} diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 7e4abe1e..4a7d8a6f 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -20,6 +20,8 @@ import ( "context" "fmt" "net/http" + "sync" + "time" "github.com/go-logr/logr" @@ -33,6 +35,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" mccontext "sigs.k8s.io/multicluster-runtime/pkg/context" + "sigs.k8s.io/multicluster-runtime/pkg/manager/peers" + "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" ) @@ -129,6 +133,48 @@ type mcManager struct { provider multicluster.Provider mcRunnables []multicluster.Aware + + peers peerRegistry + sharder sharder.Sharder + self sharder.PeerInfo + + engMu sync.Mutex + engaged map[string]*engagement + + shardLeaseNS, shardLeaseName string + perClusterLease bool + + leaseDuration time.Duration + leaseRenew time.Duration + fenceThrottle time.Duration + + peerLeasePrefix string + peerWeight uint32 + ownershipProbe, ownershipRehash time.Duration +} + +type engagement struct { + name string + cl cluster.Cluster + ctx context.Context + cancel context.CancelFunc + started bool + + fence *leaseGuard + nextTry time.Time +} + +func (m *mcManager) fenceName(cluster string) string { + if m.perClusterLease { + return fmt.Sprintf("%s-%s", m.shardLeaseName, cluster) + } + return m.shardLeaseName +} + +type peerRegistry interface { + Self() sharder.PeerInfo + Snapshot() []sharder.PeerInfo + Run(ctx context.Context) error } // New returns a new Manager for creating Controllers. The provider is used to @@ -144,10 +190,144 @@ func New(config *rest.Config, provider multicluster.Provider, opts manager.Optio // WithMultiCluster wraps a host manager to run multi-cluster controllers. func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider) (Manager, error) { - return &mcManager{ + m := &mcManager{ Manager: mgr, provider: provider, - }, nil + sharder: sharder.NewHRW(), + engaged: map[string]*engagement{}, + + shardLeaseNS: "kube-system", + shardLeaseName: "mcr-shard", + + peerLeasePrefix: "mcr-peer", + perClusterLease: true, + + leaseDuration: 20 * time.Second, + leaseRenew: 10 * time.Second, + fenceThrottle: 750 * time.Millisecond, + + peerWeight: 1, + ownershipProbe: 5 * time.Second, + ownershipRehash: 15 * time.Second, + } + + m.peers = peers.NewLeaseRegistry(mgr.GetClient(), m.shardLeaseNS, m.peerLeasePrefix, "", m.peerWeight) + m.self = m.peers.Self() + + // Start ownership loop as a manager Runnable. + if err := mgr.Add(m.newOwnershipRunnable()); err != nil { + return nil, err + } + return m, nil +} + +func (m *mcManager) newOwnershipRunnable() manager.Runnable { + return manager.RunnableFunc(func(ctx context.Context) error { + m.GetLogger().Info("ownership runnable starting", "peer", m.self.ID) + errCh := make(chan error, 1) + go func() { errCh <- m.peers.Run(ctx) }() + + m.recomputeOwnership(ctx) + t := time.NewTicker(m.ownershipProbe) + defer t.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case err := <-errCh: + if err != nil { + return fmt.Errorf("peer registry stopped: %w", err) + } + return nil + case <-t.C: + m.GetLogger().V(1).Info("ownership tick", "peers", len(m.peers.Snapshot())) + m.recomputeOwnership(ctx) + } + } + }) +} + +func (m *mcManager) recomputeOwnership(parent context.Context) { + peers := m.peers.Snapshot() + self := m.self + + type toStart struct { + name string + cl cluster.Cluster + ctx context.Context + } + var starts []toStart + var stops []context.CancelFunc + + now := time.Now() + + m.engMu.Lock() + for name, e := range m.engaged { + should := m.sharder.ShouldOwn(name, peers, self) + + switch { + case should && !e.started: + // ensure fence exists + if e.fence == nil { + onLost := func(cluster string) func() { + return func() { + // best-effort stop if we lose the lease mid-flight + m.GetLogger().Info("lease lost; stopping", "cluster", cluster, "peer", self.ID) + m.engMu.Lock() + if ee := m.engaged[cluster]; ee != nil && ee.started && ee.cancel != nil { + ee.cancel() + ee.started = false + } + m.engMu.Unlock() + } + }(name) + e.fence = newLeaseGuard( + m.Manager.GetClient(), + m.shardLeaseNS, m.fenceName(name), m.self.ID, + m.leaseDuration, m.leaseRenew, onLost, + ) + } + + // throttle attempts + if now.Before(e.nextTry) { + continue + } + + // try to take the fence; if we fail, set nextTry and retry on next tick + if !e.fence.TryAcquire(parent) { + e.nextTry = now.Add(m.fenceThrottle) + continue + } + + // acquired fence; start the cluster + ctx, cancel := context.WithCancel(parent) + e.ctx, e.cancel, e.started = ctx, cancel, true + starts = append(starts, toStart{name: e.name, cl: e.cl, ctx: ctx}) + m.GetLogger().Info("ownership start", "cluster", name, "peer", self.ID) + + case !should && e.started: + // stop + release fence + if e.cancel != nil { + stops = append(stops, e.cancel) + } + if e.fence != nil { + go e.fence.Release(parent) + } + e.cancel = nil + e.started = false + e.nextTry = time.Time{} // reset throttle so new owner can grab quickly + m.GetLogger().Info("ownership stop", "cluster", name, "peer", self.ID) + } + } + m.engMu.Unlock() + + for _, c := range stops { + c() + } + for _, s := range starts { + go m.startForCluster(s.ctx, s.name, s.cl) + } } // GetCluster returns a cluster for the given identifying cluster name. Get @@ -198,15 +378,52 @@ func (m *mcManager) Add(r Runnable) (err error) { // Engage gets called when the component should start operations for the given // Cluster. ctx is cancelled when the cluster is disengaged. -func (m *mcManager) Engage(ctx context.Context, name string, cl cluster.Cluster) error { - ctx, cancel := context.WithCancel(ctx) //nolint:govet // cancel is called in the error case only. +func (m *mcManager) Engage(parent context.Context, name string, cl cluster.Cluster) error { + m.engMu.Lock() + e, ok := m.engaged[name] + if !ok { + e = &engagement{name: name, cl: cl} + m.engaged[name] = e + + // cleanup when provider disengages the cluster + go func(pctx context.Context, key string) { + <-pctx.Done() + m.engMu.Lock() + if ee, ok := m.engaged[key]; ok { + if ee.started && ee.cancel != nil { + ee.cancel() + } + if ee.fence != nil { + // best-effort release fence so a new owner can acquire immediately + go ee.fence.Release(context.Background()) + } + delete(m.engaged, key) + } + m.engMu.Unlock() + }(parent, name) + } else { + // provider may hand a new cluster impl for the same name + e.cl = cl + } + m.engMu.Unlock() + // No starting here; recomputeOwnership() will start/stop safely. + return nil +} + +func (m *mcManager) startForCluster(ctx context.Context, name string, cl cluster.Cluster) { for _, r := range m.mcRunnables { if err := r.Engage(ctx, name, cl); err != nil { - cancel() - return fmt.Errorf("failed to engage cluster %q: %w", name, err) + m.GetLogger().Error(err, "failed to engage", "cluster", name) + // best-effort: cancel + mark stopped so next tick can retry + m.engMu.Lock() + if e := m.engaged[name]; e != nil && e.cancel != nil { + e.cancel() + e.started = false + } + m.engMu.Unlock() + return } } - return nil //nolint:govet // cancel is called in the error case only. } func (m *mcManager) GetManager(ctx context.Context, clusterName string) (manager.Manager, error) { diff --git a/pkg/manager/peers/registry.go b/pkg/manager/peers/registry.go new file mode 100644 index 00000000..911a41ff --- /dev/null +++ b/pkg/manager/peers/registry.go @@ -0,0 +1,212 @@ +package peers + +import ( + "context" + "fmt" + "os" + "strconv" + "sync" + "time" + + coordv1 "k8s.io/api/coordination/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + crclient "sigs.k8s.io/controller-runtime/pkg/client" + + "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" +) + +const ( + labelPartOf = "app.kubernetes.io/part-of" + labelPeer = "mcr.sigs.k8s.io/peer" + labelPrefix = "mcr.sigs.k8s.io/prefix" + annotWeight = "mcr.sigs.k8s.io/weight" + partOfValue = "multicluster-runtime" + defaultWeight = uint32(1) +) + +// Registry abstracts peer discovery for sharding. +type Registry interface { + Self() sharder.PeerInfo + Snapshot() []sharder.PeerInfo + Run(ctx context.Context) error +} + +type leaseRegistry struct { + ns, namePrefix string + self sharder.PeerInfo + cli crclient.Client + + mu sync.RWMutex + peers map[string]sharder.PeerInfo + + ttl time.Duration + renew time.Duration +} + +// NewLeaseRegistry creates a Registry using coordination.k8s.io Leases. +// ns : namespace where peer Leases live (e.g., "kube-system") +// namePrefix : common prefix for Lease names (e.g., "mcr-peer") +// selfID : this process identity (defaults to hostname if empty) +// weight : optional weight for HRW (defaults to 1 if 0) +func NewLeaseRegistry(cli crclient.Client, ns, namePrefix string, selfID string, weight uint32) Registry { + if selfID == "" { + if hn, _ := os.Hostname(); hn != "" { + selfID = hn + } else { + selfID = "unknown" + } + } + if weight == 0 { + weight = defaultWeight + } + return &leaseRegistry{ + ns: ns, + namePrefix: namePrefix, + self: sharder.PeerInfo{ID: selfID, Weight: weight}, + cli: cli, + peers: map[string]sharder.PeerInfo{}, + ttl: 30 * time.Second, + renew: 10 * time.Second, + } +} + +func (r *leaseRegistry) Self() sharder.PeerInfo { return r.self } + +func (r *leaseRegistry) Snapshot() []sharder.PeerInfo { + r.mu.RLock() + defer r.mu.RUnlock() + out := make([]sharder.PeerInfo, 0, len(r.peers)) + for _, p := range r.peers { + out = append(out, p) + } + return out +} + +func (r *leaseRegistry) Run(ctx context.Context) error { + // Tick frequently enough to renew well within ttl. + t := time.NewTicker(r.renew) + defer t.Stop() + + for { + // Do one pass immediately so we publish our presence promptly. + if err := r.renewSelfLease(ctx); err != nil && ctx.Err() == nil { + // Non-fatal; we'll try again on next tick. + } + if err := r.refreshPeers(ctx); err != nil && ctx.Err() == nil { + // Non-fatal; we'll try again on next tick. + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-t.C: + // loop + } + } +} + +// renewSelfLease upserts our own Lease with fresh RenewTime and duration. +func (r *leaseRegistry) renewSelfLease(ctx context.Context) error { + now := metav1.MicroTime{Time: time.Now()} + ttlSec := int32(r.ttl / time.Second) + name := fmt.Sprintf("%s-%s", r.namePrefix, r.self.ID) + + lease := &coordv1.Lease{} + err := r.cli.Get(ctx, crclient.ObjectKey{Namespace: r.ns, Name: name}, lease) + switch { + case apierrors.IsNotFound(err): + lease = &coordv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: r.ns, + Name: name, + Labels: map[string]string{ + labelPartOf: partOfValue, + labelPeer: "true", + labelPrefix: r.namePrefix, + }, + Annotations: map[string]string{ + annotWeight: strconv.FormatUint(uint64(r.self.Weight), 10), + }, + }, + Spec: coordv1.LeaseSpec{ + HolderIdentity: strPtr(r.self.ID), + RenewTime: &now, + LeaseDurationSeconds: int32Ptr(ttlSec), + }, + } + return r.cli.Create(ctx, lease) + + case err != nil: + return err + + default: + // Update the existing Lease + lease.Spec.HolderIdentity = strPtr(r.self.ID) + lease.Spec.RenewTime = &now + lease.Spec.LeaseDurationSeconds = int32Ptr(ttlSec) + if lease.Annotations == nil { + lease.Annotations = map[string]string{} + } + lease.Annotations[annotWeight] = strconv.FormatUint(uint64(r.self.Weight), 10) + return r.cli.Update(ctx, lease) + } +} + +// refreshPeers lists peer Leases and updates the in-memory snapshot. +func (r *leaseRegistry) refreshPeers(ctx context.Context) error { + list := &coordv1.LeaseList{} + // Only list our labeled peer leases with our prefix for efficiency. + if err := r.cli.List(ctx, list, + crclient.InNamespace(r.ns), + crclient.MatchingLabels{ + labelPeer: "true", + labelPrefix: r.namePrefix, + }, + ); err != nil { + return err + } + + now := time.Now() + next := make(map[string]sharder.PeerInfo, len(list.Items)) + + for i := range list.Items { + l := &list.Items[i] + // Basic sanity for holder identity + if l.Spec.HolderIdentity == nil || *l.Spec.HolderIdentity == "" { + continue + } + id := *l.Spec.HolderIdentity + + // Respect expiry: RenewTime + LeaseDurationSeconds + if l.Spec.RenewTime == nil || l.Spec.LeaseDurationSeconds == nil { + // If missing, treat as expired/stale. + continue + } + exp := l.Spec.RenewTime.Time.Add(time.Duration(*l.Spec.LeaseDurationSeconds) * time.Second) + if now.After(exp) { + continue // stale peer + } + + // Weight from annotation (optional) + weight := defaultWeight + if wStr := l.Annotations[annotWeight]; wStr != "" { + if w64, err := strconv.ParseUint(wStr, 10, 32); err == nil && w64 > 0 { + weight = uint32(w64) + } + } + + next[id] = sharder.PeerInfo{ID: id, Weight: weight} + } + + // Store snapshot (including ourselves; if not listed yet, ensure we're present). + next[r.self.ID] = sharder.PeerInfo{ID: r.self.ID, Weight: r.self.Weight} + + r.mu.Lock() + r.peers = next + r.mu.Unlock() + return nil +} + +func strPtr(s string) *string { return &s } +func int32Ptr(i int32) *int32 { return &i } diff --git a/pkg/manager/sharder/hrw.go b/pkg/manager/sharder/hrw.go new file mode 100644 index 00000000..74d5eae7 --- /dev/null +++ b/pkg/manager/sharder/hrw.go @@ -0,0 +1,31 @@ +package sharder + +import ( + "hash/fnv" +) + +type HRW struct{} + +func NewHRW() *HRW { return &HRW{} } + +func (h *HRW) ShouldOwn(clusterID string, peers []PeerInfo, self PeerInfo) bool { + if len(peers) == 0 { return true } + var best PeerInfo + var bestScore uint64 + for _, p := range peers { + score := hash64(clusterID + "|" + p.ID) + if p.Weight == 0 { p.Weight = 1 } + score *= uint64(p.Weight) + if score >= bestScore { + bestScore = score + best = p + } + } + return best.ID == self.ID +} + +func hash64(s string) uint64 { + h := fnv.New64a() + _, _ = h.Write([]byte(s)) + return h.Sum64() +} diff --git a/pkg/manager/sharder/sharder.go b/pkg/manager/sharder/sharder.go new file mode 100644 index 00000000..2e963981 --- /dev/null +++ b/pkg/manager/sharder/sharder.go @@ -0,0 +1,10 @@ +package sharder + +type PeerInfo struct { + ID string + Weight uint32 // optional (default 1) +} + +type Sharder interface { + ShouldOwn(clusterID string, peers []PeerInfo, self PeerInfo) bool +} diff --git a/pkg/manager/sharding.go b/pkg/manager/sharding.go new file mode 100644 index 00000000..42c7dea6 --- /dev/null +++ b/pkg/manager/sharding.go @@ -0,0 +1,47 @@ +package manager + +import ( + "time" + + "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" +) + +// Option mutates mcManager after construction. +type Option func(*mcManager) + +// WithSharder replaces the default HRW sharder. +func WithSharder(s sharder.Sharder) Option { return func(m *mcManager) { m.sharder = s } } + +// WithPeerWeight allows heterogenous peers. +func WithPeerWeight(w uint32) Option { return func(m *mcManager) { m.peerWeight = w } } + +// WithShardLease configures the shard lease name/namespace (for fencing) +func WithShardLease(ns, name string) Option { + return func(m *mcManager) { m.shardLeaseNS, m.shardLeaseName = ns, name } +} + +// WithPerClusterLease enables/disables per-cluster fencing. +func WithPerClusterLease(on bool) Option { return func(m *mcManager) { m.perClusterLease = on } } + +// WithOwnershipIntervals tunes loop cadences. +func WithOwnershipIntervals(probe, rehash time.Duration) Option { + return func(m *mcManager) { m.ownershipProbe = probe; m.ownershipRehash = rehash } +} + +// WithLeaseTimings configures fencing lease timings. +func WithLeaseTimings(duration, renew, throttle time.Duration) Option { + return func(m *mcManager) { + m.leaseDuration = duration + m.leaseRenew = renew + m.fenceThrottle = throttle + } +} + +// Configure applies options to a Manager if it is an *mcManager. +func Configure(m Manager, opts ...Option) { + if x, ok := m.(*mcManager); ok { + for _, o := range opts { + o(x) + } + } +} diff --git a/pkg/source/kind.go b/pkg/source/kind.go index 78f70e4e..8a2fdd8c 100644 --- a/pkg/source/kind.go +++ b/pkg/source/kind.go @@ -1,87 +1,282 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - package source import ( + "context" + "sync" + "time" + + toolscache "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" + + crcache "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" - "sigs.k8s.io/controller-runtime/pkg/source" + crsource "sigs.k8s.io/controller-runtime/pkg/source" mchandler "sigs.k8s.io/multicluster-runtime/pkg/handler" mcreconcile "sigs.k8s.io/multicluster-runtime/pkg/reconcile" ) // Kind creates a KindSource with the given cache provider. -func Kind[object client.Object]( - obj object, - handler mchandler.TypedEventHandlerFunc[object, mcreconcile.Request], - predicates ...predicate.TypedPredicate[object], -) SyncingSource[object] { - return TypedKind[object, mcreconcile.Request](obj, handler, predicates...) +func Kind[O client.Object]( + obj O, + hf mchandler.TypedEventHandlerFunc[O, mcreconcile.Request], + preds ...predicate.TypedPredicate[O], +) SyncingSource[O] { + return TypedKind[O, mcreconcile.Request](obj, hf, preds...) } // TypedKind creates a KindSource with the given cache provider. -func TypedKind[object client.Object, request mcreconcile.ClusterAware[request]]( - obj object, - handler mchandler.TypedEventHandlerFunc[object, request], - predicates ...predicate.TypedPredicate[object], -) TypedSyncingSource[object, request] { - return &kind[object, request]{ +func TypedKind[O client.Object, R mcreconcile.ClusterAware[R]]( + obj O, + hf mchandler.TypedEventHandlerFunc[O, R], + preds ...predicate.TypedPredicate[O], +) TypedSyncingSource[O, R] { + return &kind[O, R]{ obj: obj, - handler: handler, - predicates: predicates, - project: func(_ cluster.Cluster, obj object) (object, error) { return obj, nil }, + handler: hf, + predicates: preds, + project: func(_ cluster.Cluster, o O) (O, error) { return o, nil }, + resync: 0, // no periodic resync by default } } -type kind[object client.Object, request mcreconcile.ClusterAware[request]] struct { - obj object - handler mchandler.TypedEventHandlerFunc[object, request] - predicates []predicate.TypedPredicate[object] - project func(cluster.Cluster, object) (object, error) +type kind[O client.Object, R mcreconcile.ClusterAware[R]] struct { + obj O + handler mchandler.TypedEventHandlerFunc[O, R] + predicates []predicate.TypedPredicate[O] + project func(cluster.Cluster, O) (O, error) + resync time.Duration } -type clusterKind[object client.Object, request mcreconcile.ClusterAware[request]] struct { - source.TypedSyncingSource[request] +type clusterKind[O client.Object, R mcreconcile.ClusterAware[R]] struct { + clusterName string + cl cluster.Cluster + obj O + h handler.TypedEventHandler[O, R] + preds []predicate.TypedPredicate[O] + resync time.Duration + + mu sync.Mutex + registration toolscache.ResourceEventHandlerRegistration + activeCtx context.Context } // WithProjection sets the projection function for the KindSource. -func (k *kind[object, request]) WithProjection(project func(cluster.Cluster, object) (object, error)) TypedSyncingSource[object, request] { +func (k *kind[O, R]) WithProjection(project func(cluster.Cluster, O) (O, error)) TypedSyncingSource[O, R] { k.project = project return k } -func (k *kind[object, request]) ForCluster(name string, cl cluster.Cluster) (source.TypedSource[request], error) { +func (k *kind[O, R]) ForCluster(name string, cl cluster.Cluster) (crsource.TypedSource[R], error) { obj, err := k.project(cl, k.obj) if err != nil { return nil, err } - return &clusterKind[object, request]{ - TypedSyncingSource: source.TypedKind(cl.GetCache(), obj, k.handler(name, cl), k.predicates...), + return &clusterKind[O, R]{ + clusterName: name, + cl: cl, + obj: obj, + h: k.handler(name, cl), + preds: k.predicates, + resync: k.resync, }, nil } -func (k *kind[object, request]) SyncingForCluster(name string, cl cluster.Cluster) (source.TypedSyncingSource[request], error) { - obj, err := k.project(cl, k.obj) +func (k *kind[O, R]) SyncingForCluster(name string, cl cluster.Cluster) (crsource.TypedSyncingSource[R], error) { + src, err := k.ForCluster(name, cl) if err != nil { return nil, err } - return &clusterKind[object, request]{ - TypedSyncingSource: source.TypedKind(cl.GetCache(), obj, k.handler(name, cl), k.predicates...), - }, nil + return src.(crsource.TypedSyncingSource[R]), nil +} + +// WaitForSync satisfies TypedSyncingSource. +func (ck *clusterKind[O, R]) WaitForSync(ctx context.Context) error { + if !ck.cl.GetCache().WaitForCacheSync(ctx) { + return ctx.Err() + } + return nil +} + +// Start registers a removable handler on the (scoped) informer and removes it on ctx.Done(). +func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLimitingInterface[R]) error { + log := log.FromContext(ctx).WithValues("cluster", ck.clusterName, "source", "kind") + + // Check if we're already started with this context + ck.mu.Lock() + if ck.registration != nil && ck.activeCtx != nil { + // Check if the active context is still valid + select { + case <-ck.activeCtx.Done(): + // Previous context cancelled, need to clean up and re-register + log.V(1).Info("previous context cancelled, cleaning up for re-registration") + // Clean up old registration is handled below + default: + // Still active with same context - check if it's the same context + if ck.activeCtx == ctx { + ck.mu.Unlock() + log.V(1).Info("handler already registered with same context") + return nil + } + // Different context but old one still active - this shouldn't happen + log.V(1).Info("different context while old one active, will re-register") + } + } + ck.mu.Unlock() + + inf, err := ck.getInformer(ctx, ck.obj) + if err != nil { + log.Error(err, "get informer failed") + return err + } + + // If there's an old registration, remove it first + ck.mu.Lock() + if ck.registration != nil { + log.V(1).Info("removing old event handler registration") + if err := inf.RemoveEventHandler(ck.registration); err != nil { + log.Error(err, "failed to remove old event handler") + } + ck.registration = nil + ck.activeCtx = nil + } + ck.mu.Unlock() + + // predicate helpers + passCreate := func(e event.TypedCreateEvent[O]) bool { + for _, p := range ck.preds { + if !p.Create(e) { + return false + } + } + return true + } + passUpdate := func(e event.TypedUpdateEvent[O]) bool { + for _, p := range ck.preds { + if !p.Update(e) { + return false + } + } + return true + } + passDelete := func(e event.TypedDeleteEvent[O]) bool { + for _, p := range ck.preds { + if !p.Delete(e) { + return false + } + } + return true + } + + // typed event builders + makeCreate := func(o client.Object) event.TypedCreateEvent[O] { + return event.TypedCreateEvent[O]{Object: any(o).(O)} + } + makeUpdate := func(oo, no client.Object) event.TypedUpdateEvent[O] { + return event.TypedUpdateEvent[O]{ObjectOld: any(oo).(O), ObjectNew: any(no).(O)} + } + makeDelete := func(o client.Object) event.TypedDeleteEvent[O] { + return event.TypedDeleteEvent[O]{Object: any(o).(O)} + } + + // Adapter that forwards to controller handler, honoring ctx. + h := toolscache.ResourceEventHandlerFuncs{ + AddFunc: func(i interface{}) { + if ctx.Err() != nil { + return + } + if o, ok := i.(client.Object); ok { + e := makeCreate(o) + if passCreate(e) { + ck.h.Create(ctx, e, q) + } + } + }, + UpdateFunc: func(oo, no interface{}) { + if ctx.Err() != nil { + return + } + ooObj, ok1 := oo.(client.Object) + noObj, ok2 := no.(client.Object) + if ok1 && ok2 { + log.V(1).Info("kind source update", "cluster", ck.clusterName, + "name", noObj.GetName(), "ns", noObj.GetNamespace()) + e := makeUpdate(ooObj, noObj) + if passUpdate(e) { + ck.h.Update(ctx, e, q) + } + } + }, + DeleteFunc: func(i interface{}) { + if ctx.Err() != nil { + return + } + // be robust to tombstones (provider should already unwrap) + if ts, ok := i.(toolscache.DeletedFinalStateUnknown); ok { + i = ts.Obj + } + if o, ok := i.(client.Object); ok { + e := makeDelete(o) + if passDelete(e) { + ck.h.Delete(ctx, e, q) + } + } + }, + } + + // Register via removable API. + reg, addErr := inf.AddEventHandlerWithResyncPeriod(h, ck.resync) + if addErr != nil { + log.Error(addErr, "AddEventHandlerWithResyncPeriod failed") + return addErr + } + + // Store registration and context + ck.mu.Lock() + ck.registration = reg + ck.activeCtx = ctx + ck.mu.Unlock() + + log.V(1).Info("kind source handler registered", "hasRegistration", reg != nil) + + // Defensive: ensure cache is synced. + if !ck.cl.GetCache().WaitForCacheSync(ctx) { + ck.mu.Lock() + _ = inf.RemoveEventHandler(ck.registration) + ck.registration = nil + ck.activeCtx = nil + ck.mu.Unlock() + log.V(1).Info("cache not synced; handler removed") + return ctx.Err() + } + log.V(1).Info("kind source cache synced") + + // Wait for context cancellation in a goroutine + go func() { + <-ctx.Done() + ck.mu.Lock() + defer ck.mu.Unlock() + + // Only remove if this is still our active registration + if ck.activeCtx == ctx && ck.registration != nil { + if err := inf.RemoveEventHandler(ck.registration); err != nil { + log.Error(err, "failed to remove event handler on context cancel") + } + ck.registration = nil + ck.activeCtx = nil + log.V(1).Info("kind source handler removed due to context cancellation") + } + }() + + return nil +} + +// getInformer resolves the informer from the cluster cache (provider returns a scoped informer). +func (ck *clusterKind[O, R]) getInformer(ctx context.Context, obj client.Object) (crcache.Informer, error) { + return ck.cl.GetCache().GetInformer(ctx, obj) } From 1db63bcd17621cdb732b40779c85987c07929b73 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Sat, 23 Aug 2025 00:36:58 -0700 Subject: [PATCH 02/16] feat: controller sharding --- examples/sharded-namespace/README.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/sharded-namespace/README.md b/examples/sharded-namespace/README.md index ebab4203..cb75cdc2 100644 --- a/examples/sharded-namespace/README.md +++ b/examples/sharded-namespace/README.md @@ -131,9 +131,4 @@ The peer registry uses mcr-peer-* automatically and derives the peer ID from the kubectl delete -k examples/sharded-namespace/manifests kind delete cluster --name mcr-demo -``` - -## Notes - -- This example assumes the `peers` and `sharder` code we wrote is integrated and `mcmanager.Configure` hooks it up (defaults: HRW + Lease registry; hostname as peer ID). -- If your repo uses a different module path, the Dockerfile build context may need minor tweaking—otherwise it compiles against your local packages. +``` \ No newline at end of file From ed2a440a76906f380a4a5a666ea5113a65aa706d Mon Sep 17 00:00:00 2001 From: Zach Smith <33258732+zachsmith1@users.noreply.github.com> Date: Sat, 23 Aug 2025 14:53:02 -0700 Subject: [PATCH 03/16] chore: remove misleading comment Co-authored-by: Nelo-T. Wallus <10514301+ntnn@users.noreply.github.com> --- pkg/manager/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 4a7d8a6f..9a1d0241 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -316,7 +316,7 @@ func (m *mcManager) recomputeOwnership(parent context.Context) { } e.cancel = nil e.started = false - e.nextTry = time.Time{} // reset throttle so new owner can grab quickly + e.nextTry = time.Time{} m.GetLogger().Info("ownership stop", "cluster", name, "peer", self.ID) } } From 0abdcad6b1b5c465b31bac380806d67aac95a8ab Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Sat, 23 Aug 2025 15:06:44 -0700 Subject: [PATCH 04/16] fix: same cluster name re-engaged --- pkg/controller/controller.go | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index ec412e63..7358fc7c 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -116,20 +116,18 @@ func (c *mcController[request]) Engage(ctx context.Context, name string, cl clus c.lock.Lock() defer c.lock.Unlock() - // Check if we already have this cluster engaged with the SAME context - if old, ok := c.clusters[name]; ok { - // If the context is different or cancelled, we need to re-engage - select { - case <-old.ctx.Done(): - // Old context is cancelled, need to re-engage - delete(c.clusters, name) - default: - if old.cluster == cl { - // Same cluster, same active context, nothing to do - return nil - } - } - } + // Check if we already have this cluster engaged with the SAME context + if old, ok := c.clusters[name]; ok { + if old.cluster == cl && old.ctx.Err() == nil { + // Same impl, engagement still live → nothing to do + return nil + } + // Re-engage: either old ctx is done, or impl changed. Stop the old one if still live. + if old.ctx.Err() == nil { + old.cancel() + } + delete(c.clusters, name) + } engCtx, cancel := context.WithCancel(ctx) From 1493d97173737dd147fded4c0994d34b5644af7f Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Sat, 23 Aug 2025 17:13:15 -0700 Subject: [PATCH 05/16] fix: cleanup and some fixes for engage --- pkg/manager/leaseguard.go | 49 ++++- pkg/manager/leaseguard_test.go | 173 ++++++++++++++++ pkg/manager/manager.go | 247 ++--------------------- pkg/manager/ownership.go | 357 +++++++++++++++++++++++++++++++++ pkg/manager/peers/registry.go | 28 ++- pkg/manager/sharder/hrw.go | 59 +++++- pkg/manager/sharder/sharder.go | 35 ++++ pkg/manager/sharding.go | 74 +++++-- pkg/source/kind.go | 97 +++++---- 9 files changed, 825 insertions(+), 294 deletions(-) create mode 100644 pkg/manager/leaseguard_test.go create mode 100644 pkg/manager/ownership.go diff --git a/pkg/manager/leaseguard.go b/pkg/manager/leaseguard.go index a40d5a9e..478fffb1 100644 --- a/pkg/manager/leaseguard.go +++ b/pkg/manager/leaseguard.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package manager import ( @@ -11,6 +27,32 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// leaseGuard manages a single Lease as a *fence* for one shard/cluster. +// +// Design/semantics: +// +// - TryAcquire(ctx) attempts to create/adopt the Lease for holder `id`, +// and, on success, starts a background renew loop. It is idempotent: +// calling it while already held is a cheap true. +// +// - Renewing: a ticker renews the Lease every `renew`. If renewal fails +// (API error, conflict, or we observe another holder while still valid), +// `onLost` is invoked once (best-effort), the fence is released, and the +// loop exits. +// +// - Release(ctx) stops renewing and, if we still own the Lease, clears +// HolderIdentity (best-effort). Safe to call multiple times. +// +// - Thread-safety: leaseGuard is intended to be used by a single goroutine +// (caller) per fence. External synchronization is required if multiple +// goroutines might call its methods concurrently. +// +// - Timings: choose ldur (duration) > renew. A common pattern is +// renew ≈ ldur/3. Too small ldur increases churn; too large slows +// failover. +// +// RBAC: the caller’s client must be allowed to get/list/watch/create/update/patch +// Leases in namespace `ns`. type leaseGuard struct { c client.Client ns, nm string @@ -23,11 +65,14 @@ type leaseGuard struct { cancel context.CancelFunc } +// newLeaseGuard builds a guard; it does not contact the API server. func newLeaseGuard(c client.Client, ns, name, id string, ldur, renew time.Duration, onLost func()) *leaseGuard { return &leaseGuard{c: c, ns: ns, nm: name, id: id, ldur: ldur, renew: renew, onLost: onLost} } -// TryAcquire attempts to acquire the lease. Returns true if we own it (or already owned). +// TryAcquire creates/adopts the Lease for g.id and starts renewing it. +// Returns true iff we own it after this call (or already owned). +// Fails (returns false) when another non-expired holder exists or API calls error. func (g *leaseGuard) TryAcquire(ctx context.Context) bool { if g.held { return true @@ -87,6 +132,8 @@ func (g *leaseGuard) TryAcquire(ctx context.Context) bool { return true } +// Internal: renew loop and single-step renew. If renewal observes a different, +// valid holder or API errors persist, onLost() is invoked once and the fence is released. func (g *leaseGuard) renewLoop(ctx context.Context, key types.NamespacedName) { t := time.NewTicker(g.renew) defer t.Stop() diff --git a/pkg/manager/leaseguard_test.go b/pkg/manager/leaseguard_test.go new file mode 100644 index 00000000..8dc6f157 --- /dev/null +++ b/pkg/manager/leaseguard_test.go @@ -0,0 +1,173 @@ +package manager + +import ( + "context" + "testing" + "time" + + coordinationv1 "k8s.io/api/coordination/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + crclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// test constants +const ( + testNS = "kube-system" + testName = "mcr-shard-default" + testID = "peer-0" + otherID = "peer-1" +) + +func newScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := coordinationv1.AddToScheme(s); err != nil { + t.Fatalf("add scheme: %v", err) + } + return s +} + +func getLease(t *testing.T, c crclient.Client) *coordinationv1.Lease { + t.Helper() + var ls coordinationv1.Lease + if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNS, Name: testName}, &ls); err != nil { + t.Fatalf("get lease: %v", err) + } + return &ls +} + +func makeLease(holder string, renewAgo time.Duration, durSec int32) *coordinationv1.Lease { + now := time.Now() + renew := metav1.NewMicroTime(now.Add(-renewAgo)) + return &coordinationv1.Lease{ + ObjectMeta: metav1.ObjectMeta{Namespace: testNS, Name: testName}, + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &holder, + LeaseDurationSeconds: &durSec, + RenewTime: &renew, + AcquireTime: &renew, + }, + } +} + +func TestTryAcquire_CreatesAndRenewsThenReleaseClearsHolder(t *testing.T) { + s := newScheme(t) + c := fake.NewClientBuilder().WithScheme(s).Build() + + onLostCh := make(chan struct{}, 1) + g := newLeaseGuard(c, testNS, testName, testID, 3*time.Second, 200*time.Millisecond, func() { onLostCh <- struct{}{} }) + + if ok := g.TryAcquire(context.Background()); !ok { + t.Fatalf("expected TryAcquire to succeed on create") + } + if !g.held { + t.Fatalf("guard should be held after TryAcquire") + } + + ls := getLease(t, c) + if ls.Spec.HolderIdentity == nil || *ls.Spec.HolderIdentity != testID { + t.Fatalf("holder mismatch, got %v", ls.Spec.HolderIdentity) + } + if ls.Spec.LeaseDurationSeconds == nil || *ls.Spec.LeaseDurationSeconds != int32(3) { + t.Fatalf("duration mismatch, got %v", ls.Spec.LeaseDurationSeconds) + } + + // Idempotent + if ok := g.TryAcquire(context.Background()); !ok { + t.Fatalf("expected idempotent TryAcquire to return true") + } + + // Release clears holder (best-effort) + g.Release(context.Background()) + ls = getLease(t, c) + if ls.Spec.HolderIdentity != nil && *ls.Spec.HolderIdentity != "" { + t.Fatalf("expected holder cleared on release, got %q", *ls.Spec.HolderIdentity) + } +} + +func TestTryAcquire_FailsWhenOtherHoldsAndNotExpired(t *testing.T) { + s := newScheme(t) + // Other holder renewed recently, not expired + dur := int32(30) + ls := makeLease(otherID, 1*time.Second, dur) + c := fake.NewClientBuilder().WithScheme(s).WithObjects(ls).Build() + + g := newLeaseGuard(c, testNS, testName, testID, 10*time.Second, 2*time.Second, nil) + + if ok := g.TryAcquire(context.Background()); ok { + t.Fatalf("expected TryAcquire to fail while another valid holder exists") + } + if g.held { + t.Fatalf("guard should not be held") + } +} + +func TestTryAcquire_AdoptsWhenExpired(t *testing.T) { + s := newScheme(t) + // Other holder expired: renew time far in the past relative to duration + dur := int32(5) + ls := makeLease(otherID, 30*time.Second, dur) + c := fake.NewClientBuilder().WithScheme(s).WithObjects(ls).Build() + + g := newLeaseGuard(c, testNS, testName, testID, 10*time.Second, 2*time.Second, nil) + + if ok := g.TryAcquire(context.Background()); !ok { + t.Fatalf("expected TryAcquire to adopt expired lease") + } + got := getLease(t, c) + if got.Spec.HolderIdentity == nil || *got.Spec.HolderIdentity != testID { + t.Fatalf("expected holder=%q, got %v", testID, got.Spec.HolderIdentity) + } +} + +func TestRenewLoop_OnLostTriggersCallbackAndReleases(t *testing.T) { + s := newScheme(t) + c := fake.NewClientBuilder().WithScheme(s).Build() + + lost := make(chan struct{}, 1) + g := newLeaseGuard(c, testNS, testName, testID, 3*time.Second, 50*time.Millisecond, func() { lost <- struct{}{} }) + + // Acquire + if ok := g.TryAcquire(context.Background()); !ok { + t.Fatalf("expected TryAcquire to succeed") + } + + // Flip lease to another holder (valid) so renewOnce observes loss. + ls := getLease(t, c) + now := metav1.NowMicro() + dur := int32(3) + other := otherID + ls.Spec.HolderIdentity = &other + ls.Spec.LeaseDurationSeconds = &dur + ls.Spec.RenewTime = &now + if err := c.Update(context.Background(), ls); err != nil { + t.Fatalf("update lease: %v", err) + } + + // Expect onLost to fire and guard to release within a few renew ticks + select { + case <-lost: + // ok + case <-time.After(2 * time.Second): + t.Fatalf("expected onLost to be called after renewal detects loss") + } + + // Give a moment for Release() to run + time.Sleep(50 * time.Millisecond) + if g.held { + t.Fatalf("guard should not be held after loss") + } +} + +func TestRelease_NoHoldIsNoop(t *testing.T) { + s := newScheme(t) + c := fake.NewClientBuilder().WithScheme(s).Build() + + g := newLeaseGuard(c, testNS, testName, testID, 3*time.Second, 1*time.Second, nil) + // Not acquired yet; should be a no-op + g.Release(context.Background()) + // Nothing to assert other than "does not panic" +} diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 9a1d0241..3869bd73 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "net/http" - "sync" "time" "github.com/go-logr/logr" @@ -131,50 +130,7 @@ var _ Manager = &mcManager{} type mcManager struct { manager.Manager provider multicluster.Provider - - mcRunnables []multicluster.Aware - - peers peerRegistry - sharder sharder.Sharder - self sharder.PeerInfo - - engMu sync.Mutex - engaged map[string]*engagement - - shardLeaseNS, shardLeaseName string - perClusterLease bool - - leaseDuration time.Duration - leaseRenew time.Duration - fenceThrottle time.Duration - - peerLeasePrefix string - peerWeight uint32 - ownershipProbe, ownershipRehash time.Duration -} - -type engagement struct { - name string - cl cluster.Cluster - ctx context.Context - cancel context.CancelFunc - started bool - - fence *leaseGuard - nextTry time.Time -} - -func (m *mcManager) fenceName(cluster string) string { - if m.perClusterLease { - return fmt.Sprintf("%s-%s", m.shardLeaseName, cluster) - } - return m.shardLeaseName -} - -type peerRegistry interface { - Self() sharder.PeerInfo - Snapshot() []sharder.PeerInfo - Run(ctx context.Context) error + engine *ownershipEngine } // New returns a new Manager for creating Controllers. The provider is used to @@ -190,146 +146,29 @@ func New(config *rest.Config, provider multicluster.Provider, opts manager.Optio // WithMultiCluster wraps a host manager to run multi-cluster controllers. func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider) (Manager, error) { - m := &mcManager{ - Manager: mgr, - provider: provider, - sharder: sharder.NewHRW(), - engaged: map[string]*engagement{}, - - shardLeaseNS: "kube-system", - shardLeaseName: "mcr-shard", + cfg := OwnershipConfig{ + FenceNS: "kube-system", FencePrefix: "mcr-shard", PerClusterLease: true, + LeaseDuration: 20 * time.Second, LeaseRenew: 10 * time.Second, FenceThrottle: 750 * time.Millisecond, + PeerPrefix: "mcr-peer", PeerWeight: 1, Probe: 5 * time.Second, Rehash: 15 * time.Second, + } - peerLeasePrefix: "mcr-peer", - perClusterLease: true, + pr := peers.NewLeaseRegistry(mgr.GetClient(), cfg.FenceNS, cfg.PeerPrefix, "", cfg.PeerWeight) + self := pr.Self() - leaseDuration: 20 * time.Second, - leaseRenew: 10 * time.Second, - fenceThrottle: 750 * time.Millisecond, + eng := newOwnershipEngine( + mgr.GetClient(), mgr.GetLogger(), + sharder.NewHRW(), pr, self, cfg, + ) - peerWeight: 1, - ownershipProbe: 5 * time.Second, - ownershipRehash: 15 * time.Second, - } - - m.peers = peers.NewLeaseRegistry(mgr.GetClient(), m.shardLeaseNS, m.peerLeasePrefix, "", m.peerWeight) - m.self = m.peers.Self() + m := &mcManager{Manager: mgr, provider: provider, engine: eng} // Start ownership loop as a manager Runnable. - if err := mgr.Add(m.newOwnershipRunnable()); err != nil { + if err := mgr.Add(eng.Runnable()); err != nil { return nil, err } return m, nil } -func (m *mcManager) newOwnershipRunnable() manager.Runnable { - return manager.RunnableFunc(func(ctx context.Context) error { - m.GetLogger().Info("ownership runnable starting", "peer", m.self.ID) - errCh := make(chan error, 1) - go func() { errCh <- m.peers.Run(ctx) }() - - m.recomputeOwnership(ctx) - t := time.NewTicker(m.ownershipProbe) - defer t.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case err := <-errCh: - if err != nil { - return fmt.Errorf("peer registry stopped: %w", err) - } - return nil - case <-t.C: - m.GetLogger().V(1).Info("ownership tick", "peers", len(m.peers.Snapshot())) - m.recomputeOwnership(ctx) - } - } - }) -} - -func (m *mcManager) recomputeOwnership(parent context.Context) { - peers := m.peers.Snapshot() - self := m.self - - type toStart struct { - name string - cl cluster.Cluster - ctx context.Context - } - var starts []toStart - var stops []context.CancelFunc - - now := time.Now() - - m.engMu.Lock() - for name, e := range m.engaged { - should := m.sharder.ShouldOwn(name, peers, self) - - switch { - case should && !e.started: - // ensure fence exists - if e.fence == nil { - onLost := func(cluster string) func() { - return func() { - // best-effort stop if we lose the lease mid-flight - m.GetLogger().Info("lease lost; stopping", "cluster", cluster, "peer", self.ID) - m.engMu.Lock() - if ee := m.engaged[cluster]; ee != nil && ee.started && ee.cancel != nil { - ee.cancel() - ee.started = false - } - m.engMu.Unlock() - } - }(name) - e.fence = newLeaseGuard( - m.Manager.GetClient(), - m.shardLeaseNS, m.fenceName(name), m.self.ID, - m.leaseDuration, m.leaseRenew, onLost, - ) - } - - // throttle attempts - if now.Before(e.nextTry) { - continue - } - - // try to take the fence; if we fail, set nextTry and retry on next tick - if !e.fence.TryAcquire(parent) { - e.nextTry = now.Add(m.fenceThrottle) - continue - } - - // acquired fence; start the cluster - ctx, cancel := context.WithCancel(parent) - e.ctx, e.cancel, e.started = ctx, cancel, true - starts = append(starts, toStart{name: e.name, cl: e.cl, ctx: ctx}) - m.GetLogger().Info("ownership start", "cluster", name, "peer", self.ID) - - case !should && e.started: - // stop + release fence - if e.cancel != nil { - stops = append(stops, e.cancel) - } - if e.fence != nil { - go e.fence.Release(parent) - } - e.cancel = nil - e.started = false - e.nextTry = time.Time{} - m.GetLogger().Info("ownership stop", "cluster", name, "peer", self.ID) - } - } - m.engMu.Unlock() - - for _, c := range stops { - c() - } - for _, s := range starts { - go m.startForCluster(s.ctx, s.name, s.cl) - } -} - // GetCluster returns a cluster for the given identifying cluster name. Get // returns an existing cluster if it has been created before. // If no cluster is known to the provider under the given cluster name, @@ -365,65 +204,15 @@ func (m *mcManager) GetProvider() multicluster.Provider { // Add will set requested dependencies on the component, and cause the component to be // started when Start is called. -func (m *mcManager) Add(r Runnable) (err error) { - m.mcRunnables = append(m.mcRunnables, r) - defer func() { - if err != nil { - m.mcRunnables = m.mcRunnables[:len(m.mcRunnables)-1] - } - }() - +func (m *mcManager) Add(r Runnable) error { + m.engine.AddRunnable(r) return m.Manager.Add(r) } // Engage gets called when the component should start operations for the given // Cluster. ctx is cancelled when the cluster is disengaged. -func (m *mcManager) Engage(parent context.Context, name string, cl cluster.Cluster) error { - m.engMu.Lock() - e, ok := m.engaged[name] - if !ok { - e = &engagement{name: name, cl: cl} - m.engaged[name] = e - - // cleanup when provider disengages the cluster - go func(pctx context.Context, key string) { - <-pctx.Done() - m.engMu.Lock() - if ee, ok := m.engaged[key]; ok { - if ee.started && ee.cancel != nil { - ee.cancel() - } - if ee.fence != nil { - // best-effort release fence so a new owner can acquire immediately - go ee.fence.Release(context.Background()) - } - delete(m.engaged, key) - } - m.engMu.Unlock() - }(parent, name) - } else { - // provider may hand a new cluster impl for the same name - e.cl = cl - } - m.engMu.Unlock() - // No starting here; recomputeOwnership() will start/stop safely. - return nil -} - -func (m *mcManager) startForCluster(ctx context.Context, name string, cl cluster.Cluster) { - for _, r := range m.mcRunnables { - if err := r.Engage(ctx, name, cl); err != nil { - m.GetLogger().Error(err, "failed to engage", "cluster", name) - // best-effort: cancel + mark stopped so next tick can retry - m.engMu.Lock() - if e := m.engaged[name]; e != nil && e.cancel != nil { - e.cancel() - e.started = false - } - m.engMu.Unlock() - return - } - } +func (m *mcManager) Engage(ctx context.Context, name string, cl cluster.Cluster) error { + return m.engine.Engage(ctx, name, cl) } func (m *mcManager) GetManager(ctx context.Context, clusterName string) (manager.Manager, error) { diff --git a/pkg/manager/ownership.go b/pkg/manager/ownership.go new file mode 100644 index 00000000..f21b9d5f --- /dev/null +++ b/pkg/manager/ownership.go @@ -0,0 +1,357 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/manager" + + "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" +) + +// OwnershipConfig holds the knobs for shard ownership and fencing. +// +// Fencing: +// - FenceNS/FencePrefix: namespace and name prefix for per-shard Lease objects +// (e.g. mcr-shard- when PerClusterLease is true). +// - PerClusterLease: when true, use one Lease per cluster (recommended). When false, +// use a single shared fence name for all clusters. +// - LeaseDuration / LeaseRenew: total TTL and renew cadence. Choose renew < duration +// (commonly renew ≈ duration/3). +// - FenceThrottle: backoff before retrying failed fence acquisition, to reduce API churn. +// +// Peers (membership): +// - PeerPrefix: prefix used by the peer registry for membership Leases. +// - PeerWeight: relative capacity hint used by the sharder (0 treated as 1). +// +// Cadence: +// - Probe: periodic ownership tick interval (decision loop). +// - Rehash: optional slower cadence for planned redistribution (unused in basic HRW). +type OwnershipConfig struct { + // FenceNS is the namespace where fence Leases live (usually "kube-system"). + FenceNS string + // FencePrefix is the base Lease name for fences; with PerClusterLease it becomes + // FencePrefix+"-"+ (e.g., "mcr-shard-zoo"). + FencePrefix string + // PerClusterLease controls whether we create one fence per cluster (true) or a + // single shared fence name (false). Per-cluster is recommended. + PerClusterLease bool + // LeaseDuration is the total TTL written to the Lease (spec.LeaseDurationSeconds). + LeaseDuration time.Duration + // LeaseRenew is the renewal cadence; must be < LeaseDuration (rule of thumb ≈ 1/3). + LeaseRenew time.Duration + // FenceThrottle backs off repeated failed TryAcquire attempts to reduce API churn. + FenceThrottle time.Duration + + // PeerPrefix is the Lease name prefix used by the peer registry for membership. + PeerPrefix string + // PeerWeight is this peer’s capacity hint for HRW (0 treated as 1). + PeerWeight uint32 + + // Probe is the ownership decision loop interval. + Probe time.Duration + // Rehash is an optional slower cadence for planned redistribution (currently unused). + Rehash time.Duration +} + +// ownershipEngine makes ownership decisions and starts/stops per-cluster work. +// +// It combines: +// - a peerRegistry (live peer snapshot), +// - a sharder (e.g., HRW) to decide "who should own", +// - a per-cluster leaseGuard to fence "who actually runs". +// +// The engine keeps per-cluster engagement state, ties watches/workers to an +// engagement context, and uses the fence to guarantee single-writer semantics. +type ownershipEngine struct { + // kube is the host cluster client used for Leases and provider operations. + kube client.Client + // log is the engine’s logger. + log logr.Logger + // sharder decides “shouldOwn” given clusterID, peers, and self (e.g., HRW). + sharder sharder.Sharder + // peers provides a live membership snapshot for sharding decisions. + peers peerRegistry + // self is this process’s identity/weight as known by the peer registry. + self sharder.PeerInfo + // cfg holds all ownership/fencing configuration. + cfg OwnershipConfig + + // mu guards engaged and runnables. + mu sync.Mutex + // engaged tracks per-cluster engagement state keyed by cluster name. + engaged map[string]*engagement + // runnables are the multicluster components to (de)start per cluster. + runnables []multicluster.Aware +} + +// engagement tracks per-cluster lifecycle within the engine. +// +// ctx/cancel: engagement context; cancellation stops sources/workers. +// started: whether runnables have been engaged for this cluster. +// fence: the per-cluster Lease guard (nil until first start attempt). +// nextTry: throttle timestamp for fence acquisition retries. +type engagement struct { + // name is the cluster identifier (e.g., namespace). + name string + // cl is the controller-runtime cluster handle for this engagement. + cl cluster.Cluster + // ctx is the engagement context; cancelling it stops sources/work. + ctx context.Context + // cancel cancels ctx. + cancel context.CancelFunc + // started is true after runnables have been engaged for this cluster. + started bool + + // fence is the per-cluster Lease guard (nil until first start attempt). + fence *leaseGuard + // nextTry defers the next TryAcquire attempt until this time (throttling). + nextTry time.Time +} + +// peerRegistry abstracts peer discovery/membership for sharding decisions. +// Self returns this process identity; Snapshot returns the current peer set; +// Run keeps the registry fresh until ctx is cancelled. +type peerRegistry interface { + // Self returns this process’s identity and weight as known by the registry. + Self() sharder.PeerInfo + // Snapshot returns the current set of known peers. The slice order is arbitrary + // and the returned data should be treated as read-only by callers. + Snapshot() []sharder.PeerInfo + // Run keeps membership fresh until ctx is cancelled or an error occurs. + // It should block, periodically updating internal state for Self/Snapshot. + Run(ctx context.Context) error +} + +// newOwnershipEngine wires an engine with its dependencies and initial config. +func newOwnershipEngine(kube client.Client, log logr.Logger, shard sharder.Sharder, peers peerRegistry, self sharder.PeerInfo, cfg OwnershipConfig) *ownershipEngine { + return &ownershipEngine{ + kube: kube, log: log, + sharder: shard, peers: peers, self: self, cfg: cfg, + engaged: make(map[string]*engagement), + } +} + +func (e *ownershipEngine) fenceName(cluster string) string { + // Per-cluster fence: mcr-shard-; otherwise a single global fence + if e.cfg.PerClusterLease { + return fmt.Sprintf("%s-%s", e.cfg.FencePrefix, cluster) + } + return e.cfg.FencePrefix +} + +// Runnable returns a Runnable that manages ownership of clusters. +func (e *ownershipEngine) Runnable() manager.Runnable { + return manager.RunnableFunc(func(ctx context.Context) error { + e.log.Info("ownership runnable starting", "peer", e.self.ID) + errCh := make(chan error, 1) + go func() { errCh <- e.peers.Run(ctx) }() + e.recompute(ctx) + t := time.NewTicker(e.cfg.Probe) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return ctx.Err() + case err := <-errCh: + return err + case <-t.C: + e.log.V(1).Info("ownership tick", "peers", len(e.peers.Snapshot())) + e.recompute(ctx) + } + } + }) +} + +// Engage registers a cluster for ownership management. +func (e *ownershipEngine) Engage(parent context.Context, name string, cl cluster.Cluster) error { + // If provider already canceled, don't engage a dead cluster. + if err := parent.Err(); err != nil { + return err + } + + var doRecompute bool + + e.mu.Lock() + defer e.mu.Unlock() + + if cur, ok := e.engaged[name]; ok { + // Re-engage same name: replace token; stop old engagement; preserve fence. + var fence *leaseGuard + if cur.fence != nil { + fence = cur.fence // keep the existing fence/renewer + } + if cur.cancel != nil { + cur.cancel() // stop old sources/workers + } + + newEng := &engagement{name: name, cl: cl, fence: fence} + e.engaged[name] = newEng + + // cleanup tied to the *new* token; old goroutine will no-op (ee != token) + go func(pctx context.Context, key string, token *engagement) { + <-pctx.Done() + e.mu.Lock() + if ee, ok := e.engaged[key]; ok && ee == token { + if ee.started && ee.cancel != nil { + ee.cancel() + } + if ee.fence != nil { + go ee.fence.Release(context.Background()) + } + delete(e.engaged, key) + } + e.mu.Unlock() + }(parent, name, newEng) + + doRecompute = true + } else { + eng := &engagement{name: name, cl: cl} + e.engaged[name] = eng + go func(pctx context.Context, key string, token *engagement) { + <-pctx.Done() + e.mu.Lock() + if ee, ok := e.engaged[key]; ok && ee == token { + if ee.started && ee.cancel != nil { + ee.cancel() + } + if ee.fence != nil { + go ee.fence.Release(context.Background()) + } + delete(e.engaged, key) + } + e.mu.Unlock() + }(parent, name, eng) + doRecompute = true + } + + // Kick a decision outside the lock for faster attach. + if doRecompute { + go e.recompute(parent) + } + + return nil +} + +// recomputeOwnership checks the current ownership state and starts/stops clusters as needed. +func (e *ownershipEngine) recompute(parent context.Context) { + peers := e.peers.Snapshot() + self := e.self + + type toStart struct { + name string + cl cluster.Cluster + ctx context.Context + } + var starts []toStart + var stops []context.CancelFunc + + now := time.Now() + + e.mu.Lock() + defer e.mu.Unlock() + for name, engm := range e.engaged { + should := e.sharder.ShouldOwn(name, peers, self) + + switch { + case should && !engm.started: + // ensure fence exists + if engm.fence == nil { + onLost := func(cluster string) func() { + return func() { + // best-effort stop if we lose the lease mid-flight + e.log.Info("lease lost; stopping", "cluster", cluster, "peer", self.ID) + e.mu.Lock() + if ee := e.engaged[cluster]; ee != nil && ee.started && ee.cancel != nil { + ee.cancel() + ee.started = false + } + e.mu.Unlock() + } + }(name) + engm.fence = newLeaseGuard( + e.kube, + e.cfg.FenceNS, e.fenceName(name), e.self.ID, + e.cfg.LeaseDuration, e.cfg.LeaseRenew, onLost, + ) + } + + // throttle attempts + if now.Before(engm.nextTry) { + continue + } + + // try to take the fence; if we fail, set nextTry and retry on next tick + if !engm.fence.TryAcquire(parent) { + engm.nextTry = now.Add(e.cfg.FenceThrottle) + continue + } + + // acquired fence; start the cluster + ctx, cancel := context.WithCancel(parent) + engm.ctx, engm.cancel, engm.started = ctx, cancel, true + starts = append(starts, toStart{name: engm.name, cl: engm.cl, ctx: ctx}) + e.log.Info("ownership start", "cluster", name, "peer", self.ID) + + case !should && engm.started: + // stop + release fence + if engm.cancel != nil { + stops = append(stops, engm.cancel) + } + if engm.fence != nil { + go engm.fence.Release(parent) + } + engm.cancel = nil + engm.started = false + engm.nextTry = time.Time{} + e.log.Info("ownership stop", "cluster", name, "peer", self.ID) + } + } + for _, c := range stops { + c() + } + for _, s := range starts { + go e.startForCluster(s.ctx, s.name, s.cl) + } +} + +// startForCluster engages all runnables for the given cluster. +func (e *ownershipEngine) startForCluster(ctx context.Context, name string, cl cluster.Cluster) { + for _, r := range e.runnables { + if err := r.Engage(ctx, name, cl); err != nil { + e.log.Error(err, "failed to engage", "cluster", name) + // best-effort: cancel + mark stopped so next tick can retry + e.mu.Lock() + if engm := e.engaged[name]; e != nil && engm.cancel != nil { + engm.cancel() + engm.started = false + } + e.mu.Unlock() + return + } + } +} + +func (e *ownershipEngine) AddRunnable(r multicluster.Aware) { e.runnables = append(e.runnables, r) } diff --git a/pkg/manager/peers/registry.go b/pkg/manager/peers/registry.go index 911a41ff..bdea6270 100644 --- a/pkg/manager/peers/registry.go +++ b/pkg/manager/peers/registry.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package peers import ( @@ -11,6 +27,7 @@ import ( coordv1 "k8s.io/api/coordination/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" @@ -130,9 +147,9 @@ func (r *leaseRegistry) renewSelfLease(ctx context.Context) error { }, }, Spec: coordv1.LeaseSpec{ - HolderIdentity: strPtr(r.self.ID), + HolderIdentity: ptr.To(r.self.ID), RenewTime: &now, - LeaseDurationSeconds: int32Ptr(ttlSec), + LeaseDurationSeconds: ptr.To(ttlSec), }, } return r.cli.Create(ctx, lease) @@ -142,9 +159,9 @@ func (r *leaseRegistry) renewSelfLease(ctx context.Context) error { default: // Update the existing Lease - lease.Spec.HolderIdentity = strPtr(r.self.ID) + lease.Spec.HolderIdentity = ptr.To(r.self.ID) lease.Spec.RenewTime = &now - lease.Spec.LeaseDurationSeconds = int32Ptr(ttlSec) + lease.Spec.LeaseDurationSeconds = ptr.To(ttlSec) if lease.Annotations == nil { lease.Annotations = map[string]string{} } @@ -207,6 +224,3 @@ func (r *leaseRegistry) refreshPeers(ctx context.Context) error { r.mu.Unlock() return nil } - -func strPtr(s string) *string { return &s } -func int32Ptr(i int32) *int32 { return &i } diff --git a/pkg/manager/sharder/hrw.go b/pkg/manager/sharder/hrw.go index 74d5eae7..db583d61 100644 --- a/pkg/manager/sharder/hrw.go +++ b/pkg/manager/sharder/hrw.go @@ -1,20 +1,73 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package sharder import ( "hash/fnv" ) +// HRW implements Highest-Random-Weight (aka Rendezvous) hashing. +// +// Given a stable cluster identifier and a snapshot of peers, HRW selects +// the peer with the highest score for that cluster. All peers compute the +// same scores independently, so the winner is deterministic as long as +// the inputs (clusterID, peer IDs, weights) are identical. +// +// Weighting: this implementation biases selection by multiplying the +// score with the peer's Weight (0 is treated as 1). This is a simple and +// fast heuristic; if you need proportional weighting with stronger +// distribution guarantees, consider the canonical weighted rendezvous +// formula (e.g., score = -ln(u)/w). type HRW struct{} +// NewHRW returns a new HRW sharder. func NewHRW() *HRW { return &HRW{} } +// ShouldOwn returns true if self is the HRW winner for clusterID among peers. +// +// Inputs: +// - clusterID: a stable string that identifies the shard/cluster (e.g., namespace name). +// - peers: the full membership snapshot used for the decision; all participants +// should use the same list (order does not matter). +// - self: the caller's peer info. +// +// Behavior & caveats: +// - If peers is empty, this returns true (caller may choose to gate with fencing). +// - Weight 0 is treated as 1 (no special meaning). +// - Ties are broken by "last wins" in the iteration order (practically +// unreachable with 64-bit hashing, but documented for completeness). +// - The hash (FNV-1a, 64-bit) is fast and stable but not cryptographically secure. +// - This method does not enforce ownership; callers should still use a +// per-shard fence (e.g., a Lease) to serialize actual work. +// +// Determinism requirements: +// - All peers must see the same peer IDs and weights when computing. +// - clusterID must be stable across processes (same input → same winner). func (h *HRW) ShouldOwn(clusterID string, peers []PeerInfo, self PeerInfo) bool { - if len(peers) == 0 { return true } + if len(peers) == 0 { + return true + } var best PeerInfo var bestScore uint64 for _, p := range peers { score := hash64(clusterID + "|" + p.ID) - if p.Weight == 0 { p.Weight = 1 } + if p.Weight == 0 { + p.Weight = 1 + } score *= uint64(p.Weight) if score >= bestScore { bestScore = score @@ -24,6 +77,8 @@ func (h *HRW) ShouldOwn(clusterID string, peers []PeerInfo, self PeerInfo) bool return best.ID == self.ID } +// hash64 returns a stable 64-bit FNV-1a hash of s. +// Fast, non-cryptographic; suitable for rendezvous hashing. func hash64(s string) uint64 { h := fnv.New64a() _, _ = h.Write([]byte(s)) diff --git a/pkg/manager/sharder/sharder.go b/pkg/manager/sharder/sharder.go index 2e963981..5c00b393 100644 --- a/pkg/manager/sharder/sharder.go +++ b/pkg/manager/sharder/sharder.go @@ -1,10 +1,45 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package sharder +// PeerInfo describes a participating peer in sharding decisions. +// +// ID must be a stable, unique identifier for the peer (e.g., pod hostname). +// +// Weight is a relative capacity hint. The current HRW implementation in this +// package treats 0 as 1 and multiplies the peer’s score by Weight. With many +// shards, a peer’s expected share of ownership is roughly proportional to +// Weight relative to the sum of all peers’ weights. If you need stricter, +// provably proportional weighting, use a canonical weighted rendezvous score +// (e.g., s = -ln(u)/w) instead of simple multiplication. type PeerInfo struct { ID string Weight uint32 // optional (default 1) } +// Sharder chooses whether the local peer should "own" (run) a given cluster. +// +// Implementations must be deterministic across peers given the same inputs. +// ShouldOwn does not enforce ownership; callers should still gate actual work +// behind a per-cluster fence (e.g., a Lease) to guarantee single-writer. type Sharder interface { + // ShouldOwn returns true if self is the winner for clusterID among peers. + // - clusterID: stable identifier for the shard + // - peers: full membership snapshot (order-independent) + // - self: the caller’s PeerInfo ShouldOwn(clusterID string, peers []PeerInfo, self PeerInfo) bool } diff --git a/pkg/manager/sharding.go b/pkg/manager/sharding.go index 42c7dea6..90a1b850 100644 --- a/pkg/manager/sharding.go +++ b/pkg/manager/sharding.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package manager import ( @@ -7,37 +23,71 @@ import ( ) // Option mutates mcManager after construction. +// Options must be applied after WithMultiCluster(...) and before Start(...). type Option func(*mcManager) // WithSharder replaces the default HRW sharder. -func WithSharder(s sharder.Sharder) Option { return func(m *mcManager) { m.sharder = s } } +func WithSharder(s sharder.Sharder) Option { + return func(m *mcManager) { + if m.engine != nil { + m.engine.sharder = s + } + } +} -// WithPeerWeight allows heterogenous peers. -func WithPeerWeight(w uint32) Option { return func(m *mcManager) { m.peerWeight = w } } +// WithPeerWeight allows heterogeneous peers (capacity hint). +// Effective share tends toward w_i/Σw under many shards. +func WithPeerWeight(w uint32) Option { + return func(m *mcManager) { + if m.engine != nil { + m.engine.cfg.PeerWeight = w + } + } +} -// WithShardLease configures the shard lease name/namespace (for fencing) +// WithShardLease configures the fencing Lease ns/prefix (mcr-shard-* by default). func WithShardLease(ns, name string) Option { - return func(m *mcManager) { m.shardLeaseNS, m.shardLeaseName = ns, name } + return func(m *mcManager) { + if m.engine != nil { + m.engine.cfg.FenceNS = ns + m.engine.cfg.FencePrefix = name + } + } } -// WithPerClusterLease enables/disables per-cluster fencing. -func WithPerClusterLease(on bool) Option { return func(m *mcManager) { m.perClusterLease = on } } +// WithPerClusterLease enables/disables per-cluster fencing (true -> mcr-shard-). +func WithPerClusterLease(on bool) Option { + return func(m *mcManager) { + if m.engine != nil { + m.engine.cfg.PerClusterLease = on + } + } +} -// WithOwnershipIntervals tunes loop cadences. +// WithOwnershipIntervals tunes the ownership probe/rehash cadences. func WithOwnershipIntervals(probe, rehash time.Duration) Option { - return func(m *mcManager) { m.ownershipProbe = probe; m.ownershipRehash = rehash } + return func(m *mcManager) { + if m.engine != nil { + m.engine.cfg.Probe = probe + m.engine.cfg.Rehash = rehash + } + } } // WithLeaseTimings configures fencing lease timings. +// Choose renew < duration (e.g., renew ≈ duration/3). func WithLeaseTimings(duration, renew, throttle time.Duration) Option { return func(m *mcManager) { - m.leaseDuration = duration - m.leaseRenew = renew - m.fenceThrottle = throttle + if m.engine != nil { + m.engine.cfg.LeaseDuration = duration + m.engine.cfg.LeaseRenew = renew + m.engine.cfg.FenceThrottle = throttle + } } } // Configure applies options to a Manager if it is an *mcManager. +// Must be called before Start(); options do not rewire the running peer registry. func Configure(m Manager, opts ...Option) { if x, ok := m.(*mcManager); ok { for _, o := range opts { diff --git a/pkg/source/kind.go b/pkg/source/kind.go index 8a2fdd8c..c5a9d0dc 100644 --- a/pkg/source/kind.go +++ b/pkg/source/kind.go @@ -1,3 +1,16 @@ +/* +Copyright 2025 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package source import ( @@ -22,43 +35,43 @@ import ( ) // Kind creates a KindSource with the given cache provider. -func Kind[O client.Object]( - obj O, - hf mchandler.TypedEventHandlerFunc[O, mcreconcile.Request], - preds ...predicate.TypedPredicate[O], -) SyncingSource[O] { - return TypedKind[O, mcreconcile.Request](obj, hf, preds...) +func Kind[object client.Object]( + obj object, + handler mchandler.TypedEventHandlerFunc[object, mcreconcile.Request], + predicates ...predicate.TypedPredicate[object], +) SyncingSource[object] { + return TypedKind[object, mcreconcile.Request](obj, handler, predicates...) } // TypedKind creates a KindSource with the given cache provider. -func TypedKind[O client.Object, R mcreconcile.ClusterAware[R]]( - obj O, - hf mchandler.TypedEventHandlerFunc[O, R], - preds ...predicate.TypedPredicate[O], -) TypedSyncingSource[O, R] { - return &kind[O, R]{ +func TypedKind[object client.Object, request mcreconcile.ClusterAware[request]]( + obj object, + handler mchandler.TypedEventHandlerFunc[object, request], + predicates ...predicate.TypedPredicate[object], +) TypedSyncingSource[object, request] { + return &kind[object, request]{ obj: obj, - handler: hf, - predicates: preds, - project: func(_ cluster.Cluster, o O) (O, error) { return o, nil }, + handler: handler, + predicates: predicates, + project: func(_ cluster.Cluster, o object) (object, error) { return o, nil }, resync: 0, // no periodic resync by default } } -type kind[O client.Object, R mcreconcile.ClusterAware[R]] struct { - obj O - handler mchandler.TypedEventHandlerFunc[O, R] - predicates []predicate.TypedPredicate[O] - project func(cluster.Cluster, O) (O, error) +type kind[object client.Object, request mcreconcile.ClusterAware[request]] struct { + obj object + handler mchandler.TypedEventHandlerFunc[object, request] + predicates []predicate.TypedPredicate[object] + project func(cluster.Cluster, object) (object, error) resync time.Duration } -type clusterKind[O client.Object, R mcreconcile.ClusterAware[R]] struct { +type clusterKind[object client.Object, request mcreconcile.ClusterAware[request]] struct { clusterName string cl cluster.Cluster - obj O - h handler.TypedEventHandler[O, R] - preds []predicate.TypedPredicate[O] + obj object + h handler.TypedEventHandler[object, request] + preds []predicate.TypedPredicate[object] resync time.Duration mu sync.Mutex @@ -67,17 +80,17 @@ type clusterKind[O client.Object, R mcreconcile.ClusterAware[R]] struct { } // WithProjection sets the projection function for the KindSource. -func (k *kind[O, R]) WithProjection(project func(cluster.Cluster, O) (O, error)) TypedSyncingSource[O, R] { +func (k *kind[object, request]) WithProjection(project func(cluster.Cluster, object) (object, error)) TypedSyncingSource[object, request] { k.project = project return k } -func (k *kind[O, R]) ForCluster(name string, cl cluster.Cluster) (crsource.TypedSource[R], error) { +func (k *kind[object, request]) ForCluster(name string, cl cluster.Cluster) (crsource.TypedSource[request], error) { obj, err := k.project(cl, k.obj) if err != nil { return nil, err } - return &clusterKind[O, R]{ + return &clusterKind[object, request]{ clusterName: name, cl: cl, obj: obj, @@ -87,16 +100,16 @@ func (k *kind[O, R]) ForCluster(name string, cl cluster.Cluster) (crsource.Typed }, nil } -func (k *kind[O, R]) SyncingForCluster(name string, cl cluster.Cluster) (crsource.TypedSyncingSource[R], error) { +func (k *kind[object, request]) SyncingForCluster(name string, cl cluster.Cluster) (crsource.TypedSyncingSource[request], error) { src, err := k.ForCluster(name, cl) if err != nil { return nil, err } - return src.(crsource.TypedSyncingSource[R]), nil + return src.(crsource.TypedSyncingSource[request]), nil } // WaitForSync satisfies TypedSyncingSource. -func (ck *clusterKind[O, R]) WaitForSync(ctx context.Context) error { +func (ck *clusterKind[object, request]) WaitForSync(ctx context.Context) error { if !ck.cl.GetCache().WaitForCacheSync(ctx) { return ctx.Err() } @@ -104,7 +117,7 @@ func (ck *clusterKind[O, R]) WaitForSync(ctx context.Context) error { } // Start registers a removable handler on the (scoped) informer and removes it on ctx.Done(). -func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLimitingInterface[R]) error { +func (ck *clusterKind[object, request]) Start(ctx context.Context, q workqueue.TypedRateLimitingInterface[request]) error { log := log.FromContext(ctx).WithValues("cluster", ck.clusterName, "source", "kind") // Check if we're already started with this context @@ -148,7 +161,7 @@ func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLim ck.mu.Unlock() // predicate helpers - passCreate := func(e event.TypedCreateEvent[O]) bool { + passCreate := func(e event.TypedCreateEvent[object]) bool { for _, p := range ck.preds { if !p.Create(e) { return false @@ -156,7 +169,7 @@ func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLim } return true } - passUpdate := func(e event.TypedUpdateEvent[O]) bool { + passUpdate := func(e event.TypedUpdateEvent[object]) bool { for _, p := range ck.preds { if !p.Update(e) { return false @@ -164,7 +177,7 @@ func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLim } return true } - passDelete := func(e event.TypedDeleteEvent[O]) bool { + passDelete := func(e event.TypedDeleteEvent[object]) bool { for _, p := range ck.preds { if !p.Delete(e) { return false @@ -174,14 +187,14 @@ func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLim } // typed event builders - makeCreate := func(o client.Object) event.TypedCreateEvent[O] { - return event.TypedCreateEvent[O]{Object: any(o).(O)} + makeCreate := func(o client.Object) event.TypedCreateEvent[object] { + return event.TypedCreateEvent[object]{Object: any(o).(object)} } - makeUpdate := func(oo, no client.Object) event.TypedUpdateEvent[O] { - return event.TypedUpdateEvent[O]{ObjectOld: any(oo).(O), ObjectNew: any(no).(O)} + makeUpdate := func(oo, no client.Object) event.TypedUpdateEvent[object] { + return event.TypedUpdateEvent[object]{ObjectOld: any(oo).(object), ObjectNew: any(no).(object)} } - makeDelete := func(o client.Object) event.TypedDeleteEvent[O] { - return event.TypedDeleteEvent[O]{Object: any(o).(O)} + makeDelete := func(o client.Object) event.TypedDeleteEvent[object] { + return event.TypedDeleteEvent[object]{Object: any(o).(object)} } // Adapter that forwards to controller handler, honoring ctx. @@ -204,8 +217,6 @@ func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLim ooObj, ok1 := oo.(client.Object) noObj, ok2 := no.(client.Object) if ok1 && ok2 { - log.V(1).Info("kind source update", "cluster", ck.clusterName, - "name", noObj.GetName(), "ns", noObj.GetNamespace()) e := makeUpdate(ooObj, noObj) if passUpdate(e) { ck.h.Update(ctx, e, q) @@ -277,6 +288,6 @@ func (ck *clusterKind[O, R]) Start(ctx context.Context, q workqueue.TypedRateLim } // getInformer resolves the informer from the cluster cache (provider returns a scoped informer). -func (ck *clusterKind[O, R]) getInformer(ctx context.Context, obj client.Object) (crcache.Informer, error) { +func (ck *clusterKind[object, request]) getInformer(ctx context.Context, obj client.Object) (crcache.Informer, error) { return ck.cl.GetCache().GetInformer(ctx, obj) } From 8c521f9e3b3b901e20a572861c5c3fbc72425f76 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Sat, 23 Aug 2025 17:28:17 -0700 Subject: [PATCH 06/16] chore: cleanup --- pkg/source/kind.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/source/kind.go b/pkg/source/kind.go index c5a9d0dc..36e444d5 100644 --- a/pkg/source/kind.go +++ b/pkg/source/kind.go @@ -1,9 +1,12 @@ /* Copyright 2025 The Kubernetes Authors. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. From 434aeb57eba4e5a4b4e78529615f3d5d48319adb Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Sat, 23 Aug 2025 18:52:36 -0700 Subject: [PATCH 07/16] fix: formatting, tests, cleanup --- examples/sharded-namespace/main.go | 6 +- pkg/controller/controller.go | 4 +- pkg/manager/leaseguard.go | 13 +- pkg/manager/leaseguard_test.go | 17 +++ pkg/manager/manager.go | 2 +- pkg/manager/ownership.go | 24 +--- pkg/manager/peers/doc.go | 30 ++++ pkg/manager/peers/registry.go | 33 +++-- pkg/manager/peers/registry_test.go | 223 +++++++++++++++++++++++++++++ 9 files changed, 313 insertions(+), 39 deletions(-) create mode 100644 pkg/manager/peers/doc.go create mode 100644 pkg/manager/peers/registry_test.go diff --git a/examples/sharded-namespace/main.go b/examples/sharded-namespace/main.go index 4049efa9..f4521c4d 100644 --- a/examples/sharded-namespace/main.go +++ b/examples/sharded-namespace/main.go @@ -7,12 +7,12 @@ import ( "fmt" "os" - "github.com/go-logr/logr" "golang.org/x/sync/errgroup" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -31,13 +31,13 @@ func main() { ctx := ctrl.SetupSignalHandler() - if err := run(ctx, log); err != nil { + if err := run(ctx); err != nil { log.Error(err, "exiting") os.Exit(1) } } -func run(ctx context.Context, log logr.Logger) error { +func run(ctx context.Context) error { // use in-cluster config; fall back to default loading rules for local runs cfg, err := ctrl.GetConfig() if err != nil { diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 7358fc7c..b24f7684 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -169,7 +169,7 @@ func (c *mcController[request]) Engage(ctx context.Context, name string, cl clus // note: cancel() is driven by parent; no need to call here }(engCtx, name, ec) - return nil //nolint:govet // cancel is called in the error case only. + return nil } func (c *mcController[request]) MultiClusterWatch(src mcsource.TypedSource[client.Object, request]) error { @@ -188,7 +188,7 @@ func (c *mcController[request]) MultiClusterWatch(src mcsource.TypedSource[clien c.sources = append(c.sources, src) - return nil //nolint:govet // cancel is called in the error case only. + return nil } func startWithinContext[request mcreconcile.ClusterAware[request]](ctx context.Context, src source.TypedSource[request]) source.TypedSource[request] { diff --git a/pkg/manager/leaseguard.go b/pkg/manager/leaseguard.go index 478fffb1..8a0c90d6 100644 --- a/pkg/manager/leaseguard.go +++ b/pkg/manager/leaseguard.go @@ -24,6 +24,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -54,12 +55,12 @@ import ( // RBAC: the caller’s client must be allowed to get/list/watch/create/update/patch // Leases in namespace `ns`. type leaseGuard struct { - c client.Client - ns, nm string - id string - ldur time.Duration // lease duration - renew time.Duration // renew period - onLost func() // callback when we lose the lease + c client.Client + ns, nm string + id string + ldur time.Duration // lease duration + renew time.Duration // renew period + onLost func() // callback when we lose the lease held bool cancel context.CancelFunc diff --git a/pkg/manager/leaseguard_test.go b/pkg/manager/leaseguard_test.go index 8dc6f157..37f44060 100644 --- a/pkg/manager/leaseguard_test.go +++ b/pkg/manager/leaseguard_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package manager import ( @@ -9,6 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 3869bd73..707e0f25 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -152,7 +152,7 @@ func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider) (Mana PeerPrefix: "mcr-peer", PeerWeight: 1, Probe: 5 * time.Second, Rehash: 15 * time.Second, } - pr := peers.NewLeaseRegistry(mgr.GetClient(), cfg.FenceNS, cfg.PeerPrefix, "", cfg.PeerWeight) + pr := peers.NewLeaseRegistry(mgr.GetClient(), cfg.FenceNS, cfg.PeerPrefix, "", cfg.PeerWeight, mgr.GetLogger()) self := pr.Self() eng := newOwnershipEngine( diff --git a/pkg/manager/ownership.go b/pkg/manager/ownership.go index f21b9d5f..e3e64fff 100644 --- a/pkg/manager/ownership.go +++ b/pkg/manager/ownership.go @@ -23,10 +23,12 @@ import ( "time" "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/manager/peers" "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" ) @@ -93,7 +95,7 @@ type ownershipEngine struct { // sharder decides “shouldOwn” given clusterID, peers, and self (e.g., HRW). sharder sharder.Sharder // peers provides a live membership snapshot for sharding decisions. - peers peerRegistry + peers peers.Registry // self is this process’s identity/weight as known by the peer registry. self sharder.PeerInfo // cfg holds all ownership/fencing configuration. @@ -131,22 +133,8 @@ type engagement struct { nextTry time.Time } -// peerRegistry abstracts peer discovery/membership for sharding decisions. -// Self returns this process identity; Snapshot returns the current peer set; -// Run keeps the registry fresh until ctx is cancelled. -type peerRegistry interface { - // Self returns this process’s identity and weight as known by the registry. - Self() sharder.PeerInfo - // Snapshot returns the current set of known peers. The slice order is arbitrary - // and the returned data should be treated as read-only by callers. - Snapshot() []sharder.PeerInfo - // Run keeps membership fresh until ctx is cancelled or an error occurs. - // It should block, periodically updating internal state for Self/Snapshot. - Run(ctx context.Context) error -} - // newOwnershipEngine wires an engine with its dependencies and initial config. -func newOwnershipEngine(kube client.Client, log logr.Logger, shard sharder.Sharder, peers peerRegistry, self sharder.PeerInfo, cfg OwnershipConfig) *ownershipEngine { +func newOwnershipEngine(kube client.Client, log logr.Logger, shard sharder.Sharder, peers peers.Registry, self sharder.PeerInfo, cfg OwnershipConfig) *ownershipEngine { return &ownershipEngine{ kube: kube, log: log, sharder: shard, peers: peers, self: self, cfg: cfg, @@ -251,7 +239,7 @@ func (e *ownershipEngine) Engage(parent context.Context, name string, cl cluster if doRecompute { go e.recompute(parent) } - + return nil } @@ -344,7 +332,7 @@ func (e *ownershipEngine) startForCluster(ctx context.Context, name string, cl c e.log.Error(err, "failed to engage", "cluster", name) // best-effort: cancel + mark stopped so next tick can retry e.mu.Lock() - if engm := e.engaged[name]; e != nil && engm.cancel != nil { + if engm := e.engaged[name]; engm != nil && engm.cancel != nil { engm.cancel() engm.started = false } diff --git a/pkg/manager/peers/doc.go b/pkg/manager/peers/doc.go new file mode 100644 index 00000000..b52f83bb --- /dev/null +++ b/pkg/manager/peers/doc.go @@ -0,0 +1,30 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* +Package peers implements a lightweight membership registry for sharding. + +Each peer advertises presence via a coordination.k8s.io/Lease named +- (e.g. mcr-peer-sharded-namespace-0). The registry: + + - upserts our own Lease on a cadence (heartbeat), + - lists other peer Leases with matching labels/prefix, + - exposes a stable snapshot for HRW/placement. + +This registry only tracks membership; single-writer ownership is enforced +separately via per-cluster fencing Leases. +*/ +package peers diff --git a/pkg/manager/peers/registry.go b/pkg/manager/peers/registry.go index bdea6270..b8b50232 100644 --- a/pkg/manager/peers/registry.go +++ b/pkg/manager/peers/registry.go @@ -24,10 +24,13 @@ import ( "sync" "time" + "github.com/go-logr/logr" + coordv1 "k8s.io/api/coordination/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" @@ -42,10 +45,14 @@ const ( defaultWeight = uint32(1) ) -// Registry abstracts peer discovery for sharding. +// Registry provides peer membership for sharding decisions. type Registry interface { + // Self returns this process's identity and weight. Self() sharder.PeerInfo + // Snapshot returns the current set of live peers. Treat as read-only. Snapshot() []sharder.PeerInfo + // Run blocks, periodically renewing our Lease and refreshing peer membership + // until ctx is cancelled or an error occurs. Run(ctx context.Context) error } @@ -59,14 +66,19 @@ type leaseRegistry struct { ttl time.Duration renew time.Duration + + log logr.Logger } -// NewLeaseRegistry creates a Registry using coordination.k8s.io Leases. -// ns : namespace where peer Leases live (e.g., "kube-system") -// namePrefix : common prefix for Lease names (e.g., "mcr-peer") -// selfID : this process identity (defaults to hostname if empty) -// weight : optional weight for HRW (defaults to 1 if 0) -func NewLeaseRegistry(cli crclient.Client, ns, namePrefix string, selfID string, weight uint32) Registry { +// NewLeaseRegistry constructs a Lease-based Registry. +// +// Params: +// - ns: namespace where peer Leases live (e.g. "kube-system") +// - namePrefix: Lease name prefix (e.g. "mcr-peer") +// - selfID: this process ID (defaults to os.Hostname() if empty) +// - weight: relative capacity (0 treated as 1) +// - log: logger; use logr.Discard() to silence +func NewLeaseRegistry(cli crclient.Client, ns, namePrefix string, selfID string, weight uint32, log logr.Logger) Registry { if selfID == "" { if hn, _ := os.Hostname(); hn != "" { selfID = hn @@ -85,6 +97,7 @@ func NewLeaseRegistry(cli crclient.Client, ns, namePrefix string, selfID string, peers: map[string]sharder.PeerInfo{}, ttl: 30 * time.Second, renew: 10 * time.Second, + log: log.WithName("lease-registry").WithValues("ns", ns, "prefix", namePrefix, "selfID", selfID, "weight", weight), } } @@ -101,6 +114,8 @@ func (r *leaseRegistry) Snapshot() []sharder.PeerInfo { } func (r *leaseRegistry) Run(ctx context.Context) error { + r.log.Info("peer registry starting", "ns", r.ns, "prefix", r.namePrefix, "self", r.self.ID) + // Tick frequently enough to renew well within ttl. t := time.NewTicker(r.renew) defer t.Stop() @@ -108,10 +123,10 @@ func (r *leaseRegistry) Run(ctx context.Context) error { for { // Do one pass immediately so we publish our presence promptly. if err := r.renewSelfLease(ctx); err != nil && ctx.Err() == nil { - // Non-fatal; we'll try again on next tick. + r.log.V(1).Info("renewSelfLease failed; will retry", "err", err) } if err := r.refreshPeers(ctx); err != nil && ctx.Err() == nil { - // Non-fatal; we'll try again on next tick. + r.log.V(1).Info("refreshPeers failed; will retry", "err", err) } select { diff --git a/pkg/manager/peers/registry_test.go b/pkg/manager/peers/registry_test.go new file mode 100644 index 00000000..19255807 --- /dev/null +++ b/pkg/manager/peers/registry_test.go @@ -0,0 +1,223 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package peers + +import ( + "context" + "testing" + "time" + + "github.com/go-logr/logr" + + coordv1 "k8s.io/api/coordination/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + + crclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +const ( + testNS = "kube-system" + prefix = "mcr-peer" + selfID = "peer-0" + otherID = "peer-1" +) + +func newScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := coordv1.AddToScheme(s); err != nil { + t.Fatalf("add scheme: %v", err) + } + return s +} + +func getLease(t *testing.T, c crclient.Client, name string) *coordv1.Lease { + t.Helper() + var ls coordv1.Lease + if err := c.Get(context.Background(), crclient.ObjectKey{Namespace: testNS, Name: name}, &ls); err != nil { + t.Fatalf("get lease %s: %v", name, err) + } + return &ls +} + +func TestRenewSelfLease_CreateThenUpdate(t *testing.T) { + s := newScheme(t) + c := fake.NewClientBuilder().WithScheme(s).Build() + + r := NewLeaseRegistry(c, testNS, prefix, selfID, 2, logr.Discard()).(*leaseRegistry) + + // First call creates the Lease + if err := r.renewSelfLease(context.Background()); err != nil { + t.Fatalf("renewSelfLease create: %v", err) + } + name := prefix + "-" + selfID + ls := getLease(t, c, name) + if ls.Spec.HolderIdentity == nil || *ls.Spec.HolderIdentity != selfID { + t.Fatalf("holder = %v, want %q", ls.Spec.HolderIdentity, selfID) + } + if got := ls.Labels[labelPeer]; got != "true" { + t.Fatalf("label %s = %q, want true", labelPeer, got) + } + if got := ls.Labels[labelPrefix]; got != prefix { + t.Fatalf("label %s = %q, want %s", labelPrefix, got, prefix) + } + if got := ls.Annotations[annotWeight]; got != "2" { + t.Fatalf("weight annot = %q, want 2", got) + } + + // Update path: change weight and ensure it is written + r.self.Weight = 3 + time.Sleep(1 * time.Millisecond) // ensure RenewTime advances + if err := r.renewSelfLease(context.Background()); err != nil { + t.Fatalf("renewSelfLease update: %v", err) + } + ls2 := getLease(t, c, name) + if got := ls2.Annotations[annotWeight]; got != "3" { + t.Fatalf("updated weight annot = %q, want 3", got) + } + if ls.Spec.RenewTime == nil || ls2.Spec.RenewTime == nil || + !ls2.Spec.RenewTime.Time.After(ls.Spec.RenewTime.Time) { + t.Fatalf("RenewTime did not advance: old=%v new=%v", ls.Spec.RenewTime, ls2.Spec.RenewTime) + } +} + +func TestRefreshPeers_FiltersAndParses(t *testing.T) { + s := newScheme(t) + + now := time.Now() + validRenew := metav1.NewMicroTime(now) + expiredRenew := metav1.NewMicroTime(now.Add(-time.Hour)) + dur := int32(60) + + // valid peer (otherID), correct labels/prefix, not expired, weight 5 + valid := &coordv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNS, Name: prefix + "-" + otherID, + Labels: map[string]string{ + labelPartOf: partOfValue, + labelPeer: "true", + labelPrefix: prefix, + }, + Annotations: map[string]string{ + annotWeight: "5", + }, + }, + Spec: coordv1.LeaseSpec{ + HolderIdentity: &[]string{otherID}[0], + RenewTime: &validRenew, + LeaseDurationSeconds: &dur, + }, + } + + // expired peer (should be ignored) + expired := &coordv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNS, Name: prefix + "-expired", + Labels: map[string]string{ + labelPeer: "true", + labelPrefix: prefix, + }, + }, + Spec: coordv1.LeaseSpec{ + HolderIdentity: &[]string{"nobody"}[0], + RenewTime: &expiredRenew, + LeaseDurationSeconds: &dur, + }, + } + + // wrong prefix (should be ignored) + wrong := &coordv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: testNS, Name: "otherprefix-" + "x", + Labels: map[string]string{ + labelPeer: "true", + labelPrefix: "otherprefix", + }, + }, + Spec: coordv1.LeaseSpec{ + HolderIdentity: &[]string{"x"}[0], + RenewTime: &validRenew, + LeaseDurationSeconds: &dur, + }, + } + + c := fake.NewClientBuilder().WithScheme(s).WithObjects(valid, expired, wrong).Build() + + r := NewLeaseRegistry(c, testNS, prefix, selfID, 1, logr.Discard()).(*leaseRegistry) + if err := r.refreshPeers(context.Background()); err != nil { + t.Fatalf("refreshPeers: %v", err) + } + + snap := r.Snapshot() + + // Expect self + valid other + want := map[string]uint32{ + selfID: 1, + otherID: 5, + } + got := map[string]uint32{} + for _, p := range snap { + got[p.ID] = p.Weight + } + for id, w := range want { + if got[id] != w { + t.Fatalf("snapshot missing/mismatch for %s: got %d want %d; full=%v", id, got[id], w, got) + } + } + // Should not include expired/wrong + if _, ok := got["expired"]; ok { + t.Fatalf("snapshot unexpectedly contains expired") + } + if _, ok := got["x"]; ok { + t.Fatalf("snapshot unexpectedly contains wrong prefix") + } +} + +func TestRun_PublishesSelfAndStopsOnCancel(t *testing.T) { + s := newScheme(t) + c := fake.NewClientBuilder().WithScheme(s).Build() + r := NewLeaseRegistry(c, testNS, prefix, selfID, 1, logr.Discard()).(*leaseRegistry) + + // Speed up the loop for the test + r.ttl = 2 * time.Second + r.renew = 50 * time.Millisecond + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + + go func() { + _ = r.Run(ctx) + close(done) + }() + + // wait for at least one tick + time.Sleep(120 * time.Millisecond) + + // self lease should exist + name := prefix + "-" + selfID + _ = getLease(t, c, name) // will fatal if missing + + // cancel and ensure Run returns + cancel() + select { + case <-done: + case <-time.After(1 * time.Second): + t.Fatalf("Run did not exit after cancel") + } +} From c1bd6b35d99664ff75a65a1f1b38e7395a9a1da0 Mon Sep 17 00:00:00 2001 From: Zach Smith <33258732+zachsmith1@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:47:01 -0400 Subject: [PATCH 08/16] fix comment Co-authored-by: Marvin Beckers --- pkg/manager/leaseguard.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/manager/leaseguard.go b/pkg/manager/leaseguard.go index 8a0c90d6..4316f6a0 100644 --- a/pkg/manager/leaseguard.go +++ b/pkg/manager/leaseguard.go @@ -72,7 +72,7 @@ func newLeaseGuard(c client.Client, ns, name, id string, ldur, renew time.Durati } // TryAcquire creates/adopts the Lease for g.id and starts renewing it. -// Returns true iff we own it after this call (or already owned). +// Returns true if we own it after this call (or already owned). // Fails (returns false) when another non-expired holder exists or API calls error. func (g *leaseGuard) TryAcquire(ctx context.Context) bool { if g.held { From 0a6a229b791db78e3647042f9fd6b936c3f39b2d Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 09:50:02 -0700 Subject: [PATCH 09/16] fix: replace ownership with synchronization naming --- examples/sharded-namespace/README.md | 16 +++--- pkg/manager/manager.go | 8 +-- pkg/manager/sharding.go | 4 +- .../{ownership.go => synchronization.go} | 50 ++++++++++--------- 4 files changed, 40 insertions(+), 38 deletions(-) rename pkg/manager/{ownership.go => synchronization.go} (84%) diff --git a/examples/sharded-namespace/README.md b/examples/sharded-namespace/README.md index cb75cdc2..3e0a3fa9 100644 --- a/examples/sharded-namespace/README.md +++ b/examples/sharded-namespace/README.md @@ -1,12 +1,12 @@ # Sharded Namespace Example -This demo runs two replicas of a multicluster manager that **split ownership** of downstream "clusters" discovered by the *namespace provider* (each Kubernetes Namespace == one cluster). Ownership is decided by HRW hashing across peers, then serialized with per-cluster fencing Leases so exactly one peer reconciles a given cluster at a time. +This demo runs two replicas of a multicluster manager that **splits ownership** of downstream "clusters" discovered by the *namespace provider* (each Kubernetes Namespace == one cluster). Synchronization is decided by HRW hashing across peers, then serialized with per-cluster fencing Leases so exactly one peer reconciles a given cluster at a time. **Key Components:** - **Peer membership (presence)**: `coordination.k8s.io/Lease` with prefix `mcr-peer-*` -- **Per-cluster fencing (ownership)**: `coordination.k8s.io/Lease` with prefix `mcr-shard-` +- **Per-cluster fencing (synchronization)**: `coordination.k8s.io/Lease` with prefix `mcr-shard-` -Controllers attach per-cluster watches when ownership starts, and cleanly detach & re-attach when ownership transfers. +Controllers attach per-cluster watches when synchronization starts, and cleanly detach & re-attach when it transfers. ## Build the image @@ -39,8 +39,8 @@ kubectl -n mcr-demo logs statefulset/sharded-namespace -f You should see lines like: ```bash -"ownership start {"cluster": "zoo", "peer": "sharded-namespace-0"}" -"ownership start {"cluster": "jungle", "peer": "sharded-namespace-1"}" +"synchronization start {"cluster": "zoo", "peer": "sharded-namespace-0"}" +"synchronization start {"cluster": "jungle", "peer": "sharded-namespace-1"}" Reconciling ConfigMap {"controller": "multicluster-configmaps", "controllerGroup": "", "controllerKind": "ConfigMap", "reconcileID": "4f1116b3-b5 │ │ 4e-4e6a-b84f-670ca5cfc9ce", "cluster": "zoo", "ns": "default", "name": "elephant"} @@ -66,9 +66,9 @@ kubectl -n kube-system get lease mcr-shard-$C \ ``` -## Test Ownership +## Test Synchronization -Scale down to 1 replica and watch ownership consolidate: +Scale down to 1 replica and watch synchronization consolidate: ```bash # Scale down kubectl -n mcr-demo scale statefulset/sharded-namespace --replicas=1 @@ -92,7 +92,7 @@ kubectl -n "$C" create cm test-$(date +%s) --from-literal=ts=$(date +%s) --dry-r kubectl -n mcr-demo logs pod/sharded-namespace-0 --since=100s | grep "Reconciling ConfigMap.*$C" ``` -Scale up to 3 replicas and watch ownership rebalance: +Scale up to 3 replicas and watch synchronization rebalance: ```bash # Scale up kubectl -n mcr-demo scale statefulset/sharded-namespace --replicas=3 diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 707e0f25..c7d736d1 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -130,7 +130,7 @@ var _ Manager = &mcManager{} type mcManager struct { manager.Manager provider multicluster.Provider - engine *ownershipEngine + engine *synchronizationEngine } // New returns a new Manager for creating Controllers. The provider is used to @@ -146,7 +146,7 @@ func New(config *rest.Config, provider multicluster.Provider, opts manager.Optio // WithMultiCluster wraps a host manager to run multi-cluster controllers. func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider) (Manager, error) { - cfg := OwnershipConfig{ + cfg := SynchronizationConfig{ FenceNS: "kube-system", FencePrefix: "mcr-shard", PerClusterLease: true, LeaseDuration: 20 * time.Second, LeaseRenew: 10 * time.Second, FenceThrottle: 750 * time.Millisecond, PeerPrefix: "mcr-peer", PeerWeight: 1, Probe: 5 * time.Second, Rehash: 15 * time.Second, @@ -155,14 +155,14 @@ func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider) (Mana pr := peers.NewLeaseRegistry(mgr.GetClient(), cfg.FenceNS, cfg.PeerPrefix, "", cfg.PeerWeight, mgr.GetLogger()) self := pr.Self() - eng := newOwnershipEngine( + eng := newSynchronizationEngine( mgr.GetClient(), mgr.GetLogger(), sharder.NewHRW(), pr, self, cfg, ) m := &mcManager{Manager: mgr, provider: provider, engine: eng} - // Start ownership loop as a manager Runnable. + // Start synchronization loop as a manager Runnable. if err := mgr.Add(eng.Runnable()); err != nil { return nil, err } diff --git a/pkg/manager/sharding.go b/pkg/manager/sharding.go index 90a1b850..a1fdc10a 100644 --- a/pkg/manager/sharding.go +++ b/pkg/manager/sharding.go @@ -64,8 +64,8 @@ func WithPerClusterLease(on bool) Option { } } -// WithOwnershipIntervals tunes the ownership probe/rehash cadences. -func WithOwnershipIntervals(probe, rehash time.Duration) Option { +// WithSynchronizationIntervals tunes the synchronization probe/rehash cadences. +func WithSynchronizationIntervals(probe, rehash time.Duration) Option { return func(m *mcManager) { if m.engine != nil { m.engine.cfg.Probe = probe diff --git a/pkg/manager/ownership.go b/pkg/manager/synchronization.go similarity index 84% rename from pkg/manager/ownership.go rename to pkg/manager/synchronization.go index e3e64fff..740ff75f 100644 --- a/pkg/manager/ownership.go +++ b/pkg/manager/synchronization.go @@ -33,7 +33,7 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/multicluster" ) -// OwnershipConfig holds the knobs for shard ownership and fencing. +// SynchronizationConfig holds the knobs for shard synchronization and fencing. // // Fencing: // - FenceNS/FencePrefix: namespace and name prefix for per-shard Lease objects @@ -49,9 +49,9 @@ import ( // - PeerWeight: relative capacity hint used by the sharder (0 treated as 1). // // Cadence: -// - Probe: periodic ownership tick interval (decision loop). +// - Probe: periodic synchronization tick interval (decision loop). // - Rehash: optional slower cadence for planned redistribution (unused in basic HRW). -type OwnershipConfig struct { +type SynchronizationConfig struct { // FenceNS is the namespace where fence Leases live (usually "kube-system"). FenceNS string // FencePrefix is the base Lease name for fences; with PerClusterLease it becomes @@ -72,13 +72,13 @@ type OwnershipConfig struct { // PeerWeight is this peer’s capacity hint for HRW (0 treated as 1). PeerWeight uint32 - // Probe is the ownership decision loop interval. + // Probe is the synchronization decision loop interval. Probe time.Duration // Rehash is an optional slower cadence for planned redistribution (currently unused). Rehash time.Duration } -// ownershipEngine makes ownership decisions and starts/stops per-cluster work. +// synchronizationEngine makes synchronization decisions and starts/stops per-cluster work. // // It combines: // - a peerRegistry (live peer snapshot), @@ -87,7 +87,7 @@ type OwnershipConfig struct { // // The engine keeps per-cluster engagement state, ties watches/workers to an // engagement context, and uses the fence to guarantee single-writer semantics. -type ownershipEngine struct { +type synchronizationEngine struct { // kube is the host cluster client used for Leases and provider operations. kube client.Client // log is the engine’s logger. @@ -98,8 +98,8 @@ type ownershipEngine struct { peers peers.Registry // self is this process’s identity/weight as known by the peer registry. self sharder.PeerInfo - // cfg holds all ownership/fencing configuration. - cfg OwnershipConfig + // cfg holds all synchronization/fencing configuration. + cfg SynchronizationConfig // mu guards engaged and runnables. mu sync.Mutex @@ -133,16 +133,16 @@ type engagement struct { nextTry time.Time } -// newOwnershipEngine wires an engine with its dependencies and initial config. -func newOwnershipEngine(kube client.Client, log logr.Logger, shard sharder.Sharder, peers peers.Registry, self sharder.PeerInfo, cfg OwnershipConfig) *ownershipEngine { - return &ownershipEngine{ +// newSynchronizationEngine wires an engine with its dependencies and initial config. +func newSynchronizationEngine(kube client.Client, log logr.Logger, shard sharder.Sharder, peers peers.Registry, self sharder.PeerInfo, cfg SynchronizationConfig) *synchronizationEngine { + return &synchronizationEngine{ kube: kube, log: log, sharder: shard, peers: peers, self: self, cfg: cfg, engaged: make(map[string]*engagement), } } -func (e *ownershipEngine) fenceName(cluster string) string { +func (e *synchronizationEngine) fenceName(cluster string) string { // Per-cluster fence: mcr-shard-; otherwise a single global fence if e.cfg.PerClusterLease { return fmt.Sprintf("%s-%s", e.cfg.FencePrefix, cluster) @@ -150,10 +150,10 @@ func (e *ownershipEngine) fenceName(cluster string) string { return e.cfg.FencePrefix } -// Runnable returns a Runnable that manages ownership of clusters. -func (e *ownershipEngine) Runnable() manager.Runnable { +// Runnable returns a Runnable that manages synchronization of clusters. +func (e *synchronizationEngine) Runnable() manager.Runnable { return manager.RunnableFunc(func(ctx context.Context) error { - e.log.Info("ownership runnable starting", "peer", e.self.ID) + e.log.Info("synchronization runnable starting", "peer", e.self.ID) errCh := make(chan error, 1) go func() { errCh <- e.peers.Run(ctx) }() e.recompute(ctx) @@ -166,15 +166,15 @@ func (e *ownershipEngine) Runnable() manager.Runnable { case err := <-errCh: return err case <-t.C: - e.log.V(1).Info("ownership tick", "peers", len(e.peers.Snapshot())) + e.log.V(1).Info("synchronization tick", "peers", len(e.peers.Snapshot())) e.recompute(ctx) } } }) } -// Engage registers a cluster for ownership management. -func (e *ownershipEngine) Engage(parent context.Context, name string, cl cluster.Cluster) error { +// Engage registers a cluster for synchronization management. +func (e *synchronizationEngine) Engage(parent context.Context, name string, cl cluster.Cluster) error { // If provider already canceled, don't engage a dead cluster. if err := parent.Err(); err != nil { return err @@ -243,8 +243,8 @@ func (e *ownershipEngine) Engage(parent context.Context, name string, cl cluster return nil } -// recomputeOwnership checks the current ownership state and starts/stops clusters as needed. -func (e *ownershipEngine) recompute(parent context.Context) { +// recompute checks the current synchronization state and starts/stops clusters as needed. +func (e *synchronizationEngine) recompute(parent context.Context) { peers := e.peers.Snapshot() self := e.self @@ -301,7 +301,7 @@ func (e *ownershipEngine) recompute(parent context.Context) { ctx, cancel := context.WithCancel(parent) engm.ctx, engm.cancel, engm.started = ctx, cancel, true starts = append(starts, toStart{name: engm.name, cl: engm.cl, ctx: ctx}) - e.log.Info("ownership start", "cluster", name, "peer", self.ID) + e.log.Info("synchronization start", "cluster", name, "peer", self.ID) case !should && engm.started: // stop + release fence @@ -314,7 +314,7 @@ func (e *ownershipEngine) recompute(parent context.Context) { engm.cancel = nil engm.started = false engm.nextTry = time.Time{} - e.log.Info("ownership stop", "cluster", name, "peer", self.ID) + e.log.Info("synchronization stop", "cluster", name, "peer", self.ID) } } for _, c := range stops { @@ -326,7 +326,7 @@ func (e *ownershipEngine) recompute(parent context.Context) { } // startForCluster engages all runnables for the given cluster. -func (e *ownershipEngine) startForCluster(ctx context.Context, name string, cl cluster.Cluster) { +func (e *synchronizationEngine) startForCluster(ctx context.Context, name string, cl cluster.Cluster) { for _, r := range e.runnables { if err := r.Engage(ctx, name, cl); err != nil { e.log.Error(err, "failed to engage", "cluster", name) @@ -342,4 +342,6 @@ func (e *ownershipEngine) startForCluster(ctx context.Context, name string, cl c } } -func (e *ownershipEngine) AddRunnable(r multicluster.Aware) { e.runnables = append(e.runnables, r) } +func (e *synchronizationEngine) AddRunnable(r multicluster.Aware) { + e.runnables = append(e.runnables, r) +} From 335a35b6c44d5ddbd981fce8ba0a6225c0615778 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 10:19:14 -0700 Subject: [PATCH 10/16] fix: remove configure in favor of opts --- examples/sharded-namespace/README.md | 13 ++++++++----- examples/sharded-namespace/main.go | 16 +++++++--------- pkg/manager/manager.go | 11 ++++++++--- pkg/manager/sharding.go | 14 ++------------ 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/examples/sharded-namespace/README.md b/examples/sharded-namespace/README.md index 3e0a3fa9..82537da9 100644 --- a/examples/sharded-namespace/README.md +++ b/examples/sharded-namespace/README.md @@ -75,7 +75,7 @@ kubectl -n mcr-demo scale statefulset/sharded-namespace --replicas=1 # Watch leases lose their holders as pods terminate -watch 'kubectl -n kube-system get lease -o custom-columns=NAME:.metadata.name,HOLDER:.spec.holderIdentity | grep "^mcr-shard-"' +kubectl -n kube-system get lease -o custom-columns=NAME:.metadata.name,HOLDER:.spec.holderIdentity | grep "^mcr-shard-" # Wait for all clusters to be owned by the single remaining pod (~30s) kubectl -n kube-system wait --for=jsonpath='{.spec.holderIdentity}'=sharded-namespace-0 \ @@ -97,7 +97,7 @@ Scale up to 3 replicas and watch synchronization rebalance: # Scale up kubectl -n mcr-demo scale statefulset/sharded-namespace --replicas=3 # Watch leases regain holders as pods start -watch 'kubectl -n kube-system get lease -o custom-columns=NAME:.metadata.name,HOLDER:.spec.holderIdentity | grep "^mcr-shard-"' +kubectl -n kube-system get lease -o custom-columns=NAME:.metadata.name,HOLDER:.spec.holderIdentity | grep "^mcr-shard-" # Create a cm in the default ns which belongs to sharded-namespace-2 C=default @@ -110,17 +110,20 @@ kubectl -n mcr-demo logs pod/sharded-namespace-2 --since=100s | grep "Reconcilin In your example app (e.g., examples/sharded-namespace/main.go), configure fencing and timings: ```go -mcmanager.Configure(mgr, +mgr, err := mcmanager.New(cfg, provider, manager.Options{}, // Per-cluster fencing Leases live here as mcr-shard- mcmanager.WithShardLease("kube-system", "mcr-shard"), mcmanager.WithPerClusterLease(true), // enabled by default - + // Optional: tune fencing timings (duration, renew, throttle): // mcmanager.WithLeaseTimings(30*time.Second, 10*time.Second, 750*time.Millisecond), - + // Optional: peer weight for HRW: // mcmanager.WithPeerWeight(1), ) +if err != nil { + // handle error +} ``` The peer registry uses mcr-peer-* automatically and derives the peer ID from the pod hostname (StatefulSet ordinal). diff --git a/examples/sharded-namespace/main.go b/examples/sharded-namespace/main.go index f4521c4d..94420126 100644 --- a/examples/sharded-namespace/main.go +++ b/examples/sharded-namespace/main.go @@ -52,20 +52,18 @@ func run(ctx context.Context) error { provider := namespace.New(host) // Multicluster manager (no peer ID passed; pod hostname becomes peer ID). - mgr, err := mcmanager.New(cfg, provider, manager.Options{}) - if err != nil { - return fmt.Errorf("create mc manager: %w", err) - } - // Configure sharding: - // - fencing prefix: "mcr-shard" (per-cluster Lease names become mcr-shard-) - // - peer membership still uses "mcr-peer" internally (set in WithMultiCluster) - // Peer ID defaults to os.Hostname(). - mcmanager.Configure(mgr, + // - fencing prefix: "mcr-shard" (per-cluster Lease names become mcr-shard-) + // - peer membership still uses "mcr-peer" internally (set in WithMultiCluster) + // Peer ID defaults to os.Hostname(). + mgr, err := mcmanager.New(cfg, provider, manager.Options{}, mcmanager.WithShardLease("kube-system", "mcr-shard"), // optional but explicit (your manager already defaults this to true) mcmanager.WithPerClusterLease(true), ) + if err != nil { + return fmt.Errorf("create mc manager: %w", err) + } // A simple controller that logs ConfigMaps per owned “cluster” (namespace). if err := mcbuilder.ControllerManagedBy(mgr). diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index c7d736d1..c16c82ea 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -136,16 +136,16 @@ type mcManager struct { // New returns a new Manager for creating Controllers. The provider is used to // discover and manage clusters. With a provider set to nil, the manager will // behave like a regular controller-runtime manager. -func New(config *rest.Config, provider multicluster.Provider, opts manager.Options) (Manager, error) { +func New(config *rest.Config, provider multicluster.Provider, opts manager.Options, mcOpts ...Option) (Manager, error) { mgr, err := manager.New(config, opts) if err != nil { return nil, err } - return WithMultiCluster(mgr, provider) + return WithMultiCluster(mgr, provider, mcOpts...) } // WithMultiCluster wraps a host manager to run multi-cluster controllers. -func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider) (Manager, error) { +func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider, mcOpts ...Option) (Manager, error) { cfg := SynchronizationConfig{ FenceNS: "kube-system", FencePrefix: "mcr-shard", PerClusterLease: true, LeaseDuration: 20 * time.Second, LeaseRenew: 10 * time.Second, FenceThrottle: 750 * time.Millisecond, @@ -162,6 +162,11 @@ func WithMultiCluster(mgr manager.Manager, provider multicluster.Provider) (Mana m := &mcManager{Manager: mgr, provider: provider, engine: eng} + // Apply options before wiring the Runnable so overrides take effect early. + for _, o := range mcOpts { + o(m) + } + // Start synchronization loop as a manager Runnable. if err := mgr.Add(eng.Runnable()); err != nil { return nil, err diff --git a/pkg/manager/sharding.go b/pkg/manager/sharding.go index a1fdc10a..10ad93a3 100644 --- a/pkg/manager/sharding.go +++ b/pkg/manager/sharding.go @@ -22,8 +22,8 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" ) -// Option mutates mcManager after construction. -// Options must be applied after WithMultiCluster(...) and before Start(...). +// Option mutates mcManager configuration. +// Prefer passing options directly to New/WithMultiCluster so overrides apply before wiring runnables. type Option func(*mcManager) // WithSharder replaces the default HRW sharder. @@ -85,13 +85,3 @@ func WithLeaseTimings(duration, renew, throttle time.Duration) Option { } } } - -// Configure applies options to a Manager if it is an *mcManager. -// Must be called before Start(); options do not rewire the running peer registry. -func Configure(m Manager, opts ...Option) { - if x, ok := m.(*mcManager); ok { - for _, o := range opts { - o(x) - } - } -} From 83c6d32202188bc4f87109153e7eb6fb12de7496 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 10:19:47 -0700 Subject: [PATCH 11/16] fix: remove configure in favor of opts --- examples/sharded-namespace/main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/sharded-namespace/main.go b/examples/sharded-namespace/main.go index 94420126..f5c966cf 100644 --- a/examples/sharded-namespace/main.go +++ b/examples/sharded-namespace/main.go @@ -53,9 +53,9 @@ func run(ctx context.Context) error { // Multicluster manager (no peer ID passed; pod hostname becomes peer ID). // Configure sharding: - // - fencing prefix: "mcr-shard" (per-cluster Lease names become mcr-shard-) - // - peer membership still uses "mcr-peer" internally (set in WithMultiCluster) - // Peer ID defaults to os.Hostname(). + // - fencing prefix: "mcr-shard" (per-cluster Lease names become mcr-shard-) + // - peer membership still uses "mcr-peer" internally (set in WithMultiCluster) + // Peer ID defaults to os.Hostname(). mgr, err := mcmanager.New(cfg, provider, manager.Options{}, mcmanager.WithShardLease("kube-system", "mcr-shard"), // optional but explicit (your manager already defaults this to true) From 9181aba193c85eebb66c83fdcf623cb0d16486ca Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 10:25:41 -0700 Subject: [PATCH 12/16] fix: ns/nm to namespace/name --- pkg/manager/leaseguard.go | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/pkg/manager/leaseguard.go b/pkg/manager/leaseguard.go index 4316f6a0..919409e3 100644 --- a/pkg/manager/leaseguard.go +++ b/pkg/manager/leaseguard.go @@ -53,22 +53,23 @@ import ( // failover. // // RBAC: the caller’s client must be allowed to get/list/watch/create/update/patch -// Leases in namespace `ns`. +// Leases in namespace. type leaseGuard struct { - c client.Client - ns, nm string - id string - ldur time.Duration // lease duration - renew time.Duration // renew period - onLost func() // callback when we lose the lease + client client.Client + namespace string + name string + id string + ldur time.Duration // lease duration + renew time.Duration // renew period + onLost func() // callback when we lose the lease held bool cancel context.CancelFunc } // newLeaseGuard builds a guard; it does not contact the API server. -func newLeaseGuard(c client.Client, ns, name, id string, ldur, renew time.Duration, onLost func()) *leaseGuard { - return &leaseGuard{c: c, ns: ns, nm: name, id: id, ldur: ldur, renew: renew, onLost: onLost} +func newLeaseGuard(client client.Client, namespace, name, id string, ldur, renew time.Duration, onLost func()) *leaseGuard { + return &leaseGuard{client: client, namespace: namespace, name: name, id: id, ldur: ldur, renew: renew, onLost: onLost} } // TryAcquire creates/adopts the Lease for g.id and starts renewing it. @@ -79,17 +80,17 @@ func (g *leaseGuard) TryAcquire(ctx context.Context) bool { return true } - key := types.NamespacedName{Namespace: g.ns, Name: g.nm} + key := types.NamespacedName{Namespace: g.namespace, Name: g.name} now := metav1.NowMicro() ldurSec := int32(g.ldur / time.Second) var ls coordinationv1.Lease - err := g.c.Get(ctx, key, &ls) + err := g.client.Get(ctx, key, &ls) switch { case apierrors.IsNotFound(err): ls = coordinationv1.Lease{ - ObjectMeta: metav1.ObjectMeta{Namespace: g.ns, Name: g.nm}, + ObjectMeta: metav1.ObjectMeta{Namespace: g.namespace, Name: g.name}, Spec: coordinationv1.LeaseSpec{ HolderIdentity: &g.id, LeaseDurationSeconds: &ldurSec, @@ -97,7 +98,7 @@ func (g *leaseGuard) TryAcquire(ctx context.Context) bool { RenewTime: &now, }, } - if err := g.c.Create(ctx, &ls); err != nil { + if err := g.client.Create(ctx, &ls); err != nil { return false } case err != nil: @@ -120,7 +121,7 @@ func (g *leaseGuard) TryAcquire(ctx context.Context) bool { ls.Spec.AcquireTime = &now } ls.Spec.RenewTime = &now - if err := g.c.Update(ctx, &ls); err != nil { + if err := g.client.Update(ctx, &ls); err != nil { return false } } @@ -158,7 +159,7 @@ func (g *leaseGuard) renewLoop(ctx context.Context, key types.NamespacedName) { func (g *leaseGuard) renewOnce(ctx context.Context, key types.NamespacedName) bool { now := metav1.NowMicro() var ls coordinationv1.Lease - if err := g.c.Get(ctx, key, &ls); err != nil { + if err := g.client.Get(ctx, key, &ls); err != nil { return false } // another holder? @@ -170,7 +171,7 @@ func (g *leaseGuard) renewOnce(ctx context.Context, key types.NamespacedName) bo ls.Spec.HolderIdentity = &g.id ls.Spec.LeaseDurationSeconds = &ldurSec ls.Spec.RenewTime = &now - if err := g.c.Update(ctx, &ls); err != nil { + if err := g.client.Update(ctx, &ls); err != nil { return false } return true @@ -186,14 +187,14 @@ func (g *leaseGuard) Release(ctx context.Context) { } g.held = false - key := types.NamespacedName{Namespace: g.ns, Name: g.nm} + key := types.NamespacedName{Namespace: g.namespace, Name: g.name} var ls coordinationv1.Lease - if err := g.c.Get(ctx, key, &ls); err == nil { + if err := g.client.Get(ctx, key, &ls); err == nil { if ls.Spec.HolderIdentity != nil && *ls.Spec.HolderIdentity == g.id { empty := "" ls.Spec.HolderIdentity = &empty // keep RenewTime/AcquireTime; just clear holder - _ = g.c.Update(ctx, &ls) // ignore errors + _ = g.client.Update(ctx, &ls) // ignore errors } } } From 80158258cb4f844b1d285d6f6421bdfe086a1007 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 10:29:17 -0700 Subject: [PATCH 13/16] fix: remove sleep in test --- pkg/manager/leaseguard_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/manager/leaseguard_test.go b/pkg/manager/leaseguard_test.go index 37f44060..b487b3a5 100644 --- a/pkg/manager/leaseguard_test.go +++ b/pkg/manager/leaseguard_test.go @@ -172,8 +172,6 @@ func TestRenewLoop_OnLostTriggersCallbackAndReleases(t *testing.T) { t.Fatalf("expected onLost to be called after renewal detects loss") } - // Give a moment for Release() to run - time.Sleep(50 * time.Millisecond) if g.held { t.Fatalf("guard should not be held after loss") } From 9588d26f436a3bd0512ba566592e679d03575b8c Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 10:49:29 -0700 Subject: [PATCH 14/16] fix: reorder loss handling (cleanup before notify) --- pkg/manager/leaseguard.go | 6 ++--- pkg/manager/leaseguard_test.go | 41 ++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/pkg/manager/leaseguard.go b/pkg/manager/leaseguard.go index 919409e3..563e3cd6 100644 --- a/pkg/manager/leaseguard.go +++ b/pkg/manager/leaseguard.go @@ -135,7 +135,7 @@ func (g *leaseGuard) TryAcquire(ctx context.Context) bool { } // Internal: renew loop and single-step renew. If renewal observes a different, -// valid holder or API errors persist, onLost() is invoked once and the fence is released. +// valid holder or API errors persist, the fence is released first and onLost() is invoked once. func (g *leaseGuard) renewLoop(ctx context.Context, key types.NamespacedName) { t := time.NewTicker(g.renew) defer t.Stop() @@ -145,11 +145,11 @@ func (g *leaseGuard) renewLoop(ctx context.Context, key types.NamespacedName) { return case <-t.C: if ok := g.renewOnce(ctx, key); !ok { - // best-effort notify once, then release + // best-effort: release, then notify once + g.Release(context.Background()) if g.onLost != nil { g.onLost() } - g.Release(context.Background()) return } } diff --git a/pkg/manager/leaseguard_test.go b/pkg/manager/leaseguard_test.go index b487b3a5..44365fb1 100644 --- a/pkg/manager/leaseguard_test.go +++ b/pkg/manager/leaseguard_test.go @@ -140,7 +140,7 @@ func TestTryAcquire_AdoptsWhenExpired(t *testing.T) { } } -func TestRenewLoop_OnLostTriggersCallbackAndReleases(t *testing.T) { +func TestRenewLoop_LossTriggersOnLostAndReleases(t *testing.T) { s := newScheme(t) c := fake.NewClientBuilder().WithScheme(s).Build() @@ -171,7 +171,6 @@ func TestRenewLoop_OnLostTriggersCallbackAndReleases(t *testing.T) { case <-time.After(2 * time.Second): t.Fatalf("expected onLost to be called after renewal detects loss") } - if g.held { t.Fatalf("guard should not be held after loss") } @@ -186,3 +185,41 @@ func TestRelease_NoHoldIsNoop(t *testing.T) { g.Release(context.Background()) // Nothing to assert other than "does not panic" } + +func TestRenewLoop_ReleasesBeforeOnLost(t *testing.T) { + s := newScheme(t) + c := fake.NewClientBuilder().WithScheme(s).Build() + + heldAtCallback := make(chan bool, 1) + var g *leaseGuard + g = newLeaseGuard(c, testNS, testName, testID, 3*time.Second, 50*time.Millisecond, func() { + heldAtCallback <- g.held + }) + + // Acquire + if ok := g.TryAcquire(context.Background()); !ok { + t.Fatalf("expected TryAcquire to succeed") + } + + // Flip lease to another holder so renewOnce observes loss. + ls := getLease(t, c) + now := metav1.NowMicro() + dur := int32(3) + other := otherID + ls.Spec.HolderIdentity = &other + ls.Spec.LeaseDurationSeconds = &dur + ls.Spec.RenewTime = &now + if err := c.Update(context.Background(), ls); err != nil { + t.Fatalf("update lease: %v", err) + } + + // Expect callback to observe held == false (Release runs before onLost) + select { + case v := <-heldAtCallback: + if v { + t.Fatalf("expected g.held=false at onLost callback time") + } + case <-time.After(2 * time.Second): + t.Fatalf("expected onLost to be called after renewal detects loss") + } +} From 8b9beaf2169f7dfbae45d9ceca894ac1412358d3 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 12:28:30 -0700 Subject: [PATCH 15/16] feat: add pluggable peer registry --- pkg/manager/sharding.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/manager/sharding.go b/pkg/manager/sharding.go index 10ad93a3..cf0c7323 100644 --- a/pkg/manager/sharding.go +++ b/pkg/manager/sharding.go @@ -19,11 +19,11 @@ package manager import ( "time" + "sigs.k8s.io/multicluster-runtime/pkg/manager/peers" "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" ) // Option mutates mcManager configuration. -// Prefer passing options directly to New/WithMultiCluster so overrides apply before wiring runnables. type Option func(*mcManager) // WithSharder replaces the default HRW sharder. @@ -85,3 +85,15 @@ func WithLeaseTimings(duration, renew, throttle time.Duration) Option { } } } + +// WithPeerRegistry injects a custom peer Registry. When set, it overrides the +// default Lease-based registry. Peer weight should be provided by the custom +// registry; WithPeerWeight does not apply. +func WithPeerRegistry(reg peers.Registry) Option { + return func(m *mcManager) { + if m.engine != nil && reg != nil { + m.engine.peers = reg + m.engine.self = reg.Self() + } + } +} From 2b5537acce6d42fe799d89d88fdc9da37d6f14fd Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 10 Sep 2025 12:29:19 -0700 Subject: [PATCH 16/16] chore: unit tests and env tests for sharding --- pkg/manager/manager_envtest_suite_test.go | 41 ++++++ pkg/manager/peers/registry.go | 3 + pkg/manager/peers/registry_test.go | 2 + pkg/manager/sharding_test.go | 60 +++++++++ pkg/manager/synchronization.go | 3 +- pkg/manager/synchronization_envtest_test.go | 93 ++++++++++++++ pkg/manager/synchronization_test.go | 132 ++++++++++++++++++++ pkg/util/util.go | 37 ++++++ 8 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 pkg/manager/manager_envtest_suite_test.go create mode 100644 pkg/manager/sharding_test.go create mode 100644 pkg/manager/synchronization_envtest_test.go create mode 100644 pkg/manager/synchronization_test.go create mode 100644 pkg/util/util.go diff --git a/pkg/manager/manager_envtest_suite_test.go b/pkg/manager/manager_envtest_suite_test.go new file mode 100644 index 00000000..e97c1a44 --- /dev/null +++ b/pkg/manager/manager_envtest_suite_test.go @@ -0,0 +1,41 @@ +package manager_test + +import ( + "testing" + + "k8s.io/client-go/rest" + + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestManager(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Manager Envtest Suite") +} + +var testEnv *envtest.Environment +var cfg *rest.Config + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + testEnv = &envtest.Environment{} + var err error + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + // Prevent the metrics listener being created + metricsserver.DefaultBindAddress = "0" +}) + +var _ = AfterSuite(func() { + if testEnv != nil { + Expect(testEnv.Stop()).To(Succeed()) + } + // Put the DefaultBindAddress back + metricsserver.DefaultBindAddress = ":8080" +}) diff --git a/pkg/manager/peers/registry.go b/pkg/manager/peers/registry.go index b8b50232..f5e929dd 100644 --- a/pkg/manager/peers/registry.go +++ b/pkg/manager/peers/registry.go @@ -34,6 +34,7 @@ import ( crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" + "sigs.k8s.io/multicluster-runtime/pkg/util" ) const ( @@ -86,6 +87,8 @@ func NewLeaseRegistry(cli crclient.Client, ns, namePrefix string, selfID string, selfID = "unknown" } } + // Sanitize to DNS-1123 subdomain for Lease names: lowercase, [a-z0-9-.], start/end alphanumeric. + selfID = util.SanitizeDNS1123(selfID) if weight == 0 { weight = defaultWeight } diff --git a/pkg/manager/peers/registry_test.go b/pkg/manager/peers/registry_test.go index 19255807..f45c3fe8 100644 --- a/pkg/manager/peers/registry_test.go +++ b/pkg/manager/peers/registry_test.go @@ -29,6 +29,8 @@ import ( crclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + + _ "github.com/onsi/ginkgo/v2" ) const ( diff --git a/pkg/manager/sharding_test.go b/pkg/manager/sharding_test.go new file mode 100644 index 00000000..bdeed9da --- /dev/null +++ b/pkg/manager/sharding_test.go @@ -0,0 +1,60 @@ +package manager + +import ( + "context" + "testing" + "time" + + "github.com/go-logr/logr" + + coordinationv1 "k8s.io/api/coordination/v1" + "k8s.io/apimachinery/pkg/runtime" + + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" +) + +type fakeRegistry struct{ p sharder.PeerInfo } + +func (f *fakeRegistry) Self() sharder.PeerInfo { return f.p } +func (f *fakeRegistry) Snapshot() []sharder.PeerInfo { return []sharder.PeerInfo{f.p} } +func (f *fakeRegistry) Run(ctx context.Context) error { <-ctx.Done(); return ctx.Err() } + +func TestOptions_ApplyPeerRegistry(t *testing.T) { + s := runtime.NewScheme() + _ = coordinationv1.AddToScheme(s) + cli := fake.NewClientBuilder().WithScheme(s).Build() + + cfg := SynchronizationConfig{} + reg := &fakeRegistry{p: sharder.PeerInfo{ID: "x", Weight: 7}} + eng := newSynchronizationEngine(cli, logr.Discard(), sharder.NewHRW(), reg, reg.Self(), cfg) + m := &mcManager{Manager: nil, provider: nil, engine: eng} + + WithPeerRegistry(reg)(m) + if m.engine.peers != reg { + t.Fatalf("expected custom registry applied") + } + if m.engine.self != reg.Self() { + t.Fatalf("expected self to be updated from registry") + } +} + +func TestOptions_ApplyLeaseAndTimings(t *testing.T) { + m := &mcManager{engine: &synchronizationEngine{cfg: SynchronizationConfig{}}} + WithShardLease("ns", "name")(m) + WithPerClusterLease(true)(m) + WithLeaseTimings(30*time.Second, 10*time.Second, 750*time.Millisecond)(m) + WithSynchronizationIntervals(5*time.Second, 15*time.Second)(m) + + cfg := m.engine.cfg + if cfg.FenceNS != "ns" || cfg.FencePrefix != "name" || !cfg.PerClusterLease { + t.Fatalf("lease cfg not applied: %+v", cfg) + } + if cfg.LeaseDuration != 30*time.Second || cfg.LeaseRenew != 10*time.Second || cfg.FenceThrottle != 750*time.Millisecond { + t.Fatalf("timings not applied: %+v", cfg) + } + if cfg.Probe != 5*time.Second || cfg.Rehash != 15*time.Second { + t.Fatalf("cadence not applied: %+v", cfg) + } +} diff --git a/pkg/manager/synchronization.go b/pkg/manager/synchronization.go index 740ff75f..54e6f86e 100644 --- a/pkg/manager/synchronization.go +++ b/pkg/manager/synchronization.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/multicluster-runtime/pkg/manager/peers" "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + "sigs.k8s.io/multicluster-runtime/pkg/util" ) // SynchronizationConfig holds the knobs for shard synchronization and fencing. @@ -145,7 +146,7 @@ func newSynchronizationEngine(kube client.Client, log logr.Logger, shard sharder func (e *synchronizationEngine) fenceName(cluster string) string { // Per-cluster fence: mcr-shard-; otherwise a single global fence if e.cfg.PerClusterLease { - return fmt.Sprintf("%s-%s", e.cfg.FencePrefix, cluster) + return fmt.Sprintf("%s-%s", e.cfg.FencePrefix, util.SanitizeDNS1123(cluster)) } return e.cfg.FencePrefix } diff --git a/pkg/manager/synchronization_envtest_test.go b/pkg/manager/synchronization_envtest_test.go new file mode 100644 index 00000000..17a55f5c --- /dev/null +++ b/pkg/manager/synchronization_envtest_test.go @@ -0,0 +1,93 @@ +package manager_test + +import ( + "context" + "fmt" + "time" + + coordinationv1 "k8s.io/api/coordination/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/manager" + + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/providers/namespace" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Synchronization engine envtest (namespace provider)", func() { + It("distributes leases and reconciles per-namespace", func(ctx SpecContext) { + // Build a direct client (no cache) for setup/reads + sch := runtime.NewScheme() + Expect(corev1.AddToScheme(sch)).To(Succeed()) + Expect(coordinationv1.AddToScheme(sch)).To(Succeed()) + directCli, err := client.New(cfg, client.Options{Scheme: sch}) + Expect(err).NotTo(HaveOccurred()) + + // Ensure kube-system exists for fencing leases + { + var ks corev1.Namespace + err := directCli.Get(ctx, client.ObjectKey{Name: "kube-system"}, &ks) + if apierrors.IsNotFound(err) { + Expect(directCli.Create(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "kube-system"}})).To(Succeed()) + } else { + Expect(err).NotTo(HaveOccurred()) + } + } + + // Create N namespaces to act as clusters + nsNames := []string{"zoo", "jungle", "island"} + for _, n := range nsNames { + Expect(directCli.Create(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: n}})).To(Succeed()) + } + + // Provider: namespaces as clusters (uses its own cache via cluster) + host, err := cluster.New(cfg) + Expect(err).NotTo(HaveOccurred()) + prov := namespace.New(host) + + // Build mc manager with short timings + m, err := mcmanager.New(cfg, prov, manager.Options{}, + mcmanager.WithShardLease("kube-system", "mcr-shard"), + mcmanager.WithPerClusterLease(true), + mcmanager.WithLeaseTimings(6*time.Second, 2*time.Second, 100*time.Millisecond), + mcmanager.WithSynchronizationIntervals(200*time.Millisecond, 1*time.Second), + ) + Expect(err).NotTo(HaveOccurred()) + + // Add a trivial runnable to exercise engagement + r := &noopRunnable{} + Expect(m.Add(r)).To(Succeed()) + + cctx, cancel := context.WithCancel(ctx) + defer cancel() + + // Start manager and provider + go func() { _ = prov.Run(cctx, m) }() + go func() { _ = host.Start(cctx) }() + go func() { _ = m.Start(cctx) }() + + // Eventually expect three leases with holders set (read via direct client) + Eventually(func(g Gomega) { + for _, n := range nsNames { + var ls coordinationv1.Lease + key := client.ObjectKey{Namespace: "kube-system", Name: fmt.Sprintf("mcr-shard-%s", n)} + g.Expect(directCli.Get(ctx, key, &ls)).To(Succeed()) + g.Expect(ls.Spec.HolderIdentity).NotTo(BeNil()) + g.Expect(*ls.Spec.HolderIdentity).NotTo(BeEmpty()) + } + }).WithTimeout(20 * time.Second).WithPolling(200 * time.Millisecond).Should(Succeed()) + }) +}) + +type noopRunnable struct{} + +func (n *noopRunnable) Start(ctx context.Context) error { <-ctx.Done(); return ctx.Err() } +func (n *noopRunnable) Engage(ctx context.Context, name string, cl cluster.Cluster) error { return nil } diff --git a/pkg/manager/synchronization_test.go b/pkg/manager/synchronization_test.go new file mode 100644 index 00000000..7d1e44f6 --- /dev/null +++ b/pkg/manager/synchronization_test.go @@ -0,0 +1,132 @@ +package manager + +import ( + "context" + "testing" + "time" + + "github.com/go-logr/logr" + + coordinationv1 "k8s.io/api/coordination/v1" + "k8s.io/apimachinery/pkg/runtime" + + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/cluster" + + "sigs.k8s.io/multicluster-runtime/pkg/manager/sharder" +) + +type stubSharder struct{ own bool } + +func (s *stubSharder) ShouldOwn(clusterID string, _ []sharder.PeerInfo, _ sharder.PeerInfo) bool { + return s.own +} + +type stubRegistry struct{ self sharder.PeerInfo } + +func (r *stubRegistry) Self() sharder.PeerInfo { return r.self } +func (r *stubRegistry) Snapshot() []sharder.PeerInfo { return []sharder.PeerInfo{r.self} } +func (r *stubRegistry) Run(ctx context.Context) error { <-ctx.Done(); return ctx.Err() } + +type stubRunnable struct{ called chan string } + +func (s *stubRunnable) Engage(ctx context.Context, name string, cl cluster.Cluster) error { + select { + case s.called <- name: + default: + } + return nil +} + +func TestSynchronization_StartsWhenShouldOwnAndFenceAcquired(t *testing.T) { + s := runtime.NewScheme() + if err := coordinationv1.AddToScheme(s); err != nil { + t.Fatalf("scheme: %v", err) + } + cli := fake.NewClientBuilder().WithScheme(s).Build() + + cfg := SynchronizationConfig{ + FenceNS: "kube-system", FencePrefix: "mcr-shard", PerClusterLease: true, + LeaseDuration: 3 * time.Second, LeaseRenew: 50 * time.Millisecond, FenceThrottle: 50 * time.Millisecond, + PeerPrefix: "mcr-peer", PeerWeight: 1, Probe: 10 * time.Millisecond, + } + reg := &stubRegistry{self: sharder.PeerInfo{ID: "peer-0", Weight: 1}} + sh := &stubSharder{own: true} + e := newSynchronizationEngine(cli, logr.Discard(), sh, reg, reg.self, cfg) + + sink := &stubRunnable{called: make(chan string, 1)} + e.AddRunnable(sink) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + if err := e.Engage(ctx, "zoo", nil); err != nil { + t.Fatalf("engage: %v", err) + } + // Force a recompute to decide and start + e.recompute(ctx) + + select { + case name := <-sink.called: + if name != "zoo" { + t.Fatalf("expected engage for zoo, got %s", name) + } + case <-time.After(2 * time.Second): + t.Fatalf("expected runnable to be engaged") + } + + // Verify Lease created and held by self + var ls coordinationv1.Lease + key := client.ObjectKey{Namespace: cfg.FenceNS, Name: e.fenceName("zoo")} + if err := cli.Get(ctx, key, &ls); err != nil { + t.Fatalf("get lease: %v", err) + } + if ls.Spec.HolderIdentity == nil || *ls.Spec.HolderIdentity != reg.self.ID { + t.Fatalf("expected holder %q, got %+v", reg.self.ID, ls.Spec.HolderIdentity) + } +} + +func TestSynchronization_StopsAndReleasesWhenShouldOwnFalse(t *testing.T) { + s := runtime.NewScheme() + if err := coordinationv1.AddToScheme(s); err != nil { + t.Fatalf("scheme: %v", err) + } + cli := fake.NewClientBuilder().WithScheme(s).Build() + + cfg := SynchronizationConfig{ + FenceNS: "kube-system", FencePrefix: "mcr-shard", PerClusterLease: true, + LeaseDuration: 3 * time.Second, LeaseRenew: 50 * time.Millisecond, FenceThrottle: 50 * time.Millisecond, + PeerPrefix: "mcr-peer", PeerWeight: 1, Probe: 10 * time.Millisecond, + } + reg := &stubRegistry{self: sharder.PeerInfo{ID: "peer-0", Weight: 1}} + sh := &stubSharder{own: true} + e := newSynchronizationEngine(cli, logr.Discard(), sh, reg, reg.self, cfg) + + e.AddRunnable(&stubRunnable{called: make(chan string, 1)}) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + if err := e.Engage(ctx, "zoo", nil); err != nil { + t.Fatalf("engage: %v", err) + } + e.recompute(ctx) // start and acquire lease + + // Flip ownership to false and recompute; engine should stop and release fence + sh.own = false + e.recompute(ctx) + + // Poll for lease holder cleared by Release() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + var ls coordinationv1.Lease + if err := cli.Get(ctx, client.ObjectKey{Namespace: cfg.FenceNS, Name: e.fenceName("zoo")}, &ls); err == nil { + if ls.Spec.HolderIdentity != nil && *ls.Spec.HolderIdentity == "" { + return + } + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("expected lease holder to be cleared after release") +} diff --git a/pkg/util/util.go b/pkg/util/util.go new file mode 100644 index 00000000..8cea02b2 --- /dev/null +++ b/pkg/util/util.go @@ -0,0 +1,37 @@ +package util + +// SanitizeDNS1123 converts s to a DNS-1123 subdomain-compatible string for resource names. +// It lowercases, replaces unsupported characters with '-', and trims leading/trailing non-alphanumerics. +func SanitizeDNS1123(s string) string { + b := make([]rune, 0, len(s)) + for _, r := range s { + switch { + case r >= 'A' && r <= 'Z': + b = append(b, r+('a'-'A')) + case (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '.': + b = append(b, r) + default: + b = append(b, '-') + } + } + // trim leading/trailing non-alphanumeric + start, end := 0, len(b) + for start < end { + r := b[start] + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') { + break + } + start++ + } + for end > start { + r := b[end-1] + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') { + break + } + end-- + } + if start >= end { + return "unknown" + } + return string(b[start:end]) +}