Skip to content

Commit 9159f70

Browse files
authored
[Feature] Disaster recovery (#590)
1 parent f74977e commit 9159f70

File tree

13 files changed

+309
-8
lines changed

13 files changed

+309
-8
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
- Add Label and Envs Pod customization
1111
- Improved JWT Rotation
1212
- Allow to customize Security Context in pods
13+
- Remove dead Coordinators in Cluster mode
14+
- Add AutoRecovery flag to recover cluster in case of deadlock
1315

1416
## [1.0.3](https://github.com/arangodb/kube-arangodb/tree/1.0.3) (2020-05-25)
1517
- Prevent deletion of not known PVC's

pkg/apis/deployment/v1/deployment_spec.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ type DeploymentSpec struct {
9494

9595
Chaos ChaosSpec `json:"chaos"`
9696

97+
Recovery *ArangoDeploymentRecoverySpec `json:"recovery,omitempty"`
98+
9799
Bootstrap BootstrapSpec `json:"bootstrap,omitempty"`
98100
}
99101

pkg/apis/deployment/v1/plan.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ const (
113113
ActionTypeJWTRefresh ActionType = "JWTRefresh"
114114
// ActionTypeJWTPropagated change propagated flag
115115
ActionTypeJWTPropagated ActionType = "JWTPropagated"
116+
// ActionTypeClusterMemberCleanup removes member from cluster
117+
ActionTypeClusterMemberCleanup ActionType = "ClusterMemberCleanup"
116118
)
117119

118120
const (
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Adam Janikowski
21+
//
22+
23+
package v1
24+
25+
import "github.com/arangodb/kube-arangodb/pkg/util"
26+
27+
type ArangoDeploymentRecoverySpec struct {
28+
AutoRecover *bool `json:"autoRecover"`
29+
}
30+
31+
func (a *ArangoDeploymentRecoverySpec) Get() ArangoDeploymentRecoverySpec {
32+
if a != nil {
33+
return *a
34+
}
35+
36+
return ArangoDeploymentRecoverySpec{}
37+
}
38+
39+
func (a ArangoDeploymentRecoverySpec) GetAutoRecover() bool {
40+
return util.BoolOrDefault(a.AutoRecover, false)
41+
}

pkg/apis/deployment/v1/server_group_spec.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ type ServerGroupSpecSecurityContext struct {
103103

104104
AllowPrivilegeEscalation *bool `json:"allowPrivilegeEscalation,omitempty"`
105105
Privileged *bool `json:"privileged,omitempty"`
106-
ReadOnlyRootFilesystem *bool `json:"readOnlyFileSystem,omitempty"`
106+
ReadOnlyRootFilesystem *bool `json:"readOnlyRootFilesystem,omitempty"`
107107
RunAsNonRoot *bool `json:"runAsNonRoot,omitempty"`
108108
RunAsUser *int64 `json:"runAsUser,omitempty"`
109109
RunAsGroup *int64 `json:"runAsGroup,omitempty"`

pkg/apis/deployment/v1/zz_generated.deepcopy.go

Lines changed: 26 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Adam Janikowski
21+
//
22+
23+
package reconcile
24+
25+
import (
26+
"context"
27+
28+
"github.com/arangodb/go-driver"
29+
30+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
31+
"github.com/rs/zerolog"
32+
)
33+
34+
func init() {
35+
registerAction(api.ActionTypeClusterMemberCleanup, newClusterMemberCleanupAction)
36+
}
37+
38+
// newClusterMemberCleanupAction creates a new Action that implements the given
39+
// planned ClusterMemberCleanup action.
40+
func newClusterMemberCleanupAction(log zerolog.Logger, action api.Action, actionCtx ActionContext) Action {
41+
a := &actionClusterMemberCleanup{}
42+
43+
a.actionImpl = newActionImplDefRef(log, action, actionCtx, addMemberTimeout)
44+
45+
return a
46+
}
47+
48+
// actionClusterMemberCleanup implements an ClusterMemberCleanup.
49+
type actionClusterMemberCleanup struct {
50+
// actionImpl implement timeout and member id functions
51+
actionImpl
52+
53+
// actionEmptyCheckProgress implement check progress with empty implementation
54+
actionEmptyCheckProgress
55+
}
56+
57+
// Start performs the start of the action.
58+
// Returns true if the action is completely finished, false in case
59+
// the start time needs to be recorded and a ready condition needs to be checked.
60+
func (a *actionClusterMemberCleanup) Start(ctx context.Context) (bool, error) {
61+
if err := a.start(ctx); err != nil {
62+
a.log.Warn().Err(err).Msgf("Unable to clean cluster member")
63+
}
64+
65+
return true, nil
66+
}
67+
68+
func (a *actionClusterMemberCleanup) start(ctx context.Context) error {
69+
id := driver.ServerID(a.MemberID())
70+
71+
c, err := a.actionCtx.GetDatabaseClient(ctx)
72+
if err != nil {
73+
return err
74+
}
75+
76+
cluster, err := c.Cluster(ctx)
77+
if err != nil {
78+
return err
79+
}
80+
81+
health, err := cluster.Health(ctx)
82+
if err != nil {
83+
return err
84+
}
85+
86+
if _, ok := health.Health[id]; !ok {
87+
return nil
88+
}
89+
90+
if err := cluster.RemoveServer(ctx, id); err != nil {
91+
return err
92+
}
93+
94+
return nil
95+
}

pkg/deployment/reconcile/plan_builder.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,6 @@ func createPlan(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIOb
245245
plan = pb.Apply(createKeyfileRenewalPlan)
246246
}
247247

248-
// Check for the need to rotate TLS certificate of a members
249-
//if plan.IsEmpty() {
250-
// plan = pb.Apply(createRotateTLSServerCertificatePlan)
251-
//}
252-
253248
// Check for changes storage classes or requirements
254249
if plan.IsEmpty() {
255250
plan = pb.Apply(createRotateServerStoragePlan)
@@ -271,6 +266,10 @@ func createPlan(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIOb
271266
plan = pb.Apply(createCACleanPlan)
272267
}
273268

269+
if plan.IsEmpty() {
270+
plan = pb.Apply(createClusterOperationPlan)
271+
}
272+
274273
// Return plan
275274
return plan, true
276275
}
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Adam Janikowski
21+
//
22+
23+
package reconcile
24+
25+
import (
26+
"context"
27+
"time"
28+
29+
"github.com/arangodb/go-driver"
30+
31+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
32+
"github.com/arangodb/kube-arangodb/pkg/deployment/resources/inspector"
33+
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
34+
"github.com/rs/zerolog"
35+
)
36+
37+
const coordinatorHealthFailedTimeout time.Duration = time.Minute
38+
39+
func createClusterOperationPlan(ctx context.Context,
40+
log zerolog.Logger, apiObject k8sutil.APIObject,
41+
spec api.DeploymentSpec, status api.DeploymentStatus,
42+
cachedStatus inspector.Inspector, context PlanBuilderContext) api.Plan {
43+
44+
if spec.GetMode() != api.DeploymentModeCluster {
45+
return nil
46+
}
47+
48+
c, err := context.GetDatabaseClient(ctx)
49+
if err != nil {
50+
return nil
51+
}
52+
53+
cluster, err := c.Cluster(ctx)
54+
if err != nil {
55+
log.Warn().Err(err).Msgf("Unable to get Cluster client")
56+
return nil
57+
}
58+
59+
health, err := cluster.Health(ctx)
60+
if err != nil {
61+
log.Warn().Err(err).Msgf("Unable to get Cluster health")
62+
return nil
63+
}
64+
65+
membersHealth := health.Health
66+
67+
status.Members.ForeachServerGroup(func(group api.ServerGroup, list api.MemberStatusList) error {
68+
for _, m := range list {
69+
delete(membersHealth, driver.ServerID(m.ID))
70+
}
71+
72+
return nil
73+
})
74+
75+
if len(membersHealth) == 0 {
76+
return nil
77+
}
78+
79+
for id, member := range membersHealth {
80+
switch member.Role {
81+
case driver.ServerRoleCoordinator:
82+
if member.Status != driver.ServerStatusFailed {
83+
continue
84+
}
85+
86+
if member.LastHeartbeatAcked.Add(coordinatorHealthFailedTimeout).Before(time.Now()) {
87+
return api.Plan{
88+
api.NewAction(api.ActionTypeClusterMemberCleanup, api.ServerGroupCoordinators, string(id)),
89+
}
90+
}
91+
}
92+
}
93+
94+
return nil
95+
}

pkg/deployment/reconcile/plan_builder_context.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ type PlanBuilderContext interface {
5858
RenderPodForMember(cachedStatus inspector.Inspector, spec api.DeploymentSpec, status api.DeploymentStatus, memberID string, imageInfo api.ImageInfo) (*core.Pod, error)
5959
// SelectImage select currently used image by pod
6060
SelectImage(spec api.DeploymentSpec, status api.DeploymentStatus) (api.ImageInfo, bool)
61+
// GetDatabaseClient returns a cached client for the entire database (cluster coordinators or single server),
62+
// creating one if needed.
63+
GetDatabaseClient(ctx context.Context) (driver.Client, error)
6164
// GetServerClient returns a cached client for a specific server.
6265
GetServerClient(ctx context.Context, group api.ServerGroup, id string) (driver.Client, error)
6366
// SecretsInterface return secret interface

0 commit comments

Comments
 (0)