Skip to content

Commit 453eb4f

Browse files
Merge pull request #5 from prajwalakhuj/monitoring-enabled
enabled basic alerts and slack integration
2 parents f41ad7d + fb1d616 commit 453eb4f

File tree

11 files changed

+513
-37
lines changed

11 files changed

+513
-37
lines changed

README.md

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ Features
2222

2323
7. Logging and Monitoring: Easily configure logging destinations for slow logs and engine logs, allowing you to monitor the performance and troubleshoot any issues efficiently.
2424

25+
8. CloudWatch Alerts: Set up CloudWatch alarms to monitor the health and performance of your Redis cluster. Integrate these alarms with AWS Simple Notification Service (SNS) to receive real-time alerts. Use AWS Lambda functions to customize your alerting logic, and send notifications to Slack channels for immediate visibility into your Redis cluster's status.
26+
27+
2528
## Uses Example
2629

2730
```hcl
@@ -31,22 +34,27 @@ module "redis" {
3134
environment = "production"
3235
name = "redis"
3336
family = "redis6.x"
34-
vpc_id = "vpc-06eb7eskaf"
35-
subnets = ["subnet-0bfa3eskaf","subnet-0140bskaf"]
36-
node_type = "cache.t3.small"
37-
kms_key_arn = "arn:aws:kms:us-east-2:222222222222:key/kms_key_arn"
38-
num_cache_nodes = 2
39-
engine_version = "6.x"
40-
multi_az_enabled = false
41-
availability_zones = 2
42-
automatic_failover_enabled = true
43-
snapshot_retention_limit = 7
44-
at_rest_encryption_enabled = true
45-
transit_encryption_enabled = false
46-
notification_topic_arn = null
47-
allowed_security_groups = [sg-0132a18skaf]
48-
snapshot_window = "07:00-08:00"
49-
maintenance_window = "sun:09:00-sun:10:00"
37+
vpc_id = "vpc-06eb7eskaf"
38+
subnets = ["subnet-0bfa3eskaf","subnet-0140bskaf"]
39+
node_type = "cache.t3.small"
40+
kms_key_arn = "arn:aws:kms:us-east-2:222222222222:key/kms_key_arn"
41+
num_cache_nodes = 2
42+
engine_version = "6.x"
43+
multi_az_enabled = false
44+
availability_zones = 2
45+
automatic_failover_enabled = true
46+
snapshot_retention_limit = 7
47+
transit_encryption_enabled = false
48+
notification_topic_arn = null
49+
allowed_security_groups = [sg-0132a18skaf]
50+
snapshot_window = "07:00-08:00"
51+
maintenance_window = "sun:09:00-sun:10:00"
52+
cloudwatch_metric_alarms_enabled = true # For enabling basic alerting
53+
alarm_cpu_threshold_percent = 70
54+
alarm_memory_threshold_bytes = "10000000" # in bytes
55+
slack_username = "john"
56+
slack_channel = "redis-alerts"
57+
slack_webhook_url = "https://hooks.slack.com/services/xxxxxxxxx"
5058
}
5159
5260
```
@@ -79,37 +87,52 @@ Security scanning is graciously provided by Prowler. Proowler is the leading ful
7987

8088
| Name | Version |
8189
|------|---------|
90+
| <a name="provider_archive"></a> [archive](#provider\_archive) | n/a |
8291
| <a name="provider_aws"></a> [aws](#provider\_aws) | >= 4.23 |
8392
| <a name="provider_random"></a> [random](#provider\_random) | >= 3.0.0 |
8493

8594
## Modules
8695

8796
| Name | Source | Version |
8897
|------|--------|---------|
98+
| <a name="module_cw_sns_slack"></a> [cw\_sns\_slack](#module\_cw\_sns\_slack) | ./lambda | n/a |
8999
| <a name="module_security_group_redis"></a> [security\_group\_redis](#module\_security\_group\_redis) | terraform-aws-modules/security-group/aws | 4.13.0 |
90100

91101
## Resources
92102

93103
| Name | Type |
94104
|------|------|
105+
| [aws_cloudwatch_metric_alarm.cache_cpu](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
106+
| [aws_cloudwatch_metric_alarm.cache_memory](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
95107
| [aws_elasticache_parameter_group.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/elasticache_parameter_group) | resource |
96108
| [aws_elasticache_replication_group.redis](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/elasticache_replication_group) | resource |
97109
| [aws_elasticache_subnet_group.elasticache](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/elasticache_subnet_group) | resource |
110+
| [aws_kms_ciphertext.slack_url](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_ciphertext) | resource |
111+
| [aws_kms_key.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource |
112+
| [aws_lambda_permission.sns_lambda_slack_invoke](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
98113
| [aws_secretsmanager_secret.secret_redis](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
99114
| [aws_security_group_rule.cidr_ingress](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
100115
| [aws_security_group_rule.default_ingress](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
116+
| [aws_sns_topic.slack_topic](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic) | resource |
117+
| [aws_sns_topic_subscription.slack-endpoint](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_subscription) | resource |
101118
| [random_password.password](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource |
119+
| [archive_file.lambdazip](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
102120
| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
103121

104122
## Inputs
105123

106124
| Name | Description | Type | Default | Required |
107125
|------|-------------|------|---------|:--------:|
126+
| <a name="input_alarm_actions"></a> [alarm\_actions](#input\_alarm\_actions) | Alarm action list | `list(string)` | `[]` | no |
127+
| <a name="input_alarm_cpu_threshold_percent"></a> [alarm\_cpu\_threshold\_percent](#input\_alarm\_cpu\_threshold\_percent) | CPU threshold alarm level | `number` | `75` | no |
128+
| <a name="input_alarm_memory_threshold_bytes"></a> [alarm\_memory\_threshold\_bytes](#input\_alarm\_memory\_threshold\_bytes) | Ram threshold alarm level in bytes | `number` | `10000000` | no |
108129
| <a name="input_allowed_cidr_blocks"></a> [allowed\_cidr\_blocks](#input\_allowed\_cidr\_blocks) | A list of CIDR blocks which are allowed to access the database | `list(any)` | `[]` | no |
109130
| <a name="input_allowed_security_groups"></a> [allowed\_security\_groups](#input\_allowed\_security\_groups) | A list of Security Group ID's to allow access to | `list(any)` | `[]` | no |
110131
| <a name="input_at_rest_encryption_enabled"></a> [at\_rest\_encryption\_enabled](#input\_at\_rest\_encryption\_enabled) | (Optional) Whether to enable encryption at rest | `bool` | `true` | no |
111132
| <a name="input_automatic_failover_enabled"></a> [automatic\_failover\_enabled](#input\_automatic\_failover\_enabled) | Enable automatic failover | `bool` | `true` | no |
112133
| <a name="input_availability_zones"></a> [availability\_zones](#input\_availability\_zones) | The no. of AZs | `string` | `2` | no |
134+
| <a name="input_cloudwatch_metric_alarms_enabled"></a> [cloudwatch\_metric\_alarms\_enabled](#input\_cloudwatch\_metric\_alarms\_enabled) | Boolean flag to enable/disable CloudWatch metrics alarms | `bool` | `false` | no |
135+
| <a name="input_cw_sns_topic_arn"></a> [cw\_sns\_topic\_arn](#input\_cw\_sns\_topic\_arn) | The username to use when sending notifications to Slack. | `string` | `""` | no |
113136
| <a name="input_engine_log_destination"></a> [engine\_log\_destination](#input\_engine\_log\_destination) | The destination for engine logs(eg. Cloudwatch log-group name or kinesis firehose stream name) | `string` | `null` | no |
114137
| <a name="input_engine_log_destination_type"></a> [engine\_log\_destination\_type](#input\_engine\_log\_destination\_type) | The type of destination for engine logs(eg . cloudwatch-logs or kinesis-firehose) | `string` | `""` | no |
115138
| <a name="input_engine_log_format"></a> [engine\_log\_format](#input\_engine\_log\_format) | the format for logs eg. json/text | `string` | `"json"` | no |
@@ -124,9 +147,13 @@ Security scanning is graciously provided by Prowler. Proowler is the leading ful
124147
| <a name="input_node_type"></a> [node\_type](#input\_node\_type) | The instance size of the redis cluster | `string` | `"cache.t3.micro"` | no |
125148
| <a name="input_notification_topic_arn"></a> [notification\_topic\_arn](#input\_notification\_topic\_arn) | (Optional) ARN of an SNS topic to send ElastiCache notifications | `string` | `null` | no |
126149
| <a name="input_num_cache_nodes"></a> [num\_cache\_nodes](#input\_num\_cache\_nodes) | The number of cache nodes | `number` | `1` | no |
150+
| <a name="input_ok_actions"></a> [ok\_actions](#input\_ok\_actions) | The list of actions to execute when this alarm transitions into an OK state from any other state. Each action is specified as an Amazon Resource Number (ARN) | `list(string)` | `[]` | no |
127151
| <a name="input_parameter_group_description"></a> [parameter\_group\_description](#input\_parameter\_group\_description) | Parameter group | `string` | `null` | no |
128152
| <a name="input_port"></a> [port](#input\_port) | The redis port | `number` | `6379` | no |
129153
| <a name="input_recovery_window_aws_secret"></a> [recovery\_window\_aws\_secret](#input\_recovery\_window\_aws\_secret) | Number of days that AWS Secrets Manager waits before it can delete the secret. This value can be 0 to force deletion without recovery or range from 7 to 30 days. | `number` | `0` | no |
154+
| <a name="input_slack_channel"></a> [slack\_channel](#input\_slack\_channel) | The Slack channel where notifications will be posted. | `string` | `""` | no |
155+
| <a name="input_slack_username"></a> [slack\_username](#input\_slack\_username) | The username to use when sending notifications to Slack. | `string` | `""` | no |
156+
| <a name="input_slack_webhook_url"></a> [slack\_webhook\_url](#input\_slack\_webhook\_url) | The Slack Webhook URL where notifications will be sent. | `string` | `""` | no |
130157
| <a name="input_slow_log_destination"></a> [slow\_log\_destination](#input\_slow\_log\_destination) | The destination for slow logs(eg. Cloudwatch log-group name or kinesis firehose stream name.) | `string` | `null` | no |
131158
| <a name="input_slow_log_destination_type"></a> [slow\_log\_destination\_type](#input\_slow\_log\_destination\_type) | The type of destination for slow logs(eg . cloudwatch-logs or kinesis-firehose) | `string` | `""` | no |
132159
| <a name="input_slow_log_format"></a> [slow\_log\_format](#input\_slow\_log\_format) | the format for logs eg. json/text | `string` | `"json"` | no |

examples/complete/main.tf

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
locals {
22
name = "redis"
3-
region = "us-east-1"
3+
region = "us-east-2"
44
family = "redis6.x"
55
node_type = "cache.t3.small"
6-
vpc_id = "vpc-06f1a2f3a7"
7-
subnet_ids = ["subnet-0bb128ab", "subnet-0b54928666a"]
8-
kms_key_arn = "arn:aws:kms:us-east-1:2222222222:key/bcfdc1c5-1bbbdb467d90"
6+
vpc_id = "vpc-0220830b5260698db"
7+
subnet_ids = ["subnet-0d4dee4a7ea31a96d", "subnet-07fdc14616382f833"]
8+
kms_key_arn = ""
99
environment = "prod"
1010
redis_engine_version = "6.0"
11-
allowed_security_groups = ["sg-0e8dab08e40"]
11+
allowed_security_groups = ["sg-02c3f55874f6e0c64"]
1212
additional_tags = {
1313
Owner = "Organization_Name"
1414
Expires = "Never"
@@ -17,19 +17,25 @@ locals {
1717
}
1818

1919
module "redis" {
20-
source = "squareops/elasticache-redis/aws"
21-
name = local.name
22-
family = local.family
23-
node_type = local.node_type
24-
environment = local.environment
25-
engine_version = local.redis_engine_version
26-
num_cache_nodes = 2
27-
vpc_id = local.vpc_id
28-
subnets = local.subnet_ids
29-
kms_key_arn = local.kms_key_arn
30-
multi_az_enabled = false
31-
availability_zones = 2
32-
snapshot_window = "07:00-08:00"
33-
maintenance_window = "sun:09:00-sun:10:00"
34-
allowed_security_groups = local.allowed_security_groups
20+
source = "squareops/elasticache-redis/aws"
21+
name = local.name
22+
family = local.family
23+
node_type = local.node_type
24+
environment = local.environment
25+
engine_version = local.redis_engine_version
26+
num_cache_nodes = 2
27+
vpc_id = local.vpc_id
28+
subnets = local.subnet_ids
29+
kms_key_arn = local.kms_key_arn
30+
multi_az_enabled = false
31+
availability_zones = 2
32+
snapshot_window = "07:00-08:00"
33+
maintenance_window = "sun:09:00-sun:10:00"
34+
allowed_security_groups = local.allowed_security_groups
35+
cloudwatch_metric_alarms_enabled = true
36+
alarm_cpu_threshold_percent = 70
37+
alarm_memory_threshold_bytes = "10000000" # in bytes
38+
slack_username = ""
39+
slack_channel = ""
40+
slack_webhook_url = ""
3541
}

lambda/README.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
## Lambda for SNS
2+
![squareops_avatar]
3+
4+
[squareops_avatar]: https://squareops.com/wp-content/uploads/2022/12/squareops-logo.png
5+
6+
### [SquareOps Technologies](https://squareops.com/) Your DevOps Partner for Accelerating cloud journey.
7+
<br>
8+
9+
Here is Lambda that calls the Slack webhook and passes the alarm message as the payload.
10+
<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
11+
## Requirements
12+
13+
No requirements.
14+
15+
## Providers
16+
17+
| Name | Version |
18+
|------|---------|
19+
| <a name="provider_aws"></a> [aws](#provider\_aws) | n/a |
20+
21+
## Modules
22+
23+
No modules.
24+
25+
## Resources
26+
27+
| Name | Type |
28+
|------|------|
29+
| [aws_cloudwatch_log_group.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
30+
| [aws_iam_role.lambda_exec_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
31+
| [aws_iam_role_policy.lambda_cwl_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
32+
| [aws_lambda_function.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
33+
| [aws_iam_policy_document.lambda_cwl_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
34+
| [aws_iam_policy_document.lambda_exec_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
35+
36+
## Inputs
37+
38+
| Name | Description | Type | Default | Required |
39+
|------|-------------|------|---------|:--------:|
40+
| <a name="input_artifact_file"></a> [artifact\_file](#input\_artifact\_file) | The path to the function's deployment package within the local filesystem | `string` | `null` | no |
41+
| <a name="input_cwl_retention_days"></a> [cwl\_retention\_days](#input\_cwl\_retention\_days) | The retention time in days for the CloudWatch Logs Stream. | `number` | `30` | no |
42+
| <a name="input_description"></a> [description](#input\_description) | Description of what the Lambda Function does. | `string` | `null` | no |
43+
| <a name="input_environment"></a> [environment](#input\_environment) | The Lambda environment's configuration settings. | `map(string)` | `{}` | no |
44+
| <a name="input_handler"></a> [handler](#input\_handler) | The function entrypoint in the code. | `string` | `"index.handler"` | no |
45+
| <a name="input_memory_size"></a> [memory\_size](#input\_memory\_size) | Amount of memory in MB your Lambda Function can use at runtime. | `number` | `128` | no |
46+
| <a name="input_name"></a> [name](#input\_name) | A unique name for the Lambda Function. | `string` | n/a | yes |
47+
| <a name="input_runtime"></a> [runtime](#input\_runtime) | The Runtime used in the Lambda Function. | `string` | n/a | yes |
48+
| <a name="input_tags"></a> [tags](#input\_tags) | A mapping of tags to assign to the module resources. | `map(string)` | `{}` | no |
49+
| <a name="input_timeout"></a> [timeout](#input\_timeout) | The amount of time your Lambda Function has to run in seconds. | `number` | `6` | no |
50+
51+
## Outputs
52+
53+
| Name | Description |
54+
|------|-------------|
55+
| <a name="output_arn"></a> [arn](#output\_arn) | The ARN identifying the Lambda Function. |
56+
| <a name="output_exec_role_id"></a> [exec\_role\_id](#output\_exec\_role\_id) | The ID of the Function's IAM Role. |
57+
| <a name="output_invoke_arn"></a> [invoke\_arn](#output\_invoke\_arn) | The ARN to be used for invoking Lambda Function from API Gateway. |
58+
| <a name="output_name"></a> [name](#output\_name) | The name of the Lambda Function. |
59+
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->

lambda/data.tf

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Lambda Assume Role policy
2+
data "aws_iam_policy_document" "lambda_exec_role_policy" {
3+
statement {
4+
sid = "LambdaExecRolePolicy"
5+
effect = "Allow"
6+
principals {
7+
identifiers = [
8+
"lambda.amazonaws.com",
9+
]
10+
type = "Service"
11+
}
12+
actions = [
13+
"sts:AssumeRole",
14+
]
15+
}
16+
}
17+
18+
# Lambda CloudWatch Logs access
19+
data "aws_iam_policy_document" "lambda_cwl_access" {
20+
statement {
21+
sid = "LambdaCreateCloudWatchLogGroup"
22+
effect = "Allow"
23+
actions = [
24+
"logs:PutLogEvents",
25+
"logs:CreateLogStream",
26+
"logs:CreateLogGroup"
27+
]
28+
resources = [
29+
"arn:aws:logs:*:*:log-group:/aws/lambda/*:*:*"
30+
]
31+
}
32+
}

lambda/iam.tf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
resource "aws_iam_role" "lambda_exec_role" {
2+
name = "${replace(title(var.name), "-", "")}LambdaExecRole"
3+
assume_role_policy = data.aws_iam_policy_document.lambda_exec_role_policy.json
4+
}
5+
6+
resource "aws_iam_role_policy" "lambda_cwl_policy" {
7+
name = "${replace(title(var.name), "-", "")}LambdaCWLogsPolicy"
8+
role = aws_iam_role.lambda_exec_role.id
9+
policy = data.aws_iam_policy_document.lambda_cwl_access.json
10+
}

lambda/main.tf

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
resource "aws_cloudwatch_log_group" "lambda" {
2+
name = "/aws/lambda/${var.name}"
3+
retention_in_days = var.cwl_retention_days
4+
tags = var.tags
5+
}
6+
7+
resource "aws_lambda_function" "this" {
8+
function_name = var.name
9+
description = var.description
10+
filename = var.artifact_file
11+
source_code_hash = var.artifact_file != null ? filebase64sha256(var.artifact_file) : null
12+
role = aws_iam_role.lambda_exec_role.arn
13+
handler = var.handler
14+
runtime = var.runtime
15+
memory_size = var.memory_size
16+
timeout = var.timeout
17+
18+
dynamic "environment" {
19+
for_each = (length(var.environment) > 0 ? [1] : [])
20+
content {
21+
variables = var.environment
22+
}
23+
}
24+
25+
tags = var.tags
26+
}

lambda/outputs.tf

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
output "name" {
2+
description = "The name of the Lambda Function."
3+
value = aws_lambda_function.this.function_name
4+
}
5+
6+
output "arn" {
7+
description = "The ARN identifying the Lambda Function."
8+
value = aws_lambda_function.this.arn
9+
}
10+
11+
output "invoke_arn" {
12+
description = "The ARN to be used for invoking Lambda Function from API Gateway."
13+
value = aws_lambda_function.this.invoke_arn
14+
}
15+
16+
output "exec_role_id" {
17+
description = "The ID of the Function's IAM Role."
18+
value = aws_iam_role.lambda_exec_role.id
19+
}

0 commit comments

Comments
 (0)