Skip to content
This repository has been archived by the owner on Nov 14, 2024. It is now read-only.

Feature/alarm creator #24

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
104a3ab
test
Jerpen80 Apr 11, 2024
9efa539
Update variables.tf
Jerpen80 Apr 11, 2024
5c661a5
test2
Jerpen80 Apr 11, 2024
63a232a
test3
Jerpen80 Apr 12, 2024
0e17400
test4
Jerpen80 Apr 12, 2024
bb98b12
test5
Jerpen80 Apr 12, 2024
fe40b1d
test6
Jerpen80 Apr 12, 2024
07eadc1
test7
Jerpen80 Apr 12, 2024
9872af0
test8
Jerpen80 Apr 12, 2024
e35f5b0
test9
Jerpen80 Apr 12, 2024
964587d
bucket policies
Jerpen80 Apr 12, 2024
20f3f65
bucket policies
Jerpen80 Apr 12, 2024
8cd740c
Update variables.tf with description how to pass file
Jerpen80 Apr 12, 2024
44fcfa0
Update variables.tf
Jerpen80 Apr 12, 2024
29fba5f
added default alarmsfile
Jerpen80 Apr 12, 2024
16fae61
fixed bug
Jerpen80 Apr 12, 2024
79f96dd
s3 putbucketpolicy issue
Jerpen80 Apr 12, 2024
c47adf2
added dependency
Jerpen80 Apr 12, 2024
6f2246f
removed duplicate data block
Jerpen80 Apr 12, 2024
0625b48
open bucket policy
Jerpen80 Apr 12, 2024
eb82ea4
added lambda s3 permissions
Jerpen80 Apr 12, 2024
e1f620a
removed public access
Jerpen80 Apr 12, 2024
fe0abe5
added default bucketname
Jerpen80 Apr 12, 2024
04b41b1
fiddling with bucket access permissions
Jerpen80 Apr 12, 2024
96f5a05
Added lambda trigger for s3 upload
Jerpen80 May 2, 2024
b1d33b5
removed global vars to avoid caching
Jerpen80 May 2, 2024
646198d
replacing obsolte alarms
Jerpen80 May 2, 2024
a192f10
remove all non autosclaing alarms then create alarms
Jerpen80 May 6, 2024
57e33d7
fixed dimensions
Jerpen80 Aug 15, 2024
4fc635c
added elasticache permissions
Jerpen80 Aug 15, 2024
0bf2277
added elasticache alarms
Jerpen80 Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 81 additions & 58 deletions alarm_creator/actions.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,100 @@
import boto3, json

from pip import main
import boto3, json, os

# Create boto3 clients
CWclient = boto3.client("cloudwatch")
ec2 = boto3.resource("ec2")
rds = boto3.client("rds")
ec2client = boto3.client("ec2")
ecsclient = boto3.client("ecs")
s3client = boto3.client("s3")
elasticlient = boto3.client("elasticache")

alarm_bucket = os.environ["BUCKET_NAME"]

# Load json file containing the alarms
with open('./alarms.json') as alarms_file:
alarms = json.load(alarms_file)
def get_alarm_params():
alarm_data = s3client.get_object(
Bucket=alarm_bucket,
Key='alarms.json'
)
alarm_params = alarm_data.get('Body')
alarms = json.load(alarm_params)
return alarms

# Alarm creator
def AWS_Alarms():
# Load alarms parameters
alarms = get_alarm_params()

for service in alarms:

# Fill instances variable with Running instances per service
if service == "EC2":
instances = GetRunningInstances()
elif service == "RDS":
instances = GetRunningDBInstances()
elif service == "ECS":
instances = GetRunningClusters()
elif service == "ElastiCache":
instances = GetRunningCacheClusters()
elif service == "CWAgent":
instances = GetRunningInstances()


for alarm in alarms[service]:

# Query the namespaces in CloudWatch Metrics
response = CWclient.list_metrics(Namespace=f"{alarms[service][alarm]['Namespace']}", RecentlyActive='PT3H',)
response = CWclient.list_metrics(Namespace=f"{alarms[service][alarm]['Namespace']}", RecentlyActive='PT3H')
for metrics in response["Metrics"]:

# Check if any of the found metricnames are equal to metric names in alarms file
# Check if any of the found metric names are equal to metric names in alarms file
if metrics["MetricName"] == alarms[service][alarm]['MetricName']:
for dimensions in metrics["Dimensions"]:
if dimensions["Name"] == alarms[service][alarm]['Dimensions']:
for priority, threshold in zip(alarms[service][alarm]['AlarmThresholds']["priority"], alarms[service][alarm]['AlarmThresholds']["alarm_threshold"]):

# To make alarmnames pretty, 'MB/GB' is used instead of 1000000/1000000000 bytes, needs to be in bytes for actual threshold
if alarms[service][alarm]['Description']['ThresholdUnit'] == "GB":
cw_threshold = int(threshold) * 1000000000
elif alarms[service][alarm]['Description']['ThresholdUnit'] == "MB":
cw_threshold = int(threshold) * 1000000
else:
cw_threshold = int(threshold)

# Handling dimensions
instanceDimensions = {
"Name": f"{dimensions['Name']}",
"Value": f"{dimensions['Value']}"
}
dimensionlist = []
# For disk alarms there are more dimensions than other alarms
try:
for item in alarms[service][alarm]['DiskDimensions']:
dimensionlist.append(item)
except KeyError: #
dimensionlist = []
dimensionlist.insert(0, instanceDimensions)

for instance in instances:

# Create alarms
CWclient.put_metric_alarm(
AlarmName=f"{instance}-{alarm} {alarms[service][alarm]['Description']['Operatorsymbol']} {threshold} {alarms[service][alarm]['Description']['ThresholdUnit']}",
ComparisonOperator=alarms[service][alarm]['ComparisonOperator'],
EvaluationPeriods=alarms[service][alarm]['EvaluationPeriods'],
MetricName=alarms[service][alarm]['MetricName'],
Namespace=alarms[service][alarm]['Namespace'],
Period=alarms[service][alarm]['Period'],
Statistic=alarms[service][alarm]['Statistic'],
Threshold=cw_threshold,
ActionsEnabled=True,
TreatMissingData=alarms[service][alarm]['TreatMissingData'],
AlarmDescription=f"{priority}",
Dimensions=dimensionlist,
Tags=[{"Key": "CreatedbyLambda", "Value": "True"}],
)
for priority, threshold in zip(alarms[service][alarm]['AlarmThresholds']["priority"], alarms[service][alarm]['AlarmThresholds']["alarm_threshold"]):
# Convert thresholds to bytes if needed
if alarms[service][alarm]['Description']['ThresholdUnit'] == "GB":
cw_threshold = int(threshold) * 1000000000
elif alarms[service][alarm]['Description']['ThresholdUnit'] == "MB":
cw_threshold = int(threshold) * 1000000
else:
cw_threshold = int(threshold)

# Handling dimensions
for instance in instances:

instanceDimensions = {
"Name": f"{alarms[service][alarm]['Dimensions']}",
"Value": instance
}

# Initialize the dimension list
dimensionlist = [instanceDimensions]

# Add any additional disk-related dimensions if present
if 'ExtraDimensions' in alarms[service][alarm]:
dimensionlist.extend(alarms[service][alarm]['ExtraDimensions'])

# Create the alarms
CWclient.put_metric_alarm(
AlarmName=f"{instance}-{alarm} {alarms[service][alarm]['Description']['Operatorsymbol']} {threshold} {alarms[service][alarm]['Description']['ThresholdUnit']}",
ComparisonOperator=alarms[service][alarm]['ComparisonOperator'],
EvaluationPeriods=alarms[service][alarm]['EvaluationPeriods'],
MetricName=alarms[service][alarm]['MetricName'],
Namespace=alarms[service][alarm]['Namespace'],
Period=alarms[service][alarm]['Period'],
Statistic=alarms[service][alarm]['Statistic'],
Threshold=cw_threshold,
ActionsEnabled=True,
TreatMissingData=alarms[service][alarm]['TreatMissingData'],
AlarmDescription=f"{priority}",
Dimensions=dimensionlist,
Tags=[{"Key": "CreatedbyLambda", "Value": "True"}],
)

def GetRunningInstances():
get_running_instances = ec2client.describe_instances(
Filters=[{"Name": "instance-state-name", "Values": ["running"]}]
)

# instantiate empty array to store instance-id's
# Instantiate empty array to store instance-id's
RunningInstances = []

# create an array with a list of instance names
# Create an array with a list of instance names
for reservations in get_running_instances["Reservations"]:
for instance in reservations["Instances"]:
RunningInstances.append(instance["InstanceId"])
Expand All @@ -109,17 +118,28 @@ def GetRunningClusters():

return RunningClusterNames

def GetRunningCacheClusters():
get_running_cacheclusters = elasticlient.describe_cache_clusters()
RunningCacheClusters = []
for cachecluster in get_running_cacheclusters["CacheClusters"]:
RunningCacheClusters.append(cachecluster['CacheClusterId'])

return RunningCacheClusters

def DeleteAlarms():
get_alarm_info = CWclient.describe_alarms()
RunningInstances = GetRunningInstances()
RunningRDSInstances = GetRunningDBInstances()
RunningClusters = GetRunningClusters()
RunningCacheClusters = GetRunningCacheClusters()

# collect alarm metrics and compare alarm metric instanceId with instance id's in array. if the state reason is breaching and instance does not exist delete alarm.
# Collect alarm metrics and compare alarm metric instanceId with instance id's in array.
# If the state reason is breaching and instance does not exist, delete alarm.
for metricalarm in get_alarm_info["MetricAlarms"]:
instance_id = list(filter(lambda x: x["Name"] == "InstanceId", metricalarm["Dimensions"]))
rds_instance_name = list(filter(lambda x: x["Name"] == "DBInstanceIdentifier", metricalarm["Dimensions"]))
cluster_name = list(filter(lambda x: x["Name"] == "ClusterName", metricalarm["Dimensions"]))
cache_cluster_name = list(filter(lambda x: x["Name"] == "CacheClusterId", metricalarm["Dimensions"]))

if len(instance_id) == 1:
if instance_id[0]["Value"] not in RunningInstances:
Expand All @@ -130,3 +150,6 @@ def DeleteAlarms():
elif len(cluster_name) == 1:
if cluster_name[0]["Value"] not in RunningClusters:
CWclient.delete_alarms(AlarmNames=[metricalarm["AlarmName"]])
elif len(cache_cluster_name) == 1:
if cache_cluster_name[0]["Value"] not in RunningCacheClusters:
CWclient.delete_alarms(AlarmNames=[metricalarm["AlarmName"]])
78 changes: 78 additions & 0 deletions alarms_s3.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
resource "aws_s3_bucket" "alarm_bucket" {
#bucket_prefix = "alarm-creator-alarms"
bucket_prefix = var.alarm_bucket_name
force_destroy = true
}

resource "aws_s3_bucket_policy" "alarm_bucket_policy" {
bucket = aws_s3_bucket.alarm_bucket.id
policy = data.aws_iam_policy_document.alarm_bucket_policy_doc.json
depends_on = [
aws_s3_bucket_public_access_block.this
]
}

resource "aws_s3_bucket_acl" "bucket_access" {
bucket = aws_s3_bucket.alarm_bucket.id
acl = "private"
depends_on = [aws_s3_bucket_ownership_controls.this]
}

data "aws_iam_policy_document" "alarm_bucket_policy_doc" {
statement {
principals {
type = "AWS"
identifiers = ["*"]
}
actions = ["s3:GetObject"]
resources = [aws_s3_bucket.alarm_bucket.arn, "${aws_s3_bucket.alarm_bucket.arn}/*",]
}
statement {
principals {
type = "AWS"
identifiers = [data.aws_caller_identity.current.account_id]
}
actions = ["s3:PutBucketPolicy"]
resources = [aws_s3_bucket.alarm_bucket.arn, "${aws_s3_bucket.alarm_bucket.arn}/*",]
}
}

resource "aws_s3_bucket_ownership_controls" "this" {
bucket = aws_s3_bucket.alarm_bucket.id

rule {
object_ownership = "BucketOwnerPreferred"
}
}

resource "aws_s3_bucket_public_access_block" "this" {
bucket = aws_s3_bucket.alarm_bucket.id

block_public_acls = true
block_public_policy = false
ignore_public_acls = true
restrict_public_buckets = true
}

resource "aws_s3_object" "alarms_file" {
bucket = aws_s3_bucket.alarm_bucket.id
key = "alarms.json"
content = var.alarm_file == null ? file("./default_alarms.json") : var.alarm_file
}

resource "aws_s3_bucket_notification" "lambdatrigger" {
bucket = aws_s3_bucket.alarm_bucket.id
lambda_function {
lambda_function_arn = module.lambda_cw_alarm_creator.lambda_function_arn
events = ["s3:ObjectCreated:*"]
}
depends_on = [ aws_lambda_permission.allow_s3_trigger ]
}

resource "aws_lambda_permission" "allow_s3_trigger" {
statement_id = "AllowExecutionFromS3Bucket"
action = "lambda:InvokeFunction"
function_name = module.lambda_cw_alarm_creator.lambda_function_arn
principal = "s3.amazonaws.com"
source_arn = aws_s3_bucket.alarm_bucket.arn
}
6 changes: 3 additions & 3 deletions alarm_creator/alarms.json → default_alarms.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
"Statistic" : "Average",
"TreatMissingData" : "breaching",
"Dimensions" : "InstanceId",
"DiskDimensions": [
"ExtraDimensions": [
{
"Name": "path",
"Value": "/"
Expand Down Expand Up @@ -90,7 +90,7 @@
"Statistic" : "Average",
"TreatMissingData" : "breaching",
"Dimensions" : "InstanceId",
"DiskDimensions": [
"ExtraDimensions": [
{
"Name": "path",
"Value": "/sys/fs/cgroup"
Expand Down Expand Up @@ -123,7 +123,7 @@
"Statistic" : "Average",
"TreatMissingData" : "breaching",
"Dimensions" : "InstanceId",
"DiskDimensions": [
"ExtraDimensions": [
{
"Name": "path",
"Value": "/dev"
Expand Down
1 change: 1 addition & 0 deletions lambda_cw_alarm_creator.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ module "lambda_cw_alarm_creator" {

environment_variables = {
SNS_ARN = "${aws_sns_topic.notification_receiver.arn}"
BUCKET_NAME = "${aws_s3_bucket.alarm_bucket.id}"
}

sqs_dlq_arn = var.sqs_dlq_arn
Expand Down
22 changes: 22 additions & 0 deletions lambda_cw_alarm_creator_role.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ module "iam_role_lambda_cw_alarm_creator" {
"lambda_ec2_read_access" : jsondecode(data.aws_iam_policy_document.lambda_ec2_read_access.json)
"lambda_rds_read_access" : jsondecode(data.aws_iam_policy_document.lambda_rds_read_access.json)
"lambda_ecs_read_access" : jsondecode(data.aws_iam_policy_document.lambda_ecs_read_access.json)
"lambda_s3_read_access" : jsondecode(data.aws_iam_policy_document.lambda_s3_read_access.json)
"lambda_elasticache_read_access" : jsondecode(data.aws_iam_policy_document.lambda_elasticache_read_access.json)
}

trust_relationship = {
Expand Down Expand Up @@ -93,6 +95,26 @@ data "aws_iam_policy_document" "lambda_ecs_read_access" {
}
}

data "aws_iam_policy_document" "lambda_s3_read_access" {
statement {
sid = "AllowS3Access"

actions = ["s3:Get*"]

resources = ["*"]
}
}

data "aws_iam_policy_document" "lambda_elasticache_read_access" {
statement {
sid = "AllowLambdaElasticacheAccess"

actions = ["elasticache:Describe*"]

resources = ["*"]
}
}

# The Lambda role needs to access KMS key in order to access SNS topic.
resource "aws_kms_grant" "give_lambda_role_access" {
name = "lambda-role-kms-grant-access"
Expand Down
11 changes: 11 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,14 @@ variable "monitoring_account_configuration" {
sqs_account = number
})
}

variable "alarm_file" {
description = "json alarms file"
default = null

}
variable "alarm_bucket_name" {
description = "prefix for alarm bucket name"
default = "alarm-creator-alarm-parameters-"

}
Loading