Skip to content

Commit

Permalink
add self-managed grafana option
Browse files Browse the repository at this point in the history
Co-authored-by: Matthew Nightingale <nghtm@amazon.com>
  • Loading branch information
KeitaW and nghtm committed Nov 21, 2024
1 parent 0e68c54 commit 7dd63d8
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 0 deletions.
7 changes: 7 additions & 0 deletions 4.validation_and_observability/4.prometheus-grafana/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ If you have already created your HyperPod cluster, you can follow [these instruc
>[!IMPORTANT]
> It is strongly recommended you deploy this stack into the same region and same account as your SageMaker HyperPod Cluster.This will ensure successful execution of the Lifecycle Scripts, specifically `install_prometheus.sh`, which relies on AWS CLI commands that assume same account and same region.
[<kbd> <br> 1-Click Deploy 🚀 <br> </kbd>](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.amazonaws.com/templates/cluster-observability.yaml&stackName=Cluster-Observability)

>[!INFO]
> If the deployment region does not support Amazon Managed Grafana, you can use following stack which deploys one EC2 instance and installed Grafana through launch template.
[<kbd> <br> 1-Click Deploy 🚀 <br> </kbd>](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://awsome-distributed-training.s3.us-east-1.amazonaws.com/templates/cluster-observability-with-os-grafana.yaml&stackName=Cluster-Observability)

### Connect to the cluster
Connect to the controller node of your cluster via ssm:
>[!NOTE]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template to monitor SageMaker Hyperpod - launches a t2.medium instance with 30GB of storage, security group, IAM role for Prometheus access, Grafana setup, and a Prometheus workspace.

Parameters:
LatestAmiId:
Type: 'AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>'
Default: '/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2'
Description: "The latest Amazon Linux 2 AMI ID."

Resources:
MySecurityGroup:
Type: "AWS::EC2::SecurityGroup"
Properties:
GroupDescription: "Allow ingress on port 3000 for Grafana access"
SecurityGroupIngress:
- IpProtocol: "tcp"
FromPort: 3000
ToPort: 3000
CidrIp: "0.0.0.0/0"

GrafanaEC2Role:
Type: "AWS::IAM::Role"
Properties:
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service: ec2.amazonaws.com
Action: "sts:AssumeRole"
Policies:
- PolicyName: "PrometheusAccessPolicy"
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- aps:ListWorkspaces
- aps:DescribeWorkspace
- aps:QueryMetrics
- aps:GetLabels
- aps:GetSeries
- aps:GetMetricMetadata
Resource: "*"
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore

MyInstanceProfile:
Type: "AWS::IAM::InstanceProfile"
Properties:
Roles:
- !Ref GrafanaEC2Role

APSWorkspace:
Type: "AWS::APS::Workspace"
Properties:
Alias: !Sub "${AWS::StackName}-Hyperpod-WorkSpace"
Tags:
- Key: "Name"
Value: "SageMaker Hyperpod PrometheusMetrics"

MyInstance:
Type: "AWS::EC2::Instance"
Properties:
InstanceType: "t2.medium"
ImageId: !Ref LatestAmiId
IamInstanceProfile: !Ref MyInstanceProfile
SecurityGroupIds:
- !Ref MySecurityGroup
BlockDeviceMappings:
- DeviceName: "/dev/xvda"
Ebs:
VolumeSize: 30
UserData:
Fn::Base64: !Sub |
#!/bin/bash

# Update system packages
sudo yum update -y

# Install Docker
echo "Installing Docker..."
sudo amazon-linux-extras install docker -y

# Start Docker service
echo "Starting Docker service..."
sudo systemctl start docker

# Enable Docker to start on boot
sudo systemctl enable docker

# Add the current user (ec2-user) to the Docker group to run Docker commands without sudo
echo "Adding ec2-user to Docker group..."
sudo usermod -aG docker ec2-user

# Pull the latest Grafana image
echo "Pulling the latest Grafana Docker image..."
docker pull grafana/grafana:latest

# Run Grafana container with automatic restart
echo "Starting Grafana container with restart policy..."
docker run -d -p 3000:3000 --name=grafana --restart always grafana/grafana:latest

# Print Grafana access info
echo "Docker and Grafana setup complete."
echo "Grafana is running at http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):3000"
echo "Default Grafana login credentials are admin/admin. Please change the password after the first login."

# Note: Log out and log back in for Docker permissions to take effect
echo "Please log out and back in for Docker group permissions to apply."
Tags:
- Key: "Name"
Value: "OS-Grafana"


Outputs:
InstanceId:
Description: "Instance ID of the EC2 instance"
Value: !Ref MyInstance
PrometheusWorkspaceId:
Description: "ID of the Amazon Managed Prometheus Workspace"
Value: !Ref APSWorkspace
AMPRemoteWriteURL:
Value: !Join ["" , [ !GetAtt APSWorkspace.PrometheusEndpoint , "api/v1/remote_write" ]]
GrafanaInstanceAddress:
Description: "Grafana address with port 3000 for the EC2 instance"
Value: !Sub "http://${MyInstance.PublicIp}:3000"

0 comments on commit 7dd63d8

Please sign in to comment.