From cf25cf5cd5dbade28a38fc41398fc8b584f3b643 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 2 Apr 2024 15:24:51 -0400 Subject: [PATCH] E2E: use a self-hosted Consul for easier WI testing (#20256) Our `consulcompat` tests exercise both the Workload Identity and legacy Consul token workflow, but they are limited to running single node tests. The E2E cluster is network isolated, so using our HCP Consul cluster runs into a problem validating WI tokens because it can't reach the JWKS endpoint. In real production environments, you'd solve this with a CNAME pointing to a public IP pointing to a proxy with a real domain name. But that's logisitcally impractical for our ephemeral nightly cluster. Migrate the HCP Consul to a single-node Consul cluster on AWS EC2 alongside our Nomad cluster. Bootstrap TLS and ACLs in Terraform and ensure all nodes can reach each other. This will allow us to update our Consul tests so they can use Workload Identity, in a separate PR. Ref: #19698 --- e2e/terraform/README.md | 12 +- e2e/terraform/compute.tf | 17 ++ e2e/terraform/consul-clients.tf | 81 ++++++++ e2e/terraform/consul-servers.tf | 178 ++++++++++++++++++ e2e/terraform/etc/consul.d/.environment | 1 + e2e/terraform/etc/consul.d/client_acl.json | 8 - e2e/terraform/etc/consul.d/clients.hcl | 42 +++++ e2e/terraform/etc/consul.d/clients.json | 12 -- .../etc/consul.d/consul-server.service | 20 ++ e2e/terraform/etc/consul.d/servers.hcl | 47 +++++ .../nomad.d/{consul.hcl => client-consul.hcl} | 1 + e2e/terraform/etc/nomad.d/server-consul.hcl | 10 + e2e/terraform/hcp_consul.tf | 130 ------------- e2e/terraform/network.tf | 45 +++++ e2e/terraform/nomad-acls.tf | 21 ++- e2e/terraform/outputs.tf | 3 + .../provision-nomad/install-linux.tf | 8 +- e2e/terraform/provision-nomad/main.tf | 20 +- e2e/terraform/scripts/bootstrap-consul.sh | 29 +++ .../scripts/consul-agents-policy.hcl | 12 ++ .../scripts/nomad-cluster-consul-policy.hcl | 34 ++++ e2e/terraform/terraform.tfvars | 5 +- e2e/terraform/tls_ca.tf | 4 +- e2e/terraform/variables.tf | 8 +- 24 files changed, 564 insertions(+), 184 deletions(-) create mode 100644 e2e/terraform/consul-clients.tf create mode 100644 e2e/terraform/consul-servers.tf create mode 100644 e2e/terraform/etc/consul.d/.environment delete mode 100644 e2e/terraform/etc/consul.d/client_acl.json create mode 100644 e2e/terraform/etc/consul.d/clients.hcl delete mode 100644 e2e/terraform/etc/consul.d/clients.json create mode 100644 e2e/terraform/etc/consul.d/consul-server.service create mode 100644 e2e/terraform/etc/consul.d/servers.hcl rename e2e/terraform/etc/nomad.d/{consul.hcl => client-consul.hcl} (81%) create mode 100644 e2e/terraform/etc/nomad.d/server-consul.hcl delete mode 100644 e2e/terraform/hcp_consul.tf create mode 100755 e2e/terraform/scripts/bootstrap-consul.sh create mode 100644 e2e/terraform/scripts/consul-agents-policy.hcl create mode 100644 e2e/terraform/scripts/nomad-cluster-consul-policy.hcl diff --git a/e2e/terraform/README.md b/e2e/terraform/README.md index c3e94fe7d40..b7d77a22bde 100644 --- a/e2e/terraform/README.md +++ b/e2e/terraform/README.md @@ -4,10 +4,10 @@ This folder contains Terraform resources for provisioning a Nomad cluster on EC2 instances on AWS to use as the target of end-to-end tests. -Terraform provisions the AWS infrastructure assuming that EC2 AMIs -have already been built via Packer and HCP Consul and HCP Vault -clusters are already running. It deploys a build of Nomad from your -local machine along with configuration files. +Terraform provisions the AWS infrastructure assuming that EC2 AMIs have already +been built via Packer and a HCP Vault cluster is already running. It deploys a +build of Nomad from your local machine along with configuration files, as well +as a single-node Consul server cluster. ## Setup @@ -30,8 +30,6 @@ team's vault under `nomad-e2e`. ``` export HCP_CLIENT_ID= export HCP_CLIENT_SECRET= -export CONSUL_HTTP_TOKEN= -export CONSUL_HTTP_ADDR= ``` The Vault admin token will expire after 6 hours. If you haven't @@ -57,6 +55,8 @@ client_count_ubuntu_jammy_amd64 = "4" client_count_windows_2016_amd64 = "1" ``` +You will also need a Consul Enterprise license file. + Optionally, edit the `nomad_local_binary` variable in the `terraform.tfvars` file to change the path to the local binary of Nomad you'd like to upload. diff --git a/e2e/terraform/compute.tf b/e2e/terraform/compute.tf index d340d0ca7fe..ddb101b85fe 100644 --- a/e2e/terraform/compute.tf +++ b/e2e/terraform/compute.tf @@ -58,6 +58,23 @@ resource "aws_instance" "client_windows_2016_amd64" { } } +resource "aws_instance" "consul_server" { + ami = data.aws_ami.ubuntu_jammy_amd64.image_id + instance_type = var.instance_type + key_name = module.keys.key_name + vpc_security_group_ids = [aws_security_group.consul_server.id] + iam_instance_profile = data.aws_iam_instance_profile.nomad_e2e_cluster.name + availability_zone = var.availability_zone + + # Instance tags + tags = { + Name = "${local.random_name}-consul-server-ubuntu-jammy-amd64" + ConsulAutoJoin = "auto-join-${local.random_name}" + User = data.aws_caller_identity.current.arn + } +} + + data "external" "packer_sha" { program = ["/bin/sh", "-c", </dev/null 2>&1 && pwd )" + +echo "waiting for Consul leader to be up..." +while true : +do + consul info && break + echo "Consul server not ready, waiting 5s" + sleep 5 +done + +consul acl bootstrap || echo "Consul ACLs already bootstrapped" + +if [ $(consul info | grep -q "version_metadata = ent") ]; then + echo "writing namespaces" + consul namespace create -name "prod" + consul namespace create -name "dev" +fi + +echo "writing Nomad cluster policy and token" +consul acl policy create -name nomad-cluster -rules @${DIR}/nomad-cluster-consul-policy.hcl +consul acl token create -policy-name=nomad-cluster -secret "$NOMAD_CLUSTER_CONSUL_TOKEN" + +echo "writing Consul cluster policy and token" +consul acl policy create -name consul-agents -rules @${DIR}/consul-agents-policy.hcl +consul acl token create -policy-name=consul-agents -secret "$CONSUL_AGENT_TOKEN" diff --git a/e2e/terraform/scripts/consul-agents-policy.hcl b/e2e/terraform/scripts/consul-agents-policy.hcl new file mode 100644 index 00000000000..28d74cf503d --- /dev/null +++ b/e2e/terraform/scripts/consul-agents-policy.hcl @@ -0,0 +1,12 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# Consul agents only need to register themselves and read services + +node "*" { + policy = "write" +} + +service_prefix "" { + policy = "read" +} diff --git a/e2e/terraform/scripts/nomad-cluster-consul-policy.hcl b/e2e/terraform/scripts/nomad-cluster-consul-policy.hcl new file mode 100644 index 00000000000..c07dc09b03a --- /dev/null +++ b/e2e/terraform/scripts/nomad-cluster-consul-policy.hcl @@ -0,0 +1,34 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +// The Nomad Client will be registering things into its buddy Consul Client. +// Note: because we also test the use of Consul namespaces, this token must be +// able to register services, read the keystore, and read node data for any +// namespace. +// The operator=write permission is required for creating config entries for +// connect ingress gateways. operator ACLs are not namespaced, though the +// config entries they can generate are. +operator = "write" + +agent_prefix "" { + policy = "read" +} + +namespace_prefix "" { + // The acl=write permission is required for generating Consul Service Identity + // tokens for consul connect services. Those services could be configured for + // any Consul namespace the job-submitter has access to. + acl = "write" + + key_prefix "" { + policy = "read" + } + + node_prefix "" { + policy = "read" + } + + service_prefix "" { + policy = "write" + } +} diff --git a/e2e/terraform/terraform.tfvars b/e2e/terraform/terraform.tfvars index f531e298327..324bb7c7ce3 100644 --- a/e2e/terraform/terraform.tfvars +++ b/e2e/terraform/terraform.tfvars @@ -8,5 +8,8 @@ nomad_local_binary = "../../pkg/linux_amd64/nomad" nomad_local_binary_client_windows_2016_amd64 = ["../../pkg/windows_amd64/nomad.exe"] -# For testing enterprise, set via --var: +# The Consul server is Consul Enterprise, so provide a license via --var: +# consul_license = + +# For testing Nomad enterprise, also set via --var: # nomad_license = diff --git a/e2e/terraform/tls_ca.tf b/e2e/terraform/tls_ca.tf index e30da79ca5f..992c165b5ca 100644 --- a/e2e/terraform/tls_ca.tf +++ b/e2e/terraform/tls_ca.tf @@ -22,12 +22,12 @@ resource "tls_self_signed_cert" "ca" { allowed_uses = ["cert_signing"] } -resource "local_file" "ca_key" { +resource "local_sensitive_file" "ca_key" { filename = "keys/tls_ca.key" content = tls_private_key.ca.private_key_pem } -resource "local_file" "ca_cert" { +resource "local_sensitive_file" "ca_cert" { filename = "keys/tls_ca.crt" content = tls_self_signed_cert.ca.cert_pem } diff --git a/e2e/terraform/variables.tf b/e2e/terraform/variables.tf index 5867f209c02..81c612c9bdc 100644 --- a/e2e/terraform/variables.tf +++ b/e2e/terraform/variables.tf @@ -53,7 +53,13 @@ variable "nomad_local_binary" { variable "nomad_license" { type = string - description = "If nomad_license is set, deploy a license to override the temporary license" + description = "If nomad_license is set, deploy a license" + default = "" +} + +variable "consul_license" { + type = string + description = "If consul_license is set, deploy a license" default = "" }