Skip to content

Commit

Permalink
Automate GCP VM image creation (#2130)
Browse files Browse the repository at this point in the history
With this change, updating CI Linux & Windows image is as simple as
triggering builds in the following pipeline:

- https://buildkite.com/bazel-trusted/create-windows-vm-image
- https://buildkite.com/bazel-trusted/create-linux-vm-image

Check the documentation for more details.
  • Loading branch information
meteorcloudy authored Dec 2, 2024
1 parent ca51d31 commit dc1fc9a
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 126 deletions.
99 changes: 7 additions & 92 deletions buildkite/create_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,9 @@
# limitations under the License.

from datetime import datetime
import json
import os
import queue
import subprocess
import sys
import tempfile
import threading

import gcloud
import gcloud_utils
Expand Down Expand Up @@ -59,12 +55,6 @@
},
}

WORK_QUEUE = queue.Queue()


def run(args, **kwargs):
return subprocess.run(args, **kwargs)


def preprocess_setup_script(setup_script, is_windows):
output_file = tempfile.mkstemp()[1]
Expand All @@ -77,6 +67,7 @@ def preprocess_setup_script(setup_script, is_windows):
if is_windows:
f.write("'@\n")
f.write('[System.IO.File]::WriteAllLines("c:\\setup.ps1", $setup_script)\n')
f.write('Start-Process -FilePath "powershell.exe" -ArgumentList "-File c:\\setup.ps1" -RedirectStandardOutput "c:\\setup-stdout.log" -RedirectStandardError "c:\\setup-stderr.log" -NoNewWindow\n')
return output_file


Expand Down Expand Up @@ -112,47 +103,6 @@ def create_instance(instance_name, params):
os.remove(setup_script)


# https://stackoverflow.com/a/25802742
def write_to_clipboard(output):
process = subprocess.Popen("pbcopy", env={"LANG": "en_US.UTF-8"}, stdin=subprocess.PIPE)
process.communicate(output.encode("utf-8"))


def print_windows_instructions(project, zone, instance_name):
tail_start = gcloud_utils.tail_serial_console(
instance_name, project=project, zone=zone, until="Finished running startup scripts"
)

pw = json.loads(
gcloud.reset_windows_password(
instance_name, format="json", project=project, zone=zone
).stdout
)
rdp_file = tempfile.mkstemp(suffix=".rdp")[1]
with open(rdp_file, "w") as f:
f.write("full address:s:" + pw["ip_address"] + "\n")
f.write("username:s:" + pw["username"] + "\n")
print("Opening ", rdp_file)
subprocess.run(["open", rdp_file])
write_to_clipboard(pw["password"])
with gcloud.PRINT_LOCK:
print("Use this password to connect to the Windows VM: " + pw["password"])
print("Please run the setup script C:\\setup.ps1 once you're logged in.")

# Wait until the VM reboots once, then open RDP again.
tail_start = gcloud_utils.tail_serial_console(
instance_name,
project=project,
zone=zone,
start=tail_start,
until="GCEGuestAgent: GCE Agent Started",
)
print("Connecting via RDP a second time to finish the setup...")
write_to_clipboard(pw["password"])
run(["open", rdp_file])
return tail_start


def workflow(name, params):
instance_name = "%s-image-%s" % (name, int(datetime.now().timestamp()))
project = params["project"]
Expand All @@ -164,16 +114,8 @@ def workflow(name, params):
# Wait for the VM to become ready.
gcloud_utils.wait_for_instance(instance_name, project=project, zone=zone, status="RUNNING")

if "windows" in instance_name:
# Wait for VM to be ready, then print setup instructions.
tail_start = print_windows_instructions(project, zone, instance_name)
# Continue printing the serial console until the VM shuts down.
gcloud_utils.tail_serial_console(
instance_name, project=project, zone=zone, start=tail_start
)
else:
# Continuously print the serial console.
gcloud_utils.tail_serial_console(instance_name, project=project, zone=zone)
# Continuously print the serial console.
gcloud_utils.tail_serial_console(instance_name, project=project, zone=zone)

# Wait for the VM to completely shutdown.
gcloud_utils.wait_for_instance(
Expand All @@ -194,17 +136,6 @@ def workflow(name, params):
gcloud.delete_instance(instance_name, project=project, zone=zone)


def worker():
while True:
item = WORK_QUEUE.get()
if not item:
break
try:
workflow(**item)
finally:
WORK_QUEUE.task_done()


def main(argv=None):
if argv is None:
argv = sys.argv[1:]
Expand All @@ -222,27 +153,11 @@ def main(argv=None):
)
return 1

# Put VM creation instructions into the work queue.
for name in argv:
WORK_QUEUE.put({"name": name, "params": IMAGE_CREATION_VMS[name]})

# Spawn worker threads that will create the VMs.
threads = []
for _ in range(WORK_QUEUE.qsize()):
t = threading.Thread(target=worker)
t.start()
threads.append(t)

# Wait for all VMs to be created.
WORK_QUEUE.join()

# Signal worker threads to exit.
for _ in range(len(threads)):
WORK_QUEUE.put(None)
if len(argv) > 1:
print("Only one platform can be created at a time.")
return 1

# Wait for worker threads to exit.
for t in threads:
t.join()
workflow(argv[0], IMAGE_CREATION_VMS[argv[0]])

return 0

Expand Down
4 changes: 2 additions & 2 deletions buildkite/gcloud_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ def prettify_logs(instance_name, log, with_prefix=True):
# Then drop the common prefix to make the output easier to read.
# For unknown platforms, we just take every line unmodified.
if "ubuntu" in instance_name or "docker" in instance_name:
match = re.match(r".*GCEMetadataScripts: startup-script: (.*)", line)
match = re.match(r".*: startup-script: (.*)", line)
if not match:
continue
line = match.group(1)
elif "windows" in instance_name:
match = re.match(r".*windows-startup-script-ps1: (.*)", line)
match = re.match(r".*\[setup-windows.ps1\]: (.*)", line)
if not match:
continue
line = match.group(1)
Expand Down
24 changes: 22 additions & 2 deletions buildkite/setup-windows.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,21 @@ Add-Type -AssemblyName "System.IO.Compression.FileSystem"
## Use TLS1.2 for HTTPS (fixes an issue where later steps can't connect to github.com)
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12

## If choco is already installed, this is the second time the VM starts up, run GCESysprep and then shutdown
if (Get-Command choco -ErrorAction SilentlyContinue) {
$port = New-Object System.IO.Ports.SerialPort COM1,9600,None,8,one
$port.Open()
$port.WriteLine("[setup-windows.ps1]: choco is already installed, this is the second time the VM starts up, running GCESysprep and then shutdown...")
$port.Close()
GCESysprep
exit 0
}

$port = New-Object System.IO.Ports.SerialPort COM1,9600,None,8,one
$port.Open()
$port.WriteLine("[setup-windows.ps1]: Starting to setup windows... This could take up to one hour, check C:/setup-stdout.log on the VM for progress.")
$port.Close()

## Create C:\temp
Write-Host "Creating temporary folder C:\temp..."
if (-Not (Test-Path "c:\temp")) {
Expand Down Expand Up @@ -337,6 +352,11 @@ $pagefile.InitialSize = 4 * 1024;
$pagefile.MaximumSize = 64 * 1024;
$pagefile.Put();

Write-Host "All done, adding GCESysprep to RunOnce and rebooting..."
Set-ItemProperty "HKLM:\Software\Microsoft\Windows\CurrentVersion\RunOnce" -Name "GCESysprep" -Value "c:\Program Files\Google\Compute Engine\sysprep\gcesysprep.bat"
Write-Host "All done, rebooting..."

$port = New-Object System.IO.Ports.SerialPort COM1,9600,None,8,one
$port.Open()
$port.WriteLine("[setup-windows.ps1]: Setup windows done, rebooting...")
$port.Close()

Restart-Computer
55 changes: 25 additions & 30 deletions docs/ci-playbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,60 +6,55 @@ This guide describes several maintenance workflows that have to be executed freq

## Deploying new CI worker images

Our Linux and Windows CI workers run on GCE instances. The basic update process consists of the following two steps:
Our **Linux** and **Windows** CI workers run on GCE instances. The basic update process consists of the following two steps and has been **automated** in the bazel-trusted BuildKite org:

1. Run [create_images](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/create_images.py) to create new VM images. This step starts a temporary VM, configures it as a CI worker, and then saves its image in GCE before destroying the temporary VM. This step does not affect running builds.
1. Run [create_instances](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/create_instances.py) to deploy instances with the new VM images. This step deletes the existing instances, then reads the [configuration file](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/instances.yml) to determine how many instances are needed, and finally creates new instances with the new images. As a result, any running builds will be interrupted.

Note: Many changes to the Linux workers don't require these two steps since we run Docker containers on Linux. See below for a description on how to create and deploy new Docker images.

### Prerequesites
For **macOS**, follow the internal playbook (go/bazel-ci-playbook).

All steps require `git` and the [Google Cloud SDK](https://cloud.google.com/sdk/install) to be installed on your machine.
### Windows & Linux

### Windows
You will need to be a member of the `bazel-trusted` BuildKite org.

You need a machine with a recent version of MacOS and Microsoft Remote Desktop (10) installed.
1. Submit your change to this repository.
1. Initiate a build on the [Create Windows VM image](https://buildkite.com/bazel-trusted/create-windows-vm-image) or [Create Linux VM image](https://buildkite.com/bazel-trusted/create-linux-vm-image) pipeline.
1. Wait for the first build step to finish. This will create a new Windows VM image.
1. Deploy the new image to the `bazel-testing` org by unblocking the next step.
1. Initiate a new build on the [Bazel](https://buildkite.com/bazel-testing/bazel-bazel) pipeline to test the new image.
1. Push the image to prod by unblocking the next step (eg. `bk-testing-windows` to `bk-windows`).
1. Wait for the VMs to be recreated in the bazel and bazel-trusted orgs and the new image to be deployed.

1. First, create new images.
1. Clone the continuous-integration repository.
1. `cd` into the `continuous-integration/buildkite` directory.
1. Create new images by running `create_images.py <platform1> <platform2> <...>`. For Windows, this usually means to include `bk-windows` and `bk-trusted-windows`, whereas the `windows-playground` platform is optional. Hint: You can see a list of available platforms by running the script without any arguments.
1. The script opens Microsoft Remote Desktop to establish a connection to the VM that is used for building the image. Accept any popups and log into the machine by pasting the password into the password field (the script already copied into the clipboard).
1. Run the setup script by executing `\setup.ps1`.
1. Wait until the script has finished. At one point the VM will be rebooted, so the script has to open the remote connection again. The whole process can take up to 30 minutes.
1. Login into the Google Cloud Console and check that the created images are no longer busy. Make sure to select the project that matches the image (e.g. `bazel-public` for `trusted` images, `bazel-untrusted` for "normal" images).
1. If something fails, you can always run `create_images` again.
1. Deploy CI workers with the newly created image by running `create_instances.py --local_config <instance_group1> <instance_group2> <...>`. The available instances group names can be found in the [configuration file](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/instances.yml). Moreover, you can run the script without any arguments to get a list of available instance groups or check the configuration file. For Windows you would usually pass `bk-windows bk-trusted-windows` to the script.
Note: if anything goes wrong in the new image, you can always revert to the previous image by deleting the new image in the GCP console and re-create the VMs.

### Linux
### Deploy new Docker images for Linux

Most changes can be rolled out by creating and deploying new Docker images. This step requires that Docker is installed and set up, and you need permissions to access the container registry in our GCP project.
Most changes can be rolled out by creating and deploying new Docker images. This step requires that

- You are on a Linux machine (images built on macOS may cause problem).
- Docker is installed and set up.
- You need permissions to access the container registry in our GCP project.

Follow these steps to build and deploy a new Docker image:

1. Clear your local Docker cache via `docker builder prune -a -f`.
1. Clone the continuous-integration repository.
1. `cd` into the `continuous-integration/buildkite/docker` directory.
1. Run `build.sh`.
1. Run `push.sh`.

If you need to create and deploy new VM images, you can follow these steps:

1. Clone the continuous-integration repository.
1. `cd` into the `continuous-integration/buildkite` directory.
1. Create new images by running `python3.6 create_images.py <platform1> <platform2> <...>`. For Linux, this usually means to include `bk-docker` and `bk-trusted-docker`. Hint: You can see a list of available platforms by running the script without any arguments.
1. Deploy CI workers with the newly created image by running `python3.6 create_instances.py --local_config <instance_group1> <instance_group2> <...>`. The available instances group names can be found in the [configuration file](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/instances.yml). For Linux you would usually pass `bk-docker bk-trusted-docker` to the script.

### MacOS

We are operating a number of physical Mac machines in our office. Please see [go/bazel-ci-playbook](http://go/bazel-ci-playbook) if you're in the Google network.
If you are on the `testing` branch, the new image will be pushed to the `gcr.io/bazel-public/testing` repository. If you are on the `master` branch, the new image will be pushed to the `gcr.io/bazel-public` repository.

## Deploying a new Bazelisk version

1. Create a [new Bazelisk release](https://github.com/bazelbuild/bazelisk/releases). This step has to be done on a Mac machine (due to [cross-compilation problems](https://github.com/golang/go/issues/22510)), and requires permissions to create a release.
1. To deploy this release on MacOS:
1. Update the [Bazelisk Homebrew formula](https://github.com/fweikert/homebrew-tap/blob/master/Formula/bazelisk.rb).
1. SSH into the machines and update them via Homebrew (see internal instructions for more details).
1. Update the startup script for macOS VMs to install the latest Bazelisk version.
1. To deploy this release on Linux:
1. Update the [Dockerfile](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/docker/Dockerfile).
1. Follow the instructions [here](#linux) to deploy new Docker images.
1. Follow the above instructions to deploy new Docker images.
1. To deploy this release on Windows:
1. Create and deploy new VM images by following the [instructions](#windows). There is no need to update any files manually since the [setup script](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/setup-windows.ps1) always fetches the latest version of Bazelisk
1. Create and deploy new VM images by following the above instructions. There is no need to update any files manually since the [setup script](https://github.com/bazelbuild/continuous-integration/blob/master/buildkite/setup-windows.ps1) always fetches the latest version of Bazelisk
97 changes: 97 additions & 0 deletions pipelines/publish-vm-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
---
steps:
- command: |-
cd buildkite
./create_images.py ${BAZEL_TEST_VM_NAME}
label: ":pipeline:"
agents:
- "queue=default"
plugins:
- docker#v3.8.0:
always-pull: true
environment:
- "ANDROID_HOME"
- "ANDROID_NDK_HOME"
- "BUILDKITE_ARTIFACT_UPLOAD_DESTINATION"
image: "gcr.io/bazel-public/ubuntu2204"
network: "host"
privileged: true
propagate-environment: true
propagate-uid-gid: true
volumes:
- "/etc/group:/etc/group:ro"
- "/etc/passwd:/etc/passwd:ro"
- "/etc/shadow:/etc/shadow:ro"
- "/opt/android-ndk-r15c:/opt/android-ndk-r15c:ro"
- "/opt/android-sdk-linux:/opt/android-sdk-linux:ro"
- "/var/lib/buildkite-agent:/var/lib/buildkite-agent"
- "/var/lib/gitmirrors:/var/lib/gitmirrors:ro"
- "/var/run/docker.sock:/var/run/docker.sock"
- wait

- block: ":arrows_counterclockwise: Re-create instance group in the bazel-testing org"

- command: |-
cd buildkite
./create_instances.py ${BAZEL_TEST_VM_NAME}
label: ":pipeline:"
agents:
- "queue=default"
plugins:
- docker#v3.8.0:
always-pull: true
environment:
- "ANDROID_HOME"
- "ANDROID_NDK_HOME"
- "BUILDKITE_ARTIFACT_UPLOAD_DESTINATION"
image: "gcr.io/bazel-public/ubuntu2204"
network: "host"
privileged: true
propagate-environment: true
propagate-uid-gid: true
volumes:
- "/etc/group:/etc/group:ro"
- "/etc/passwd:/etc/passwd:ro"
- "/etc/shadow:/etc/shadow:ro"
- "/opt/android-ndk-r15c:/opt/android-ndk-r15c:ro"
- "/opt/android-sdk-linux:/opt/android-sdk-linux:ro"
- "/var/lib/buildkite-agent:/var/lib/buildkite-agent"
- "/var/lib/gitmirrors:/var/lib/gitmirrors:ro"
- "/var/run/docker.sock:/var/run/docker.sock"
- wait

- block: ":white_check_mark: Confirm you have tested the new VM image in the bazel-testing org"

- wait

- block: ":rocket: Promote the VM image to Prod"

- command: |-
cd buildkite
./promote_images.py ${BAZEL_VM_NAME}
label: ":pipeline:"
agents:
- "queue=default"
plugins:
- docker#v3.8.0:
always-pull: true
environment:
- "ANDROID_HOME"
- "ANDROID_NDK_HOME"
- "BUILDKITE_ARTIFACT_UPLOAD_DESTINATION"
image: "gcr.io/bazel-public/ubuntu2204"
network: "host"
privileged: true
propagate-environment: true
propagate-uid-gid: true
volumes:
- "/etc/group:/etc/group:ro"
- "/etc/passwd:/etc/passwd:ro"
- "/etc/shadow:/etc/shadow:ro"
- "/opt/android-ndk-r15c:/opt/android-ndk-r15c:ro"
- "/opt/android-sdk-linux:/opt/android-sdk-linux:ro"
- "/var/lib/buildkite-agent:/var/lib/buildkite-agent"
- "/var/lib/gitmirrors:/var/lib/gitmirrors:ro"
- "/var/run/docker.sock:/var/run/docker.sock"

0 comments on commit dc1fc9a

Please sign in to comment.