initial commit 🎉

tofran · tofran · commit b1d05a4c78de · 2022-09-22T11:07:04.000+01:00
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+.*
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -0,0 +1,54 @@
+name: Build
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Login to GitHub container registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ secrets.GHCR_USERNAME }}
+          password: ${{ secrets.GHCR_PASSWORD }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Docker metadata
+        id: docker_metadata
+        uses: docker/metadata-action@v4
+        with:
+          images: ghcr.io/significa/fly-pg-dump-to-s3
+          tags: |
+            type=sha
+            type=ref,event=tag
+            type=ref,event=branch
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+          flavor: |
+            latest=true
+
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v3
+        with:
+          context: ./
+          file: ./Dockerfile
+          platforms: linux/amd64, linux/arm64, linux/386
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.docker_metadata.outputs.tags }}
+          labels: ${{ steps.docker_metadata.outputs.labels }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.env
+.env*
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,8 @@
+FROM alpine
+
+RUN apk add --no-cache bash curl aws-cli postgresql-client && \
+    curl -L https://fly.io/install.sh | sh
+
+COPY ./pg-dump-to-s3.sh ./entrypoint.sh /
+
+CMD [ "/entrypoint.sh" ]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <https://unlicense.org>
diff --git a/README.md b/README.md
@@ -0,0 +1,120 @@
+# Fly pg_dump to AWS S3
+
+This is a **hacky** way to have a Fly app that dumps postgres databases that are also on Fly, to AWS S3 buckets.
+This uses a dedicaded app for the *backup worker* that is woken up to start the dump. When it finished it is scaled back to 0, meaning it is not billable when idle*.
+
+*The machine is not billable, any volumes will be. This can further be improved so volumes are deleted. Volumes are required as the temporary disk is of an unknown, small size.
+
+
+## Why this?
+
+Indeed Fly's pg images support wal-g config to S3 via env vars. But I wanted a way to create simple archives periodically with pg_dump, making it easy for developers to replicate databases, and have a point in time recovery.
+
+Since the backup worker is running on Fly, and not in some other external service like AWS or GitHub actions, we can create backups rather quickly. And also because the latency/bandwith from Fly to AWS are quite good (in the regions I've tested).
+
+And what about Fly machines? I haven't tried them.
+
+## Requirements
+
+1. Fly postgres instance and a user with read permissons.
+   Create the `db_backup_worker` user with:
+    ```sql
+    CREATE USER db_backup_worker WITH PASSWORD '<password>';
+    GRANT CONNECT ON DATABASE <db_name> TO db_backup_worker;
+    -- For all schemas (example for public):
+    GRANT USAGE ON SCHEMA public TO db_backup_worker;
+    GRANT SELECT ON ALL TABLES IN SCHEMA public TO db_backup_worker;
+    GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO db_backup_worker;
+    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO db_backup_worker;
+    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO db_backup_worker;
+    ```
+
+2. AWS S3 bucket and an access token with write permissions to it.
+   Iam policy:
+   ```json
+   {
+        "Version": "2012-10-17",
+        "Statement": [
+            {
+                "Sid": "WriteDatabaseBackups",
+                "Effect": "Allow",
+                "Action": [
+                    "s3:PutObject",
+                    "s3:AbortMultipartUpload",
+                    "s3:ListMultipartUploadParts"
+                ],
+                "Resource": [
+                    "arn:aws:s3:::your-s3-bucket/backup.tar.gz"
+                ]
+            }
+        ]
+    }
+   ```
+
+
+## Installation
+
+1. Launch your database backup worker with `fly launch --image ghcr.io/significa/fly-pg-dump-to-s3`
+
+2. Create a volume for temporary files with `fly volumes create --no-encryption --size $SIZE_IN_GB temp_data`
+
+3. Add the volume to your `fly.toml`
+    ```toml
+    [mounts]
+    destination = "/tmp/db-backups"
+    source = "temp_data"
+    ```
+
+4. Set the required fly secrets (env vars). Example:
+    ```env
+    AWS_ACCESS_KEY_ID=XXXX
+    AWS_SECRET_ACCESS_KEY=XXXX
+    DATABASE_URL=postgresql://username:password@my-fly-db-instance.internal:5432/my_database
+    S3_DESTINATON=s3://your-s3-bucket/backup.tar.gz
+    FLY_API_TOKEN=XXXX
+    ```
+
+5. Run `flyctl scale count 1` whenever you want to start a backup. Add this to any periodic runner along with the envs `FLY_APP` and `FLY_API_TOKEN` to run it periodically.
+
+
+## What about backup history?
+
+You could add a date to the S3_DESTINATON filename (by changing the docker CMD). But I recommend adding versioning to your S3 and manage retention via policies.
+
+
+## Backup multiple databases/backups in one go?
+
+Just use the env vars like so:
+
+```env
+BACKUP_CONFIGURATION_NAMES=ENV1,STAGING_ENVIRONMENT,test
+
+ENV1_DATABASE_URL=postgresql://username:password@env1/my_database
+ENV1_S3_DESTINATON=s3://sample-bucket/sample.tar.gz
+
+STAGING_ENVIRONMENT_DATABASE_URL=postgresql://username:password@sample/staging
+STAGING_ENVIRONMENT_S3_DESTINATON=s3://sample-db-backups/staging_backup.tar.gz
+
+TEST_DATABASE_URL=postgresql://username:password@sample/test
+TEST_S3_DESTINATON=s3://sample-db-backups/test_backup.tar.gz
+```
+
+It will backup all the databases to the desired s3 destination. AWS and fly tokens are reused.
+
+## Env vars documentation
+
+- `DATABASE_URL`: Postgres database URL. Example: `postgresql://username:password@test:5432/my_database`
+- `S3_DESTINATON`: AWS S3 fill file destinaton Postgres database URl
+- `BACKUP_CONFIGURATION_NAMES`: Optional: Configuration names/prefixs for `DATABASE_URL` and `S3_DESTINATON`
+- `FLY_APP_NAME`:  Optional to scale down the worker. Automatically set by Fly.
+- `FLY_API_TOKEN`: Optional to scale down the worker. Fly api token created via flyctl or the UI.
+- `BACKUPS_TEMP_DIR`: Optional: Where the temp files shoudl go. Defaults to: `/tmp/db-backups`
+- `PG_DUMP_ARGS`: Optional: Override the default `pg_dump` args: `--no-owner --clean --no-privileges --no-sync --jobs=4 --format=directory --compress=0`
+
+## Is this hacky? Does it work in production environments?
+
+Yes. Yes :sweat_smile:
+
+## Will this work outside fly?
+
+Yes, if FLY_APP_NAME or FLY_API_TOKEN are not prsent, fly commands will be ignored.
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -e
+
+BACKUP_CONFIGURATION_NAMES=${BACKUP_CONFIGURATION_NAMES:-}
+
+backup () {
+  local prefix=$1
+
+  database_url_var_name="${prefix}DATABASE_URL"
+  database_url=${!database_url_var_name}
+
+  s3_destination_var_name="${prefix}S3_DESTINATON"
+  s3_destination=${!s3_destination_var_name}
+
+  if [[ -z $database_url || -z $s3_destination ]]; then
+    echo "Required env vars: ${database_url_var_name}, ${s3_destination_var_name}"
+    exit 1
+  fi
+
+  ./pg-dump-to-s3.sh "${database_url}" "${s3_destination}"
+}
+
+main () {
+  if [[ -z $BACKUP_CONFIGURATION_NAMES ]]; then
+      echo "Backup starting"
+      backup ""
+  else
+    for configuration_name in ${BACKUP_CONFIGURATION_NAMES//,/ }; do
+      echo "Backing up $configuration_name"
+      backup "${configuration_name^^}_"
+    done
+  fi
+}
+
+main || echo "ERROR backing up, see the logs above."
+
+if [[ -n $FLY_APP_NAME && -n $FLY_API_TOKEN ]]; then
+  echo "Scaling $FLY_APP_NAME to 0"
+  /root/.fly/bin/flyctl -a "$FLY_APP_NAME" scale count 0
+fi
+
+echo "Done! Sleeping..."
+sleep infinity
diff --git a/pg-dump-to-s3.sh b/pg-dump-to-s3.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+set -e
+
+_USAGE="
+Usage: ./backup-db <database_url> <s3-destination>
+Example:
+  ./backup-db postgresql://username:password@hostname:5432/my_database s3://my-bucket-name/my_backup.tar.gz
+"
+
+BACKUPS_TEMP_DIR=${BACKUPS_TEMP_DIR:-/tmp/db-backups}
+
+# we are not using pg_dump for compression as we want concurrrency, later we compress with tar manually
+default_pg_dump_args="--no-owner --clean --no-privileges --no-sync --jobs=4 --format=directory --compress=0"
+PG_DUMP_ARGS=${PG_DUMP_ARGS:-$default_pg_dump_args}
+
+database_url=$1
+destination=$2
+
+if [[ -z "$database_url" || -z "$destination" ]]; then
+  echo "$_USAGE"
+  exit 1
+fi
+
+if [[ -z $AWS_ACCESS_KEY_ID || -z $AWS_SECRET_ACCESS_KEY ]]; then
+  echo "Required env vars: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY"
+  exit 1
+fi
+
+mkdir -p "${BACKUPS_TEMP_DIR}"
+
+backup_dir="${BACKUPS_TEMP_DIR}/db_dump"
+backup_filename="${BACKUPS_TEMP_DIR}/db_dump.tar.gz"
+
+# In the future we could add a configuration to prevent deletion of existing files
+rm -rf "${backup_dir}" "${backup_filename}"
+
+echo "Dumping database to ${backup_dir}"
+pg_dump $PG_DUMP_ARGS \
+    --dbname="${database_url}" \
+    --file="${backup_dir}"
+
+echo "Compressing backup to ${backup_filename}"
+tar -czf "${backup_filename}" -C "${backup_dir}" .
+
+echo "Uploading backup to ${destination}"
+aws s3 cp --only-show-errors "${backup_filename}" "${destination}"
+
+rm -rf "${backup_dir}" "${backup_filename}"
+
+echo "Database backup finished!"