Skip to content

Oracle: iterative_call_for_database_backup #1554

Oracle: iterative_call_for_database_backup

Oracle: iterative_call_for_database_backup #1554

name: "Oracle: Backup"
run-name: "Oracle: ${{ github.event_name == 'workflow_dispatch' && format('{0}_{1}_{2}', github.event.inputs.TargetEnvironment, github.event.inputs.Period, github.event.inputs.TargetHost) || 'iterative_call_for' }}_database_backup"
on:
workflow_dispatch:
inputs:
TargetEnvironment:
description: "Target environment"
required: true
type: choice
options:
- "delius-core-dev"
- "delius-core-test"
- "delius-core-training"
- "delius-core-stage"
- "delius-core-preprod"
- "delius-core-prod"
- "delius-mis-dev"
- "delius-mis-stage"
- "delius-mis-preprod"
- "delius-mis-prod"
TargetHost:
description: "Backup target host"
required: true
type: choice
options:
- "delius_primarydb"
- "mis_primarydb"
- "boe_primarydb"
- "dsd_primarydb"
- "delius_standbydb1"
- "mis_standbydb1"
- "boe_standbydb1"
- "dsd_standbydb1"
- "delius_standbydb2"
Period:
description: "Is this a daily or weekly backup?"
required: true
type: choice
options:
- "daily"
- "weekly"
BackupStandbyOnPrimaryFailure:
description: "Retry the backup on the standby if primary backup fails"
type: choice
default: "yes"
options:
- "yes"
- "no"
FixAbsentChunks:
description: "Validate and fix absent S3 missing backup chunks"
type: choice
default: "yes"
options:
- "yes"
- "no"
ParametersJSON:
description: "JSON parameters for backup job (overrides any other specified inputs)"
type: string
required: false
EnableTrace:
description: "Enable RMAN trace for debugging"
required: false
type: choice
default: "no"
options:
- "yes"
- "no"
VerboseOutput:
description: "Verbose Output level"
type: choice
options:
- ""
- "-v"
- "-vv"
- "-vvv"
- "-vvvv"
SourceCodeVersion:
description: "Source version for the hmpps-delius-operation-automation. Enter a pull request, branch, commit ID, tag, or reference."
type: string
default: "main"
SourceConfigVersion:
description: "Source version for the modernisation-platform-configuration-management. Enter a pull request, branch, commit ID, tag, or reference."
type: string
default: "main"
workflow_call:
inputs:
TargetEnvironment:
description: "Target environment"
required: true
type: string
TargetHost:
description: "Backup target host"
required: true
type: string
Period:
description: "Is this a daily or weekly backup?"
required: true
type: string
repository_dispatch:
types: ["oracle-db-backup-success","oracle-db-backup-failure"]
# Allow permissions on repository and docker image and OIDC token
permissions:
contents: read
packages: read
id-token: write # This is required for requesting the JWT
jobs:
build_rman_target_name:
runs-on: ubuntu-latest
outputs:
RmanTarget: ${{ steps.preparermantargetname.outputs.RmanTarget }}
TargetEnvironment: ${{ steps.preparermantargetname.outputs.TargetEnvironment }}-preapproved
BackupStatus: ${{ steps.preparermantargetname.outputs.BackupStatus }}
BackupStandbyOnPrimaryFailure: ${{ steps.preparermantargetname.outputs.BackupStandbyOnPrimaryFailure }}
SourceCodeVersion: ${{ steps.preparermantargetname.outputs.SourceCodeVersion }}
SourceConfigVersion: ${{ steps.preparermantargetname.outputs.SourceConfigVersion }}
Inputs: ${{ steps.preparermantargetname.outputs.Inputs }}
SlackChannel: ${{ steps.prepare-slack-channel-name.outputs.SlackChannel }}
IsRetryJob: ${{ steps.preparermantargetname.outputs.IsRetryJob }}
steps:
- name: Prepare Rman Target
id: preparermantargetname
run: |
function set_rman_target()
{
# Supply 2 parameters: TargetEnvironment & TargetHost
# This function determines the inventory name for the RMAN target on which the backup should be run.
RmanTarget="environment_name_$(echo $1 | sed 's/dev/development_dev/;s/test/test_test/;s/training/test_training/;s/stage/preproduction_stage/;s/pre-prod/preproduction_pre_prod/;s/-prod/_production_prod/;s/-/_/g')_$2"
echo "RmanTarget=$RmanTarget" >> $GITHUB_OUTPUT
echo "TargetEnvironment=$TargetEnvironment" >> $GITHUB_OUTPUT
}
if [[ "${{ github.event_name }}" != "repository_dispatch" ]] && [[ -z '${{ github.event.inputs.ParametersJSON }}' ]];
then
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]
then
TargetHost="${{ github.event.inputs.TargetHost }}"
TargetEnvironment="${{ github.event.inputs.TargetEnvironment }}"
Period="${{ github.event.inputs.Period }}"
BackupStandbyOnPrimaryFailure="${{ github.event.inputs.BackupStandbyOnPrimaryFailure }}"
FixAbsentChunks="${{ github.event.inputs.FixAbsentChunks }}"
EnableTrace="${{ github.event.inputs.EnableTrace }}"
VerboseOutput="${{ github.event.inputs.VerboseOutput }}"
SourceCodeVersion="${{ github.event.inputs.SourceCodeVersion }}"
SourceConfigVersion="${{ github.event.inputs.SourceConfigVersion }}"
elif [[ "${{ github.event_name }}" == "schedule" ]]
then
TargetHost="${{ inputs.TargetHost }}"
TargetEnvironment="${{ inputs.TargetEnvironment }}"
Period="${{ inputs.Period }}"
BackupStandbyOnPrimaryFailure="yes"
fi
set_rman_target $TargetEnvironment $TargetHost
# Build a JSON variable capturing all the inputs which have been supplied
Inputs="{"
Inputs=$Inputs"\"TargetEnvironment\": \"$TargetEnvironment\","
# The TargetHost is the type of the host, e.g. delius_primarydb
# The RmanTarget is the full group name including both the environment name and the database type, and may
# be updating during the course of the backup if it is retries on a standby database.
# Note that we deliberately exclude the EnableTrace attribute from the JSON since GitHub currently only
# supports 10 elements and this was deemed the least important. This means that we cannot activate
# RMAN trace on retry backups (only on the initial backup). However, retry backups are expected to be
# rare, so the requirement for tracing them is very limited.
Inputs=$Inputs"\"TargetHost\": \"$TargetHost\","
Inputs=$Inputs"\"RmanTarget\": \"$RmanTarget\","
Inputs=$Inputs"\"Period\": \"$Period\","
Inputs=$Inputs"\"BackupStandbyOnPrimaryFailure\": \"${BackupStandbyOnPrimaryFailure:-yes}\","
Inputs=$Inputs"\"FixAbsentChunks\": \"${FixAbsentChunks:-yes}\","
Inputs=$Inputs"\"VerboseOutput\": \"$VerboseOutput\","
Inputs=$Inputs"\"SourceCodeVersion\": \"$SourceCodeVersion\","
Inputs=$Inputs"\"SourceConfigVersion\": \"$SourceConfigVersion\","
Inputs=$Inputs"\"Phase\": \"Initialising Backup\""
Inputs=$Inputs"}"
else
# We come here if we have either:
# (1) Responded to a Repository Dispatch event, or
# (2) Supplied ParametersJSON as an input
# (ParametersJSON is only intended for test purposes; we can supply JSON from
# the command line to simulate it being passed as the payload of a
# Repository Dispatch event)
# In this case, the incoming JSON Inputs override any other command line parameters
if [[ "${{ github.event_name }}" == "repository_dispatch" ]]; then
echo "Responding to Repository Dispatch event"
Inputs='${{ toJson(github.event.client_payload) }}'
Inputs=$(echo $Inputs | jq -r)
Action='${{ github.event.action }}'
else
echo "Overriding command line with supplied JSON inputs"
Inputs='${{ github.event.inputs.ParametersJSON }}'
Action=$(echo $Inputs | jq -r '.BackupAction // "unknown"')
[[ "$Action" == "unknown" ]] && "Action is not specified in JSON inputs" && exit 6
fi
if [[ "$Action" == "oracle-db-backup-success" ]]; then
BackupStatus="success"
# If the backup has been successful then we validate the presence of all
# of the backup chunks in the S3 backup bucket.
Inputs=$(echo $Inputs | jq '.Phase = "Validating Chunks"')
else
echo "Backup has failed"
# If the backup has failed then we decide what to do next. If BackupStandbyOnPrimaryFailure
# has the value of "yes" then we attempt to retry the backup on the next host. i.e.:
# 1. If the backup was running on the 1st Standby then we attempt to retry on the 2nd Standby.
# 2. If the backup was running on the Primary then we attempt to retry on the 1st Standby.
# Note that we do not attempt to check if Standby databases exist at this point, since that
# requires checking out of the inventory. We simply decide which host we would use next
# assuming that it exists. The subsequent job will fail if the selected host does not exist.
BackupStatus="failed"
echo "BackupOnStandby: "$(echo $Inputs | jq -r '.BackupStandbyOnPrimaryFailure // "yes"')
if [[ $(echo $Inputs | jq -r '.BackupStandbyOnPrimaryFailure // "yes"') == "yes" ]]; then
if [[ $(echo $Inputs | jq -r '.RmanTarget') =~ standbydb2 ]]; then
echo "No Further Retry Targets Available"
BackupStatus="fatal"
else
Inputs=$(echo $Inputs | jq '.Phase = "Retrying Failed Backup on Standby"')
echo "Selecting Retry Target"
# If we are allowed to retry a failed backup then change to the next host in sequence
Inputs=$(echo $Inputs | jq '.RmanTarget |= gsub("standbydb1"; "standbydb2")')
Inputs=$(echo $Inputs | jq '.RmanTarget |= gsub("primarydb"; "standbydb1")')
# The IsRetryJob parameter is simply used for informational purposes to indicate
# a job which was attempted on the originally selected host but had to be retried
# on another host due to a failure.
Inputs=$(echo $Inputs | jq '.IsRetryJob = "yes"')
fi
fi
fi
RmanTarget=$(echo $Inputs | jq -r '.RmanTarget')
TargetEnvironment=$(echo $Inputs | jq -r '.TargetEnvironment')
Period=$(echo $Inputs | jq -r '.Period')
IsRetryJob=$(echo $Inputs | jq -r '.IsRetryJob // "no"')
# The following 2 parameters are mandatory; abort if they are missing from JSON inputs
[[ -z $TargetEnvironment ]] && "Target Environment is not specified in JSON inputs" && exit 3
[[ -z $Period ]] && "Backup Period (daily/weekly) is not specified in JSON inputs" && exit 4
if [[ "$(echo $Inputs | jq -r '.RmanTarget')" == "null" ]]; then
# We calculate the inventory name for the RMAN target by combining the environment name
# and desired host name. This is also added back into the Inputs JSON.
set_rman_target $TargetEnvironment $TargetHost
Inputs=$(echo $Inputs | jq --arg RMANTARGET "$RmanTarget" '.RmanTarget = $RMANTARGET')
else
echo $Inputs | jq -r '.RmanTarget'
fi
fi
echo "RmanTarget: $RmanTarget"
echo "Inputs: $Inputs"
echo "RmanTarget=$RmanTarget" >> $GITHUB_OUTPUT
echo "TargetEnvironment=$TargetEnvironment" >> $GITHUB_OUTPUT
echo "BackupStatus=$BackupStatus" >> $GITHUB_OUTPUT
echo "Inputs="$(echo $Inputs | jq @json)
echo "Inputs="$(echo $Inputs | jq @json) >> $GITHUB_OUTPUT
echo "EnableTrace="${EnableTrace:-"no"} >> $GITHUB_OUTPUT
echo "SourceCodeVersion=$SourceCodeVersion" >> $GITHUB_OUTPUT
echo "SourceConfigVersion=$SourceConfigVersion" >> $GITHUB_OUTPUT
echo "IsRetryJob=$IsRetryJob" >> $GITHUB_OUTPUT
- name: Prepare Slack Channel Name
id: prepare-slack-channel-name
run: |
if [[ "${{ github.event.inputs.TargetEnvironment }}" == "delius-core-prod" ]]; then
echo "SlackChannel=delius-aws-oracle-prod-alerts" >> $GITHUB_OUTPUT
else
echo "SlackChannel=delius-aws-oracle-dev-alerts" >> $GITHUB_OUTPUT
fi
oracle-backup:
name: oracle-backup
needs: build_rman_target_name
environment: ${{needs.build_rman_target_name.outputs.TargetEnvironment}}
runs-on: ubuntu-latest
container:
image: ghcr.io/ministryofjustice/hmpps-delius-operational-automation:0.78.0
timeout-minutes: 1440
env:
backup_command: ansible-playbook operations/playbooks/oracle_backup/backup.yml
validate_command: ansible-playbook operations/playbooks/oracle_backup/validate.yml
inventory: inventory/ansible
RmanTarget: "${{needs.build_rman_target_name.outputs.RmanTarget}}"
TargetEnvironment: "${{needs.build_rman_target_name.outputs.TargetEnvironment}}"
SSMParameter: "/oracle-backups/${{needs.build_rman_target_name.outputs.RmanTarget}}"
BackupStatus: "${{needs.build_rman_target_name.outputs.BackupStatus}}"
BackupStandbyOnPrimaryFailure: "${{needs.build_rman_target_name.outputs.BackupStandbyOnPrimaryFailure}}"
SourceCodeVersion: "${{needs.build_rman_target_name.outputs.SourceCodeVersion}}"
SourceConfigVersion: "${{needs.build_rman_target_name.outputs.SourceConfigVersion}}"
Inputs: "${{needs.build_rman_target_name.outputs.Inputs}}"
SlackChannel: ${{ needs.build_rman_target_name.outputs.SlackChannel }}
IsRetryJob: "${{needs.build_rman_target_name.outputs.IsRetryJob }}"
ansible_config: operations/playbooks/ansible.cfg
continue-on-error: false
steps:
- name: Checkout hmpps-delius-operation-automation
uses: actions/checkout@v4
with:
sparse-checkout-cone-mode: false
sparse-checkout: |
playbooks/oracle_backup
playbooks/ansible.cfg
path: operations
ref: "${{ env.SourceCodeVersion }}"
fetch-depth: 0
- name: Checkout Ansible Inventory From modernisation-platform-configuration-management
uses: actions/checkout@v4
with:
repository: ministryofjustice/modernisation-platform-configuration-management
sparse-checkout-cone-mode: false
sparse-checkout: |
ansible/hosts
ansible/group_vars
path: inventory
ref: "${{ env.SourceConfigVersion }}"
fetch-depth: 0
- name: Checkout Ansible Required Roles From modernisation-platform-configuration-management
uses: actions/checkout@v4
with:
repository: ministryofjustice/modernisation-platform-configuration-management
sparse-checkout-cone-mode: false
sparse-checkout: |
ansible/roles/secretsmanager-passwords
ansible/roles/get-modernisation-platform-facts
path: roles
ref: "${{ env.SourceConfigVersion }}"
fetch-depth: 0
- name: Configure AWS Credentials
id: login-aws
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/modernisation-platform-oidc-cicd"
role-session-name: "hmpps-delius-operational-automation-${{ github.run_number }}"
role-duration-seconds: 21600
aws-region: "eu-west-2"
- name: Get Slack Token
id: get-slack-token
run: |
# The OEM Account ID which is associated with this Delius account is held in the SSM Parameter Store.
OEM_ACCOUNT_ID=$(aws ssm get-parameter --name account_ids --region eu-west-2 --with-decryption --output json | \
jq '.Parameter.Value' | tr -d '\' | sed 's/^\"//' | sed 's/\"$//' | \
jq -r 'to_entries | map(select(.key | contains("hmpps-oem-"))) | first' | jq -r '.value' )
# We can then access the Slack Token from the AWS Secrets for that OEM Account
SECRET_ARN="arn:aws:secretsmanager:eu-west-2:${OEM_ACCOUNT_ID}:secret:/oracle/database/EMREP/shared-passwords"
SECRET_VALUE=$(aws secretsmanager get-secret-value --secret-id "${SECRET_ARN}" --query SecretString --output json)
SLACK_TOKEN=$(echo ${SECRET_VALUE} | jq -r | jq -r 'to_entries[] | select(.key=="slack_token").value')
echo "slack_token=${SLACK_TOKEN}" >> $GITHUB_OUTPUT
- name: Check And Set Backup Runtime
id: check-and-set-backup-runtime
shell: bash
run: |
if [[ "$BackupStatus" == "fatal" ]]; then
echo "All Possible Backup Retry Targets Exhausted"
Inputs=$(echo $Inputs | jq -r | jq '.Phase == "Failed And No More Retry Targets Remain"' | jq @json)
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite --value "$(echo $Inputs | jq -r)"
exit 11
fi
# The SSM Parameter store is used to record information about a backup run. This is mostly done simply for
# informational purposes and is not used to marshal calls to steps in the backup (those are done instead by
# respository_dispatch events).
# However, the ONE case where we DO act on the contents of the parameter store is to abort a backup if there
# is already a backup running on the same host. This is detected if the previous backup is not at the
# Inactive, Done or Failed phase.
# (Ignore errors fetching the parameter - this happens if the backup has not been run before: we
# just default this to "Inactive")
RUNTIME=$(aws ssm get-parameter --region ${AWS_REGION} --name "$SSMParameter" --query "Parameter.Value" --output text) || true
if echo $RUNTIME | jq -r 2>/dev/null | jq 'if type == "object" then "VALID_JSON_OBJECT" else "not" end' | grep -q VALID_JSON_OBJECT; then
PHASE=$(echo ${RUNTIME:-"{}"} | jq -r '.Phase // "Inactive"')
else
echo "Invalid JSON in $SSMParameter - resetting to Inactive"
SSMParameterValue="{\"Phase\":\"Inactive\"}"
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite --value "$SSMParameterValue"
PHASE="Inactive"
fi
# Initialize the Backup Status for this RMAN Backup Target
if [[ "${{ github.event_name }}" != "repository_dispatch" ]]; then
# If this is a scheduled or manually started job then check that it does not conflict with
# one which is already in progress.
# This check is done by looking at the current phase of the backup as written to the SSM
# parameter store.
if [[ $PHASE != "Inactive" ]] && [[ $PHASE != "Done" ]] && [[ ! $PHASE =~ Failed ]]; then
# If we have detected what looks like a backup running concurrently, also check the
# last time that the SSM parameter was updated. If it was over 23 hours ago (82800 seconds)
# then we can assume that the previous backup has failed in such a way that it was
# unable to update the SSM parameter first, so we assume it is no longer running and
# start the new backup anyway.
LAST_UPDATED=$(aws ssm get-parameter --region ${AWS_REGION} --name "$SSMParameter" --query "Parameter.LastModifiedDate" --output text | cut -d. -f1)
CURRENT_EPOCH=$(date +%s)
if [[ $((CURRENT_EPOCH-LAST_UPDATED)) -le 82800 ]]; then
echo "Backup is already active on this host. Multiple concurrent runs are not allowed."
Inputs=$(echo $Inputs | jq -r | jq '.Phase == "Concurrent Backup Attempt"' | jq @json)
exit 1
fi
fi
fi
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite \
--value "$(echo $Inputs | jq -r )" \
--description "Runtime phase and status details of RMAN backups running on the $RmanTarget host"
- name: Start Ansible Backup On Selected Host
if: env.BackupStatus != 'success'
id: backup
continue-on-error: false
shell: bash
run: |
# This step starts running the backup by calling the Ansible backup play
# with the appropriate inputs.
# This play is intended to run asynchronously, so once it calls the rman_backup.sh shell
# script, it does not wait for the script to complete. This is because:
# (1) Some backups can exceed the maximum 6 hour limit for a job on a hosted GitHub Runner
# (2) GitHub Runners are billed all the time they are active, even if they are not doing
# anything. Running the backup asynchronously allows the runner to complete as soon
# as the backup starts and therefore saves on hosting costs.
# The backup script, rman_backup.sh, is itself is responsible for raising a respository_dispatch
# event to re-enter this workflow so steps after the backup can continue, whether the backup has
# succeeed or failed.
echo "Period: "$(echo "$Inputs" | jq -r | jq -r '.Period' )
# The following section determines what happens if the previous attempt to run the backup has
# failed:
# If BackupStandbyOnPrimaryFailue is set to "yes" and a Standby database exists, then the
# backup may be retried on the Standby. Otherwise the job aborts with a failure since there
# is notwhere else where the backup may be attempted.
echo "BackupStatus: $BackupStatus"
echo "BackupStandbyOnPrimaryFailure: $BackupStandbyOnPrimaryFailure"
echo "RmanTarget: $RmanTarget"
if [[ "$BackupStatus" == "failed" ]]; then
echo "The Backup Failed"
fi
if [[ "$RmanTarget" =~ "standbydb1" ]]; then
echo "Target is 1st Standby"
echo "Standby Config File is ${inventory}/group_vars/${RmanTarget/primarydb/standbydb1}.yml"
if [[ ! -e ${inventory}/group_vars/${RmanTarget/primarydb/standbydb1}.yml ]]; then
echo "Standby config file does not exist"
fi
fi
if [[ "$RmanTarget" =~ "standbydb2" ]]; then
echo "Target is 2nd Standby"
echo "Standby Config File is ${inventory}/group_vars/${RmanTarget/primarydb/standbydb2}.yml"
if [[ ! -e ${inventory}/group_vars/${RmanTarget/primarydb/standbydb2}.yml ]]; then
echo "Standby config file does not exist"
fi
fi
if [[ "$BackupStatus" == "failed" ]]; then
if [[ "$BackupStandbyOnPrimaryFailure" == "no" ]]; then
echo "Backup Failed And No Retry Allowed"
Inputs=$(echo $Inputs | jq -r | jq '.Phase = "Failed And No Retry Allowed"' | jq @json)
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite --value "$(echo $Inputs | jq -r)"
exit 10
else
if [[ "$RmanTarget" =~ "standbydb1" ]] && [[ ! -e ${inventory}/group_vars/${RmanTarget/primarydb/standbydb1}.yml ]]; then
echo "Failed And No Standby Exists"
Inputs=$(echo $Inputs | jq -r | jq '.Phase = "Failed And No Standby Exists"' | jq @json)
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite --value "$(echo $Inputs | jq -r)"
exit 20
fi
if [[ "$RmanTarget" =~ "standbydb2" ]] && [[ ! -e ${inventory}/group_vars/${RmanTarget/primarydb/standbydb2}.yml ]]; then
echo "Failed on First Standby And No Second Standby Exists"
Inputs=$(echo $Inputs | jq -r | jq '.Phase = "Failed on First Standby And No Second Standby Exists"' | jq @json)
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite --value "$(echo $Inputs | jq -r)"
exit 30
fi
fi
fi
export ANSIBLE_CONFIG=$ansible_config
# A symlink is used to access the Backup Role as if it were a standalone Ansible Role
ln -s $PWD/roles/ansible/roles $PWD/operations/playbooks/oracle_backup/roles
$backup_command -i $inventory -e ansible_aws_ssm_bucket_name=${{ vars.ANSIBLE_AWS_SSM_BUCKET_NAME }} -e rman_target=$RmanTarget -e daily_weekly=$(echo "$Inputs" | jq -r | jq -r '.Period' ) -e enable_trace=$EnableTrace -e ssm_parameter=$SSMParameter -e repository_dispatch="${{ github.repository }}" -e json_inputs="$Inputs" $(echo "$Inputs" | jq -r | jq -r '.VerboseOutput // ""')
- name: Report Backup Retry Attempt
id: slack-retry
if: env.BackupStatus == 'failed'
uses: slackapi/slack-github-action@v1.27.0
with:
channel-id: "${{ env.SlackChannel }}"
payload: |
{"username":"Retrying Failed Backup on Standby","icon_emoji":"spinning-circle-of-death","text":"Retrying backup on: ${{ env.RmanTarget }}"}
env:
SLACK_BOT_TOKEN: ${{ steps.get-slack-token.outputs.slack_token }}
# To avoid clutter we do not report all backup successes - just those that needed to be retried due to prior failure
- name: Report Backup Retry Attempt Success
id: slack-retry-success
if: env.BackupStatus == 'success' && env.IsRetryJob == 'yes'
uses: slackapi/slack-github-action@v1.27.0
with:
channel-id: "${{ env.SlackChannel }}"
payload: |
{"username":"Retrying Failed Backup on Standby Successful","icon_emoji":"github-tick","text":"Retrying Failed Backup on Standby Successful for: ${{ env.RmanTarget }}"}
env:
SLACK_BOT_TOKEN: ${{ steps.get-slack-token.outputs.slack_token }}
- name: Run Ansible Validate And Fix Absent Chunks
if: env.BackupStatus == 'success'
run: |
# We only come here by a repository_dispatch event which reports that the backup was successful.
# If this is the case then we run a second Ansible play which checks that all backup chunk pieces are available
# on the S3 bucket. This is a precaution due to previous incidents where the backup metadata did not match
# the available chunks. This has not been seen on more recent versions of OSBWS and it is believed the
# bug has been fixed. However, since Oracle had been unable to identify the root cause, we keep this
# check active just in case.
# Unlike the Backup Ansible play, this step is not asynchronous so we do not expect it to encounter
# time limits.
export ANSIBLE_CONFIG=$ansible_config
ln -s $PWD/roles/ansible/roles $PWD/operations/playbooks/oracle_backup/roles
$validate_command -i $inventory -e ansible_aws_ssm_bucket_name=${{ vars.ANSIBLE_AWS_SSM_BUCKET_NAME }} -e rman_target=$RmanTarget -e fix_absent_chunks=$(echo "$Inputs" | jq -r | jq -r '.FixAbsentChunks // "yes"') $(echo "$Inputs" | jq -r | jq -r '.VerboseOutput // ""')
- name: Record Job Completed in SSM Parameter Store
if: env.BackupStatus == 'success'
run: |
Inputs=$(echo $Inputs | jq -r | jq '.Phase = "Done"' | jq @json)
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite --value "$(echo $Inputs | jq -r)"
- name: Record Job Failure in SSM Parameter Store
if: failure()
shell: bash
run: |
Inputs=$(echo $Inputs | jq -r | jq '.Phase += " -- Failed"' | jq @json)
if [[ ! $(echo $Inputs | jq -r | jq '.Phase') =~ Concurrent ]]; then
# Do not update SSM Parameter if failure was due to concurrent backup attempts as we do
# not want to overwrite the parameter value for the original run
aws ssm put-parameter --region ${AWS_REGION} --name "$SSMParameter" --type String --overwrite --value "$(echo $Inputs | jq -r)"
fi
- name: Slack Failure Notification
id: slack-failure
if: failure()
uses: slackapi/slack-github-action@v1.27.0
with:
channel-id: "${{ env.SlackChannel }}"
payload: |
{"username":"Failed Backup","icon_emoji":"red_circle","text":"Failed Backup","blocks":[{"type": "section","text": {"type": "mrkdwn","text": "Failed Backup:"}},{"type": "section","fields":[{"type": "mrkdwn","text": "*Workflow:*\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ github.workflow }}>"},{"type": "mrkdwn","text": "*Job:*\n${{ github.job }}"},{"type": "mrkdwn","text": "*Repo:*\n${{ github.repository }}"},{"type": "mrkdwn","text": "*Rman Target:*\n${{ env.RmanTarget }}"}]}]}
env:
SLACK_BOT_TOKEN: ${{ steps.get-slack-token.outputs.slack_token }}