From 467294b734e3d426825706ff501b32dfc5f4f4d4 Mon Sep 17 00:00:00 2001 From: "Ruben L. Mendoza" Date: Fri, 8 Nov 2024 14:50:08 -0500 Subject: [PATCH] Fix minute replication job script (#348) * Update livenessProbe for replication job * Update script for minute replication file process * Update replication script to send message to slack * Reset gitsha for web api --- images/replication-job/Dockerfile | 3 +- images/replication-job/start.sh | 89 +++++++++++++------ .../jobs/replication-job-deployment.yaml | 10 ++- osm-seed/values.yaml | 3 + 4 files changed, 73 insertions(+), 32 deletions(-) diff --git a/images/replication-job/Dockerfile b/images/replication-job/Dockerfile index 992421f8..843d7c1a 100644 --- a/images/replication-job/Dockerfile +++ b/images/replication-job/Dockerfile @@ -5,7 +5,8 @@ RUN apt-get update && \ nginx \ python3-pip \ python3-venv \ - procps && \ + procps \ + curl && \ rm -rf /var/lib/apt/lists/* RUN python3 -m venv /opt/venv diff --git a/images/replication-job/start.sh b/images/replication-job/start.sh index 5653eb7a..30453c14 100755 --- a/images/replication-job/start.sh +++ b/images/replication-job/start.sh @@ -8,6 +8,9 @@ else echo JAVACMD_OPTIONS=\"-server -Xmx$memory\" >~/.osmosis fi +slack_message_count=0 +max_slack_messages=2 + workingDirectory="/mnt/data" mkdir -p $workingDirectory @@ -49,6 +52,10 @@ function get_current_state_file() { --file $workingDirectory/state.txt --query="name" fi fi + else + echo "File $workingDirectory/state.txt exist in local storage" + echo "File $workingDirectory/state.txt content:" + cat $workingDirectory/state.txt fi } @@ -70,30 +77,71 @@ function upload_file_cloud() { fi } +function send_slack_message() { + # Check if Slack messaging is enabled + if [ "${ENABLE_SEND_SLACK_MESSAGE}" != "true" ]; then + echo "Slack messaging is disabled. Set ENABLE_SEND_SLACK_MESSAGE to true to enable." + return + fi + + # Check if the Slack webhook URL is set + if [ -z "${SLACK_WEBHOOK_URL}" ]; then + echo "SLACK_WEBHOOK_URL is not set. Unable to send message to Slack." + return 1 + fi + + # Limit Slack message count to 3 + if [ "$slack_message_count" -ge "$max_slack_messages" ]; then + echo "Max Slack messages limit reached. No further messages will be sent." + return + fi + + local message="$1" + curl -X POST -H 'Content-type: application/json' --data "{\"text\": \"$message\"}" "$SLACK_WEBHOOK_URL" + echo "Message sent to Slack: $message" + slack_message_count=$((slack_message_count + 1)) +} + + function monitor_minute_replication() { - # Function to handle continuous monitoring, minutminutes replication and upload to cloud provider - # Directory to store a log of processed files + # Function to handle continuous monitoring, minute replication, and sequential upload to cloud provider + # Directory to store a log of the last processed file processed_files_log="$workingDirectory/processed_files.log" max_log_size_mb=1 - while true; do - upload_file_cloud /mnt/data/state.txt - sleep 60s - done & - while true; do if [ -e "$processed_files_log" ]; then log_size=$(du -m "$processed_files_log" | cut -f1) if [ "$log_size" -gt "$max_log_size_mb" ]; then echo $(date +%F_%H:%M:%S)": Cleaning processed_files_log..." >"$processed_files_log" fi - for local_minute_file in $(find $workingDirectory/ -cmin -1); do + # Find new .gz files created within the last minute + for local_minute_file in $(find $workingDirectory/ -name "*.gz" -cmin -1); do if [ -f "$local_minute_file" ]; then - if grep -q "$local_minute_file" "$processed_files_log"; then - continue + echo "Processing $local_minute_file..." + # Ensure the file is uploaded only once + if ! grep -q "$local_minute_file: SUCCESS" "$processed_files_log" && ! grep -q "$local_minute_file: FAILURE" "$processed_files_log"; then + # Verify gz file integrity + if gzip -t "$local_minute_file" 2>/dev/null; then + # Upload the file sequentially + upload_file_cloud $local_minute_file + echo "$local_minute_file: SUCCESS" >>"$processed_files_log" + # Upload and update state.txt after successful upload + upload_file_cloud "$workingDirectory/state.txt" + else + echo $(date +%F_%H:%M:%S)": $local_minute_file is corrupted and will not be uploaded." >>"$processed_files_log" + echo "$local_minute_file: FAILURE" >>"$processed_files_log" + # Ensure state.txt maintains the current ID to regenerate the corrupted file + current_state_id=$(( $(echo "$local_minute_file" | sed 's/[^0-9]//g' | sed 's/^0*//') - 1 )) + sed -i "s/sequenceNumber=.*/sequenceNumber=$current_state_id/" "$workingDirectory/state.txt" + rm "$local_minute_file" + echo "Stopping any existing Osmosis processes..." + pkill -f "osmosis.*--replicate-apidb" + echo "Regenerating $local_minute_file..." + send_slack_message "${ENVIROMENT}: Corrupted file $local_minute_file detected. Regenerating the file..." + generate_replication + fi fi - upload_file_cloud $local_minute_file - echo "$local_minute_file" >>"$processed_files_log" fi done else @@ -120,23 +168,6 @@ function generate_replication() { workingDirectory=$workingDirectory } -# function start_nginx() { -# if [ "$STAR_NGINX_SERVER" = "true" ]; then -# echo 'server { -# listen 8080; -# server_name localhost; - -# location / { -# root /mnt/data; -# index index.html; -# } -# }' >/etc/nginx/nginx.conf -# service nginx restart -# else -# echo "STAR_NGINX_SERVER is either not set or not set to true." -# fi -# } - ######################## Start minutes replication process ######################## get_current_state_file flag=true diff --git a/osm-seed/templates/jobs/replication-job-deployment.yaml b/osm-seed/templates/jobs/replication-job-deployment.yaml index fe2ea3a8..5c18bdeb 100644 --- a/osm-seed/templates/jobs/replication-job-deployment.yaml +++ b/osm-seed/templates/jobs/replication-job-deployment.yaml @@ -28,9 +28,9 @@ spec: - /bin/bash - -c - /liveness.sh - initialDelaySeconds: 10 + initialDelaySeconds: 60 timeoutSeconds: 5 - periodSeconds: 10 + periodSeconds: 30 failureThreshold: 3 {{- if .Values.replicationJob.resources.enabled }} resources: @@ -78,6 +78,12 @@ spec: - name: MEMORY_JAVACMD_OPTIONS value: {{ .Values.replicationJob.resources.requests.memory | default "2Gi" | quote}} {{- end }} + - name: ENABLE_SEND_SLACK_MESSAGE + value: {{ .Values.replicationJob.env.ENABLE_SEND_SLACK_MESSAGE | quote}} + - name: SLACK_WEBHOOK_URL + value: {{ .Values.replicationJob.env.SLACK_WEBHOOK_URL | quote}} + - name: ENVIROMENT + value: {{ .Values.environment | quote}} {{- if .Values.replicationJob.nodeSelector.enabled }} nodeSelector: {{ .Values.replicationJob.nodeSelector.label_key }} : {{ .Values.replicationJob.nodeSelector.label_value }} diff --git a/osm-seed/values.yaml b/osm-seed/values.yaml index b30717bf..2a345d6d 100644 --- a/osm-seed/values.yaml +++ b/osm-seed/values.yaml @@ -293,6 +293,9 @@ replicationJob: image: name: "" tag: "" + env: + ENABLE_SEND_SLACK_MESSAGE: "false" + SLACK_WEBHOOK_URL: "null" resources: enabled: false requests: