diff --git a/mcserver/tasks.py b/mcserver/tasks.py index 009d0cb..20b804a 100644 --- a/mcserver/tasks.py +++ b/mcserver/tasks.py @@ -1,6 +1,7 @@ import os import json import requests +import time from http import HTTPStatus from django.conf import settings from django.core.files import File @@ -58,10 +59,10 @@ def download_session_archive(self, session_id, user_id=None): try: session_dir_path = SessionDirectoryConstructor().build(session_id) session_zip_path = zipdir(session_dir_path) - with open(session_zip_path, "rb") as archive: - log = DownloadLog.objects.create(task_id=str(self.request.id), user_id=user_id) - log.media.save(os.path.basename(session_zip_path), archive) - os.remove(session_zip_path) + + create_download_log(session_zip_path, self.request.id, user_id) + os_remove_with_retry(session_zip_path) + except Exception as e: # Delete files and send the traceback to Sentry if something went wrong if session_dir_path and os.path.isfile(session_dir_path): @@ -197,6 +198,38 @@ def submit_cloudwatch_metrics(): from mcserver.utils import submit_number_of_pending_trials_to_cloudwatch submit_number_of_pending_trials_to_cloudwatch() +# Helper functions +def create_download_log(zip_path, task_id, user_id, + max_retries=5, backoff=0.1): + archive = None + for attempt in range(max_retries): + try: + archive = open(zip_path, "rb") + break + + except (FileNotFoundError, PermissionError, OSError) as e: + if attempt == max_retries - 1: + raise + time.sleep(backoff * (2**attempt)) + + if archive is not None: + with archive: + log = DownloadLog.objects.create(task_id=task_id, user_id=user_id) + log.media.save(os.path.basename(zip_path), archive) + +def os_remove_with_retry(path, max_retries=5, backoff=0.1): + """ + os.remove(path) with exponential backoff for robustness in cloud + """ + for attempt in range(max_retries): + try: + os.remove(path) + return + + except (FileNotFoundError, PermissionError, OSError) as e: + if attempt == max_retries - 1: + raise + time.sleep(backoff * (2**attempt)) # TODO: temporary disabled - need testing # @shared_task diff --git a/mcserver/zipsession_v2.py b/mcserver/zipsession_v2.py index e186955..23a00da 100644 --- a/mcserver/zipsession_v2.py +++ b/mcserver/zipsession_v2.py @@ -1,6 +1,7 @@ import os import pathlib import shutil +import time import pickle import boto3 import zipfile @@ -316,16 +317,49 @@ def zipdir(dir_path, delete_directory_after_zip=True): if os.path.isfile(zip_path): os.remove(zip_path) - zip_file = zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) - for root, dirs, files in os.walk(dir_path): - for f in files: - zip_file.write( - os.path.join(root, f), - os.path.relpath(os.path.join(root, f), os.path.join(dir_path, '..')) - ) - zip_file.close() + zipdir_contents_with_retry(dir_path, zip_path) - if delete_directory_after_zip and os.path.exists(dir_path): - shutil.rmtree(dir_path) + if delete_directory_after_zip: + rmtree_with_retry(dir_path) return zip_path + +# Helper functions +def rmtree_with_retry(path, max_retries=5, backoff=0.1): + """ + Retries shutil.rmtree() to handle both race conditions inherent in the + function itself (prior to Python 3.13) and in containers with possible + file system delays or other race conditions. + """ + for attempt in range(max_retries): + try: + shutil.rmtree(path) + return + + except (FileNotFoundError, PermissionError, OSError) as e: + if attempt == max_retries - 1: + raise + time.sleep(backoff * (2**attempt)) + +def zipdir_contents_with_retry(dir_path, zip_path, max_retries=5, backoff=0.1): + """ + Given a source dir_path and a target zip_path, zip the folder with + retrying with exponential backoff to avoid temporary issues with + cloud storage. + """ + for attempt in range(max_retries): + try: + zip_file = zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) + for root, dirs, files in os.walk(dir_path): + for f in files: + zip_file.write( + os.path.join(root, f), + os.path.relpath(os.path.join(root, f), os.path.join(dir_path, '..')) + ) + zip_file.close() + return + + except (FileNotFoundError, PermissionError, OSError) as e: + if attempt == max_retries - 1: + raise + time.sleep(backoff * (2 ** attempt))