From 873cdfdf9770915b5210182b1c113d27d2217ed7 Mon Sep 17 00:00:00 2001 From: Matt Reimer Date: Thu, 25 Apr 2024 16:19:36 -0700 Subject: [PATCH] fixing md5 hashes #120 --- src/classes/data_exchange/uploader.py | 2 +- src/classes/util.py | 46 +++++++++++---------------- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/classes/data_exchange/uploader.py b/src/classes/data_exchange/uploader.py index 6068fd8..00e7f3a 100644 --- a/src/classes/data_exchange/uploader.py +++ b/src/classes/data_exchange/uploader.py @@ -188,7 +188,7 @@ def handle_done(): self.file_upload_log(f"ERROR: uploading chunk {start}-{end} to {url}: {self.error}", Qgis.Critical) # Don't return here so we can retry else: - self.file_upload_log(f"SUCCESS: Finished uploading chunk {start}-{end}", Qgis.Info) + self.file_upload_log(f"SUCCESS: Finished uploading chunk {start:,}-{end:,}", Qgis.Info) return True except Exception as e: self.uploaded_size = original_size diff --git a/src/classes/util.py b/src/classes/util.py index b9c5b92..b9e8b66 100644 --- a/src/classes/util.py +++ b/src/classes/util.py @@ -131,14 +131,17 @@ def error_level_to_str(level: int) -> str: return 'Info' -def calculate_etag( - file_path: str, - chunk_size_bytes: int = MULTIPART_CHUNK_SIZE, - chunk_thresh_bytes: int = MULTIPART_THRESHOLD -) -> Dict[str, str]: +def calculate_etag(file_path: str) -> Dict[str, str]: """ We need a way to calculate the ETag for a file, which is a hash of the file contents. - If the file is small enough, we can just hash the whole thing. - If it's too big, we need to hash it in chunks. + + NOTE: This used to use the multi-part upload method where files > chunksize get the etag + that is a hash of hashes with a suffix of the number of parts. + + This is not necessary anymore though as the file we are comparing against has been copied + so the etag we're looking for will actually be just the Md5 hash of the entire file. + + What's really important here is to calculate the MD5 in the most efficient possible way + so that we're not loading the whole thing into memory just to calculate the hash. This should mirror the way AWS S3 calculates ETags for multipart uploads. @@ -152,31 +155,18 @@ def calculate_etag( """ file_size_in_bytes = os.path.getsize(file_path) - if file_size_in_bytes < chunk_thresh_bytes: - with open(file_path, 'rb') as f: - fbytes_read = f.read() - etag = hashlib.md5(fbytes_read).hexdigest() - else: - parts = file_size_in_bytes // chunk_size_bytes - if file_size_in_bytes % chunk_size_bytes > 0: - parts += 1 - total_md5 = '' - with open(file_path, 'rb') as file: - for part in range(parts): - skip_bytes = chunk_size_bytes * part - total_bytes_left = file_size_in_bytes - skip_bytes - bytes_to_read = min(total_bytes_left, chunk_size_bytes) - file.seek(skip_bytes) - buffer = file.read(bytes_to_read) - total_md5 += hashlib.md5(buffer).hexdigest() - combined_hash = hashlib.md5(bytes.fromhex(total_md5)).hexdigest() - etag = f'{combined_hash}-{parts}' + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + hash_md5.update(chunk) + etag = hash_md5.hexdigest() return { 'size': file_size_in_bytes, - 'etag': f'"{etag}"', + 'etag': f'"{etag}"' } + def humane_bytes(size: int, precision: int = 1) -> str: """ Convert a byte size to a human readable string. @@ -193,4 +183,4 @@ def humane_bytes(size: int, precision: int = 1) -> str: if unit == 'B': precision = 0 - return f"{size:.{precision}f} {unit}" \ No newline at end of file + return f"{size:.{precision}f} {unit}"