From 873cdfdf9770915b5210182b1c113d27d2217ed7 Mon Sep 17 00:00:00 2001
From: Matt Reimer <matt@northarrowresearch.com>
Date: Thu, 25 Apr 2024 16:19:36 -0700
Subject: [PATCH] fixing md5 hashes #120

---
 src/classes/data_exchange/uploader.py |  2 +-
 src/classes/util.py                   | 46 +++++++++++----------------
 2 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/src/classes/data_exchange/uploader.py b/src/classes/data_exchange/uploader.py
index 6068fd8..00e7f3a 100644
--- a/src/classes/data_exchange/uploader.py
+++ b/src/classes/data_exchange/uploader.py
@@ -188,7 +188,7 @@ def handle_done():
                     self.file_upload_log(f"ERROR: uploading chunk {start}-{end} to {url}: {self.error}", Qgis.Critical)
                     # Don't return here so we can retry
                 else:
-                    self.file_upload_log(f"SUCCESS: Finished uploading chunk {start}-{end}", Qgis.Info)
+                    self.file_upload_log(f"SUCCESS: Finished uploading chunk {start:,}-{end:,}", Qgis.Info)
                     return True
             except Exception as e:
                 self.uploaded_size = original_size
diff --git a/src/classes/util.py b/src/classes/util.py
index b9c5b92..b9e8b66 100644
--- a/src/classes/util.py
+++ b/src/classes/util.py
@@ -131,14 +131,17 @@ def error_level_to_str(level: int) -> str:
         return 'Info'
 
 
-def calculate_etag(
-    file_path: str,
-    chunk_size_bytes: int = MULTIPART_CHUNK_SIZE,
-    chunk_thresh_bytes: int = MULTIPART_THRESHOLD
-) -> Dict[str, str]:
+def calculate_etag(file_path: str) -> Dict[str, str]:
     """ We need a way to calculate the ETag for a file, which is a hash of the file contents. 
-    If the file is small enough, we can just hash the whole thing. 
-    If it's too big, we need to hash it in chunks.
+
+    NOTE: This used to use the multi-part upload method where files > chunksize get the etag
+    that is a hash of hashes with a suffix of the number of parts.
+
+    This is not necessary anymore though as the file we are comparing against has been copied
+    so the etag we're looking for will actually be just the Md5 hash of the entire file.
+
+    What's really important here is to calculate the MD5 in the most efficient possible way
+    so that we're not loading the whole thing into memory just to calculate the hash.
 
     This should mirror the way AWS S3 calculates ETags for multipart uploads.
 
@@ -152,31 +155,18 @@ def calculate_etag(
     """
     file_size_in_bytes = os.path.getsize(file_path)
 
-    if file_size_in_bytes < chunk_thresh_bytes:
-        with open(file_path, 'rb') as f:
-            fbytes_read = f.read()
-        etag = hashlib.md5(fbytes_read).hexdigest()
-    else:
-        parts = file_size_in_bytes // chunk_size_bytes
-        if file_size_in_bytes % chunk_size_bytes > 0:
-            parts += 1
-        total_md5 = ''
-        with open(file_path, 'rb') as file:
-            for part in range(parts):
-                skip_bytes = chunk_size_bytes * part
-                total_bytes_left = file_size_in_bytes - skip_bytes
-                bytes_to_read = min(total_bytes_left, chunk_size_bytes)
-                file.seek(skip_bytes)
-                buffer = file.read(bytes_to_read)
-                total_md5 += hashlib.md5(buffer).hexdigest()
-        combined_hash = hashlib.md5(bytes.fromhex(total_md5)).hexdigest()
-        etag = f'{combined_hash}-{parts}'
+    hash_md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            hash_md5.update(chunk)
+    etag = hash_md5.hexdigest()
 
     return {
         'size': file_size_in_bytes,
-        'etag': f'"{etag}"',
+        'etag': f'"{etag}"'
     }
 
+
 def humane_bytes(size: int, precision: int = 1) -> str:
     """ Convert a byte size to a human readable string.
 
@@ -193,4 +183,4 @@ def humane_bytes(size: int, precision: int = 1) -> str:
     if unit == 'B':
         precision = 0
 
-    return f"{size:.{precision}f} {unit}"
\ No newline at end of file
+    return f"{size:.{precision}f} {unit}"