Skip to content

Commit

Permalink
wip: threshold-based diffing
Browse files Browse the repository at this point in the history
  • Loading branch information
in03 committed Oct 12, 2022
1 parent bef1f34 commit ca3d2cf
Showing 1 changed file with 40 additions and 59 deletions.
99 changes: 40 additions & 59 deletions rex/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from yaspin import yaspin
from glob import glob
from difflib import SequenceMatcher

import hashlib
from pydavinci import davinci

rich_tracebacks.install()
Expand All @@ -25,14 +25,22 @@
class Backup:
def __init__(self):

# TODO: Ensure no wrongful file collisions
# Use database name, type, ip address, and project path/folder structure in project manager
# to create a unique hash for the backup name. Keep the timestamp. So backups are still unique.

self.project = resolve.project
self.db_name = resolve.project_manager.db["DbName"]
self.timestamp = datetime.now().strftime("%H%M%S")
self.backup_filename = f"{self.project.name}_{self.timestamp}.drp"
static_dir = os.path.normpath(settings["backup"]["static_dir"])
self.backup_filepath = os.path.join(static_dir, self.backup_filename)

print(f"Backup Name: {self.backup_filename}")
print(f"Backup Path: {self.backup_filepath}")
self.backup_filename = (
f"{self.db_name}_{self.project.name}_{self.timestamp}.drp"
)
self.static_dir = os.path.normpath(settings["backup"]["static_dir"])
self.backup_filepath = os.path.join(self.static_dir, self.backup_filename)

print(f"Backup Name: '{self.backup_filename}'")
print(f"Backup Path: '{self.static_dir}'")

def run(self, generate_checksum: bool = True, de_duplicate: bool = True):
"""
Expand All @@ -43,23 +51,30 @@ def run(self, generate_checksum: bool = True, de_duplicate: bool = True):
de_duplicate (bool, optional): Compare neighbouring checksums and symlink file if they're identical. Defaults to True.
"""

if not generate_checksum and de_duplicate:
raise ValueError("generate_checksum must be enabled to de-duplicate!")

if de_duplicate:
logger.info("Gathering existing checksums...")
self.gather_checksums()
self.gather_project_series()

logger.info("Exporting project backup...")
self.export_project()

if de_duplicate:
logger.info("De-duplicating...")
self.de_duplicate()

if generate_checksum:
logger.info("Generating checksum...")
self.generate_checksum()

if de_duplicate:
logger.info("De-duplicating...")
self.de_duplicate()
def generate_checksum(self):

hash_md5 = hashlib.md5()
with open(self.backup_filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)

with open(self.backup_filepath + ".md5", "x") as checksum_file:
checksum_file.write(hash_md5.hexdigest())

def export_project(self):

Expand All @@ -69,30 +84,13 @@ def export_project(self):
stills_and_luts=True,
)

def generate_checksum(self):

assert os.path.exists(self.backup_filepath)

def md5sum(file_):
hash = md5()
with open(file_, "rb") as f:
for chunk in iter(lambda: f.read(128 * hash.block_size), b""):
hash.update(chunk)
return hash.hexdigest()

self.checksum_filepath = self.backup_filepath + ".md5"
with open(self.checksum_filepath, "x") as checksum_file:
checksum_file.write(md5sum(self.backup_filepath))

def gather_checksums(self):
def gather_project_series(self):
"""
Get checksums.
Gathers all checksums that match the project backup series.
Get all projects in the project series (all backups matching current project)
"""
self.backup_filepath = os.path.normpath(self.backup_filepath)
parent_dir = os.path.dirname(self.backup_filepath)
self.matching_checksums = glob(f"{parent_dir}/*{self.project.name}*.drp.md5")
self.matching_checksums = glob(f"{parent_dir}/*{self.project.name}*.drp")
sorted(
self.matching_checksums,
key=lambda name: os.path.getmtime(os.path.join(name)),
Expand All @@ -103,47 +101,30 @@ def de_duplicate(self):

most_recent_checksums = self.matching_checksums[:3]

def compare_checksums(x: str, y: str):
"""
Compare two checksum files.
Args:
x (str): First file to compare
y (str): Second file to compare
def compare_projects(x: str, y: str):

Returns:
bool: Returns True if they match
"""
file_x = open(x, "rb").read()
file_y = open(y, "rb").read()
m = SequenceMatcher(None, file_x, file_y)
ratio = m.quick_ratio()

file_x = open(x, "rb")
file_y = open(y, "rb")
logger.debug(f"[magenta]Match ratio is: {ratio}")

checksum_x = file_x.read()
checksum_y = file_y.read()
logger.debug(f"[magenta]{checksum_x} - {checksum_y}")

if checksum_x == checksum_y:
if ratio >= 0.990:
return True

return False

matching_checksum = None

x = self.checksum_filepath
x = self.backup_filepath
for y in most_recent_checksums:
logger.debug(
f"[magenta]Comparing checksum files: '{os.path.basename(x)}' and '{os.path.basename(y)}'"
)
if compare_checksums(x, y):
if compare_projects(x, y):
matching_checksum = y
break

if matching_checksum:
logger.info(f"Most recent match {matching_checksum}")

def compare_projects(self):

text1 = open(file1).read()
text2 = open(file2).read()
m = SequenceMatcher(None, text1, text2)
m.ratio()

0 comments on commit ca3d2cf

Please sign in to comment.