Skip to content

Commit

Permalink
Merge pull request #3 from jwestgard/checkbag
Browse files Browse the repository at this point in the history
add bag and partition folders
  • Loading branch information
jwestgard authored May 1, 2021
2 parents 737ec2c + 3cabb79 commit 727aabb
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 0 deletions.
82 changes: 82 additions & 0 deletions bag/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env python3

import argparse
import bagit
import json
import os
import sys


def parse_args():
''' Parse command line arguments '''

parser = argparse.ArgumentParser(
description='Create APTrust-compatible BagIt bags'
)

parser.add_argument(
'path',
help='Root directory to be bagged',
action='store'
)

parser.add_argument(
'-c', '--config',
help='Path to a config file containing tag info',
action='store'
)

parser.add_argument(
'-v', '--version',
action='version',
help='Print version number and exit',
version='%(prog)s 0.1'
)

return parser.parse_args()


def main():

try:
print(f"\n================")
print(f"| |")
print(f"| Bagging Tool |")
print(f"| |")
print(f"================\n")

""" (1) Parse args """
args = parse_args()
print(f"Running with the following arguments:")
width = max([len(k) for k in args.__dict__])
for k in args.__dict__:
print(f" {k:>{width}} : {getattr(args, k)}")

""" (2) Read configuration """
if args.config:
with open(args.config) as handle:
config = json.load(handle)

""" (3) Create bag """
bag = bagit.make_bag(args.path, checksums=['md5', 'sha256'])
for tagfile, tags in config.items():
if tagfile == 'bag-info.txt':
for k, v in tags.items():
bag.info[k] = v
else:
tagfilepath = os.path.join(args.path, tagfile)
with open(tagfilepath, 'w') as handle:
for k, v in tags.items():
handle.write(f"{k}: {v}\n")
bag.save()

""" (4) Summarize results """
print("Bagging complete.")

except Exception as err:
print(f"ERROR: {err}", file=sys.stderr)
sys.exit(1)


if __name__ == "__main__":
main()
127 changes: 127 additions & 0 deletions partition/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python3

from .classes import FileSet
from .exceptions import ConfigError, DuplicateFileError
import argparse
import os
import shutil
import sys


def parse_args():
''' Parse command line arguments '''

parser = argparse.ArgumentParser(
description='Partition a tree of files based on various schemes'
)

parser.add_argument(
'source',
help='Root directory to be partitioned',
action='store'
)

parser.add_argument(
'destination',
help='Output directory (if exists, must be empty)',
action='store'
)

parser.add_argument(
'-m', '--mode',
choices=['copy', 'move', 'dryrun'],
help='Dryrun, move, or copy files to destination',
action='store',
default='dryrun'
)

parser.add_argument(
'-v', '--version',
action='version',
help='Print version number and exit',
version='%(prog)s 0.1'
)

return parser.parse_args()


def check_args(args):
''' Validate the provided arguments '''

if not os.path.isdir(args.source):
raise ConfigError("Source directory not found")

if os.path.isdir(args.destination) and len(os.listdir(args.destination)) > 0:
raise ConfigError("Destination directory is not empty")


def has_duplicates(mapping):
all_dest = dict()
for source, destination in mapping.items():
all_dest.setdefault(destination, []).append(source)
duplicates = [tuple(all_dest[d]) for d in all_dest if len(all_dest[d]) > 1]
if duplicates:
return duplicates
else:
return False


def main():

try:
print(f"\n==================")
print(f"| |")
print(f"| Partition Tool |")
print(f"| |")
print(f"==================\n")

""" (1) Parse args """
args = parse_args()

""" (2) Validate the provided arguments """
check_args(args)
print(f"Running with the following arguments:")
width = max([len(k) for k in args.__dict__])
for k in args.__dict__:
print(f" {k:>{width}} : {getattr(args, k)}")

""" (3) Create FileSet """
fileset = FileSet.from_filesystem(args.source)
print(f"\nAnalyzing files: {len(fileset)} files, " + \
f"{round(fileset.bytes/2**30, 2)} GiB")

""" (4) Create partition map """
print(f"Creating mapping to partitioned tree...")
pattern = r"^([a-z]+?)-(\d+?)-\d+?\.\w+?$"
mapping = fileset.partition_by(pattern, args.destination)

""" (5) Check for duplicate files """
duplicates = has_duplicates(mapping)
if duplicates:
raise DuplicateFileError(f"Duplicate filenames detected: {duplicates}")
else:
print("Destination paths are all confirmed to be unique...")

""" (5) Move, copy, or print """
print(f"Partitioning files ({args.mode} mode)...")
for n, (source, destination) in enumerate(mapping.items(), 1):
print(f" {n}. {source} -> {destination}")
if args.mode == 'dryrun':
continue
else:
os.makedirs(os.path.dirname(destination), exist_ok=True)
if args.mode == 'copy':
shutil.copyfile(source, destination)
elif args.mode == 'move':
shutil.move(source, destination)

""" (6) Summarize results """
print("Partitioning complete.")

except Exception as err:
print(f"ERROR: {err}", file=sys.stderr)
sys.exit(1)


if __name__ == "__main__":
main()
52 changes: 52 additions & 0 deletions partition/classes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from collections import namedtuple, UserDict
import csv
import os
import re


Asset = namedtuple('Asset', 'filename md5 bytes')


class FileSet(UserDict):

def __init__(self, data):
super().__init__(data)

@classmethod
def from_csv(cls, csvfile):
data = dict()
with open(csvfile) as handle:
for row in csv.DictReader(handle):
data[row['PATH']] = Asset(row['FILENAME'], row['MD5'], int(row['BYTES']))
return cls(data)

@classmethod
def from_filesystem(cls, root):
data = dict()
for directory, subdirs, files in os.walk(root):
for filename in files:
if filename.startswith("."):
continue
filepath = os.path.join(directory, filename)
bytes = os.path.getsize(filepath)
asset = Asset(filename, None, bytes)
data[filepath] = asset
return cls(data)

@property
def bytes(self):
return sum([asset.bytes for asset in self.data.values()])

def __repr__(self):
return f"<FileSet containing {len(self)} assets, {self.bytes} bytes>"

def partition_by(self, pattern, destination):
mapping = dict()
for path, asset in self.items():
m = re.match(pattern, asset.filename)
if m:
dest_dir = f"{m.group(1)}-{m.group(2)}"
else:
dest_dir = "extra"
mapping[path] = os.path.join(destination, dest_dir, asset.filename)
return mapping
14 changes: 14 additions & 0 deletions partition/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class ConfigError(Exception):
""" Custom exception class raised by invalid args """

def __init__(self, message):
self.message = message
super().__init__(self.message)


class DuplicateFileError(Exception):
""" Custom exception class raised when encountering repeated filenames """

def __init__(self, message):
self.message = message
super().__init__(self.message)

0 comments on commit 727aabb

Please sign in to comment.