forked from jwestgard/preserve
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from jwestgard/checkbag
add bag and partition folders
- Loading branch information
Showing
4 changed files
with
275 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
import bagit | ||
import json | ||
import os | ||
import sys | ||
|
||
|
||
def parse_args(): | ||
''' Parse command line arguments ''' | ||
|
||
parser = argparse.ArgumentParser( | ||
description='Create APTrust-compatible BagIt bags' | ||
) | ||
|
||
parser.add_argument( | ||
'path', | ||
help='Root directory to be bagged', | ||
action='store' | ||
) | ||
|
||
parser.add_argument( | ||
'-c', '--config', | ||
help='Path to a config file containing tag info', | ||
action='store' | ||
) | ||
|
||
parser.add_argument( | ||
'-v', '--version', | ||
action='version', | ||
help='Print version number and exit', | ||
version='%(prog)s 0.1' | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def main(): | ||
|
||
try: | ||
print(f"\n================") | ||
print(f"| |") | ||
print(f"| Bagging Tool |") | ||
print(f"| |") | ||
print(f"================\n") | ||
|
||
""" (1) Parse args """ | ||
args = parse_args() | ||
print(f"Running with the following arguments:") | ||
width = max([len(k) for k in args.__dict__]) | ||
for k in args.__dict__: | ||
print(f" {k:>{width}} : {getattr(args, k)}") | ||
|
||
""" (2) Read configuration """ | ||
if args.config: | ||
with open(args.config) as handle: | ||
config = json.load(handle) | ||
|
||
""" (3) Create bag """ | ||
bag = bagit.make_bag(args.path, checksums=['md5', 'sha256']) | ||
for tagfile, tags in config.items(): | ||
if tagfile == 'bag-info.txt': | ||
for k, v in tags.items(): | ||
bag.info[k] = v | ||
else: | ||
tagfilepath = os.path.join(args.path, tagfile) | ||
with open(tagfilepath, 'w') as handle: | ||
for k, v in tags.items(): | ||
handle.write(f"{k}: {v}\n") | ||
bag.save() | ||
|
||
""" (4) Summarize results """ | ||
print("Bagging complete.") | ||
|
||
except Exception as err: | ||
print(f"ERROR: {err}", file=sys.stderr) | ||
sys.exit(1) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from .classes import FileSet | ||
from .exceptions import ConfigError, DuplicateFileError | ||
import argparse | ||
import os | ||
import shutil | ||
import sys | ||
|
||
|
||
def parse_args(): | ||
''' Parse command line arguments ''' | ||
|
||
parser = argparse.ArgumentParser( | ||
description='Partition a tree of files based on various schemes' | ||
) | ||
|
||
parser.add_argument( | ||
'source', | ||
help='Root directory to be partitioned', | ||
action='store' | ||
) | ||
|
||
parser.add_argument( | ||
'destination', | ||
help='Output directory (if exists, must be empty)', | ||
action='store' | ||
) | ||
|
||
parser.add_argument( | ||
'-m', '--mode', | ||
choices=['copy', 'move', 'dryrun'], | ||
help='Dryrun, move, or copy files to destination', | ||
action='store', | ||
default='dryrun' | ||
) | ||
|
||
parser.add_argument( | ||
'-v', '--version', | ||
action='version', | ||
help='Print version number and exit', | ||
version='%(prog)s 0.1' | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def check_args(args): | ||
''' Validate the provided arguments ''' | ||
|
||
if not os.path.isdir(args.source): | ||
raise ConfigError("Source directory not found") | ||
|
||
if os.path.isdir(args.destination) and len(os.listdir(args.destination)) > 0: | ||
raise ConfigError("Destination directory is not empty") | ||
|
||
|
||
def has_duplicates(mapping): | ||
all_dest = dict() | ||
for source, destination in mapping.items(): | ||
all_dest.setdefault(destination, []).append(source) | ||
duplicates = [tuple(all_dest[d]) for d in all_dest if len(all_dest[d]) > 1] | ||
if duplicates: | ||
return duplicates | ||
else: | ||
return False | ||
|
||
|
||
def main(): | ||
|
||
try: | ||
print(f"\n==================") | ||
print(f"| |") | ||
print(f"| Partition Tool |") | ||
print(f"| |") | ||
print(f"==================\n") | ||
|
||
""" (1) Parse args """ | ||
args = parse_args() | ||
|
||
""" (2) Validate the provided arguments """ | ||
check_args(args) | ||
print(f"Running with the following arguments:") | ||
width = max([len(k) for k in args.__dict__]) | ||
for k in args.__dict__: | ||
print(f" {k:>{width}} : {getattr(args, k)}") | ||
|
||
""" (3) Create FileSet """ | ||
fileset = FileSet.from_filesystem(args.source) | ||
print(f"\nAnalyzing files: {len(fileset)} files, " + \ | ||
f"{round(fileset.bytes/2**30, 2)} GiB") | ||
|
||
""" (4) Create partition map """ | ||
print(f"Creating mapping to partitioned tree...") | ||
pattern = r"^([a-z]+?)-(\d+?)-\d+?\.\w+?$" | ||
mapping = fileset.partition_by(pattern, args.destination) | ||
|
||
""" (5) Check for duplicate files """ | ||
duplicates = has_duplicates(mapping) | ||
if duplicates: | ||
raise DuplicateFileError(f"Duplicate filenames detected: {duplicates}") | ||
else: | ||
print("Destination paths are all confirmed to be unique...") | ||
|
||
""" (5) Move, copy, or print """ | ||
print(f"Partitioning files ({args.mode} mode)...") | ||
for n, (source, destination) in enumerate(mapping.items(), 1): | ||
print(f" {n}. {source} -> {destination}") | ||
if args.mode == 'dryrun': | ||
continue | ||
else: | ||
os.makedirs(os.path.dirname(destination), exist_ok=True) | ||
if args.mode == 'copy': | ||
shutil.copyfile(source, destination) | ||
elif args.mode == 'move': | ||
shutil.move(source, destination) | ||
|
||
""" (6) Summarize results """ | ||
print("Partitioning complete.") | ||
|
||
except Exception as err: | ||
print(f"ERROR: {err}", file=sys.stderr) | ||
sys.exit(1) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from collections import namedtuple, UserDict | ||
import csv | ||
import os | ||
import re | ||
|
||
|
||
Asset = namedtuple('Asset', 'filename md5 bytes') | ||
|
||
|
||
class FileSet(UserDict): | ||
|
||
def __init__(self, data): | ||
super().__init__(data) | ||
|
||
@classmethod | ||
def from_csv(cls, csvfile): | ||
data = dict() | ||
with open(csvfile) as handle: | ||
for row in csv.DictReader(handle): | ||
data[row['PATH']] = Asset(row['FILENAME'], row['MD5'], int(row['BYTES'])) | ||
return cls(data) | ||
|
||
@classmethod | ||
def from_filesystem(cls, root): | ||
data = dict() | ||
for directory, subdirs, files in os.walk(root): | ||
for filename in files: | ||
if filename.startswith("."): | ||
continue | ||
filepath = os.path.join(directory, filename) | ||
bytes = os.path.getsize(filepath) | ||
asset = Asset(filename, None, bytes) | ||
data[filepath] = asset | ||
return cls(data) | ||
|
||
@property | ||
def bytes(self): | ||
return sum([asset.bytes for asset in self.data.values()]) | ||
|
||
def __repr__(self): | ||
return f"<FileSet containing {len(self)} assets, {self.bytes} bytes>" | ||
|
||
def partition_by(self, pattern, destination): | ||
mapping = dict() | ||
for path, asset in self.items(): | ||
m = re.match(pattern, asset.filename) | ||
if m: | ||
dest_dir = f"{m.group(1)}-{m.group(2)}" | ||
else: | ||
dest_dir = "extra" | ||
mapping[path] = os.path.join(destination, dest_dir, asset.filename) | ||
return mapping |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
class ConfigError(Exception): | ||
""" Custom exception class raised by invalid args """ | ||
|
||
def __init__(self, message): | ||
self.message = message | ||
super().__init__(self.message) | ||
|
||
|
||
class DuplicateFileError(Exception): | ||
""" Custom exception class raised when encountering repeated filenames """ | ||
|
||
def __init__(self, message): | ||
self.message = message | ||
super().__init__(self.message) |