diff --git a/ipwb/indexer.py b/ipwb/indexer.py index ecb904be..7d0a9c3a 100755 --- a/ipwb/indexer.py +++ b/ipwb/indexer.py @@ -34,6 +34,7 @@ from ipwb.util import iso8601_to_digits14, ipfs_client from ipwb.util import is_wacz, extract_warcs_from_wacz +from ipwb.util import cleanup_warc_files_extracted_from_wacz import requests import datetime @@ -126,9 +127,11 @@ def index_file_at(warc_paths, encryption_key=None, warc_paths_to_append = [] warc_paths_to_remove = [] + warcs_to_cleanup_post_indexing = [] for warc_path in warc_paths: if is_wacz(warc_path): warc_paths_to_append += extract_warcs_from_wacz(warc_path) + warcs_to_cleanup_post_indexing = warc_paths_to_append warc_paths_to_remove.append(warc_path) # Manipulate list of WARCs extracted from WACZ @@ -184,6 +187,8 @@ def index_file_at(warc_paths, encryption_key=None, cdxj_metadata_lines = generate_cdxj_metadata(cdxj_lines) cdxj_lines = cdxj_metadata_lines + cdxj_lines + cleanup_warc_files_extracted_from_wacz(warcs_to_cleanup_post_indexing) + if quiet: return cdxj_lines diff --git a/ipwb/util.py b/ipwb/util.py index 98c0f7bf..ede08474 100644 --- a/ipwb/util.py +++ b/ipwb/util.py @@ -365,3 +365,13 @@ def extract_warcs_from_wacz(wacz_path): extract_warcs_to_disk(wacz_path, warc_paths) return glob.glob('archive/*') + + +def cleanup_warc_files_extracted_from_wacz(warc_paths): + for temporary_warc in warc_paths: + try: + if os.path.isfile(temporary_warc): + print(f'Deleting tempporary WARC at {temporary_warc}') + os.remove(temporary_warc) + except OSError as e: + print(f'Error: {e.filename}, {e.strerror}')