Skip to content

Commit

Permalink
Add preliminary support for ingesting WACZ files
Browse files Browse the repository at this point in the history
Re:#710
  • Loading branch information
machawk1 committed May 17, 2022
1 parent 045dfaf commit 779978a
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 0 deletions.
13 changes: 13 additions & 0 deletions ipwb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from six import PY3

from ipwb.util import iso8601_to_digits14, ipfs_client
from ipwb.util import is_wacz, extract_warcs_from_wacz

import requests
import datetime
Expand Down Expand Up @@ -123,6 +124,18 @@ def index_file_at(warc_paths, encryption_key=None,
for warc_path in warc_paths:
verify_file_exists(warc_path)

warc_paths_to_append = []
warc_paths_to_remove = []
for warc_path in warc_paths:
if is_wacz(warc_path):
warc_paths_to_append.append(
extract_warcs_from_wacz(warc_path))
warc_paths_to_remove.append(warc_path)

# Manipulate list of WARCs extracted from WACZ
warc_paths.remove(warc_paths_to_remove)
warc_paths += warc_paths_to_append

cdxj_lines = []

if outfile:
Expand Down
29 changes: 29 additions & 0 deletions ipwb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
import logging
import platform

# For extracting WARCs from WACZ
import glob
from zipfile import ZipFile, is_zipfile

from urllib.request import urlopen
from urllib.error import URLError

Expand Down Expand Up @@ -336,3 +340,28 @@ def check_for_update(_):
print("The installed version of ipwb is outdated.")
print(f"* Installed: {current}\n* Latest: {latest}")
print("Please run `pip install --upgrade ipwb` to upgrade.")


def is_wacz(path):
# TODO: add logic to check if wacz
# the py-wacz validator inherits many dependencies,
# so ad hoc here for now
return is_zipfile(path)


def get_warc_paths_in_wacz(wacz_path):
with ZipFile(wacz_path) as z:
return [w for w in z.namelist() if w.startswith('archive/')]


def extract_warcs_to_disk(wac_paths):
for warc in warc_paths:
with ZipFile(sample_wacz) as z:
z.extract(warc)


def extract_warcs_from_wacz(wacz_path):
warc_paths = get_warc_paths_in_wacz(wacz_path)
extract_warcs_to_disk(warc_paths)

return glob.glob('archive/*')

0 comments on commit 779978a

Please sign in to comment.