Skip to content

Commit

Permalink
Tweak
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Jul 19, 2024
1 parent 4dbeea7 commit fc56191
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
21 changes: 16 additions & 5 deletions newshomepages/extract/consolidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path

import click
import requests
from retry import retry
from rich import print
from rich.progress import track
Expand Down Expand Up @@ -173,10 +174,20 @@ def consolidate(
zip_path.unlink()


@retry(tries=3, delay=180, backoff=2)
@retry(tries=3, delay=300)
def _get_zip_archive(output_dir: Path) -> zipfile.ZipFile:
print("⬇️ Downloading latest data")
zip_url = "https://archive.org/compress/latest-homepages/formats=JSON,JPEG,ITEM%20TILE,ARCHIVE%20BITTORRENT,METADATA"
zip_path = output_dir / "latest.zip"
utils.download_url(zip_url, zip_path)
return zipfile.ZipFile(zip_path)
url = "https://archive.org/compress/latest-homepages/formats=JSON,JPEG,ITEM%20TILE,ARCHIVE%20BITTORRENT,METADATA"
output_path = output_dir / "latest.zip"
timeout = 60 * 10 # 10 minutes
_download_url(url, output_path, timeout)
return zipfile.ZipFile(output_path)


def _download_url(url: str, output_path: Path, timeout: int) -> None:
"""Download the provided URL to the provided path."""
with requests.get(url, stream=True, timeout=timeout) as r:
r.raise_for_status()
with open(output_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
2 changes: 1 addition & 1 deletion newshomepages/extract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _get_json_url(url):
return pd.read_json(output_path)
else:
# Get the URL
data = utils.get_json_url(url)
data = utils.get_json_url(url, timeout=60, verbose=True)

# Parse as a dataframe
df = pd.DataFrame(data)
Expand Down

0 comments on commit fc56191

Please sign in to comment.