From 4afc37d4230412d2c01e61aa708ad877d816c768 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Wed, 6 Nov 2024 11:26:16 +0100 Subject: [PATCH] changed library for downloads --- notebooks/beginners_kit.ipynb | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/notebooks/beginners_kit.ipynb b/notebooks/beginners_kit.ipynb index 5a17a83..01fc148 100644 --- a/notebooks/beginners_kit.ipynb +++ b/notebooks/beginners_kit.ipynb @@ -28,7 +28,7 @@ "outputs": [], "source": [ "import os\n", - "from urllib.request import urlretrieve\n", + "import requests\n", "from urllib.parse import urlsplit\n", "import tarfile\n", "from pathlib import Path\n", @@ -47,7 +47,7 @@ "\n", "\n", "\n", - "def download_tar(url, path):\n", + "def download_and_extract(url, path):\n", " tar_name = urlsplit(url).path.split('/')[-1] # publication.tar\n", " tar_path = os.path.join(path, tar_name) # data/raw/publication.tar\n", " untarred_folder = tar_name.split('.')[0] # publication\n", @@ -55,7 +55,15 @@ " if not os.path.exists(untarred_path):\n", " if not os.path.exists(tar_path):\n", " print(f\"downloading ${url}\")\n", - " urlretrieve(url, tar_path)\n", + " # urlretrieve(url, tar_path)\n", + " try:\n", + " with requests.get(url, stream=True) as response:\n", + " response.raise_for_status()\n", + " with open(tar_path, 'wb') as f:\n", + " for chunk in response.iter_content(chunk_size=8192):\n", + " f.write(chunk)\n", + " except requests.exceptions.RequestException as e:\n", + " print(\"Error downloading the file:\", e)\n", "\n", " print(f\"untar ${tar_name}\")\n", " with tarfile.open(tar_path, \"r\") as tar:\n", @@ -65,10 +73,9 @@ " os.remove(tar_path)\n", "\n", "\n", - "\"\"\" Downloads data into /data/raw\n", - "\"\"\"\n", + "# Download data into /data/raw\n", "for tar in openaire_files:\n", - " download_tar(tar, \"/app/openaire/data/raw\")" + " download_and_extract(tar, \"/app/openaire/data/raw\")" ] }, {