Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harmony 1714 - Disambiguate files when downloading files that have the same name from the same job #82

Merged
merged 8 commits into from
Mar 26, 2024
49 changes: 47 additions & 2 deletions harmony/harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from tabnanny import check
import time
import platform
from uuid import UUID
from requests import Response
from requests.exceptions import JSONDecodeError
import requests.models
Expand Down Expand Up @@ -1078,13 +1079,57 @@ def result_urls(self,
if link['rel'] == 'data':
yield link['href']

def _is_staged_result(self, url: str) -> str:
"""Check if the URL indicates that the data is associated with actual
service ouputs (as opposed to a download link for example).

Args:
url: The location (URL) of the file to be downloaded

Returns:
A boolean indicating whether the data is staged data.
"""
url_parts = url.split('/')
possible_uuid = url_parts[-3]
possible_item_id = url_parts[-2]
try:
uuid_obj = UUID(possible_uuid, version=4)
except ValueError:
return False
if str(uuid_obj) != possible_uuid:
return False
if not possible_item_id.isnumeric():
return False
return True

def get_download_filename_from_url(self, url: str) -> str:
"""For a given URL, returns the filename that will be used for download.
It will include a Harmony generated ID prefix if the data is staged.

Args:
url: The location (URL) of the file to be downloaded

Returns:
The filename that will be used to name the downloaded file.
"""
url_parts = url.split('/')
original_filename = url_parts[-1]

is_staged_result = self._is_staged_result(url)
if not is_staged_result:
return original_filename
item_id = url_parts[-2]
return f'{item_id}_{original_filename}'

def _download_file(self, url: str, directory: str = '', overwrite: bool = False) -> str:
"""Downloads data, saves it to a file, and returns the filename.

Performance should be close to native with an appropriate chunk size. This can be changed
via environment variable DOWNLOAD_CHUNK_SIZE.

Filenames are automatically determined by using the latter portion of the provided URL.
Filenames are automatically determined by using the latter portion of the provided URL
and will be prefixed by the item id generated by Harmony when data was transformed
from the original.

Args:
url: The location (URL) of the file to be downloaded
Expand All @@ -1099,7 +1144,7 @@ def _download_file(self, url: str, directory: str = '', overwrite: bool = False)
"""
chunksize = int(self.config.DOWNLOAD_CHUNK_SIZE)
session = self._session()
filename = url.split('/')[-1]
filename = self.get_download_filename_from_url(url)

if directory:
filename = os.path.join(directory, filename)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,18 @@ def side_effect_for_get_json(extra_links) -> List[str]:

return [status_running1, status_running2, status_paused, status_resumed, status_successful, status_successful]

def test_get_file_name_staged_link():
# For staged results, the filename should get prefixed with the work item id, to avoid collisions
client = Client(should_validate_auth=False)
actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/staging-bucket/a7aee059-7531-4388-86e0-85af1de9c31a/1047412/C1254854453-LARC_CLOUD_merged.nc4')
assert actual_file_name == '1047412_C1254854453-LARC_CLOUD_merged.nc4'

def test_get_file_name_non_staged_link():
# In this case, e.g. for a direct download data link, the filename should just be the last part of the URL path
client = Client(should_validate_auth=False)
actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/test-data/C1261703151-EEDTEST/ATL08_20181014001049_02350102_006_02.h5')
assert actual_file_name == 'ATL08_20181014001049_02350102_006_02.h5'

@pytest.mark.parametrize('link_type', [LinkType.http, LinkType.https, LinkType.s3])
def test_iterator(link_type, mocker):
extra_links = extra_links_for_iteration(link_type.value)
Expand Down
Loading