Skip to content

Commit 8d778c8

Browse files
authored
Merge pull request #82 from nasa/harmony-1714
Harmony 1714 - Disambiguate files when downloading files that have the same name from the same job
2 parents 5d35a0d + df60c5a commit 8d778c8

File tree

2 files changed

+59
-2
lines changed

2 files changed

+59
-2
lines changed

harmony/harmony.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from tabnanny import check
2727
import time
2828
import platform
29+
from uuid import UUID
2930
from requests import Response
3031
from requests.exceptions import JSONDecodeError
3132
import requests.models
@@ -1078,13 +1079,57 @@ def result_urls(self,
10781079
if link['rel'] == 'data':
10791080
yield link['href']
10801081

1082+
def _is_staged_result(self, url: str) -> str:
1083+
"""Check if the URL indicates that the data is associated with actual
1084+
service ouputs (as opposed to a download link for example).
1085+
1086+
Args:
1087+
url: The location (URL) of the file to be downloaded
1088+
1089+
Returns:
1090+
A boolean indicating whether the data is staged data.
1091+
"""
1092+
url_parts = url.split('/')
1093+
possible_uuid = url_parts[-3]
1094+
possible_item_id = url_parts[-2]
1095+
try:
1096+
uuid_obj = UUID(possible_uuid, version=4)
1097+
except ValueError:
1098+
return False
1099+
if str(uuid_obj) != possible_uuid:
1100+
return False
1101+
if not possible_item_id.isnumeric():
1102+
return False
1103+
return True
1104+
1105+
def get_download_filename_from_url(self, url: str) -> str:
1106+
"""For a given URL, returns the filename that will be used for download.
1107+
It will include a Harmony generated ID prefix if the data is staged.
1108+
1109+
Args:
1110+
url: The location (URL) of the file to be downloaded
1111+
1112+
Returns:
1113+
The filename that will be used to name the downloaded file.
1114+
"""
1115+
url_parts = url.split('/')
1116+
original_filename = url_parts[-1]
1117+
1118+
is_staged_result = self._is_staged_result(url)
1119+
if not is_staged_result:
1120+
return original_filename
1121+
item_id = url_parts[-2]
1122+
return f'{item_id}_{original_filename}'
1123+
10811124
def _download_file(self, url: str, directory: str = '', overwrite: bool = False) -> str:
10821125
"""Downloads data, saves it to a file, and returns the filename.
10831126
10841127
Performance should be close to native with an appropriate chunk size. This can be changed
10851128
via environment variable DOWNLOAD_CHUNK_SIZE.
10861129
1087-
Filenames are automatically determined by using the latter portion of the provided URL.
1130+
Filenames are automatically determined by using the latter portion of the provided URL
1131+
and will be prefixed by the item id generated by Harmony when data was transformed
1132+
from the original.
10881133
10891134
Args:
10901135
url: The location (URL) of the file to be downloaded
@@ -1099,7 +1144,7 @@ def _download_file(self, url: str, directory: str = '', overwrite: bool = False)
10991144
"""
11001145
chunksize = int(self.config.DOWNLOAD_CHUNK_SIZE)
11011146
session = self._session()
1102-
filename = url.split('/')[-1]
1147+
filename = self.get_download_filename_from_url(url)
11031148

11041149
if directory:
11051150
filename = os.path.join(directory, filename)

tests/test_client.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,6 +1162,18 @@ def side_effect_for_get_json(extra_links) -> List[str]:
11621162

11631163
return [status_running1, status_running2, status_paused, status_resumed, status_successful, status_successful]
11641164

1165+
def test_get_file_name_staged_link():
1166+
# For staged results, the filename should get prefixed with the work item id, to avoid collisions
1167+
client = Client(should_validate_auth=False)
1168+
actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/staging-bucket/a7aee059-7531-4388-86e0-85af1de9c31a/1047412/C1254854453-LARC_CLOUD_merged.nc4')
1169+
assert actual_file_name == '1047412_C1254854453-LARC_CLOUD_merged.nc4'
1170+
1171+
def test_get_file_name_non_staged_link():
1172+
# In this case, e.g. for a direct download data link, the filename should just be the last part of the URL path
1173+
client = Client(should_validate_auth=False)
1174+
actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/test-data/C1261703151-EEDTEST/ATL08_20181014001049_02350102_006_02.h5')
1175+
assert actual_file_name == 'ATL08_20181014001049_02350102_006_02.h5'
1176+
11651177
@pytest.mark.parametrize('link_type', [LinkType.http, LinkType.https, LinkType.s3])
11661178
def test_iterator(link_type, mocker):
11671179
extra_links = extra_links_for_iteration(link_type.value)

0 commit comments

Comments
 (0)