26
26
from tabnanny import check
27
27
import time
28
28
import platform
29
+ from uuid import UUID
29
30
from requests import Response
30
31
from requests .exceptions import JSONDecodeError
31
32
import requests .models
@@ -1078,13 +1079,57 @@ def result_urls(self,
1078
1079
if link ['rel' ] == 'data' :
1079
1080
yield link ['href' ]
1080
1081
1082
+ def _is_staged_result (self , url : str ) -> str :
1083
+ """Check if the URL indicates that the data is associated with actual
1084
+ service ouputs (as opposed to a download link for example).
1085
+
1086
+ Args:
1087
+ url: The location (URL) of the file to be downloaded
1088
+
1089
+ Returns:
1090
+ A boolean indicating whether the data is staged data.
1091
+ """
1092
+ url_parts = url .split ('/' )
1093
+ possible_uuid = url_parts [- 3 ]
1094
+ possible_item_id = url_parts [- 2 ]
1095
+ try :
1096
+ uuid_obj = UUID (possible_uuid , version = 4 )
1097
+ except ValueError :
1098
+ return False
1099
+ if str (uuid_obj ) != possible_uuid :
1100
+ return False
1101
+ if not possible_item_id .isnumeric ():
1102
+ return False
1103
+ return True
1104
+
1105
+ def get_download_filename_from_url (self , url : str ) -> str :
1106
+ """For a given URL, returns the filename that will be used for download.
1107
+ It will include a Harmony generated ID prefix if the data is staged.
1108
+
1109
+ Args:
1110
+ url: The location (URL) of the file to be downloaded
1111
+
1112
+ Returns:
1113
+ The filename that will be used to name the downloaded file.
1114
+ """
1115
+ url_parts = url .split ('/' )
1116
+ original_filename = url_parts [- 1 ]
1117
+
1118
+ is_staged_result = self ._is_staged_result (url )
1119
+ if not is_staged_result :
1120
+ return original_filename
1121
+ item_id = url_parts [- 2 ]
1122
+ return f'{ item_id } _{ original_filename } '
1123
+
1081
1124
def _download_file (self , url : str , directory : str = '' , overwrite : bool = False ) -> str :
1082
1125
"""Downloads data, saves it to a file, and returns the filename.
1083
1126
1084
1127
Performance should be close to native with an appropriate chunk size. This can be changed
1085
1128
via environment variable DOWNLOAD_CHUNK_SIZE.
1086
1129
1087
- Filenames are automatically determined by using the latter portion of the provided URL.
1130
+ Filenames are automatically determined by using the latter portion of the provided URL
1131
+ and will be prefixed by the item id generated by Harmony when data was transformed
1132
+ from the original.
1088
1133
1089
1134
Args:
1090
1135
url: The location (URL) of the file to be downloaded
@@ -1099,7 +1144,7 @@ def _download_file(self, url: str, directory: str = '', overwrite: bool = False)
1099
1144
"""
1100
1145
chunksize = int (self .config .DOWNLOAD_CHUNK_SIZE )
1101
1146
session = self ._session ()
1102
- filename = url . split ( '/' )[ - 1 ]
1147
+ filename = self . get_download_filename_from_url ( url )
1103
1148
1104
1149
if directory :
1105
1150
filename = os .path .join (directory , filename )
0 commit comments