-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #53 from VishnuSaiKarthikGindi/main
added a notebook and python files for NSMv2.0 data pipeline
- Loading branch information
Showing
10 changed files
with
3,446 additions
and
0 deletions.
There are no files selected for viewing
887 changes: 887 additions & 0 deletions
887
Data_Processing_NSMv2.0/.ipynb_checkpoints/DataProcessing_Pipeline-checkpoint.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
350 changes: 350 additions & 0 deletions
350
Data_Processing_NSMv2.0/.ipynb_checkpoints/NSIDC_Data-checkpoint.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,350 @@ | ||
from __future__ import print_function | ||
|
||
import base64 | ||
import getopt | ||
import itertools | ||
import json | ||
import math | ||
import netrc | ||
import os.path | ||
import ssl | ||
import sys | ||
import time | ||
import getpass | ||
import shutil | ||
|
||
try: | ||
from urllib.parse import urlparse | ||
from urllib.request import urlopen, Request, build_opener, HTTPCookieProcessor | ||
from urllib.error import HTTPError, URLError | ||
except ImportError: | ||
from urlparse import urlparse | ||
from urllib2 import urlopen, Request, HTTPError, URLError, build_opener, HTTPCookieProcessor | ||
|
||
import netrc | ||
from urllib.parse import urlparse | ||
|
||
"""" | ||
short_name = 'ASO_50M_SWE' | ||
version = '1' | ||
time_start = '2013-04-03T00:00:00Z' | ||
time_end = '2019-07-16T23:59:59Z' | ||
bounding_box = '-123.34078531,33.35825379,-105.07803558,48.97106571' | ||
polygon = '' | ||
filename_filter = '' | ||
url_list = [] """ | ||
|
||
CMR_URL = 'https://cmr.earthdata.nasa.gov' | ||
URS_URL = 'https://urs.earthdata.nasa.gov' | ||
CMR_PAGE_SIZE = 2000 | ||
CMR_FILE_URL = ('{0}/search/granules.json?provider=NSIDC_ECS' | ||
'&sort_key[]=start_date&sort_key[]=producer_granule_id' | ||
'&scroll=true&page_size={1}'.format(CMR_URL, CMR_PAGE_SIZE)) | ||
|
||
def get_login_credentials(): | ||
try: | ||
info = netrc.netrc() | ||
username, password = info.authenticators(URS_URL) | ||
credentials = f'{username}:{password}' | ||
credentials = base64.b64encode(credentials.encode('ascii')).decode('ascii') | ||
except Exception: | ||
username = input("Earthdata Login Username: ") | ||
password = getpass.getpass("Earthdata Login Password: ") | ||
credentials = f'{username}:{password}' | ||
credentials = base64.b64encode(credentials.encode('ascii')).decode('ascii') | ||
return credentials | ||
|
||
def build_version_query_params(version): | ||
desired_pad_length = 3 | ||
if len(version) > desired_pad_length: | ||
print('Version string too long: "{0}"'.format(version)) | ||
quit() | ||
|
||
version = str(int(version)) # Strip off any leading zeros | ||
query_params = '' | ||
|
||
while len(version) <= desired_pad_length: | ||
padded_version = version.zfill(desired_pad_length) | ||
query_params += '&version={0}'.format(padded_version) | ||
desired_pad_length -= 1 | ||
return query_params | ||
|
||
def filter_add_wildcards(filter): | ||
if not filter.startswith('*'): | ||
filter = '*' + filter | ||
if not filter.endswith('*'): | ||
filter = filter + '*' | ||
return filter | ||
|
||
def build_filename_filter(filename_filter): | ||
filters = filename_filter.split(',') | ||
result = '&options[producer_granule_id][pattern]=true' | ||
for filter in filters: | ||
result += '&producer_granule_id[]=' + filter_add_wildcards(filter) | ||
return result | ||
|
||
def build_cmr_query_url(short_name, version, time_start, time_end, | ||
bounding_box, polygon=None, | ||
filename_filter=None): | ||
params = '&short_name={0}'.format(short_name) | ||
params += build_version_query_params(version) | ||
params += '&temporal[]={0},{1}'.format(time_start, time_end) | ||
if polygon: | ||
params += '&polygon={0}'.format(polygon) | ||
elif bounding_box: | ||
params += '&bounding_box={0}'.format(bounding_box) | ||
if filename_filter: | ||
params += build_filename_filter(filename_filter) | ||
return CMR_FILE_URL + params | ||
|
||
def get_speed(time_elapsed, chunk_size): | ||
if time_elapsed <= 0: | ||
return '' | ||
speed = chunk_size / time_elapsed | ||
if speed <= 0: | ||
speed = 1 | ||
size_name = ('', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y') | ||
i = int(math.floor(math.log(speed, 1000))) | ||
p = math.pow(1000, i) | ||
return '{0:.1f}{1}B/s'.format(speed / p, size_name[i]) | ||
|
||
def output_progress(count, total, status='', bar_len=60): | ||
if total <= 0: | ||
return | ||
fraction = min(max(count / float(total), 0), 1) | ||
filled_len = int(round(bar_len * fraction)) | ||
percents = int(round(100.0 * fraction)) | ||
bar = '=' * filled_len + ' ' * (bar_len - filled_len) | ||
fmt = ' [{0}] {1:3d}% {2} '.format(bar, percents, status) | ||
print('\b' * (len(fmt) + 4), end='') # clears the line | ||
sys.stdout.write(fmt) | ||
sys.stdout.flush() | ||
|
||
def cmr_read_in_chunks(file_object, chunk_size=1024 * 1024): | ||
"""Read a file in chunks using a generator. Default chunk size: 1Mb.""" | ||
while True: | ||
data = file_object.read(chunk_size) | ||
if not data: | ||
break | ||
yield data | ||
|
||
def get_login_response(url, credentials): | ||
opener = build_opener(HTTPCookieProcessor()) | ||
req = Request(url) | ||
|
||
if credentials: | ||
req.add_header('Authorization', 'Basic {0}'.format(credentials)) | ||
|
||
try: | ||
response = opener.open(req) | ||
except HTTPError as e: | ||
print('HTTP error {0}, {1}'.format(e.code, e.reason)) | ||
sys.exit(1) | ||
except URLError as e: | ||
print('URL error: {0}'.format(e.reason)) | ||
sys.exit(1) | ||
|
||
return response | ||
|
||
def cmr_download(urls, folder, quiet=False): | ||
if not urls: | ||
return | ||
|
||
# Create the target directory if it doesn't exist | ||
if not os.path.exists(folder): | ||
os.makedirs(folder) | ||
|
||
if not quiet: | ||
print('Downloading {0} files to {1}...'.format(len(urls), folder)) | ||
|
||
credentials = get_login_credentials() | ||
print(credentials) | ||
|
||
for index, url in enumerate(urls, start=1): | ||
filename = os.path.join(folder, url.split('/')[-1]) # Specify the full path to the file | ||
if not quiet: | ||
print('{0}/{1}: {2}'.format(str(index).zfill(len(str(len(urls)))), len(urls), filename)) | ||
|
||
try: | ||
response = get_login_response(url, credentials) | ||
length = int(response.headers['content-length']) | ||
count = 0 | ||
chunk_size = min(max(length, 1), 1024 * 1024) | ||
time_initial = time.time() | ||
with open(filename, 'wb') as out_file: | ||
for data in cmr_read_in_chunks(response, chunk_size=chunk_size): | ||
out_file.write(data) | ||
if not quiet: | ||
count = count + 1 | ||
time_elapsed = time.time() - time_initial | ||
download_speed = get_speed(time_elapsed, count * chunk_size) | ||
output_progress(count, math.ceil(length / chunk_size), status=download_speed) | ||
|
||
if not quiet: | ||
print() | ||
except HTTPError as e: | ||
print('HTTP error {0}, {1}'.format(e.code, e.reason)) | ||
except URLError as e: | ||
print('URL error: {0}'.format(e.reason)) | ||
except IOError: | ||
raise | ||
|
||
orgnl_directory = folder | ||
destination_directory = 'SWE_Data_xml' | ||
|
||
if not os.path.exists(destination_directory): | ||
os.makedirs(destination_directory) | ||
|
||
files = os.listdir(orgnl_directory) | ||
|
||
for file in files: | ||
if file.endswith('.xml'): | ||
source_file_path = os.path.join(orgnl_directory, file) | ||
destination_file_path = os.path.join(destination_directory, file) | ||
|
||
# Move the file to the destination directory | ||
shutil.move(source_file_path, destination_file_path) | ||
|
||
print("Files with .xml extension moved to the destination folder.") | ||
|
||
|
||
def cmr_filter_urls(search_results): | ||
"""Select only the desired data files from CMR response.""" | ||
if 'feed' not in search_results or 'entry' not in search_results['feed']: | ||
return [] | ||
|
||
entries = [e['links'] | ||
for e in search_results['feed']['entry'] | ||
if 'links' in e] | ||
# Flatten "entries" to a simple list of links | ||
links = list(itertools.chain(*entries)) | ||
|
||
urls = [] | ||
unique_filenames = set() | ||
for link in links: | ||
if 'href' not in link: | ||
# Exclude links with nothing to download | ||
continue | ||
if 'inherited' in link and link['inherited'] is True: | ||
# Why are we excluding these links? | ||
continue | ||
if 'rel' in link and 'data#' not in link['rel']: | ||
# Exclude links which are not classified by CMR as "data" or "metadata" | ||
continue | ||
|
||
if 'title' in link and 'opendap' in link['title'].lower(): | ||
# Exclude OPeNDAP links--they are responsible for many duplicates | ||
# This is a hack; when the metadata is updated to properly identify | ||
# non-datapool links, we should be able to do this in a non-hack way | ||
continue | ||
|
||
filename = link['href'].split('/')[-1] | ||
if filename in unique_filenames: | ||
# Exclude links with duplicate filenames (they would overwrite) | ||
continue | ||
unique_filenames.add(filename) | ||
|
||
urls.append(link['href']) | ||
|
||
return urls | ||
|
||
def cmr_search(short_name, version, time_start, time_end, | ||
bounding_box, polygon='', filename_filter='', quiet=False): | ||
cmr_query_url = build_cmr_query_url(short_name=short_name, version=version, | ||
time_start=time_start, time_end=time_end, | ||
bounding_box=bounding_box, | ||
polygon=polygon, filename_filter=filename_filter) | ||
if not quiet: | ||
print('Querying for data:\n\t{0}\n'.format(cmr_query_url)) | ||
|
||
cmr_scroll_id = None | ||
ctx = ssl.create_default_context() | ||
ctx.check_hostname = False | ||
ctx.verify_mode = ssl.CERT_NONE | ||
|
||
urls = [] | ||
hits = 0 | ||
while True: | ||
req = Request(cmr_query_url) | ||
if cmr_scroll_id: | ||
req.add_header('cmr-scroll-id', cmr_scroll_id) | ||
try: | ||
response = urlopen(req, context=ctx) | ||
except Exception as e: | ||
print('Error: ' + str(e)) | ||
sys.exit(1) | ||
if not cmr_scroll_id: | ||
# Python 2 and 3 have different case for the http headers | ||
headers = {k.lower(): v for k, v in dict(response.info()).items()} | ||
cmr_scroll_id = headers['cmr-scroll-id'] | ||
hits = int(headers['cmr-hits']) | ||
if not quiet: | ||
if hits > 0: | ||
print('Found {0} matches.'.format(hits)) | ||
else: | ||
print('Found no matches.') | ||
search_page = response.read() | ||
search_page = json.loads(search_page.decode('utf-8')) | ||
url_scroll_results = cmr_filter_urls(search_page) | ||
if not url_scroll_results: | ||
break | ||
if not quiet and hits > CMR_PAGE_SIZE: | ||
print('.', end='') | ||
sys.stdout.flush() | ||
urls += url_scroll_results | ||
print(urls) | ||
|
||
if not quiet and hits > CMR_PAGE_SIZE: | ||
print() | ||
return urls | ||
|
||
def main(argv=None): | ||
global short_name, version, time_start, time_end, bounding_box, \ | ||
polygon, filename_filter, url_list | ||
|
||
if argv is None: | ||
argv = sys.argv[1:] | ||
|
||
force = False | ||
quiet = False | ||
usage = 'usage: nsidc-download_***.py [--help, -h] [--force, -f] [--quiet, -q]' | ||
|
||
try: | ||
opts, args = getopt.getopt(argv, 'hfq', ['help', 'force', 'quiet']) | ||
for opt, _arg in opts: | ||
if opt in ('-f', '--force'): | ||
force = True | ||
elif opt in ('-q', '--quiet'): | ||
quiet = True | ||
elif opt in ('-h', '--help'): | ||
print(usage) | ||
sys.exit(0) | ||
except getopt.GetoptError as e: | ||
print(e.args[0]) | ||
print(usage) | ||
sys.exit(1) | ||
|
||
# Supply some default search parameters, just for testing purposes. | ||
# These are only used if the parameters aren't filled in up above. | ||
if 'short_name' in short_name: | ||
short_name = 'ATL06' | ||
version = '003' | ||
time_start = '2018-10-14T00:00:00Z' | ||
time_end = '2021-01-08T21:48:13Z' | ||
bounding_box = '-123.34078531,33.35825379,-105.07803558,48.97106571' | ||
polygon = '' | ||
filename_filter = '*ATL06_2020111121*' | ||
url_list = [] | ||
|
||
try: | ||
if not url_list: | ||
url_list = cmr_search(short_name, version, time_start, time_end, | ||
bounding_box=bounding_box, polygon=polygon, | ||
filename_filter=filename_filter, quiet=quiet) | ||
|
||
#cmr_download(url_list, force=force, quiet=quiet) | ||
except KeyboardInterrupt: | ||
quit() | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.