From 125b5983a7aa9f571c269aef1dd2c840db8080bf Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 14 Jun 2023 11:55:45 +0000 Subject: [PATCH 01/11] Adding ERA5 download --- ESDC/inputs-collect/download-ERA5.py | 89 ++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 ESDC/inputs-collect/download-ERA5.py diff --git a/ESDC/inputs-collect/download-ERA5.py b/ESDC/inputs-collect/download-ERA5.py new file mode 100644 index 0000000..f683b18 --- /dev/null +++ b/ESDC/inputs-collect/download-ERA5.py @@ -0,0 +1,89 @@ +from tqdm import tqdm +import os +import cdsapi +from multiprocessing import Pool, Manager + +def download_data(args): + pathOut, variable, year, month, progress_queue = args + directory = os.path.join(pathOut, variable, year) + + if not os.path.exists(directory): + try: + os.makedirs(directory) + except FileExistsError: + pass + + filename = f"{variable}.hh.*.era5.{month}.{year}.nc" + filepath = os.path.join(directory, filename) + + c = cdsapi.Client() + c.retrieve( + 'reanalysis-era5-single-levels', + { + 'product_type': 'reanalysis', + 'format': 'netcdf', + 'year': year, + 'month': month, + 'day': [ + '01', '02', '03', + '04', '05', '06', + '07', '08', '09', + '10', '11', '12', + '13', '14', '15', + '16', '17', '18', + '19', '20', '21', + '22', '23', '24', + '25', '26', '27', + '28', '29', '30', + '31', + ], + 'time': [ + '00:00', '01:00', '02:00', + '03:00', '04:00', '05:00', + '06:00', '07:00', '08:00', + '09:00', '10:00', '11:00', + '12:00', '13:00', '14:00', + '15:00', '16:00', '17:00', + '18:00', '19:00', '20:00', + '21:00', '22:00', '23:00', + ], + 'variable': variable, + }, + filepath + ) + + progress_queue.put(1) + return f"Downloaded {filepath}" + +def main(): + pathOut = "~/data/ERA5/source" + pathOut = os.path.expanduser(pathOut) + + if not os.path.exists(pathOut): + os.makedirs(pathOut) + + years = [str(year) for year in range(1971, 2022)] + months = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] + variables = ['2m_temperature', 'evaporation', 'maximum_2m_temperature_since_previous_post_processing', 'minimum_2m_temperature_since_previous_post_processing', 'total_precipitation', 'surface_solar_radiation_downwards'] + + args_list = [] + for variable in variables: + for year in years: + for month in months: + args_list.append((pathOut, variable, year, month)) + + with Pool() as pool, Manager() as manager: + progress_queue = manager.Queue() + total_tasks = len(args_list) + results = [] + + with tqdm(total=total_tasks) as pbar: + for result in pool.imap_unordered(download_data, [(args + (progress_queue,)) for args in args_list]): + results.append(result) + pbar.update(progress_queue.get()) + + for result in results: + print(result) + +if __name__ == '__main__': + main() From 346bf4d315c736bfb5340099810f003a59293694 Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 14 Jun 2023 13:15:18 +0000 Subject: [PATCH 02/11] Update README --- ESDC/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ESDC/README.md b/ESDC/README.md index 0084c74..b7cd785 100644 --- a/ESDC/README.md +++ b/ESDC/README.md @@ -153,7 +153,7 @@ The cube generation process is divided in four phases: ### 1. Downloading required raw datasets -These datasets are the input data. Each dataset can be a set of `.nc`, `.hdf`, or `.tif` files. These files contains data with its original configuration. The downloading code for each dataset is found at the `inputs-collect` folder. Note that some datasets can be acquired via `xcube-cci` and don't require to be downloaded. Additional datasets were acquired va ftp (e.g. GLEAM) or sftp (e.g. GFED4) and don't have a download program. Other datasets were provided by their original providers (e.g. FLUXCOM) and don't have a download program neither. +These datasets are the input data. Each dataset can be a set of `.nc`, `.hdf`, or `.tif` files. These files contains data with its original configuration. The downloading code for each dataset is found at the `inputs-collect` folder. Note that some datasets can be acquired via `xcube-cci` and don't require to be downloaded. Additional datasets were acquired va ftp (e.g. GLEAM) or sftp (e.g. GFED4) and don't have a download program. Other datasets were provided by their original providers (e.g. FLUXCOM) and don't have a download program neither. For some datasets, user accounts are necessary. Please follow the instructions on their websites. For ERA5, refer to the [Copernicus Climate Data Store (CDS) API How-To](https://cds.climate.copernicus.eu/api-how-to) for more information. ``` # MODIS: Download daily .hdf files @@ -168,6 +168,9 @@ inputs-collect/extract-gz-gosif.py # CCI-SM: Download daily .nc files inputs-collect/download-cci-sm.py + +# ERA5: Download hourly .nc files +inpits-collect/download-ERA5.py ``` ### 2. Preprocessing datasets From d0377306c95be90e9e9bd97a74f7f36e567a78cb Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 21 Jun 2023 08:05:39 +0000 Subject: [PATCH 03/11] GLEAM v3.7a --- ESDC/inputs-preprocess/GLEAM/gleam-data-cube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ESDC/inputs-preprocess/GLEAM/gleam-data-cube.py b/ESDC/inputs-preprocess/GLEAM/gleam-data-cube.py index 4a4fa2e..7394648 100644 --- a/ESDC/inputs-preprocess/GLEAM/gleam-data-cube.py +++ b/ESDC/inputs-preprocess/GLEAM/gleam-data-cube.py @@ -10,7 +10,7 @@ # if not os.path.exists(pathOut): # os.mkdir(pathOut) -pathIn = "path-to-GLEAM-folder" +pathIn = "~/data/GLEAM/source" pathOut = "~/data/GLEAM/preprocess" pathOut = os.path.expanduser(pathOut) @@ -24,7 +24,7 @@ for year in tqdm(years): - files = glob.glob(f"{pathIn}/data/v3.6a/daily/{year}/*.nc") + files = glob.glob(f"{pathIn}/v3.7a/daily/{year}/*.nc") files.sort() datasets = [xr.open_dataset(file,chunks = {'time':512,'lat':128,'lon':128}) for file in files] From fffe689f3c37e1732f8ccd9d5bbb611ee9793cd4 Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 21 Jun 2023 09:27:47 +0000 Subject: [PATCH 04/11] Update README --- ESDC/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ESDC/README.md b/ESDC/README.md index b7cd785..84406b4 100644 --- a/ESDC/README.md +++ b/ESDC/README.md @@ -171,6 +171,9 @@ inputs-collect/download-cci-sm.py # ERA5: Download hourly .nc files inpits-collect/download-ERA5.py + +# GLEAM: Download daily .nc files +inpits-collect/download-GLEAM.py ``` ### 2. Preprocessing datasets From e5f5469c95c66b10e9d2b388e5ec119fbd5bc5f8 Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 21 Jun 2023 09:28:44 +0000 Subject: [PATCH 05/11] Adding GLEAM download --- ESDC/inputs-collect/download-GLEAM.py | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 ESDC/inputs-collect/download-GLEAM.py diff --git a/ESDC/inputs-collect/download-GLEAM.py b/ESDC/inputs-collect/download-GLEAM.py new file mode 100644 index 0000000..4c2788d --- /dev/null +++ b/ESDC/inputs-collect/download-GLEAM.py @@ -0,0 +1,70 @@ +import os +import paramiko +from tqdm import tqdm +from multiprocessing import Pool + +def download_file(args): + host, port, username, password, remote_file, local_file = args + transport = paramiko.Transport((host, port)) + transport.connect(username=username, password=password) + sftp = paramiko.SFTPClient.from_transport(transport) + + sftp.get(remote_file, local_file) + + sftp.close() + transport.close() + +def download_files(host, port, username, password, remote_dir, local_dir): + transport = paramiko.Transport((host, port)) + transport.connect(username=username, password=password) + sftp = paramiko.SFTPClient.from_transport(transport) + + remote_years = sftp.listdir(remote_dir) + + tasks = [] + + # v3.7a also has data for 2022 + for year in range(1980, 2022): + str_year = str(year) + if str_year in remote_years: + remote_year_dir = os.path.join(remote_dir, str_year) + local_year_dir = os.path.join(local_dir, str_year) + + if not os.path.exists(local_year_dir): + os.makedirs(local_year_dir) + + remote_files = sftp.listdir(remote_year_dir) + + for file in remote_files: + remote_file = os.path.join(remote_year_dir, file) + local_file = os.path.join(local_year_dir, file) + + tasks.append((host, port, username, password, remote_file, local_file)) + + sftp.close() + transport.close() + + # Server Restriction for 8 simultaneously downloads (?) + with Pool(8) as pool: + with tqdm(total=len(tasks), desc="Downloading files") as pbar: + for _ in pool.imap_unordered(download_file, tasks): + pbar.update(1) + +def main(): + print("Please enter the credentials you got per mail from GLEAM") + host = input("Enter the host (without 'sftp://'): ") + port = int(input("Enter the port: ")) + username = input("Enter your username: ") + password = input("Enter your password: ") + remote_dir = "./data/v3.7a/daily" + local_dir = "~/data/GLEAM/source" + + local_dir = os.path.expanduser(local_dir) + + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + download_files(host, port, username, password, remote_dir, local_dir) + +if __name__ == '__main__': + main() From 74aef9eaa30f7be145c56f2857e25044c713cb67 Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Fri, 23 Jun 2023 12:11:12 +0000 Subject: [PATCH 06/11] Adding GFED4 download --- ESDC/inputs-collect/download-GFED4.py | 76 +++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 ESDC/inputs-collect/download-GFED4.py diff --git a/ESDC/inputs-collect/download-GFED4.py b/ESDC/inputs-collect/download-GFED4.py new file mode 100644 index 0000000..e308807 --- /dev/null +++ b/ESDC/inputs-collect/download-GFED4.py @@ -0,0 +1,76 @@ +import requests +from tqdm import tqdm +import os + +path_out = "~/data/GFED4/source" +path_out = os.path.expanduser(path_out) + +if not os.path.exists(path_out): + os.makedirs(path_out) + +base_url = 'https://daac.ornl.gov/daacdata/global_vegetation/fire_emissions_v4_R1/data/Monthly/' +file_prefix = 'GFED4.0_MQ_' + +years = range(1995, 2017) +months = ['{:02d}'.format(m) for m in range(1, 13)] + +session = requests.Session() +auth_url = 'https://urs.earthdata.nasa.gov/oauth/authorize?app_type=401&client_id=QyeRbBJg8YuY_WBh-KBztA&response_type=code&redirect_uri=https%3A%2F%2Fdaac.ornl.gov%2Fdaacdata%2Fdoesntmater&state=aHR0cHM6Ly9kYWFjLm9ybmwuZ292L2RhYWNkYXRhL2dsb2JhbF92ZWdldGF0aW9uL2ZpcmVfZW1pc3Npb25zX3Y0X1IxL2RhdGEvTW9udGhseS8' + +username = input("Enter your username: ") +password = input("Enter your password: ") + +auth_response = session.get(auth_url, auth=(username, password)) +session.auth = (username, password) + +if auth_response.status_code != 200: + print('Authentication failed:', auth_response.status_code) + exit() + + +def handle_data_url_response(response, month, year): + if response.status_code != 200: + if response.status_code == 404: + print(f'Data not available for {month}/{year}') + else: + print('Failed to access the data URL:', response.status_code) + + +def handle_file_url_response(response, file_url): + if response.status_code != 200: + if response.status_code == 404: + print("---") + else: + print(f'Failed to download: {file_url}') + + +for year in years: + for month in months: + file_name = file_prefix + str(year) + month + '_BA.hdf' + data_url = base_url + file_name + + response = session.get(data_url, auth=(username, password), allow_redirects=False) + print(response.status_code) + + handle_data_url_response(response, month, year) + + file_url = base_url + file_name + response = session.get(file_url, auth=(username, password), allow_redirects=False, stream=True) + + handle_file_url_response(response, file_url) + + if response.status_code != 200: + continue + + file_path = os.path.join(path_out, file_name) + total_size = int(response.headers.get('content-length', 0)) + progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, desc=file_name) + + with open(file_path, 'wb') as file: + for data in response.iter_content(chunk_size=1024): + file.write(data) + progress_bar.update(len(data)) + + progress_bar.close() + print(f'Downloaded: {file_name}') + print("---") From 796db80f09a3ee30934de9578e9845f08ad0a152 Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Fri, 23 Jun 2023 12:21:04 +0000 Subject: [PATCH 07/11] Update README --- ESDC/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ESDC/README.md b/ESDC/README.md index 84406b4..ba235c1 100644 --- a/ESDC/README.md +++ b/ESDC/README.md @@ -153,7 +153,7 @@ The cube generation process is divided in four phases: ### 1. Downloading required raw datasets -These datasets are the input data. Each dataset can be a set of `.nc`, `.hdf`, or `.tif` files. These files contains data with its original configuration. The downloading code for each dataset is found at the `inputs-collect` folder. Note that some datasets can be acquired via `xcube-cci` and don't require to be downloaded. Additional datasets were acquired va ftp (e.g. GLEAM) or sftp (e.g. GFED4) and don't have a download program. Other datasets were provided by their original providers (e.g. FLUXCOM) and don't have a download program neither. For some datasets, user accounts are necessary. Please follow the instructions on their websites. For ERA5, refer to the [Copernicus Climate Data Store (CDS) API How-To](https://cds.climate.copernicus.eu/api-how-to) for more information. +These datasets are the input data. Each dataset can be a set of `.nc`, `.hdf`, or `.tif` files. These files contains data with its original configuration. The downloading code for each dataset is found at the `inputs-collect` folder. Note that some datasets can be acquired via `xcube-cci` and don't require to be downloaded. Additional datasets were acquired va ftp (e.g. GLEAM) or sftp (e.g. GFED4) and don't have a download program. Other datasets were provided by their original providers (e.g. FLUXCOM) and don't have a download program neither. For some datasets, user accounts are necessary. Please follow the instructions on their websites. On the Homepage of [GLEAM](https://www.gleam.eu/) you can register for downloading under Downloads. For ERA5, refer to the [Copernicus Climate Data Store (CDS) API How-To](https://cds.climate.copernicus.eu/api-how-to) for more information. For GFED4, use the EOSDIS [Earthdata Login](https://urs.earthdata.nasa.gov/oauth/authorize?client_id=YQOhivHfMTau88rjbMOVyg&response_type=code&redirect_uri=https://daac.ornl.gov/cgi-bin/urs/urs_logon_proc.pl&state=https%3A%2F%2Fdaac.ornl.gov%2F) for registration. ``` # MODIS: Download daily .hdf files @@ -170,10 +170,13 @@ inputs-collect/extract-gz-gosif.py inputs-collect/download-cci-sm.py # ERA5: Download hourly .nc files -inpits-collect/download-ERA5.py +inputs-collect/download-ERA5.py # GLEAM: Download daily .nc files -inpits-collect/download-GLEAM.py +inputs-collect/download-GLEAM.py + +# GFED4: Download monthly .hdf files +inputs-collect/download-GFED4.py ``` ### 2. Preprocessing datasets From 965878e56bcc2037f470256518e7f409f5b380d6 Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 28 Jun 2023 07:55:51 +0000 Subject: [PATCH 08/11] Add 2022 --- ESDC/inputs-collect/download-ERA5.py | 2 +- ESDC/inputs-collect/download-GLEAM.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ESDC/inputs-collect/download-ERA5.py b/ESDC/inputs-collect/download-ERA5.py index f683b18..43ec9dd 100644 --- a/ESDC/inputs-collect/download-ERA5.py +++ b/ESDC/inputs-collect/download-ERA5.py @@ -62,7 +62,7 @@ def main(): if not os.path.exists(pathOut): os.makedirs(pathOut) - years = [str(year) for year in range(1971, 2022)] + years = [str(year) for year in range(1971, 2023)] months = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] variables = ['2m_temperature', 'evaporation', 'maximum_2m_temperature_since_previous_post_processing', 'minimum_2m_temperature_since_previous_post_processing', 'total_precipitation', 'surface_solar_radiation_downwards'] diff --git a/ESDC/inputs-collect/download-GLEAM.py b/ESDC/inputs-collect/download-GLEAM.py index 4c2788d..52a5e47 100644 --- a/ESDC/inputs-collect/download-GLEAM.py +++ b/ESDC/inputs-collect/download-GLEAM.py @@ -23,8 +23,7 @@ def download_files(host, port, username, password, remote_dir, local_dir): tasks = [] - # v3.7a also has data for 2022 - for year in range(1980, 2022): + for year in range(1980, 2023): str_year = str(year) if str_year in remote_years: remote_year_dir = os.path.join(remote_dir, str_year) From c771dedeed861d0ec13a5be63a3eed59aa19069d Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 12 Jul 2023 08:21:51 +0000 Subject: [PATCH 09/11] Update Version & Year --- ESDC/inputs-collect/download-cci-sm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ESDC/inputs-collect/download-cci-sm.py b/ESDC/inputs-collect/download-cci-sm.py index 23f9354..2018203 100644 --- a/ESDC/inputs-collect/download-cci-sm.py +++ b/ESDC/inputs-collect/download-cci-sm.py @@ -45,9 +45,9 @@ def download_file(url): else: print(f"File {filename} already exists!") -years = np.arange(1979,2021) +years = np.arange(1979,2022) for year in tqdm(years): - urls = get_url_paths(f"https://dap.ceda.ac.uk/neodc/esacci/soil_moisture/data/daily_files/COMBINED/v06.1/{year}/","nc") + urls = get_url_paths(f"https://dap.ceda.ac.uk/neodc/esacci/soil_moisture/data/daily_files/COMBINED/v07.1/{year}/","nc") for url in urls: download_file(url) From 51778193c45f031d82b62355892bc8e82c365f5a Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 12 Jul 2023 08:49:52 +0000 Subject: [PATCH 10/11] Update README --- ESDC/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ESDC/README.md b/ESDC/README.md index ba235c1..6b32764 100644 --- a/ESDC/README.md +++ b/ESDC/README.md @@ -166,6 +166,9 @@ inputs-collect/download-GOME2-SIF.py inputs-collect/download-GOSIF.py inputs-collect/extract-gz-gosif.py +# RTSIF: Download 8-days .tif files +inputs-collect/download-RTSIF.py + # CCI-SM: Download daily .nc files inputs-collect/download-cci-sm.py From 404a37e8b741293fc8ecefcf90cee6cf772f7049 Mon Sep 17 00:00:00 2001 From: LFT-W47 Date: Wed, 12 Jul 2023 08:59:13 +0000 Subject: [PATCH 11/11] Adding RTSIF download --- ESDC/inputs-collect/download-RTSIF.py | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 ESDC/inputs-collect/download-RTSIF.py diff --git a/ESDC/inputs-collect/download-RTSIF.py b/ESDC/inputs-collect/download-RTSIF.py new file mode 100644 index 0000000..39dce92 --- /dev/null +++ b/ESDC/inputs-collect/download-RTSIF.py @@ -0,0 +1,50 @@ +import requests +import os +import patoolib +from tqdm import tqdm + +### There is no native package for extracting .rar files +### 7-Zip or WinRAR are needed for extracting .rar files + +url = "https://figshare.com/ndownloader/articles/19336346/versions/3" +filename = "cache.zip" +extract_path = "~/data/SIF/RTSIF/source" + +zip_filename = os.path.join(extract_path, filename) + +if not os.path.exists(extract_path): + os.makedirs(extract_path) + print("Extract path created.") + +response = requests.get(url, stream=True) +total_size = int(response.headers.get('content-length', 0)) + +with open(zip_filename, 'wb') as file, tqdm( + desc=filename, + total=total_size, + unit='iB', + unit_scale=True, + unit_divisor=1024, +) as progress_bar: + for data in response.iter_content(chunk_size=1024): + size = file.write(data) + progress_bar.update(size) + +print("File downloaded successfully.") + +patoolib.extract_archive(zip_filename, outdir=extract_path) +print("Extraction completed.") + +for filename in os.listdir(extract_path): + if filename.endswith(".rar"): + rar_file = os.path.join(extract_path, filename) + patoolib.extract_archive(rar_file, outdir=extract_path) + print(f"Extracted {filename} to {extract_path}") + +for file in os.listdir(extract_path): + file_path = os.path.join(extract_path, file) + if file.endswith((".rar", ".zip")): + os.remove(file_path) + print(f"Deleted {file}") + +print("All .rar and .zip files deleted.")