Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion ESDC/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ The cube generation process is divided in four phases:

### 1. Downloading required raw datasets

These datasets are the input data. Each dataset can be a set of `.nc`, `.hdf`, or `.tif` files. These files contains data with its original configuration. The downloading code for each dataset is found at the `inputs-collect` folder. Note that some datasets can be acquired via `xcube-cci` and don't require to be downloaded. Additional datasets were acquired va ftp (e.g. GLEAM) or sftp (e.g. GFED4) and don't have a download program. Other datasets were provided by their original providers (e.g. FLUXCOM) and don't have a download program neither.
These datasets are the input data. Each dataset can be a set of `.nc`, `.hdf`, or `.tif` files. These files contains data with its original configuration. The downloading code for each dataset is found at the `inputs-collect` folder. Note that some datasets can be acquired via `xcube-cci` and don't require to be downloaded. Additional datasets were acquired va ftp (e.g. GLEAM) or sftp (e.g. GFED4) and don't have a download program. Other datasets were provided by their original providers (e.g. FLUXCOM) and don't have a download program neither. For some datasets, user accounts are necessary. Please follow the instructions on their websites. On the Homepage of [GLEAM](https://www.gleam.eu/) you can register for downloading under Downloads. For ERA5, refer to the [Copernicus Climate Data Store (CDS) API How-To](https://cds.climate.copernicus.eu/api-how-to) for more information. For GFED4, use the EOSDIS [Earthdata Login](https://urs.earthdata.nasa.gov/oauth/authorize?client_id=YQOhivHfMTau88rjbMOVyg&response_type=code&redirect_uri=https://daac.ornl.gov/cgi-bin/urs/urs_logon_proc.pl&state=https%3A%2F%2Fdaac.ornl.gov%2F) for registration.

```
# MODIS: Download daily .hdf files
Expand All @@ -166,8 +166,20 @@ inputs-collect/download-GOME2-SIF.py
inputs-collect/download-GOSIF.py
inputs-collect/extract-gz-gosif.py

# RTSIF: Download 8-days .tif files
inputs-collect/download-RTSIF.py

# CCI-SM: Download daily .nc files
inputs-collect/download-cci-sm.py

# ERA5: Download hourly .nc files
inputs-collect/download-ERA5.py

# GLEAM: Download daily .nc files
inputs-collect/download-GLEAM.py

# GFED4: Download monthly .hdf files
inputs-collect/download-GFED4.py
```

### 2. Preprocessing datasets
Expand Down
89 changes: 89 additions & 0 deletions ESDC/inputs-collect/download-ERA5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from tqdm import tqdm
import os
import cdsapi
from multiprocessing import Pool, Manager

def download_data(args):
pathOut, variable, year, month, progress_queue = args
directory = os.path.join(pathOut, variable, year)

if not os.path.exists(directory):
try:
os.makedirs(directory)
except FileExistsError:
pass

filename = f"{variable}.hh.*.era5.{month}.{year}.nc"
filepath = os.path.join(directory, filename)

c = cdsapi.Client()
c.retrieve(
'reanalysis-era5-single-levels',
{
'product_type': 'reanalysis',
'format': 'netcdf',
'year': year,
'month': month,
'day': [
'01', '02', '03',
'04', '05', '06',
'07', '08', '09',
'10', '11', '12',
'13', '14', '15',
'16', '17', '18',
'19', '20', '21',
'22', '23', '24',
'25', '26', '27',
'28', '29', '30',
'31',
],
'time': [
'00:00', '01:00', '02:00',
'03:00', '04:00', '05:00',
'06:00', '07:00', '08:00',
'09:00', '10:00', '11:00',
'12:00', '13:00', '14:00',
'15:00', '16:00', '17:00',
'18:00', '19:00', '20:00',
'21:00', '22:00', '23:00',
],
'variable': variable,
},
filepath
)

progress_queue.put(1)
return f"Downloaded {filepath}"

def main():
pathOut = "~/data/ERA5/source"
pathOut = os.path.expanduser(pathOut)

if not os.path.exists(pathOut):
os.makedirs(pathOut)

years = [str(year) for year in range(1971, 2023)]
months = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
variables = ['2m_temperature', 'evaporation', 'maximum_2m_temperature_since_previous_post_processing', 'minimum_2m_temperature_since_previous_post_processing', 'total_precipitation', 'surface_solar_radiation_downwards']

args_list = []
for variable in variables:
for year in years:
for month in months:
args_list.append((pathOut, variable, year, month))

with Pool() as pool, Manager() as manager:
progress_queue = manager.Queue()
total_tasks = len(args_list)
results = []

with tqdm(total=total_tasks) as pbar:
for result in pool.imap_unordered(download_data, [(args + (progress_queue,)) for args in args_list]):
results.append(result)
pbar.update(progress_queue.get())

for result in results:
print(result)

if __name__ == '__main__':
main()
76 changes: 76 additions & 0 deletions ESDC/inputs-collect/download-GFED4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import requests
from tqdm import tqdm
import os

path_out = "~/data/GFED4/source"
path_out = os.path.expanduser(path_out)

if not os.path.exists(path_out):
os.makedirs(path_out)

base_url = 'https://daac.ornl.gov/daacdata/global_vegetation/fire_emissions_v4_R1/data/Monthly/'
file_prefix = 'GFED4.0_MQ_'

years = range(1995, 2017)
months = ['{:02d}'.format(m) for m in range(1, 13)]

session = requests.Session()
auth_url = 'https://urs.earthdata.nasa.gov/oauth/authorize?app_type=401&client_id=QyeRbBJg8YuY_WBh-KBztA&response_type=code&redirect_uri=https%3A%2F%2Fdaac.ornl.gov%2Fdaacdata%2Fdoesntmater&state=aHR0cHM6Ly9kYWFjLm9ybmwuZ292L2RhYWNkYXRhL2dsb2JhbF92ZWdldGF0aW9uL2ZpcmVfZW1pc3Npb25zX3Y0X1IxL2RhdGEvTW9udGhseS8'

username = input("Enter your username: ")
password = input("Enter your password: ")

auth_response = session.get(auth_url, auth=(username, password))
session.auth = (username, password)

if auth_response.status_code != 200:
print('Authentication failed:', auth_response.status_code)
exit()


def handle_data_url_response(response, month, year):
if response.status_code != 200:
if response.status_code == 404:
print(f'Data not available for {month}/{year}')
else:
print('Failed to access the data URL:', response.status_code)


def handle_file_url_response(response, file_url):
if response.status_code != 200:
if response.status_code == 404:
print("---")
else:
print(f'Failed to download: {file_url}')


for year in years:
for month in months:
file_name = file_prefix + str(year) + month + '_BA.hdf'
data_url = base_url + file_name

response = session.get(data_url, auth=(username, password), allow_redirects=False)
print(response.status_code)

handle_data_url_response(response, month, year)

file_url = base_url + file_name
response = session.get(file_url, auth=(username, password), allow_redirects=False, stream=True)

handle_file_url_response(response, file_url)

if response.status_code != 200:
continue

file_path = os.path.join(path_out, file_name)
total_size = int(response.headers.get('content-length', 0))
progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, desc=file_name)

with open(file_path, 'wb') as file:
for data in response.iter_content(chunk_size=1024):
file.write(data)
progress_bar.update(len(data))

progress_bar.close()
print(f'Downloaded: {file_name}')
print("---")
69 changes: 69 additions & 0 deletions ESDC/inputs-collect/download-GLEAM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import paramiko
from tqdm import tqdm
from multiprocessing import Pool

def download_file(args):
host, port, username, password, remote_file, local_file = args
transport = paramiko.Transport((host, port))
transport.connect(username=username, password=password)
sftp = paramiko.SFTPClient.from_transport(transport)

sftp.get(remote_file, local_file)

sftp.close()
transport.close()

def download_files(host, port, username, password, remote_dir, local_dir):
transport = paramiko.Transport((host, port))
transport.connect(username=username, password=password)
sftp = paramiko.SFTPClient.from_transport(transport)

remote_years = sftp.listdir(remote_dir)

tasks = []

for year in range(1980, 2023):
str_year = str(year)
if str_year in remote_years:
remote_year_dir = os.path.join(remote_dir, str_year)
local_year_dir = os.path.join(local_dir, str_year)

if not os.path.exists(local_year_dir):
os.makedirs(local_year_dir)

remote_files = sftp.listdir(remote_year_dir)

for file in remote_files:
remote_file = os.path.join(remote_year_dir, file)
local_file = os.path.join(local_year_dir, file)

tasks.append((host, port, username, password, remote_file, local_file))

sftp.close()
transport.close()

# Server Restriction for 8 simultaneously downloads (?)
with Pool(8) as pool:
with tqdm(total=len(tasks), desc="Downloading files") as pbar:
for _ in pool.imap_unordered(download_file, tasks):
pbar.update(1)

def main():
print("Please enter the credentials you got per mail from GLEAM")
host = input("Enter the host (without 'sftp://'): ")
port = int(input("Enter the port: "))
username = input("Enter your username: ")
password = input("Enter your password: ")
remote_dir = "./data/v3.7a/daily"
local_dir = "~/data/GLEAM/source"

local_dir = os.path.expanduser(local_dir)

if not os.path.exists(local_dir):
os.makedirs(local_dir)

download_files(host, port, username, password, remote_dir, local_dir)

if __name__ == '__main__':
main()
50 changes: 50 additions & 0 deletions ESDC/inputs-collect/download-RTSIF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import requests
import os
import patoolib
from tqdm import tqdm

### There is no native package for extracting .rar files
### 7-Zip or WinRAR are needed for extracting .rar files

url = "https://figshare.com/ndownloader/articles/19336346/versions/3"
filename = "cache.zip"
extract_path = "~/data/SIF/RTSIF/source"

zip_filename = os.path.join(extract_path, filename)

if not os.path.exists(extract_path):
os.makedirs(extract_path)
print("Extract path created.")

response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))

with open(zip_filename, 'wb') as file, tqdm(
desc=filename,
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as progress_bar:
for data in response.iter_content(chunk_size=1024):
size = file.write(data)
progress_bar.update(size)

print("File downloaded successfully.")

patoolib.extract_archive(zip_filename, outdir=extract_path)
print("Extraction completed.")

for filename in os.listdir(extract_path):
if filename.endswith(".rar"):
rar_file = os.path.join(extract_path, filename)
patoolib.extract_archive(rar_file, outdir=extract_path)
print(f"Extracted {filename} to {extract_path}")

for file in os.listdir(extract_path):
file_path = os.path.join(extract_path, file)
if file.endswith((".rar", ".zip")):
os.remove(file_path)
print(f"Deleted {file}")

print("All .rar and .zip files deleted.")
4 changes: 2 additions & 2 deletions ESDC/inputs-collect/download-cci-sm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def download_file(url):
else:
print(f"File {filename} already exists!")

years = np.arange(1979,2021)
years = np.arange(1979,2022)

for year in tqdm(years):
urls = get_url_paths(f"https://dap.ceda.ac.uk/neodc/esacci/soil_moisture/data/daily_files/COMBINED/v06.1/{year}/","nc")
urls = get_url_paths(f"https://dap.ceda.ac.uk/neodc/esacci/soil_moisture/data/daily_files/COMBINED/v07.1/{year}/","nc")
for url in urls:
download_file(url)
4 changes: 2 additions & 2 deletions ESDC/inputs-preprocess/GLEAM/gleam-data-cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# if not os.path.exists(pathOut):
# os.mkdir(pathOut)

pathIn = "path-to-GLEAM-folder"
pathIn = "~/data/GLEAM/source"

pathOut = "~/data/GLEAM/preprocess"
pathOut = os.path.expanduser(pathOut)
Expand All @@ -24,7 +24,7 @@

for year in tqdm(years):

files = glob.glob(f"{pathIn}/data/v3.6a/daily/{year}/*.nc")
files = glob.glob(f"{pathIn}/v3.7a/daily/{year}/*.nc")
files.sort()

datasets = [xr.open_dataset(file,chunks = {'time':512,'lat':128,'lon':128}) for file in files]
Expand Down