Adding earthnet2021x download

earthnet2021 · Nov 18, 2022 · c4f457d · c4f457d
1 parent 8410c83
commit c4f457d
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -10,6 +10,16 @@ Find more information on https://www.earthnet.tech.
 pip install earthnet
 ```
 
+# Downloading new dataset EarthNet2021x
+
+Ensure you have enough free disk space! We recommend 1TB.
+```
+import earthnet as en
+en.download(dataset = "earthnet2021x", split = "train", save_directory = "data_dir")
+```
+Where  `data_dir` is the directory where EarthNet2021 shall be saved and `splits` is `"all"`or a subset of `["train","iid","ood","extreme","seasonal"]`.
+
+
 # Download
 Ensure you have enough free disk space! We recommend 1TB.
 ```

diff --git a/earthnet/__init__.py b/earthnet/__init__.py
@@ -1,11 +1,12 @@
 """EarthNet2021 Toolkit
 A library for downloading, evaluating and plotting Earth surface forecasts.
 """
-__version__ = "0.2.4"
+__version__ = "0.3.2"
 __author__ = 'Vitus Benson, Christian Requena-Mesa'
 __credits__ = 'Max-Planck-Institute for Biogeochemistry'
 
 from earthnet.parallel_score import EarthNetScore
 from earthnet.download import Downloader
 from earthnet.coords import get_coords_from_cube, get_coords_from_tile
-from earthnet.plot_cube import cube_gallery, cube_ndvi_timeseries
+from earthnet.plot_cube import cube_gallery, cube_ndvi_timeseries
+from earthnet.download_v2 import download, load_minicube
diff --git a/earthnet/download_v2.py b/earthnet/download_v2.py
@@ -0,0 +1,89 @@
+
+
+
+import s3fs
+import xarray as xr
+from pathlib import Path
+from tqdm import tqdm
+
+SPLITS = {
+    "earthnet2021x": ["train","iid","ood","extreme","seasonal"]
+}
+
+def download(dataset = "earthnet2021x", split = "train", save_directory = "data/", proxy = None, limit = None):
+    """Download the recent EarthNet datasets
+        
+        Before downloading, ensure that you have enough free disk space. We recommend 1 TB.
+
+        Specify the directory `save_directory`, where it should be saved. Then choose, which of the splits you want to download.
+        All available splits: 
+            - For dataset `"earthnet2021x"`: `["train","iid","ood","extreme","seasonal"]`
+        
+        You can also give `"all"` to splits to download all splits of a particular dataset.
+
+        Args:
+            dataset (str): The dataset you wish to download.
+            split (str): A split of the given dataset, can also be `"all"` to download all splits of a given dataset
+            save_directory (str): The directory where the data shall be saved in, we recommend data/
+            proxy (str, optional): If you need to use a http-proxy to access the internet, you may specify it here.
+            limit (int, optional): If you only want to download a certain number of samples, you can set a limit here.
+    """  
+    if split == "all":
+        for split in SPLITS[dataset]:
+            download(dataset = dataset, split = split, save_directory=save_directory, proxy = proxy, limit = limit)
+    else:
+        s3 = s3fs.S3FileSystem(anon=True,
+                client_kwargs={
+                'endpoint_url': 'https://s3.bgc-jena.mpg.de:9000',
+                'region_name': 'thuringia',
+                },
+                config_kwargs = {
+                "proxies": {'http': proxy}
+                } if proxy else {}
+            )
+
+        print(f"Finding files of {dataset}, split {split} to download.")
+        files = s3.find(f"earthnet/{dataset}/{split}")
+        print(f"Downloading files of {dataset}, split {split}")
+        for file in tqdm(files[:limit] if limit else files):
+            savepath = Path(save_directory)/file[9:]
+            savepath.parent.mkdir(parents = True, exist_ok = True)
+            s3.download(file, str(savepath))
+        print(f"Downloaded {dataset}, split {split}.")
+
+
+def load_minicube(dataset = "earthnet2021x", split = "train", id = "29SND_2018-09-03_2019-01-30_441_569_2745_2873_6_86_42_122", region = None, proxy = None):
+    """Load a minicube from a recent EarthNet dataset
+
+        Will give you a minicube loaded from the cloud.
+
+        All available splits: 
+            - For dataset `"earthnet2021x"`: `["train","iid","ood","extreme","seasonal"]`
+
+        Args:
+            dataset(str): The dataset
+            split (str): The split
+            id (str): The id of the minicube
+            region (str, optional): If you specify the region, downloading will be faster
+            proxy (str, optional): If you need to use a http-proxy to access the internet, you may specify it here.
+    
+    """
+    s3 = s3fs.S3FileSystem(anon=True,
+            client_kwargs={
+            'endpoint_url': 'https://s3.bgc-jena.mpg.de:9000',
+            'region_name': 'thuringia',
+            },
+            config_kwargs = {
+            "proxies": {'http': proxy}
+            } if proxy else {}
+        )
+    if region:
+        file = f"earthnet/{dataset}/{split}/{region}/{id}.nc"
+    else:
+        print(f"Searching for {id}...")
+        file = s3.glob(f"earthnet/{dataset}/{split}/**/{id}.nc")[0]
+        print(f"Found {id}.")
+
+    mc = xr.open_dataset(s3.open(file))
+
+    return mc
diff --git a/setup.py b/setup.py
@@ -12,12 +12,14 @@
     "tqdm",
     "fire",
     "pyproj",
-    "pandas"
+    "pandas",
+    "s3fs",
+    "xarray"
 ]
 
 
 setup(name='earthnet', 
-        version='0.2.4',
+        version='0.3.2',
         description="EarthNet2021 Toolkit: Download, Evaluation, Plotting",
         author="Vitus Benson",
         author_email="vbenson@bgc-jena.mpg.de",