From 125adfff16c63244cd150613034d92000f9e231e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 Jan 2025 19:52:04 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/drift_removal.ipynb | 148 ++++++++++------ docs/postprocessing.ipynb | 213 +++++++++++++++-------- docs/regionmask.ipynb | 57 ++++-- docs/tutorial.ipynb | 160 ++++++++++------- notebooks/add_more_models.ipynb | 117 ++++++++----- notebooks/maintenance_grids.ipynb | 133 +++++++------- notebooks/metric_parse_improvement.ipynb | 38 ++-- notebooks/parse_area_gn.ipynb | 20 ++- notebooks/testing_various_issues.ipynb | 111 +++++++----- 9 files changed, 620 insertions(+), 377 deletions(-) diff --git a/docs/drift_removal.ipynb b/docs/drift_removal.ipynb index 1809d51f..db578cdc 100644 --- a/docs/drift_removal.ipynb +++ b/docs/drift_removal.ipynb @@ -43,6 +43,7 @@ ], "source": [ "from dask_gateway import Gateway\n", + "\n", "g = Gateway()\n", "running_clusters = g.list_clusters()\n", "print(running_clusters)\n", @@ -77,6 +78,7 @@ "source": [ "from distributed import Client\n", "from dask_gateway import GatewayCluster\n", + "\n", "cluster = GatewayCluster()\n", "cluster.scale(30)\n", "cluster" @@ -255,18 +257,22 @@ } ], "source": [ - "zkwargs = {'consolidated':True, 'use_cftime':True}\n", - "kwargs = {'zarr_kwargs':zkwargs, 'preprocess':combined_preprocessing, 'aggregate':False}\n", + "zkwargs = {\"consolidated\": True, \"use_cftime\": True}\n", + "kwargs = {\n", + " \"zarr_kwargs\": zkwargs,\n", + " \"preprocess\": combined_preprocessing,\n", + " \"aggregate\": False,\n", + "}\n", "\n", "col = google_cmip_col()\n", "\n", "\n", - "cat = col.search(source_id='CanESM5-CanOE', variable_id='thetao')\n", + "cat = col.search(source_id=\"CanESM5-CanOE\", variable_id=\"thetao\")\n", "\n", "\n", - "ddict_historical = cat.search(experiment_id='historical').to_dataset_dict(**kwargs)\n", - "ddict_ssp585 = cat.search(experiment_id='ssp585').to_dataset_dict(**kwargs)\n", - "ddict_picontrol = cat.search(experiment_id='piControl').to_dataset_dict(**kwargs)" + "ddict_historical = cat.search(experiment_id=\"historical\").to_dataset_dict(**kwargs)\n", + "ddict_ssp585 = cat.search(experiment_id=\"ssp585\").to_dataset_dict(**kwargs)\n", + "ddict_picontrol = cat.search(experiment_id=\"piControl\").to_dataset_dict(**kwargs)" ] }, { @@ -286,12 +292,18 @@ "metadata": {}, "outputs": [], "source": [ - "ds_control = ddict_picontrol['CMIP.CCCma.CanESM5-CanOE.piControl.r1i1p2f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CCCma/CanESM5-CanOE/piControl/r1i1p2f1/Omon/thetao/gn/v20190429/.nan.20190429']\n", - "ds_historical = ddict_historical['CMIP.CCCma.CanESM5-CanOE.historical.r1i1p2f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CCCma/CanESM5-CanOE/historical/r1i1p2f1/Omon/thetao/gn/v20190429/.nan.20190429']\n", - "ds_ssp585 = ddict_ssp585['ScenarioMIP.CCCma.CanESM5-CanOE.ssp585.r1i1p2f1.Omon.thetao.gn.gs://cmip6/CMIP6/ScenarioMIP/CCCma/CanESM5-CanOE/ssp585/r1i1p2f1/Omon/thetao/gn/v20190429/.nan.20190429']\n", + "ds_control = ddict_picontrol[\n", + " \"CMIP.CCCma.CanESM5-CanOE.piControl.r1i1p2f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CCCma/CanESM5-CanOE/piControl/r1i1p2f1/Omon/thetao/gn/v20190429/.nan.20190429\"\n", + "]\n", + "ds_historical = ddict_historical[\n", + " \"CMIP.CCCma.CanESM5-CanOE.historical.r1i1p2f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CCCma/CanESM5-CanOE/historical/r1i1p2f1/Omon/thetao/gn/v20190429/.nan.20190429\"\n", + "]\n", + "ds_ssp585 = ddict_ssp585[\n", + " \"ScenarioMIP.CCCma.CanESM5-CanOE.ssp585.r1i1p2f1.Omon.thetao.gn.gs://cmip6/CMIP6/ScenarioMIP/CCCma/CanESM5-CanOE/ssp585/r1i1p2f1/Omon/thetao/gn/v20190429/.nan.20190429\"\n", + "]\n", "\n", "# Pick a random location in x/y/z space to use as an exmple\n", - "roi = {'x':100,'y':220, 'lev':30}" + "roi = {\"x\": 100, \"y\": 220, \"lev\": 30}" ] }, { @@ -335,8 +347,8 @@ ], "source": [ "# ok lets just plot them together\n", - "ds_control.isel(**roi).thetao.plot(color='0.5')\n", - "ds_historical.isel(**roi).thetao.plot(color='C1')" + "ds_control.isel(**roi).thetao.plot(color=\"0.5\")\n", + "ds_historical.isel(**roi).thetao.plot(color=\"C1\")" ] }, { @@ -373,7 +385,7 @@ } ], "source": [ - "{k:v for k,v in ds_historical.attrs.items() if 'parent' in k}" + "{k: v for k, v in ds_historical.attrs.items() if \"parent\" in k}" ] }, { @@ -428,8 +440,8 @@ ], "source": [ "# ok lets just plot them together\n", - "ds_control_adj.isel(**roi).thetao.plot(color='0.5')\n", - "ds_historical_adj.isel(**roi).thetao.plot(color='C1')" + "ds_control_adj.isel(**roi).thetao.plot(color=\"0.5\")\n", + "ds_historical_adj.isel(**roi).thetao.plot(color=\"C1\")" ] }, { @@ -471,8 +483,8 @@ ], "source": [ "# ok lets just plot them together\n", - "ds_control_adj.isel(**roi, time=slice(0,24)).thetao.plot()\n", - "ds_historical_adj.isel(**roi, time=slice(0,24)).thetao.plot()" + "ds_control_adj.isel(**roi, time=slice(0, 24)).thetao.plot()\n", + "ds_historical_adj.isel(**roi, time=slice(0, 24)).thetao.plot()" ] }, { @@ -491,8 +503,9 @@ "outputs": [], "source": [ "from xmip.drift_removal import replace_time\n", + "\n", "# with the defaults it will just replace the dates with new ones which have time stamps at the beginning of the month.\n", - "ds_historical_adj = replace_time(ds_historical_adj) " + "ds_historical_adj = replace_time(ds_historical_adj)" ] }, { @@ -526,8 +539,8 @@ ], "source": [ "# ok lets just plot them together again\n", - "ds_control_adj.isel(**roi, time=slice(0,24)).thetao.plot()\n", - "ds_historical_adj.isel(**roi, time=slice(0,24)).thetao.plot()" + "ds_control_adj.isel(**roi, time=slice(0, 24)).thetao.plot()\n", + "ds_historical_adj.isel(**roi, time=slice(0, 24)).thetao.plot()" ] }, { @@ -564,7 +577,7 @@ ], "source": [ "for name, ds in ddict_historical.items():\n", - " print(name, ds.attrs['branch_time_in_parent'])" + " print(name, ds.attrs[\"branch_time_in_parent\"])" ] }, { @@ -598,14 +611,18 @@ "# replace the timestamp with the first of the month for the control run and plot\n", "# we will also average the data yearly to remove some of the visual noise\n", "\n", - "plt.figure(figsize=[12,4])\n", - "replace_time(ds_control).isel(**roi).thetao.coarsen(time=3).mean().isel(time=slice(0,150*4)).plot(color='0.5')\n", + "plt.figure(figsize=[12, 4])\n", + "replace_time(ds_control).isel(**roi).thetao.coarsen(time=3).mean().isel(\n", + " time=slice(0, 150 * 4)\n", + ").plot(color=\"0.5\")\n", "\n", - "# now we loop through all the historical members, adjust the time and plot them in the same way, \n", + "# now we loop through all the historical members, adjust the time and plot them in the same way,\n", "# but only for the first 20 years\n", "for name, ds in ddict_historical.items():\n", - " _, ds_adj = unify_time(ds_control, ds, adjust_to='parent')\n", - " ds_adj.isel(**roi).thetao.coarsen(time=3).mean().isel(time=slice(0,30*4)).plot(color='C1')" + " _, ds_adj = unify_time(ds_control, ds, adjust_to=\"parent\")\n", + " ds_adj.isel(**roi).thetao.coarsen(time=3).mean().isel(time=slice(0, 30 * 4)).plot(\n", + " color=\"C1\"\n", + " )" ] }, { @@ -654,9 +671,10 @@ "# setting up the scratch bucket\n", "import os\n", "import fsspec\n", - "PANGEO_SCRATCH = os.environ['PANGEO_SCRATCH']+'cmip6_pp_demo'\n", - "path = f'{PANGEO_SCRATCH}/test_rechunked.zarr'\n", - "temp_path = f'{PANGEO_SCRATCH}/test_rechunked_temp.zarr'\n", + "\n", + "PANGEO_SCRATCH = os.environ[\"PANGEO_SCRATCH\"] + \"cmip6_pp_demo\"\n", + "path = f\"{PANGEO_SCRATCH}/test_rechunked.zarr\"\n", + "temp_path = f\"{PANGEO_SCRATCH}/test_rechunked_temp.zarr\"\n", "mapper = fsspec.get_mapper(path)\n", "mapper_temp = fsspec.get_mapper(temp_path)" ] @@ -1506,27 +1524,30 @@ "source": [ "if not mapper.fs.exists(path):\n", " # recompute the rechunked data into the scratch bucket (is only triggered when the temporary store was erased)\n", - " \n", + "\n", " # Remove the temp store if for some reason that still exists\n", " if mapper.fs.exists(temp_path):\n", " mapper.fs.rm(temp_path, recursive=True)\n", " from rechunker import rechunk\n", + "\n", " target_chunks = {\n", - " 'thetao': {'time':6012, 'lev':1, 'x':3, 'y':291},\n", - " 'x': {'x':3},\n", - " 'y': {'y':291},\n", - " 'lat': {'x':3, 'y':291},\n", - " 'lev': {'lev':1},\n", - " 'lon': {'x':3, 'y':291},\n", - " 'time': {'time':6012}, \n", + " \"thetao\": {\"time\": 6012, \"lev\": 1, \"x\": 3, \"y\": 291},\n", + " \"x\": {\"x\": 3},\n", + " \"y\": {\"y\": 291},\n", + " \"lat\": {\"x\": 3, \"y\": 291},\n", + " \"lev\": {\"lev\": 1},\n", + " \"lon\": {\"x\": 3, \"y\": 291},\n", + " \"time\": {\"time\": 6012},\n", " }\n", - " max_mem = '1GB'\n", + " max_mem = \"1GB\"\n", "\n", - " array_plan = rechunk(ds_control[['thetao']], target_chunks, max_mem, mapper, temp_store=mapper_temp)\n", + " array_plan = rechunk(\n", + " ds_control[[\"thetao\"]], target_chunks, max_mem, mapper, temp_store=mapper_temp\n", + " )\n", " array_plan.execute(retries=10)\n", - " \n", + "\n", "ds_control_rechunked = xr.open_zarr(mapper, use_cftime=True)\n", - "ds_control_rechunked " + "ds_control_rechunked" ] }, { @@ -2061,8 +2082,8 @@ } ], "source": [ - "drift = calculate_drift(ds_control_rechunked, ds_historical, 'thetao') \n", - "drift = drift.load() # This takes a bit, but it is worth loading this small output to avoid repeated computation\n", + "drift = calculate_drift(ds_control_rechunked, ds_historical, \"thetao\")\n", + "drift = drift.load() # This takes a bit, but it is worth loading this small output to avoid repeated computation\n", "drift" ] }, @@ -2116,13 +2137,18 @@ "source": [ "start = drift.trend_time_range.isel(bnds=0).data.tolist()\n", "stop = drift.trend_time_range.isel(bnds=1).data.tolist()\n", - "time = xr.cftime_range(start, stop, freq='1MS')\n", + "time = xr.cftime_range(start, stop, freq=\"1MS\")\n", "\n", "# cut the control it to the time over which the trend was calculated\n", "ds_control_cut = ds_control_rechunked.sel(time=slice(start, stop))\n", "\n", "# use the linear slope from the same point to construct a trendline\n", - "trendline = xr.DataArray((np.arange(len(time)) * drift.thetao.isel(**roi).data) + ds_control_cut.thetao.isel(**roi, time=0).data, dims=['time'], coords={'time':time})" + "trendline = xr.DataArray(\n", + " (np.arange(len(time)) * drift.thetao.isel(**roi).data)\n", + " + ds_control_cut.thetao.isel(**roi, time=0).data,\n", + " dims=[\"time\"],\n", + " coords={\"time\": time},\n", + ")" ] }, { @@ -2205,7 +2231,9 @@ } ], "source": [ - "control_detrended = remove_trend(ds_control, drift, 'thetao', ref_date=str(ds_control.time.data[0]))\n", + "control_detrended = remove_trend(\n", + " ds_control, drift, \"thetao\", ref_date=str(ds_control.time.data[0])\n", + ")\n", "control_detrended.isel(**roi).plot()" ] }, @@ -2247,7 +2275,9 @@ } ], "source": [ - "ds_historical_dedrifted = remove_trend(ds_historical, drift, 'thetao', ref_date=str(ds_historical.time.data[0]))\n", + "ds_historical_dedrifted = remove_trend(\n", + " ds_historical, drift, \"thetao\", ref_date=str(ds_historical.time.data[0])\n", + ")\n", "ds_historical_dedrifted.isel(**roi).plot()" ] }, @@ -2866,7 +2896,7 @@ } ], "source": [ - "ds_historical_dedrifted.attrs['drift_removed']" + "ds_historical_dedrifted.attrs[\"drift_removed\"]" ] }, { @@ -2930,10 +2960,10 @@ "ds_ssp585_dedrifted = remove_trend(\n", " ds_ssp585,\n", " drift,\n", - " 'thetao',\n", - " ref_date=str(ds_historical.time.data[0]) \n", - " # Note that the ref_date is still the first time point of the *historical*run. \n", - " # This ensures that the scenario is treated as an extension of the historical \n", + " \"thetao\",\n", + " ref_date=str(ds_historical.time.data[0]),\n", + " # Note that the ref_date is still the first time point of the *historical*run.\n", + " # This ensures that the scenario is treated as an extension of the historical\n", " # run and the offset is calculated appropriately\n", ")" ] @@ -2968,10 +2998,16 @@ } ], "source": [ - "ds_historical.isel(**roi).thetao.coarsen(time=36, boundary='trim').mean().plot(color='C0', label='raw data')\n", - "ds_ssp585.isel(**roi).thetao.coarsen(time=36, boundary='trim').mean().plot(color='C0')\n", - "ds_historical_dedrifted.isel(**roi).coarsen(time=36, boundary='trim').mean().plot(color='C1', label='control drift removed')\n", - "ds_ssp585_dedrifted.isel(**roi).coarsen(time=36, boundary='trim').mean().plot(color='C1')" + "ds_historical.isel(**roi).thetao.coarsen(time=36, boundary=\"trim\").mean().plot(\n", + " color=\"C0\", label=\"raw data\"\n", + ")\n", + "ds_ssp585.isel(**roi).thetao.coarsen(time=36, boundary=\"trim\").mean().plot(color=\"C0\")\n", + "ds_historical_dedrifted.isel(**roi).coarsen(time=36, boundary=\"trim\").mean().plot(\n", + " color=\"C1\", label=\"control drift removed\"\n", + ")\n", + "ds_ssp585_dedrifted.isel(**roi).coarsen(time=36, boundary=\"trim\").mean().plot(\n", + " color=\"C1\"\n", + ")" ] }, { diff --git a/docs/postprocessing.ipynb b/docs/postprocessing.ipynb index 26538964..91ccc41f 100644 --- a/docs/postprocessing.ipynb +++ b/docs/postprocessing.ipynb @@ -98,23 +98,20 @@ "from xmip.preprocessing import combined_preprocessing\n", "\n", "col = google_cmip_col()\n", - "experiment_id='historical'\n", - "source_id = ['CanESM5-CanOE', 'GFDL-ESM4']\n", + "experiment_id = \"historical\"\n", + "source_id = [\"CanESM5-CanOE\", \"GFDL-ESM4\"]\n", "kwargs = {\n", - " 'zarr_kwargs':{\n", - " 'consolidated':True,\n", - " 'use_cftime':True\n", - " },\n", - " 'aggregate':False,\n", - " 'preprocess':combined_preprocessing\n", + " \"zarr_kwargs\": {\"consolidated\": True, \"use_cftime\": True},\n", + " \"aggregate\": False,\n", + " \"preprocess\": combined_preprocessing,\n", "}\n", "\n", "cat_data = col.search(\n", " source_id=source_id,\n", " experiment_id=experiment_id,\n", - " grid_label='gn',\n", - " table_id='Omon',\n", - " variable_id=['tos', 'zos']\n", + " grid_label=\"gn\",\n", + " table_id=\"Omon\",\n", + " variable_id=[\"tos\", \"zos\"],\n", ")\n", "ddict = cat_data.to_dataset_dict(**kwargs)\n", "list(ddict.keys())" @@ -1348,7 +1345,7 @@ ], "source": [ "# check if the merging worked\n", - "ddict_merged['GFDL-ESM4.gn.historical.Omon.r2i1p1f1']" + "ddict_merged[\"GFDL-ESM4.gn.historical.Omon.r2i1p1f1\"]" ] }, { @@ -2311,7 +2308,7 @@ "\n", "ddict_concat = concat_members(ddict_merged)\n", "print(list(ddict_concat.keys()))\n", - "ddict_concat['GFDL-ESM4.gn.historical.Omon']" + "ddict_concat[\"GFDL-ESM4.gn.historical.Omon\"]" ] }, { @@ -2344,12 +2341,13 @@ ], "source": [ "import matplotlib.pyplot as plt\n", - "plt.figure(figsize=[8,4])\n", + "\n", + "plt.figure(figsize=[8, 4])\n", "for i, (name, ds) in enumerate(ddict_concat.items()):\n", - " data = ds.tos.where(ds.zos<0).mean(['x','y'])\n", - " plt.subplot(2,1,i+1)\n", - " data.coarsen(time=12*5).mean().plot(hue='member_id')\n", - " plt.gca().set_title(ds.attrs['source_id'])" + " data = ds.tos.where(ds.zos < 0).mean([\"x\", \"y\"])\n", + " plt.subplot(2, 1, i + 1)\n", + " data.coarsen(time=12 * 5).mean().plot(hue=\"member_id\")\n", + " plt.gca().set_title(ds.attrs[\"source_id\"])" ] }, { @@ -3541,9 +3539,10 @@ ], "source": [ "from xmip.postprocessing import pick_first_member\n", + "\n", "ddict_single_member = pick_first_member(ddict_merged)\n", "print(list(ddict_single_member.keys()))\n", - "ddict_single_member['GFDL-ESM4.gn.historical.Omon']" + "ddict_single_member[\"GFDL-ESM4.gn.historical.Omon\"]" ] }, { @@ -4500,15 +4499,17 @@ "source": [ "from xmip.postprocessing import combine_datasets\n", "\n", + "\n", "def pick_first_member(ds_list, **kwargs):\n", " return ds_list[0]\n", "\n", + "\n", "ddict_new = combine_datasets(\n", " ddict_merged,\n", " pick_first_member,\n", - " match_attrs=['source_id', 'grid_label', 'experiment_id', 'table_id']\n", + " match_attrs=[\"source_id\", \"grid_label\", \"experiment_id\", \"table_id\"],\n", ")\n", - "ddict_new['CanESM5-CanOE.gn.historical.Omon']" + "ddict_new[\"CanESM5-CanOE.gn.historical.Omon\"]" ] }, { @@ -4635,9 +4636,7 @@ "metadata": {}, "outputs": [], "source": [ - "from xmip.postprocessing import (\n", - " interpolate_grid_label\n", - ")" + "from xmip.postprocessing import interpolate_grid_label" ] }, { @@ -4656,7 +4655,7 @@ } ], "source": [ - "combined_grids_dict = interpolate_grid_label(ddict, target_grid_label='gn')" + "combined_grids_dict = interpolate_grid_label(ddict, target_grid_label=\"gn\")" ] }, { @@ -5724,7 +5723,7 @@ } ], "source": [ - "combined_grids_dict['GFDL-ESM4.historical.Omon.r1i1p1f1']" + "combined_grids_dict[\"GFDL-ESM4.historical.Omon.r1i1p1f1\"]" ] }, { @@ -6686,7 +6685,9 @@ } ], "source": [ - "ddict['CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Omon/thetao/gn/v20190726/.nan.20190726']" + "ddict[\n", + " \"CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Omon/thetao/gn/v20190726/.nan.20190726\"\n", + "]" ] }, { @@ -6793,13 +6794,13 @@ "\n", "for name, ds in combined_grids_dict.items():\n", " ds = ds.isel(lev=0, time=0)\n", - " plt.figure(figsize=[10,12])\n", - " plt.subplot(2,1,1)\n", + " plt.figure(figsize=[10, 12])\n", + " plt.subplot(2, 1, 1)\n", " ds.thetao.plot()\n", - " plt.title(name+' thetao')\n", - " plt.subplot(2,1,2)\n", + " plt.title(name + \" thetao\")\n", + " plt.subplot(2, 1, 2)\n", " ds.o2.plot()\n", - " plt.title(name+' o2')\n", + " plt.title(name + \" o2\")\n", " plt.show()" ] }, @@ -6840,7 +6841,8 @@ "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "plt.rcParams['figure.figsize'] = 12, 6\n", + "\n", + "plt.rcParams[\"figure.figsize\"] = 12, 6\n", "%config InlineBackend.figure_format = 'retina'" ] }, @@ -6902,11 +6904,17 @@ "from xmip.preprocessing import combined_preprocessing\n", "\n", "col = google_cmip_col()\n", - "experiment_id='historical'\n", - "source_id = 'MPI-ESM1-2-LR'\n", - "kwargs = {'zarr_kwargs':{'consolidated':True, 'use_cftime':True}, 'aggregate':False, 'preprocess':combined_preprocessing}\n", + "experiment_id = \"historical\"\n", + "source_id = \"MPI-ESM1-2-LR\"\n", + "kwargs = {\n", + " \"zarr_kwargs\": {\"consolidated\": True, \"use_cftime\": True},\n", + " \"aggregate\": False,\n", + " \"preprocess\": combined_preprocessing,\n", + "}\n", "\n", - "cat_data = col.search(source_id=source_id, experiment_id=experiment_id, variable_id='tos')\n", + "cat_data = col.search(\n", + " source_id=source_id, experiment_id=experiment_id, variable_id=\"tos\"\n", + ")\n", "ddict = cat_data.to_dataset_dict(**kwargs)" ] }, @@ -6968,7 +6976,9 @@ } ], "source": [ - "cat_metric = col.search(source_id=source_id, experiment_id=experiment_id, variable_id='areacello')\n", + "cat_metric = col.search(\n", + " source_id=source_id, experiment_id=experiment_id, variable_id=\"areacello\"\n", + ")\n", "ddict_metrics = cat_metric.to_dataset_dict(**kwargs)" ] }, @@ -6988,7 +6998,8 @@ "outputs": [], "source": [ "from xmip.postprocessing import match_metrics\n", - "ddict_matched = match_metrics(ddict, ddict_metrics, ['areacello'])" + "\n", + "ddict_matched = match_metrics(ddict, ddict_metrics, [\"areacello\"])" ] }, { @@ -8014,7 +8025,9 @@ } ], "source": [ - "ddict_matched['CMIP.MPI-M.MPI-ESM1-2-LR.historical.r1i1p1f1.Oday.tos.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r1i1p1f1/Oday/tos/gn/v20190710/.nan.20190710']" + "ddict_matched[\n", + " \"CMIP.MPI-M.MPI-ESM1-2-LR.historical.r1i1p1f1.Oday.tos.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r1i1p1f1/Oday/tos/gn/v20190710/.nan.20190710\"\n", + "]" ] }, { @@ -8063,9 +8076,11 @@ "for ds in ddict_matched.values():\n", " # calculate the weighted average over the surface level temperatures\n", " area = ds.areacello.fillna(0)\n", - " da = ds.tos.isel(time=slice(0,240)).weighted(area).mean(['x','y']).squeeze().load()\n", - " da.plot(ax=ax, label=ds.attrs['variant_label'])\n", - "ax.legend(bbox_to_anchor=(1, 1), loc='upper left')" + " da = (\n", + " ds.tos.isel(time=slice(0, 240)).weighted(area).mean([\"x\", \"y\"]).squeeze().load()\n", + " )\n", + " da.plot(ax=ax, label=ds.attrs[\"variant_label\"])\n", + "ax.legend(bbox_to_anchor=(1, 1), loc=\"upper left\")" ] }, { @@ -8162,13 +8177,19 @@ } ], "source": [ - "cat_data = col.search(source_id=source_id, experiment_id=experiment_id, variable_id='thetao')\n", + "cat_data = col.search(\n", + " source_id=source_id, experiment_id=experiment_id, variable_id=\"thetao\"\n", + ")\n", "ddict = cat_data.to_dataset_dict(**kwargs)\n", "\n", - "cat_metric = col.search(source_id=source_id, variable_id=['areacello', 'thkcello'], experiment_id='historical')\n", + "cat_metric = col.search(\n", + " source_id=source_id,\n", + " variable_id=[\"areacello\", \"thkcello\"],\n", + " experiment_id=\"historical\",\n", + ")\n", "ddict_metrics = cat_metric.to_dataset_dict(**kwargs)\n", "\n", - "# Matching \n" + "# Matching" ] }, { @@ -9351,8 +9372,10 @@ } ], "source": [ - "ddict_matched_again = match_metrics(ddict, ddict_metrics, ['areacello', 'thkcello'])\n", - "ddict_matched_again['CMIP.MPI-M.MPI-ESM1-2-LR.historical.r2i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r2i1p1f1/Omon/thetao/gn/v20190710/.nan.20190710']" + "ddict_matched_again = match_metrics(ddict, ddict_metrics, [\"areacello\", \"thkcello\"])\n", + "ddict_matched_again[\n", + " \"CMIP.MPI-M.MPI-ESM1-2-LR.historical.r2i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r2i1p1f1/Omon/thetao/gn/v20190710/.nan.20190710\"\n", + "]" ] }, { @@ -9400,10 +9423,16 @@ "fig, ax = plt.subplots()\n", "for i, ds in enumerate(ddict_matched_again.values()):\n", " # calculate the volume weighted mean ocean temperature\n", - " vol = (ds.areacello * ds.thkcello)\n", - " da = ds.thetao.isel(time=slice(-240, None)).weighted(vol.fillna(0)).mean(['x','y', 'lev']).squeeze().load()\n", - " da.plot(ax=ax, color=f'C{i}', label=ds.attrs['variant_label'])\n", - "ax.legend(bbox_to_anchor=(1, 1), loc='upper left')" + " vol = ds.areacello * ds.thkcello\n", + " da = (\n", + " ds.thetao.isel(time=slice(-240, None))\n", + " .weighted(vol.fillna(0))\n", + " .mean([\"x\", \"y\", \"lev\"])\n", + " .squeeze()\n", + " .load()\n", + " )\n", + " da.plot(ax=ax, color=f\"C{i}\", label=ds.attrs[\"variant_label\"])\n", + "ax.legend(bbox_to_anchor=(1, 1), loc=\"upper left\")" ] }, { @@ -9530,9 +9559,20 @@ } ], "source": [ - "cat_data = col.search(source_id='FGOALS-f3-L', variable_id='thetao', experiment_id=experiment_id, grid_label='gn', table_id='Omon')\n", + "cat_data = col.search(\n", + " source_id=\"FGOALS-f3-L\",\n", + " variable_id=\"thetao\",\n", + " experiment_id=experiment_id,\n", + " grid_label=\"gn\",\n", + " table_id=\"Omon\",\n", + ")\n", "ddict = cat_data.to_dataset_dict(**kwargs)\n", - "cat_metric = col.search(source_id='FGOALS-f3-L', variable_id='areacello', experiment_id='historical', grid_label='gn')\n", + "cat_metric = col.search(\n", + " source_id=\"FGOALS-f3-L\",\n", + " variable_id=\"areacello\",\n", + " experiment_id=\"historical\",\n", + " grid_label=\"gn\",\n", + ")\n", "ddict_metrics = cat_metric.to_dataset_dict(**kwargs)" ] }, @@ -9597,7 +9637,7 @@ "metadata": {}, "outputs": [], "source": [ - "ddict_matched = match_metrics(ddict, ddict_metrics, ['areacello'])" + "ddict_matched = match_metrics(ddict, ddict_metrics, [\"areacello\"])" ] }, { @@ -10411,7 +10451,9 @@ } ], "source": [ - "ds = ddict_matched['CMIP.CAS.FGOALS-f3-L.historical.r2i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r2i1p1f1/Omon/thetao/gn/v20191008/.nan.20191008']\n", + "ds = ddict_matched[\n", + " \"CMIP.CAS.FGOALS-f3-L.historical.r2i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r2i1p1f1/Omon/thetao/gn/v20191008/.nan.20191008\"\n", + "]\n", "ds" ] }, @@ -11246,9 +11288,16 @@ } ], "source": [ - "ddict_matched_strict = match_metrics(ddict, ddict_metrics, ['areacello'], match_attrs=['source_id', 'grid_label', 'variant_label'])\n", + "ddict_matched_strict = match_metrics(\n", + " ddict,\n", + " ddict_metrics,\n", + " [\"areacello\"],\n", + " match_attrs=[\"source_id\", \"grid_label\", \"variant_label\"],\n", + ")\n", "\n", - "ds_strict = ddict_matched_strict['CMIP.CAS.FGOALS-f3-L.historical.r2i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r2i1p1f1/Omon/thetao/gn/v20191008/.nan.20191008']\n", + "ds_strict = ddict_matched_strict[\n", + " \"CMIP.CAS.FGOALS-f3-L.historical.r2i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r2i1p1f1/Omon/thetao/gn/v20191008/.nan.20191008\"\n", + "]\n", "ds_strict" ] }, @@ -12068,7 +12117,9 @@ } ], "source": [ - "ds_strict_matched = ddict_matched_strict['CMIP.CAS.FGOALS-f3-L.historical.r1i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/Omon/thetao/gn/v20190822/.nan.20190822']\n", + "ds_strict_matched = ddict_matched_strict[\n", + " \"CMIP.CAS.FGOALS-f3-L.historical.r1i1p1f1.Omon.thetao.gn.gs://cmip6/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/Omon/thetao/gn/v20190822/.nan.20190822\"\n", + "]\n", "ds_strict_matched" ] }, @@ -12210,24 +12261,36 @@ } ], "source": [ - "experiment_id = 'ssp585'\n", - "cat_data = col.search(variable_id='tos', experiment_id=experiment_id, grid_label='gn', table_id='Omon')\n", + "experiment_id = \"ssp585\"\n", + "cat_data = col.search(\n", + " variable_id=\"tos\", experiment_id=experiment_id, grid_label=\"gn\", table_id=\"Omon\"\n", + ")\n", "\n", "##### remove a single store\n", "# see https://github.com/intake/intake-esm/issues/246 for details on how to modify the dataframe\n", "df = cat_data.df\n", - "drop_idx = cat_data.df.index[cat_data.df['zstore'].str.contains('ScenarioMIP.CCCma.CanESM5.ssp585.r9i1p1f1.Omon.tos.gn')]\n", + "drop_idx = cat_data.df.index[\n", + " cat_data.df[\"zstore\"].str.contains(\n", + " \"ScenarioMIP.CCCma.CanESM5.ssp585.r9i1p1f1.Omon.tos.gn\"\n", + " )\n", + "]\n", "df = df.drop(drop_idx)\n", "cat_data = cat_data.from_df(df=df, esmcol_data=cat_data.esmcol_data)\n", "#####\n", "\n", "ddict = cat_data.to_dataset_dict(**kwargs)\n", - "cat_metric = col.search(variable_id='areacello', experiment_id=experiment_id, grid_label='gn')\n", + "cat_metric = col.search(\n", + " variable_id=\"areacello\", experiment_id=experiment_id, grid_label=\"gn\"\n", + ")\n", "ddict_metrics = cat_metric.to_dataset_dict(**kwargs)\n", - "ddict_matched = match_metrics(ddict, ddict_metrics, ['areacello'], print_statistics=True)\n", + "ddict_matched = match_metrics(\n", + " ddict, ddict_metrics, [\"areacello\"], print_statistics=True\n", + ")\n", "\n", "# remove the datasets where the parsing was unsuccesful\n", - "ddict_matched_filtered = {k:ds for k,ds in ddict_matched.items() if 'areacello' in ds.variables}" + "ddict_matched_filtered = {\n", + " k: ds for k, ds in ddict_matched.items() if \"areacello\" in ds.variables\n", + "}" ] }, { @@ -12264,19 +12327,25 @@ } ], "source": [ - "models = np.sort(cat_metric.df['source_id'].unique())\n", + "models = np.sort(cat_metric.df[\"source_id\"].unique())\n", "fig, axarr = plt.subplots(ncols=6, nrows=5, figsize=[16, 8], sharex=True, sharey=True)\n", "for model, ax in zip(models, axarr.flat):\n", - " ddict_model = {k:ds for k,ds in ddict_matched_filtered.items() if model in k}\n", + " ddict_model = {k: ds for k, ds in ddict_matched_filtered.items() if model in k}\n", " for i, ds in enumerate(ddict_model.values()):\n", " pass\n", " # calculate the area weighted mean surface ocean temperature\n", - " da = ds.tos.sel(time=slice('2000', '2100')).weighted(ds.areacello.fillna(0)).mean(['x','y', 'lev']).squeeze().load()\n", + " da = (\n", + " ds.tos.sel(time=slice(\"2000\", \"2100\"))\n", + " .weighted(ds.areacello.fillna(0))\n", + " .mean([\"x\", \"y\", \"lev\"])\n", + " .squeeze()\n", + " .load()\n", + " )\n", " # resample to 3month averages\n", - " da = da.resample(time='3MS').mean()\n", - " da.plot(ax=ax, color=f'C{1}', label=ds.attrs['variant_label'], alpha=0.5)\n", - " ax.text(0.03,0.97,model,ha='left',va='top', transform=ax.transAxes)\n", - " ax.set_xlabel('')\n", + " da = da.resample(time=\"3MS\").mean()\n", + " da.plot(ax=ax, color=f\"C{1}\", label=ds.attrs[\"variant_label\"], alpha=0.5)\n", + " ax.text(0.03, 0.97, model, ha=\"left\", va=\"top\", transform=ax.transAxes)\n", + " ax.set_xlabel(\"\")\n", " ax.grid()\n", "fig.subplots_adjust(hspace=0, wspace=0)" ] diff --git a/docs/regionmask.ipynb b/docs/regionmask.ipynb index 038837aa..e2256810 100644 --- a/docs/regionmask.ipynb +++ b/docs/regionmask.ipynb @@ -87,7 +87,6 @@ "import intake\n", "import matplotlib.pyplot as plt\n", "from xmip.preprocessing import combined_preprocessing\n", - "import xarray as xr\n", "import numpy as np" ] }, @@ -133,12 +132,32 @@ "# import example cloud datasets\n", "col_url = \"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\"\n", "col = intake.open_esm_datastore(col_url)\n", - "cat = col.search(source_id=['CAMS-CSM1-0', 'CNRM-CM6-1', 'CNRM-ESM2-1', 'ACCESS-CM2', 'ACCESS-ESM1-5', 'EC-Earth3-Veg',\n", - " 'MIROC-ES2L', 'MIROC6', 'HadGEM3-GC31-LL', 'UKESM1-0-LL', 'MPI-ESM1-2-HR', 'MRI-ESM2-0',\n", - " 'NorCPM1', 'GFDL-CM4', 'GFDL-ESM4', 'NESM3'],\n", - " experiment_id='historical', variable_id='thetao')\n", - "data_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False},\n", - " preprocess=combined_preprocessing)" + "cat = col.search(\n", + " source_id=[\n", + " \"CAMS-CSM1-0\",\n", + " \"CNRM-CM6-1\",\n", + " \"CNRM-ESM2-1\",\n", + " \"ACCESS-CM2\",\n", + " \"ACCESS-ESM1-5\",\n", + " \"EC-Earth3-Veg\",\n", + " \"MIROC-ES2L\",\n", + " \"MIROC6\",\n", + " \"HadGEM3-GC31-LL\",\n", + " \"UKESM1-0-LL\",\n", + " \"MPI-ESM1-2-HR\",\n", + " \"MRI-ESM2-0\",\n", + " \"NorCPM1\",\n", + " \"GFDL-CM4\",\n", + " \"GFDL-ESM4\",\n", + " \"NESM3\",\n", + " ],\n", + " experiment_id=\"historical\",\n", + " variable_id=\"thetao\",\n", + ")\n", + "data_dict = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False},\n", + " preprocess=combined_preprocessing,\n", + ")" ] }, { @@ -445,23 +464,29 @@ "import cartopy.crs as ccrs\n", "\n", "for k, ds in data_dict.items():\n", - " if 'lev' in ds.dims:\n", - " model = ds.attrs['source_id']\n", - " if 'member_id' in ds.dims:\n", + " if \"lev\" in ds.dims:\n", + " model = ds.attrs[\"source_id\"]\n", + " if \"member_id\" in ds.dims:\n", " ds = ds.isel(member_id=0)\n", " ds = ds.thetao.isel(time=0, lev=0).squeeze()\n", "\n", - " mask = merged_mask(basins,ds)\n", + " mask = merged_mask(basins, ds)\n", "\n", - " kwargs = dict(x='lon', y='lat',transform = ccrs.PlateCarree(), infer_intervals=False)\n", - " fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=[20,2], subplot_kw={'projection':ccrs.Robinson(190)})\n", + " kwargs = dict(\n", + " x=\"lon\", y=\"lat\", transform=ccrs.PlateCarree(), infer_intervals=False\n", + " )\n", + " fig, (ax1, ax2, ax3) = plt.subplots(\n", + " ncols=3, figsize=[20, 2], subplot_kw={\"projection\": ccrs.Robinson(190)}\n", + " )\n", " ds.plot(ax=ax1, **kwargs)\n", " ax1.set_title(f\"Raw field {model}\")\n", - " \n", - " ds_masked = ds.where(np.logical_or(np.logical_or(mask == 2, mask==3),mask==4)) # Pacific + Maritime Continent\n", + "\n", + " ds_masked = ds.where(\n", + " np.logical_or(np.logical_or(mask == 2, mask == 3), mask == 4)\n", + " ) # Pacific + Maritime Continent\n", " ds_masked.plot(ax=ax2, **kwargs)\n", " ax2.set_title(f\"Masked Pacific {model}\")\n", - " mask.plot(ax=ax3, cmap='tab20', vmin=0, vmax=19, **kwargs)\n", + " mask.plot(ax=ax3, cmap=\"tab20\", vmin=0, vmax=19, **kwargs)\n", " ax3.set_title(f\"Full Mask {model}\")\n", " for ax in [ax1, ax2, ax3]:\n", " ax.coastlines()" diff --git a/docs/tutorial.ipynb b/docs/tutorial.ipynb index c65b259e..72bed1e1 100644 --- a/docs/tutorial.ipynb +++ b/docs/tutorial.ipynb @@ -30,6 +30,7 @@ "import matplotlib.pyplot as plt\n", "import intake\n", "import dask\n", + "\n", "%matplotlib inline" ] }, @@ -103,16 +104,19 @@ ], "source": [ "# load a few models to illustrate the problem\n", - "query = dict(experiment_id=['piControl'], table_id='Oyr', \n", - " variable_id='o2', grid_label=['gn', 'gr'],\n", - " source_id=['IPSL-CM6A-LR', 'CanESM5', 'GFDL-ESM4']\n", - " )\n", + "query = dict(\n", + " experiment_id=[\"piControl\"],\n", + " table_id=\"Oyr\",\n", + " variable_id=\"o2\",\n", + " grid_label=[\"gn\", \"gr\"],\n", + " source_id=[\"IPSL-CM6A-LR\", \"CanESM5\", \"GFDL-ESM4\"],\n", + ")\n", "cat = col.search(**query)\n", "\n", - "cat.df['source_id'].unique()\n", - "z_kwargs = {'consolidated': True, 'decode_times':False}\n", - "with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", - " dset_dict = cat.to_dataset_dict(zarr_kwargs=z_kwargs)#" + "cat.df[\"source_id\"].unique()\n", + "z_kwargs = {\"consolidated\": True, \"decode_times\": False}\n", + "with dask.config.set(**{\"array.slicing.split_large_chunks\": True}):\n", + " dset_dict = cat.to_dataset_dict(zarr_kwargs=z_kwargs) #" ] }, { @@ -208,12 +212,14 @@ "\n", "# load a few models to illustrate the problem\n", "cat = col.search(**query)\n", - "cat.df['source_id'].unique()\n", + "cat.df[\"source_id\"].unique()\n", "\n", "\n", "# pass the preprocessing directly\n", - "with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", - " dset_dict_renamed = cat.to_dataset_dict(zarr_kwargs=z_kwargs, preprocess=rename_cmip6)\n", + "with dask.config.set(**{\"array.slicing.split_large_chunks\": True}):\n", + " dset_dict_renamed = cat.to_dataset_dict(\n", + " zarr_kwargs=z_kwargs, preprocess=rename_cmip6\n", + " )\n", "\n", "for k, ds in dset_dict_renamed.items():\n", " print(k)\n", @@ -1161,7 +1167,7 @@ ], "source": [ "# IPSL data is a bit of a mess\n", - "ds = dset_dict['CMIP.IPSL.IPSL-CM6A-LR.piControl.Oyr.gn']\n", + "ds = dset_dict[\"CMIP.IPSL.IPSL-CM6A-LR.piControl.Oyr.gn\"]\n", "ds = rename_cmip6(ds)\n", "ds" ] @@ -2114,10 +2120,14 @@ } ], "source": [ - "from xmip.preprocessing import promote_empty_dims, broadcast_lonlat, replace_x_y_nominal_lat_lon\n", + "from xmip.preprocessing import (\n", + " promote_empty_dims,\n", + " broadcast_lonlat,\n", + " replace_x_y_nominal_lat_lon,\n", + ")\n", "\n", "# check out the previous datasets\n", - "ds = dset_dict_renamed['CMIP.IPSL.IPSL-CM6A-LR.piControl.Oyr.gn']\n", + "ds = dset_dict_renamed[\"CMIP.IPSL.IPSL-CM6A-LR.piControl.Oyr.gn\"]\n", "ds" ] }, @@ -3862,7 +3872,7 @@ } ], "source": [ - "ds = dset_dict_renamed['CMIP.NOAA-GFDL.GFDL-ESM4.piControl.Oyr.gr']\n", + "ds = dset_dict_renamed[\"CMIP.NOAA-GFDL.GFDL-ESM4.piControl.Oyr.gr\"]\n", "ds" ] }, @@ -4802,7 +4812,7 @@ } ], "source": [ - "ds = dset_dict_renamed['CMIP.CCCma.CanESM5.piControl.Oyr.gn']\n", + "ds = dset_dict_renamed[\"CMIP.CCCma.CanESM5.piControl.Oyr.gn\"]\n", "print(ds.y.data)\n", "\n", "ds = replace_x_y_nominal_lat_lon(ds)\n", @@ -4877,10 +4887,10 @@ " ds = replace_x_y_nominal_lat_lon(ds)\n", " return ds\n", "\n", + "\n", "# pass the preprocessing directly\n", - "with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", - " dset_dict_processed1 = cat.to_dataset_dict(zarr_kwargs=z_kwargs,\n", - " preprocess=wrapper)" + "with dask.config.set(**{\"array.slicing.split_large_chunks\": True}):\n", + " dset_dict_processed1 = cat.to_dataset_dict(zarr_kwargs=z_kwargs, preprocess=wrapper)" ] }, { @@ -4902,11 +4912,11 @@ } ], "source": [ - "fig, axarr = plt.subplots(nrows=3, figsize=[10,15])\n", + "fig, axarr = plt.subplots(nrows=3, figsize=[10, 15])\n", "for ax, (k, ds) in zip(axarr.flat, dset_dict_processed1.items()):\n", - " if 'member_id' in ds.dims:\n", + " if \"member_id\" in ds.dims:\n", " ds = ds.isel(member_id=-1)\n", - " ds.o2.isel(time=0, lev=0).sel(y=slice(-15,15)).plot(ax=ax)\n", + " ds.o2.isel(time=0, lev=0).sel(y=slice(-15, 15)).plot(ax=ax)\n", " ax.set_title(k)\n", " ax.set_aspect(2)" ] @@ -4982,6 +4992,7 @@ "source": [ "from xmip.preprocessing import correct_lon\n", "\n", + "\n", "# same as above\n", "def wrapper(ds):\n", " ds = ds.copy()\n", @@ -4992,10 +5003,10 @@ " ds = replace_x_y_nominal_lat_lon(ds)\n", " return ds\n", "\n", + "\n", "# pass the preprocessing directly\n", - "with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", - " dset_dict_processed2 = cat.to_dataset_dict(zarr_kwargs=z_kwargs,\n", - " preprocess=wrapper)" + "with dask.config.set(**{\"array.slicing.split_large_chunks\": True}):\n", + " dset_dict_processed2 = cat.to_dataset_dict(zarr_kwargs=z_kwargs, preprocess=wrapper)" ] }, { @@ -5017,11 +5028,11 @@ } ], "source": [ - "fig, axarr = plt.subplots(nrows=3, figsize=[10,15])\n", + "fig, axarr = plt.subplots(nrows=3, figsize=[10, 15])\n", "for ax, (k, ds) in zip(axarr.flat, dset_dict_processed2.items()):\n", - " if 'member_id' in ds.dims:\n", + " if \"member_id\" in ds.dims:\n", " ds = ds.isel(member_id=-1)\n", - " ds.o2.isel(time=0, lev=0).sel(y=slice(-15,15)).plot(ax=ax)\n", + " ds.o2.isel(time=0, lev=0).sel(y=slice(-15, 15)).plot(ax=ax)\n", " ax.set_title(k)\n", " ax.set_aspect(2)" ] @@ -5127,13 +5138,21 @@ ], "source": [ "from xmip.preprocessing import correct_units\n", - "query = dict(experiment_id = ['historical'],variable_id='thetao', grid_label=['gn'],source_id=['CESM2', 'CanESM5'], member_id='r1i1p1f1',\n", - " )\n", + "\n", + "query = dict(\n", + " experiment_id=[\"historical\"],\n", + " variable_id=\"thetao\",\n", + " grid_label=[\"gn\"],\n", + " source_id=[\"CESM2\", \"CanESM5\"],\n", + " member_id=\"r1i1p1f1\",\n", + ")\n", "cat = col.search(**query)\n", "# raw data read in\n", "dset_dict = cat.to_dataset_dict(zarr_kwargs=z_kwargs)\n", "# fixed units\n", - "dset_dict_fixed_unit = cat.to_dataset_dict(zarr_kwargs=z_kwargs, preprocess=correct_units)" + "dset_dict_fixed_unit = cat.to_dataset_dict(\n", + " zarr_kwargs=z_kwargs, preprocess=correct_units\n", + ")" ] }, { @@ -5177,9 +5196,9 @@ } ], "source": [ - "dset_dict['CMIP.NCAR.CESM2.historical.Omon.gn'].lev.plot()\n", + "dset_dict[\"CMIP.NCAR.CESM2.historical.Omon.gn\"].lev.plot()\n", "plt.figure()\n", - "dset_dict_fixed_unit['CMIP.NCAR.CESM2.historical.Omon.gn'].lev.plot()" + "dset_dict_fixed_unit[\"CMIP.NCAR.CESM2.historical.Omon.gn\"].lev.plot()" ] }, { @@ -5208,9 +5227,9 @@ } ], "source": [ - "fig, axarr = plt.subplots(nrows=2, figsize=[10,10])\n", + "fig, axarr = plt.subplots(nrows=2, figsize=[10, 10])\n", "for ax, (k, ds) in zip(axarr.flat, dset_dict_fixed_unit.items()):\n", - " ds.thetao.isel(time=0).sel(lev=5000, method='nearest').plot(ax=ax, vmin=-1, vmax=5)\n", + " ds.thetao.isel(time=0).sel(lev=5000, method=\"nearest\").plot(ax=ax, vmin=-1, vmax=5)\n", " ax.set_title(k)" ] }, @@ -5240,9 +5259,9 @@ } ], "source": [ - "fig, axarr = plt.subplots(nrows=2, figsize=[10,10])\n", + "fig, axarr = plt.subplots(nrows=2, figsize=[10, 10])\n", "for ax, (k, ds) in zip(axarr.flat, dset_dict.items()):\n", - " ds.thetao.isel(time=0).sel(lev=5000, method='nearest').plot(ax=ax, vmin=-1, vmax=5)\n", + " ds.thetao.isel(time=0).sel(lev=5000, method=\"nearest\").plot(ax=ax, vmin=-1, vmax=5)\n", " ax.set_title(k)" ] }, @@ -5341,7 +5360,13 @@ } ], "source": [ - "from xmip.preprocessing import correct_coordinates,parse_lon_lat_bounds, maybe_convert_bounds_to_vertex, maybe_convert_vertex_to_bounds\n", + "from xmip.preprocessing import (\n", + " correct_coordinates,\n", + " parse_lon_lat_bounds,\n", + " maybe_convert_bounds_to_vertex,\n", + " maybe_convert_vertex_to_bounds,\n", + ")\n", + "\n", "\n", "# same as above\n", "def wrapper(ds):\n", @@ -5357,10 +5382,10 @@ " ds = maybe_convert_vertex_to_bounds(ds)\n", " return ds\n", "\n", + "\n", "# pass the preprocessing directly\n", "\n", - "dset_dict_processed3 = cat.to_dataset_dict(zarr_kwargs=z_kwargs,\n", - " preprocess=wrapper)" + "dset_dict_processed3 = cat.to_dataset_dict(zarr_kwargs=z_kwargs, preprocess=wrapper)" ] }, { @@ -5599,29 +5624,32 @@ "from xmip.preprocessing import combined_preprocessing\n", "\n", "# lets load a bunch more models this time\n", - "query = dict(experiment_id=['piControl', 'historical'],\n", - " table_id='Oyr', \n", - " source_id=[\n", - " 'GFDL-ESM4',\n", - " 'IPSL-CM6A-LR',\n", - " 'CanESM5',\n", - " 'CanESM5-CanOE',\n", - " 'MPI-ESM-1-2-HAM',\n", - " 'MPI-ESM1-2-HR',\n", - " 'MPI-ESM1-2-LR',\n", - " 'ACCESS-ESM1-5',\n", - " 'MRI-ESM2-0',\n", - " 'IPSL-CM5A2-INCA',\n", - " 'EC-Earth3-CC'\n", - " ],\n", - " variable_id='o2',\n", - " grid_label=['gn', 'gr'])\n", + "query = dict(\n", + " experiment_id=[\"piControl\", \"historical\"],\n", + " table_id=\"Oyr\",\n", + " source_id=[\n", + " \"GFDL-ESM4\",\n", + " \"IPSL-CM6A-LR\",\n", + " \"CanESM5\",\n", + " \"CanESM5-CanOE\",\n", + " \"MPI-ESM-1-2-HAM\",\n", + " \"MPI-ESM1-2-HR\",\n", + " \"MPI-ESM1-2-LR\",\n", + " \"ACCESS-ESM1-5\",\n", + " \"MRI-ESM2-0\",\n", + " \"IPSL-CM5A2-INCA\",\n", + " \"EC-Earth3-CC\",\n", + " ],\n", + " variable_id=\"o2\",\n", + " grid_label=[\"gn\", \"gr\"],\n", + ")\n", "cat = col.search(**query)\n", "\n", - "print(cat.df['source_id'].unique())\n", - "with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", - " dset_dict = cat.to_dataset_dict(zarr_kwargs=z_kwargs,\n", - " preprocess=combined_preprocessing)" + "print(cat.df[\"source_id\"].unique())\n", + "with dask.config.set(**{\"array.slicing.split_large_chunks\": True}):\n", + " dset_dict = cat.to_dataset_dict(\n", + " zarr_kwargs=z_kwargs, preprocess=combined_preprocessing\n", + " )" ] }, { @@ -5643,15 +5671,15 @@ } ], "source": [ - "fig, axarr = plt.subplots(nrows=4, ncols=3, figsize=[25,15])\n", - "for ax,(k, ds) in zip(axarr.flat,dset_dict.items()):\n", - " if 'member_id' in ds.dims:\n", + "fig, axarr = plt.subplots(nrows=4, ncols=3, figsize=[25, 15])\n", + "for ax, (k, ds) in zip(axarr.flat, dset_dict.items()):\n", + " if \"member_id\" in ds.dims:\n", " ds = ds.isel(member_id=0)\n", " da = ds.o2.isel(time=0).interp(lev=2500)\n", - " # this step is necessary to order the longitudes properly for simple plotting. Alternatively you could use a proper map projection \n", + " # this step is necessary to order the longitudes properly for simple plotting. Alternatively you could use a proper map projection\n", " # with e.g. cartopy and would not need this step\n", " da = replace_x_y_nominal_lat_lon(da)\n", - " da = da.sel(x=slice(100, 200), y=slice(-20,20))\n", + " da = da.sel(x=slice(100, 200), y=slice(-20, 20))\n", " try:\n", " da.plot(ax=ax, vmax=0.25, vmin=0.05)\n", " except:\n", diff --git a/notebooks/add_more_models.ipynb b/notebooks/add_more_models.ipynb index a4eda4b6..cb045978 100644 --- a/notebooks/add_more_models.ipynb +++ b/notebooks/add_more_models.ipynb @@ -34,8 +34,6 @@ "%load_ext autoreload\n", "%autoreload 2\n", "import intake\n", - "import pandas as pd\n", - "import xarray as xr\n", "from xmip.preprocessing import cmip6_renaming_dict" ] }, @@ -183,11 +181,11 @@ "# Grab all available ocean output.\n", "url = \"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\"\n", "col = intake.open_esm_datastore(url)\n", - "query = dict(table_id=['Omon', 'Oyr']) # pick all available ocean fields for now\n", + "query = dict(table_id=[\"Omon\", \"Oyr\"]) # pick all available ocean fields for now\n", "cat = col.search(**query)\n", "\n", "# find unique source_ids\n", - "available_models = cat.df['source_id'].unique()\n", + "available_models = cat.df[\"source_id\"].unique()\n", "# find available models in xmip\n", "models = [k for k in cmip6_renaming_dict().keys()]\n", "# find missing models\n", @@ -211,29 +209,47 @@ } ], "source": [ - "all_variables = cat.df['variable_id'].unique()\n", - "surface_variables = ['tos', 'chl', 'zos', 'chlos', 'fgco2', 'hfds', 'sos',\n", - " 'mlotst', 'tauuo', 'tauvo', 'msftmz', 'intpp']\n", + "all_variables = cat.df[\"variable_id\"].unique()\n", + "surface_variables = [\n", + " \"tos\",\n", + " \"chl\",\n", + " \"zos\",\n", + " \"chlos\",\n", + " \"fgco2\",\n", + " \"hfds\",\n", + " \"sos\",\n", + " \"mlotst\",\n", + " \"tauuo\",\n", + " \"tauvo\",\n", + " \"msftmz\",\n", + " \"intpp\",\n", + "]\n", "remaining_variables = [v for v in all_variables if v not in surface_variables]\n", - "# some models literally have only surface input. Ill deal with that later. For now lets look at the ones that have 3d \n", + "# some models literally have only surface input. Ill deal with that later. For now lets look at the ones that have 3d\n", "# output (if surface values show up below, add them to the list above.)\n", "# now load one dataset for each missing model\n", - "query = dict(table_id=['Omon', 'Oyr'], source_id=missing_models, variable_id=remaining_variables) # pick all available ocean fields for now\n", + "query = dict(\n", + " table_id=[\"Omon\", \"Oyr\"], source_id=missing_models, variable_id=remaining_variables\n", + ") # pick all available ocean fields for now\n", "cat_sub = col.search(**query)\n", - "cat_sub.df = cat_sub.df.drop_duplicates(subset='source_id')\n", - "surface_only_models = [m for m in missing_models if m not in cat_sub.df['source_id'].unique()]\n", + "cat_sub.df = cat_sub.df.drop_duplicates(subset=\"source_id\")\n", + "surface_only_models = [\n", + " m for m in missing_models if m not in cat_sub.df[\"source_id\"].unique()\n", + "]\n", "if len(cat_sub.df) > 0:\n", - " dset_dict = cat_sub.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':False})\n", + " dset_dict = cat_sub.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False}\n", + " )\n", " for k, ds in dset_dict.items():\n", - " print('=========================================')\n", + " print(\"=========================================\")\n", " print(k)\n", - " print('=========================================')\n", + " print(\"=========================================\")\n", " print(ds)\n", - " da = ds[ds.attrs['variable_id']]\n", + " da = ds[ds.attrs[\"variable_id\"]]\n", " if len(da.dims) < 5:\n", - " print('!!! This is a surface field!!!')\n", + " print(\"!!! This is a surface field!!!\")\n", "else:\n", - " print('Nice. All models with 3d ocean fields are catalogued')" + " print(\"Nice. All models with 3d ocean fields are catalogued\")" ] }, { @@ -257,20 +273,26 @@ } ], "source": [ - "query = dict(table_id=['Omon', 'Oyr'], source_id=surface_only_models, variable_id=surface_variables)\n", + "query = dict(\n", + " table_id=[\"Omon\", \"Oyr\"],\n", + " source_id=surface_only_models,\n", + " variable_id=surface_variables,\n", + ")\n", "cat_surf = col.search(**query)\n", - "cat_surf.df = cat_surf.df.drop_duplicates(subset='source_id')\n", + "cat_surf.df = cat_surf.df.drop_duplicates(subset=\"source_id\")\n", "\n", "if len(cat_sub.df) > 0:\n", - " dset_dict = cat_surf.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':False})\n", + " dset_dict = cat_surf.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False}\n", + " )\n", " for k, ds in dset_dict.items():\n", - " print('=========================================')\n", + " print(\"=========================================\")\n", " print(k)\n", - " print('=========================================')\n", + " print(\"=========================================\")\n", " print(ds)\n", - " da = ds[ds.attrs['variable_id']]\n", + " da = ds[ds.attrs[\"variable_id\"]]\n", "else:\n", - " print('Nice. All ocean models are catalogued')" + " print(\"Nice. All ocean models are catalogued\")" ] }, { @@ -301,7 +323,7 @@ } ], "source": [ - "col.df['table_id'].unique()" + "col.df[\"table_id\"].unique()" ] }, { @@ -346,9 +368,9 @@ } ], "source": [ - "lst = ['a', 'b']\n", + "lst = [\"a\", \"b\"]\n", "\n", - "'a' in lst" + "\"a\" in lst" ] }, { @@ -1356,19 +1378,21 @@ } ], "source": [ - "query = dict(table_id='6hrLev')\n", + "query = dict(table_id=\"6hrLev\")\n", "cat_atmos = col.search(**query)\n", - "cat_atmos.df = cat_atmos.df.drop_duplicates(subset='source_id')\n", + "cat_atmos.df = cat_atmos.df.drop_duplicates(subset=\"source_id\")\n", "print(cat_atmos.df)\n", "print(len(cat_atmos.df))\n", "if len(cat_atmos.df) > 0:\n", - " dset_dict = cat_atmos.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':False})\n", + " dset_dict = cat_atmos.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False}\n", + " )\n", " for k, ds in dset_dict.items():\n", - " print('=========================================')\n", + " print(\"=========================================\")\n", " print(k)\n", - " print('=========================================')\n", + " print(\"=========================================\")\n", " print(ds)\n", - " da = ds[ds.attrs['variable_id']]\n", + " da = ds[ds.attrs[\"variable_id\"]]\n", "# else:\n", "# print('Nice. All ocean models are catalogued')" ] @@ -1386,9 +1410,9 @@ "metadata": {}, "outputs": [], "source": [ - "query = dict(table_id=['Omon', 'Oyr']) # pick all available ocean fields for now\n", + "query = dict(table_id=[\"Omon\", \"Oyr\"]) # pick all available ocean fields for now\n", "cat = col.search(**query)\n", - "cat.df = cat.df.drop_duplicates(subset='source_id')" + "cat.df = cat.df.drop_duplicates(subset=\"source_id\")" ] }, { @@ -1398,6 +1422,7 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline" ] }, @@ -3709,27 +3734,25 @@ "# be fixed\n", "\n", "# I should also independently test for gn and gr variables\n", - "test_vars = [v for v in all_variables if v not in ['msftmz']]\n", + "test_vars = [v for v in all_variables if v not in [\"msftmz\"]]\n", "\n", "for model in models:\n", - " if 'AWI' not in model:\n", + " if \"AWI\" not in model:\n", " print(model)\n", - " cat_check = col.search(source_id=model,\n", - " variable_id=test_vars,\n", - " **query)\n", - " cat_check.df = cat_check.df.drop_duplicates(subset='source_id')\n", + " cat_check = col.search(source_id=model, variable_id=test_vars, **query)\n", + " cat_check.df = cat_check.df.drop_duplicates(subset=\"source_id\")\n", " check = cat_check.to_dataset_dict(\n", - " zarr_kwargs={'consolidated': True, 'decode_times':False},\n", - " preprocess=combined_preprocessing\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False},\n", + " preprocess=combined_preprocessing,\n", " )\n", " ds = check[list(check.keys())[0]]\n", - " var = ds.attrs['variable_id']\n", + " var = ds.attrs[\"variable_id\"]\n", " da = ds[var]\n", " print(da)\n", - " for di in ['time', 'member_id', 'lev', 'rho']:\n", + " for di in [\"time\", \"member_id\", \"lev\", \"rho\"]:\n", " if di in da.dims:\n", - " da = da.isel({di:0})\n", - " \n", + " da = da.isel({di: 0})\n", + "\n", " plt.figure()\n", " da.plot(robust=True)\n", " plt.show()" diff --git a/notebooks/maintenance_grids.ipynb b/notebooks/maintenance_grids.ipynb index 931c0f2e..2b96c2e0 100644 --- a/notebooks/maintenance_grids.ipynb +++ b/notebooks/maintenance_grids.ipynb @@ -41,6 +41,7 @@ ], "source": [ "import xmip\n", + "\n", "xmip.__version__" ] }, @@ -78,6 +79,7 @@ "# col = intake.open_esm_datastore(url)\n", "\n", "from xmip.utils import google_cmip_col\n", + "\n", "col = google_cmip_col()" ] }, @@ -251,50 +253,51 @@ " \"\"\"Show which source_id/grid_label combos have any data, and return a df that picks only one dataset for each combo\"\"\"\n", " url = \"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\"\n", " col = intake.open_esm_datastore(url)\n", - " query = dict(table_id=['Omon', 'Oyr'])\n", + " query = dict(table_id=[\"Omon\", \"Oyr\"])\n", " if variables is not None:\n", - " query['variable_id'] = variables\n", - " \n", - " \n", + " query[\"variable_id\"] = variables\n", + "\n", " # pick all available ocean fields for now\n", " cat = col.search(**query)\n", " print(cat)\n", - " \n", + "\n", " available = []\n", " dataframes = []\n", " df = cat.df.copy()\n", - " groups = df.groupby(['source_id', 'grid_label'])\n", + " groups = df.groupby([\"source_id\", \"grid_label\"])\n", " for group in groups:\n", - " \n", " # add source_id/grid_label combo to list\n", - " label = '.'.join(group[0])\n", + " label = \".\".join(group[0])\n", " # pick only the first index of each group\n", - " line = group[1].iloc[0,:]\n", - " \n", + " line = group[1].iloc[0, :]\n", + "\n", " available.append(label)\n", " dataframes.append(line)\n", "\n", " new_df = pd.concat(dataframes, axis=1).transpose()\n", " cat.df = new_df\n", - " \n", + "\n", " return cat, available\n", "\n", + "\n", "_, all_models = available_output()\n", "print(len(all_models))\n", "\n", - "cat_tracer, tracer_models = available_output(['tos', 'thetao'])\n", + "cat_tracer, tracer_models = available_output([\"tos\", \"thetao\"])\n", "missing_tracer_models = set(tracer_models).symmetric_difference(set(all_models))\n", "print(f\"Did not find tracer data for these models:{missing_tracer_models}\\n\")\n", "\n", - "cat_u, u_models = available_output(['uo'])\n", + "cat_u, u_models = available_output([\"uo\"])\n", "missing_u_models = set(u_models).symmetric_difference(set(all_models))\n", "print(f\"Did not find u data for these models:{missing_u_models}\\n\")\n", "\n", - "cat_v, v_models = available_output(['vo'])\n", + "cat_v, v_models = available_output([\"vo\"])\n", "missing_v_models = set(v_models).symmetric_difference(set(all_models))\n", "print(f\"Did not find v data for these models:{missing_v_models}\\n\")\n", "\n", - "print(f\"Any models that have only u or v:{set(v_models).symmetric_difference(set(u_models))}\")" + "print(\n", + " f\"Any models that have only u or v:{set(v_models).symmetric_difference(set(u_models))}\"\n", + ")" ] }, { @@ -2263,13 +2266,17 @@ "source": [ "# for now load them manually\n", "import fsspec\n", - "import xarray as xr\n", + "\n", "super_dict = {}\n", - "for var, cat in zip(['thetao', 'uo', 'vo'],[cat_tracer, cat_u, cat_v]):\n", - " super_dict[var]={}\n", - " for ri,(rr,row) in enumerate(cat.df.iterrows()):\n", - "# print(ri)\n", - " ds = combined_preprocessing(xr.open_zarr(fsspec.get_mapper(row['zstore']), consolidated=True, decode_times=False))\n", + "for var, cat in zip([\"thetao\", \"uo\", \"vo\"], [cat_tracer, cat_u, cat_v]):\n", + " super_dict[var] = {}\n", + " for ri, (rr, row) in enumerate(cat.df.iterrows()):\n", + " # print(ri)\n", + " ds = combined_preprocessing(\n", + " xr.open_zarr(\n", + " fsspec.get_mapper(row[\"zstore\"]), consolidated=True, decode_times=False\n", + " )\n", + " )\n", " label = f\"{row['source_id']}.{row['grid_label']}\"\n", " super_dict[var][label] = ds" ] @@ -2417,71 +2424,72 @@ ], "source": [ "staggered_grid_dict = {}\n", - "for k in super_dict['thetao'].keys():\n", - " ds_ref = super_dict['thetao'][k]\n", - " s_id = ds_ref.attrs['source_id']\n", - " g_la = ds_ref.attrs['grid_label']\n", - " \n", - " if not ('AWI' in k and 'gn' in k):\n", + "for k in super_dict[\"thetao\"].keys():\n", + " ds_ref = super_dict[\"thetao\"][k]\n", + " s_id = ds_ref.attrs[\"source_id\"]\n", + " g_la = ds_ref.attrs[\"grid_label\"]\n", + "\n", + " if not (\"AWI\" in k and \"gn\" in k):\n", " print(f\"############### {k} #######################\")\n", - " if k in super_dict['uo'].keys() and k in super_dict['vo'].keys():\n", - " \n", - " ds_u = super_dict['uo'][k]\n", - " ds_v = super_dict['vo'][k]\n", - " \n", - " if 'x' not in ds_ref.dims:\n", - " print(f'THIS IS SOME ERROR IN THE PREPROCESSSING. INVESTIGATE {k}')\n", + " if k in super_dict[\"uo\"].keys() and k in super_dict[\"vo\"].keys():\n", + " ds_u = super_dict[\"uo\"][k]\n", + " ds_v = super_dict[\"vo\"][k]\n", + "\n", + " if \"x\" not in ds_ref.dims:\n", + " print(f\"THIS IS SOME ERROR IN THE PREPROCESSSING. INVESTIGATE {k}\")\n", " # a nevermind, these are just the AWI ones...remove them earlier...\n", " else:\n", - " x_shift_u = detect_shift(ds_ref, ds_u, 'X')\n", - " y_shift_u = detect_shift(ds_ref, ds_u, 'Y')\n", + " x_shift_u = detect_shift(ds_ref, ds_u, \"X\")\n", + " y_shift_u = detect_shift(ds_ref, ds_u, \"Y\")\n", "\n", - " x_shift_v = detect_shift(ds_ref, ds_v, 'X')\n", - " y_shift_v = detect_shift(ds_ref, ds_v, 'Y')\n", - " \n", - " \n", + " x_shift_v = detect_shift(ds_ref, ds_v, \"X\")\n", + " y_shift_v = detect_shift(ds_ref, ds_v, \"Y\")\n", "\n", " # check that there is only one left after removing 'center'\n", - " x_shift = set([x_shift_u, x_shift_v]) - set(['center'])\n", - " y_shift = set([y_shift_u, y_shift_v]) - set(['center'])\n", + " x_shift = set([x_shift_u, x_shift_v]) - set([\"center\"])\n", + " y_shift = set([y_shift_u, y_shift_v]) - set([\"center\"])\n", " # if they are all on center default to left\n", "\n", " if len(x_shift) == 0:\n", - " x_shift = 'left'\n", + " x_shift = \"left\"\n", " elif len(x_shift) == 1:\n", " x_shift = list(x_shift)[0]\n", " else:\n", - " print('SCHEISSE X')\n", - " print('x')\n", + " print(\"SCHEISSE X\")\n", + " print(\"x\")\n", " print(x_shift_u)\n", " print(x_shift_v)\n", - " print('y')\n", + " print(\"y\")\n", " print(y_shift_u)\n", " print(y_shift_v)\n", - " x_shift=None\n", + " x_shift = None\n", "\n", " if len(y_shift) == 0:\n", - " y_shift = 'left'\n", + " y_shift = \"left\"\n", " elif len(y_shift) == 1:\n", " y_shift = list(y_shift)[0]\n", " else:\n", - " print('SCHEISSE Y')\n", - " print('u')\n", + " print(\"SCHEISSE Y\")\n", + " print(\"u\")\n", " print(x_shift_u)\n", " print(y_shift_u)\n", - " print('v')\n", + " print(\"v\")\n", " print(x_shift_v)\n", " print(y_shift_v)\n", " y_shift = None\n", " else:\n", - " print(f\"ATTENTION: Setting shift to left for {k}, since there is no velocity data\")\n", - " x_shift = 'left'\n", - " y_shift = 'left'\n", - " \n", + " print(\n", + " f\"ATTENTION: Setting shift to left for {k}, since there is no velocity data\"\n", + " )\n", + " x_shift = \"left\"\n", + " y_shift = \"left\"\n", + "\n", " if x_shift is not None and y_shift is not None:\n", - " if not s_id in staggered_grid_dict.keys():\n", + " if s_id not in staggered_grid_dict.keys():\n", " staggered_grid_dict[s_id] = {}\n", - " staggered_grid_dict[s_id][g_la] = {'axis_shift':{'X': x_shift, 'Y': y_shift}}" + " staggered_grid_dict[s_id][g_la] = {\n", + " \"axis_shift\": {\"X\": x_shift, \"Y\": y_shift}\n", + " }" ] }, { @@ -2498,7 +2506,8 @@ "outputs": [], "source": [ "import yaml\n", - "ff = open('/home/jovyan/xmip/xmip/specs/staggered_grid_config.yaml', \"r\")\n", + "\n", + "ff = open(\"/home/jovyan/xmip/xmip/specs/staggered_grid_config.yaml\", \"r\")\n", "grid_dict = yaml.safe_load(ff)\n", "ff.close()" ] @@ -2519,7 +2528,9 @@ ], "source": [ "# any keys in the old dict that are not in the new one?\n", - "print(f\"Keys in the old grid, which are not in the new one {set(grid_dict.keys())- set(staggered_grid_dict.keys())}\")\n", + "print(\n", + " f\"Keys in the old grid, which are not in the new one {set(grid_dict.keys())- set(staggered_grid_dict.keys())}\"\n", + ")\n", "\n", "print(f\"Newly added models {set(staggered_grid_dict.keys()) - set(grid_dict.keys())}\")" ] @@ -2542,7 +2553,7 @@ } ], "source": [ - "staggered_grid_dict['GFDL-CM4']" + "staggered_grid_dict[\"GFDL-CM4\"]" ] }, { @@ -2551,7 +2562,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open('test.yaml', 'w') as file:\n", + "with open(\"test.yaml\", \"w\") as file:\n", " documents = yaml.dump(staggered_grid_dict, file)" ] } diff --git a/notebooks/metric_parse_improvement.ipynb b/notebooks/metric_parse_improvement.ipynb index 481c4671..95af45c0 100644 --- a/notebooks/metric_parse_improvement.ipynb +++ b/notebooks/metric_parse_improvement.ipynb @@ -482,27 +482,36 @@ ], "source": [ "import pandas as pd\n", - "# get all available ocean models from the cloud. \n", - "url = 'https://storage.googleapis.com/cmip6/pangeo-cmip6.csv'\n", + "\n", + "# get all available ocean models from the cloud.\n", + "url = \"https://storage.googleapis.com/cmip6/pangeo-cmip6.csv\"\n", "df = pd.read_csv(url)\n", - "df_ocean =df[(df.table_id=='Omon') + (df.table_id=='Oyr')]\n", + "df_ocean = df[(df.table_id == \"Omon\") + (df.table_id == \"Oyr\")]\n", "ocean_models = df_ocean.source_id.unique()\n", "print(ocean_models)\n", "print(len(ocean_models))\n", "\n", - "exclude_variables = ['mlotst', 'msftmz', 'intpp'] # at the moment cmip6_pp does not like fields that have only one spatial dimension\n", + "exclude_variables = [\n", + " \"mlotst\",\n", + " \"msftmz\",\n", + " \"intpp\",\n", + "] # at the moment cmip6_pp does not like fields that have only one spatial dimension\n", "\n", "\n", "# get one dataset from each model, shouldnt matter which\n", - "col = intake.open_esm_datastore(\"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\")\n", + "col = intake.open_esm_datastore(\n", + " \"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\"\n", + ")\n", "# cat = col.search(source_id=ocean_models)\n", - "cat = col.search(table_id=['Omon', 'Oyr'])\n", - "exclude_index = [vid not in exclude_variables for vid in cat.df['variable_id']]\n", + "cat = col.search(table_id=[\"Omon\", \"Oyr\"])\n", + "exclude_index = [vid not in exclude_variables for vid in cat.df[\"variable_id\"]]\n", "cat.df = cat.df[exclude_index]\n", - "cat.df = cat.df.drop_duplicates(subset='source_id')\n", + "cat.df = cat.df.drop_duplicates(subset=\"source_id\")\n", "print(len(cat.df))\n", - "data_dict_raw = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':False},\n", - " preprocess=combined_preprocessing)\n", + "data_dict_raw = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False},\n", + " preprocess=combined_preprocessing,\n", + ")\n", "data_dict = parse_metrics(data_dict_raw, col, preprocess=combined_preprocessing)" ] }, @@ -554,7 +563,14 @@ ], "source": [ "import matplotlib.pyplot as plt\n", - "fig, axarr = plt.subplots(ncols=6, nrows=len(data_dict.keys())//6 +1, figsize=[30,30], sharex=True, sharey=True)\n", + "\n", + "fig, axarr = plt.subplots(\n", + " ncols=6,\n", + " nrows=len(data_dict.keys()) // 6 + 1,\n", + " figsize=[30, 30],\n", + " sharex=True,\n", + " sharey=True,\n", + ")\n", "for ax, (k, ds) in zip(axarr.flat, data_dict.items()):\n", " ds.areacello.plot(ax=ax, add_labels=False)\n", " ax.set_title(k)" diff --git a/notebooks/parse_area_gn.ipynb b/notebooks/parse_area_gn.ipynb index dfe1e931..99e1bf62 100644 --- a/notebooks/parse_area_gn.ipynb +++ b/notebooks/parse_area_gn.ipynb @@ -22,8 +22,7 @@ } ], "source": [ - "import intake\n", - "import numpy as np" + "import intake" ] }, { @@ -174,8 +173,12 @@ "metadata": {}, "outputs": [], "source": [ - "query = dict(experiment_id='piControl',\n", - " variable_id=['thetao', 'uo', 'vo'],table_id='Omon', grid_label='gn')" + "query = dict(\n", + " experiment_id=\"piControl\",\n", + " variable_id=[\"thetao\", \"uo\", \"vo\"],\n", + " table_id=\"Omon\",\n", + " grid_label=\"gn\",\n", + ")" ] }, { @@ -311,7 +314,10 @@ "source": [ "# load the same thing with preprocessing\n", "from xmip.preprocessing import read_data\n", - "with warnings.catch_warnings(): # these lines just make sure that the warnings dont clutter your notebook\n", + "\n", + "with (\n", + " warnings.catch_warnings()\n", + "): # these lines just make sure that the warnings dont clutter your notebook\n", " warnings.simplefilter(\"ignore\")\n", " data_dict = read_data(col, **query)" ] @@ -331,7 +337,9 @@ "metadata": {}, "outputs": [], "source": [ - "parse_metrics(data_dict, col, rename=True) #rename is important to get the consistent naming!" + "parse_metrics(\n", + " data_dict, col, rename=True\n", + ") # rename is important to get the consistent naming!" ] }, { diff --git a/notebooks/testing_various_issues.ipynb b/notebooks/testing_various_issues.ipynb index 9f99747e..f2cabb3b 100644 --- a/notebooks/testing_various_issues.ipynb +++ b/notebooks/testing_various_issues.ipynb @@ -26,19 +26,16 @@ "source": [ "import xarray as xr\n", "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", "import numpy as np\n", "import intake\n", "from xmip.preprocessing import combined_preprocessing\n", - "from dask.diagnostics import ProgressBar\n", - "import warnings\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline\n", - "plt.rcParams['figure.figsize'] = 12, 6\n", + "plt.rcParams[\"figure.figsize\"] = 12, 6\n", "%config InlineBackend.figure_format = 'retina'\n", - "xr.set_options(cmap_sequential='cividis', display_style='html')" + "xr.set_options(cmap_sequential=\"cividis\", display_style=\"html\")" ] }, { @@ -47,7 +44,9 @@ "metadata": {}, "outputs": [], "source": [ - "col = intake.open_esm_datastore(\"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\")" + "col = intake.open_esm_datastore(\n", + " \"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\"\n", + ")" ] }, { @@ -110,15 +109,18 @@ } ], "source": [ - "query = dict(variable_id=['thetao'],\n", - " experiment_id = 'historical',\n", - " table_id=['Omon'], \n", - " source_id=['CanESM5'],\n", - " grid_label=['gn'],\n", - " )\n", + "query = dict(\n", + " variable_id=[\"thetao\"],\n", + " experiment_id=\"historical\",\n", + " table_id=[\"Omon\"],\n", + " source_id=[\"CanESM5\"],\n", + " grid_label=[\"gn\"],\n", + ")\n", "cat = col.search(**query)\n", "cat.df\n", - "data_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':False}, aggregate=False)\n", + "data_dict = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False}, aggregate=False\n", + ")\n", "for k, ds in data_dict.items():\n", " print(f\"{k}: {ds.lev_bnds.dims}\")" ] @@ -596,7 +598,7 @@ } ], "source": [ - "ds.lev_bnds.isel(time=0).drop_vars(['time'])" + "ds.lev_bnds.isel(time=0).drop_vars([\"time\"])" ] }, { @@ -606,7 +608,7 @@ "outputs": [], "source": [ "# see if the values are unique with regard to time\n", - "ds = data_dict['CMIP.CCCma.CanESM5.historical.r10i1p1f1.Omon.thetao.gn']\n", + "ds = data_dict[\"CMIP.CCCma.CanESM5.historical.r10i1p1f1.Omon.thetao.gn\"]\n", "np.testing.assert_allclose(ds.lev_bnds.isel(time=0).data, ds.lev_bnds.isel(time=1).data)" ] }, @@ -635,7 +637,7 @@ "source": [ "# query = dict(variable_id=['o2'],\n", "# # experiment_id=['piControl'],\n", - "# table_id=['Omon'], \n", + "# table_id=['Omon'],\n", "# source_id=['GFDL-CM4'],\n", "# # grid_label=['gn']\n", "# )\n", @@ -657,7 +659,7 @@ "metadata": {}, "outputs": [], "source": [ - "# this should go to " + "# this should go to" ] }, { @@ -723,12 +725,16 @@ } ], "source": [ - "query = dict(variable_id=['so'],\n", - " experiment_id=['historical'],\n", - " table_id=['Omon'], \n", - " source_id=['ACCESS-ESM1-5'])\n", + "query = dict(\n", + " variable_id=[\"so\"],\n", + " experiment_id=[\"historical\"],\n", + " table_id=[\"Omon\"],\n", + " source_id=[\"ACCESS-ESM1-5\"],\n", + ")\n", "cat = col.search(**query)\n", - "data_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':True})" + "data_dict = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": True}\n", + ")" ] }, { @@ -777,17 +783,20 @@ } ], "source": [ - "\n", - "experiment_ids = ['historical']#,'ssp126',, 'ssp245', 'ssp585'\n", - "query = dict(variable_id=['thetao'],#'uo', 'vo', , 'o2'\n", - " experiment_id=experiment_ids,\n", - " table_id=['Omon'])\n", + "experiment_ids = [\"historical\"] # ,'ssp126',, 'ssp245', 'ssp585'\n", + "query = dict(\n", + " variable_id=[\"thetao\"], #'uo', 'vo', , 'o2'\n", + " experiment_id=experiment_ids,\n", + " table_id=[\"Omon\"],\n", + ")\n", "cat = col.search(**query)\n", - "models = cat.df['source_id'].unique()\n", + "models = cat.df[\"source_id\"].unique()\n", "print(models)\n", - "data_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':True})\n", + "data_dict = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": True}\n", + ")\n", "# So this fails with the preprocess argument, Ill have to see for which model exactly\n", - "for k,ds in data_dict.items():\n", + "for k, ds in data_dict.items():\n", " try:\n", " ds_new = combined_preprocessing(ds)\n", " except:\n", @@ -881,7 +890,10 @@ ], "source": [ "# Huh this also works. So it seems that the preprocessing messes something up prior to aggregation. Is this due to time?\n", - "data_dict_new = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':True}, preprocess=combined_preprocessing)" + "data_dict_new = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": True},\n", + " preprocess=combined_preprocessing,\n", + ")" ] }, { @@ -2109,12 +2121,15 @@ "# Checking each model to constrain the models that have problems, odly this works?\n", "for model in models:\n", " print(f\"+++++++{model}+++++++++\")\n", - " model_query = {k:v for k,v in query.items()}\n", - " model_query['source_id'] = model\n", + " model_query = {k: v for k, v in query.items()}\n", + " model_query[\"source_id\"] = model\n", " model_cat = col.search(**model_query)\n", " print(model_cat.df)\n", " try:\n", - " model_data_dict = model_cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':False}, preprocess=combined_preprocessing)\n", + " model_data_dict = model_cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False},\n", + " preprocess=combined_preprocessing,\n", + " )\n", " except Exception as e:\n", " print(f\"{k} failed to preprocess\")\n", " print(e)" @@ -3719,14 +3734,17 @@ "possible_fuckers = []\n", "for model in models:\n", " print(f\"+++++++{model} excluded+++++++++\")\n", - " model_query = {k:v for k,v in query.items()}\n", - " model_query['source_id'] = [m for m in models if m != model]\n", + " model_query = {k: v for k, v in query.items()}\n", + " model_query[\"source_id\"] = [m for m in models if m != model]\n", " model_cat = col.search(**model_query)\n", " try:\n", - " model_data_dict = model_cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':False}, preprocess=combined_preprocessing)\n", + " model_data_dict = model_cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": False},\n", + " preprocess=combined_preprocessing,\n", + " )\n", " possible_fuckers.append(model)\n", " except:\n", - " pass\n" + " pass" ] }, { @@ -3816,8 +3834,11 @@ ], "source": [ "# Huh this also works. So it seems that the preprocessing messes something up prior to aggregation. Is this due to time?\n", - "picked_models = [m for m in models if ('CanESM' not in m)]\n", - "data_dict_new = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times':True}, preprocess=combined_preprocessing)" + "picked_models = [m for m in models if (\"CanESM\" not in m)]\n", + "data_dict_new = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": True},\n", + " preprocess=combined_preprocessing,\n", + ")" ] }, { @@ -3849,8 +3870,14 @@ "source": [ "import xarray as xr\n", "import gcsfs\n", - "gcs = gcsfs.GCSFileSystem(token='anon')\n", - "ds = xr.open_zarr(gcs.get_mapper('gs://cmip6/CMIP/NASA-GISS/GISS-E2-1-G-CC/historical/r1i1p1f1/Omon/dissic/gn/'), consolidated=True)" + "\n", + "gcs = gcsfs.GCSFileSystem(token=\"anon\")\n", + "ds = xr.open_zarr(\n", + " gcs.get_mapper(\n", + " \"gs://cmip6/CMIP/NASA-GISS/GISS-E2-1-G-CC/historical/r1i1p1f1/Omon/dissic/gn/\"\n", + " ),\n", + " consolidated=True,\n", + ")" ] } ],