diff --git a/.github/workflows/check_formatting.yml b/.github/workflows/check_formatting.yml index b19a31fd..b6f8ec27 100644 --- a/.github/workflows/check_formatting.yml +++ b/.github/workflows/check_formatting.yml @@ -19,4 +19,5 @@ jobs: shell: bash -l {0} run: mamba install --quiet --yes --file requirements.txt black && - black tobac --check --diff + black --version && + black tobac --check --diff diff --git a/doc/tobac.rst b/doc/tobac.rst index 87cd45ab..5e651222 100644 --- a/doc/tobac.rst +++ b/doc/tobac.rst @@ -7,7 +7,26 @@ Submodules tobac.analysis module --------------------- -.. automodule:: tobac.analysis +tobac.analysis.cell_analysis module +--------------------- + +.. automodule:: tobac.analysis.cell_analysis + :members: + :undoc-members: + :show-inheritance: + +tobac.analysis.feature_analysis module +--------------------- + +.. automodule:: tobac.analysis.feature_analysis + :members: + :undoc-members: + :show-inheritance: + +tobac.analysis.spatial module +--------------------- + +.. automodule:: tobac.analysis.spatial :members: :undoc-members: :show-inheritance: @@ -71,18 +90,26 @@ tobac.tracking module tobac.utils modules ------------------ -tobac.utils.general module +tobac.utils.bulk_statistics module ------------------ -.. automodule:: tobac.utils.general +.. automodule:: tobac.utils.bulk_statistics :members: :undoc-members: :show-inheritance: -tobac.utils.bulk_statistics module +tobac.utils.decorators module ------------------ -.. automodule:: tobac.utils.bulk_statistics +.. automodule:: tobac.utils.decorators + :members: + :undoc-members: + :show-inheritance: + +tobac.utils.general module +------------------ + +.. automodule:: tobac.utils.general :members: :undoc-members: :show-inheritance: @@ -95,6 +122,14 @@ tobac.utils.mask module :undoc-members: :show-inheritance: +tobac.utils.periodic_boundaries module +------------------ + +.. automodule:: tobac.utils.periodic_boundaries + :members: + :undoc-members: + :show-inheritance: + tobac.wrapper module -------------------- diff --git a/examples/Basics/Idealized-Case-1_Tracking-of-a-Test-Blob-in-2D.ipynb b/examples/Basics/Idealized-Case-1_Tracking-of-a-Test-Blob-in-2D.ipynb index 00702937..35c026ab 100644 --- a/examples/Basics/Idealized-Case-1_Tracking-of-a-Test-Blob-in-2D.ipynb +++ b/examples/Basics/Idealized-Case-1_Tracking-of-a-Test-Blob-in-2D.ipynb @@ -36,17 +36,19 @@ "cell_type": "code", "execution_count": 1, "id": "46abd7ad", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:46:46.604370Z", - "iopub.status.busy": "2024-02-17T12:46:46.603803Z", - "iopub.status.idle": "2024-02-17T12:47:02.061962Z", - "shell.execute_reply": "2024-02-17T12:47:02.059167Z" + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "using tobac version 1.5.2\n" + ] } - }, - "outputs": [], + ], "source": [ "import tobac\n", + "print('using tobac version', str(tobac.__version__))\n", "\n", "# we add testing here to create test dataset (typically not needed in standard applications)\n", "import tobac.testing" @@ -64,14 +66,7 @@ "cell_type": "code", "execution_count": 2, "id": "a28f3ba2", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:02.069480Z", - "iopub.status.busy": "2024-02-17T12:47:02.068740Z", - "iopub.status.idle": "2024-02-17T12:47:03.098546Z", - "shell.execute_reply": "2024-02-17T12:47:03.096911Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", @@ -93,14 +88,7 @@ "cell_type": "code", "execution_count": 3, "id": "fd75ee85", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:03.105232Z", - "iopub.status.busy": "2024-02-17T12:47:03.104769Z", - "iopub.status.idle": "2024-02-17T12:47:03.716692Z", - "shell.execute_reply": "2024-02-17T12:47:03.715384Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", @@ -137,14 +125,7 @@ "cell_type": "code", "execution_count": 4, "id": "1ecce2f7", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:03.720546Z", - "iopub.status.busy": "2024-02-17T12:47:03.720174Z", - "iopub.status.idle": "2024-02-17T12:47:03.806021Z", - "shell.execute_reply": "2024-02-17T12:47:03.805262Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -521,7 +502,7 @@ " latitude (y, x) float64 ...\n", " longitude (y, x) float64 ...\n", "Attributes:\n", - " units: m s-1
  • units :
    m s-1
  • " ], "text/plain": [ "\n", @@ -694,14 +675,7 @@ "cell_type": "code", "execution_count": 5, "id": "cd819867", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:03.810077Z", - "iopub.status.busy": "2024-02-17T12:47:03.809799Z", - "iopub.status.idle": "2024-02-17T12:47:03.818233Z", - "shell.execute_reply": "2024-02-17T12:47:03.817307Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -744,14 +718,7 @@ "cell_type": "code", "execution_count": 6, "id": "b7c04d5a", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:03.822185Z", - "iopub.status.busy": "2024-02-17T12:47:03.821801Z", - "iopub.status.idle": "2024-02-17T12:47:05.221198Z", - "shell.execute_reply": "2024-02-17T12:47:05.218804Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -810,14 +777,7 @@ "cell_type": "code", "execution_count": 7, "id": "454d687c", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:05.235262Z", - "iopub.status.busy": "2024-02-17T12:47:05.232533Z", - "iopub.status.idle": "2024-02-17T12:47:05.463213Z", - "shell.execute_reply": "2024-02-17T12:47:05.461185Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "dxy, dt = tobac.get_spacings(test_data)" @@ -835,14 +795,7 @@ "cell_type": "code", "execution_count": 8, "id": "24b828de", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:05.470570Z", - "iopub.status.busy": "2024-02-17T12:47:05.470170Z", - "iopub.status.idle": "2024-02-17T12:47:05.487186Z", - "shell.execute_reply": "2024-02-17T12:47:05.484994Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -1211,7 +1164,7 @@ " fill: currentColor;\n", "}\n", "
    <xarray.DataArray 'w' ()>\n",
    -       "array(10.)
    " + "array(10.)" ], "text/plain": [ "\n", @@ -1239,14 +1192,7 @@ "cell_type": "code", "execution_count": 9, "id": "93b5659d", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:05.492533Z", - "iopub.status.busy": "2024-02-17T12:47:05.491566Z", - "iopub.status.idle": "2024-02-17T12:47:05.503504Z", - "shell.execute_reply": "2024-02-17T12:47:05.502486Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "threshold = 9" @@ -1264,14 +1210,7 @@ "cell_type": "code", "execution_count": 10, "id": "9c322da7", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:05.508050Z", - "iopub.status.busy": "2024-02-17T12:47:05.507772Z", - "iopub.status.idle": "2024-02-17T12:47:06.072214Z", - "shell.execute_reply": "2024-02-17T12:47:06.071461Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "%%capture\n", @@ -1290,14 +1229,7 @@ "cell_type": "code", "execution_count": 11, "id": "6e8d3cd3", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:06.080011Z", - "iopub.status.busy": "2024-02-17T12:47:06.079289Z", - "iopub.status.idle": "2024-02-17T12:47:06.129931Z", - "shell.execute_reply": "2024-02-17T12:47:06.122788Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -2346,14 +2278,7 @@ "cell_type": "code", "execution_count": 12, "id": "c61c8715", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:06.137629Z", - "iopub.status.busy": "2024-02-17T12:47:06.137341Z", - "iopub.status.idle": "2024-02-17T12:47:07.859565Z", - "shell.execute_reply": "2024-02-17T12:47:07.858711Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -2410,14 +2335,7 @@ "cell_type": "code", "execution_count": 13, "id": "a1c72cca", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:07.867736Z", - "iopub.status.busy": "2024-02-17T12:47:07.867020Z", - "iopub.status.idle": "2024-02-17T12:47:08.071551Z", - "shell.execute_reply": "2024-02-17T12:47:08.069618Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2451,14 +2369,7 @@ "cell_type": "code", "execution_count": 14, "id": "26b7a9b2", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:08.080735Z", - "iopub.status.busy": "2024-02-17T12:47:08.079561Z", - "iopub.status.idle": "2024-02-17T12:47:08.126057Z", - "shell.execute_reply": "2024-02-17T12:47:08.123918Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -3603,14 +3514,7 @@ "cell_type": "code", "execution_count": 15, "id": "ae3f63dd", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:08.136022Z", - "iopub.status.busy": "2024-02-17T12:47:08.135416Z", - "iopub.status.idle": "2024-02-17T12:47:08.140782Z", - "shell.execute_reply": "2024-02-17T12:47:08.139145Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "track_mask = trajectories[\"cell\"] == 1.0" @@ -3629,14 +3533,7 @@ "cell_type": "code", "execution_count": 16, "id": "25e4c7ae", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:08.150265Z", - "iopub.status.busy": "2024-02-17T12:47:08.149159Z", - "iopub.status.idle": "2024-02-17T12:47:08.162712Z", - "shell.execute_reply": "2024-02-17T12:47:08.159743Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "track = trajectories.where(track_mask).dropna()" @@ -3655,12 +3552,6 @@ "execution_count": 17, "id": "9e6f8c23", "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:08.181028Z", - "iopub.status.busy": "2024-02-17T12:47:08.172748Z", - "iopub.status.idle": "2024-02-17T12:47:09.791381Z", - "shell.execute_reply": "2024-02-17T12:47:09.790072Z" - }, "tags": [ "nbsphinx-thumbnail" ] @@ -3729,7 +3620,7 @@ "\n", "*On further extensions:*\n", "\n", - "- Also, one could actually use the output of the segmentation (`features_test` Dataset in the example below) as input for the tracking with the advantage that information on the area (ncells) is added in the dataframe. \n", + "- Also, one could actually use the output of the segmentation (`segments` Dataset in the example below) as input for the tracking with the advantage that information on the area (ncells) is added in the dataframe. \n", "\n", "- One could also use the output of the tracking in the segmentation (`trajectories` Dataset from above) with the advantage that mask will contain only the features that are also linked to trajectories. \n", "\n", @@ -3740,18 +3631,11 @@ "cell_type": "code", "execution_count": 18, "id": "f73bdf63", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:09.797031Z", - "iopub.status.busy": "2024-02-17T12:47:09.796476Z", - "iopub.status.idle": "2024-02-17T12:47:11.737307Z", - "shell.execute_reply": "2024-02-17T12:47:11.736184Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "%%capture\n", - "mask, features_test = tobac.segmentation_2D(features, test_data, dxy, threshold=9)" + "segment_labels, segments = tobac.segmentation_2D(features, test_data, dxy, threshold=9)" ] }, { @@ -3759,21 +3643,14 @@ "id": "8b9f3d3b", "metadata": {}, "source": [ - "As the name implies, the first object returned is a Boolean mask that is true for all segments belonging to features. The second output is again the features of the field." + "As the name implies, the first object returned is an array in which the segmented areas belonging to each feature have the same label value as that feature. The second output is a dataframe of the detected features updated with information about their segmented regions (currently only the number of pixels segmented)" ] }, { "cell_type": "code", "execution_count": 19, "id": "7d2926de", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:11.745231Z", - "iopub.status.busy": "2024-02-17T12:47:11.744182Z", - "iopub.status.idle": "2024-02-17T12:47:11.776460Z", - "shell.execute_reply": "2024-02-17T12:47:11.775352Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -4150,7 +4027,7 @@ " latitude (y, x) float64 ...\n", " longitude (y, x) float64 ...\n", "Attributes:\n", - " long_name: segmentation_mask
  • long_name :
    segmentation_mask
  • " ], "text/plain": [ "\n", @@ -4307,7 +4184,7 @@ } ], "source": [ - "mask" + "segment_labels" ] }, { @@ -4322,14 +4199,7 @@ "cell_type": "code", "execution_count": 20, "id": "25d26ebc", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:11.782511Z", - "iopub.status.busy": "2024-02-17T12:47:11.781845Z", - "iopub.status.idle": "2024-02-17T12:47:13.720840Z", - "shell.execute_reply": "2024-02-17T12:47:13.719745Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -4352,7 +4222,7 @@ " test_data.isel(time=itime).plot(ax=axs[i])\n", "\n", " # plot the mask outline\n", - " mask.isel(time=itime).plot.contour(levels=[0.5], ax=axs[i], colors=\"k\")\n", + " segment_labels.isel(time=itime).plot.contour(levels=[0.5], ax=axs[i], colors=\"k\")\n", "\n", " # plot the detected feature as black cross\n", " f = features.loc[[itime]]\n", @@ -4404,17 +4274,10 @@ "cell_type": "code", "execution_count": 21, "id": "4c516c26", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:13.727021Z", - "iopub.status.busy": "2024-02-17T12:47:13.726230Z", - "iopub.status.idle": "2024-02-17T12:47:13.758905Z", - "shell.execute_reply": "2024-02-17T12:47:13.757558Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "vel = tobac.analysis.calculate_velocity(track)" + "vel = tobac.calculate_velocity(track)" ] }, { @@ -4429,14 +4292,7 @@ "cell_type": "code", "execution_count": 22, "id": "887d4338", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:13.766366Z", - "iopub.status.busy": "2024-02-17T12:47:13.765703Z", - "iopub.status.idle": "2024-02-17T12:47:13.771825Z", - "shell.execute_reply": "2024-02-17T12:47:13.770580Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "expected_velocity = np.sqrt(30**2 + 14**2)" @@ -4454,14 +4310,7 @@ "cell_type": "code", "execution_count": 23, "id": "6151a205", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:13.775501Z", - "iopub.status.busy": "2024-02-17T12:47:13.775155Z", - "iopub.status.idle": "2024-02-17T12:47:14.037489Z", - "shell.execute_reply": "2024-02-17T12:47:14.036830Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -4499,17 +4348,10 @@ "cell_type": "code", "execution_count": 24, "id": "fdac72d9", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:14.041186Z", - "iopub.status.busy": "2024-02-17T12:47:14.040814Z", - "iopub.status.idle": "2024-02-17T12:47:14.046777Z", - "shell.execute_reply": "2024-02-17T12:47:14.044844Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "hist, edges = tobac.analysis.velocity_histogram(\n", + "hist, edges = tobac.velocity_histogram(\n", " track,\n", " bin_edges=np.arange(14, 43, 3),\n", ")" @@ -4519,14 +4361,7 @@ "cell_type": "code", "execution_count": 25, "id": "d18089f9", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:14.053363Z", - "iopub.status.busy": "2024-02-17T12:47:14.052460Z", - "iopub.status.idle": "2024-02-17T12:47:14.364261Z", - "shell.execute_reply": "2024-02-17T12:47:14.359902Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -4565,178 +4400,55 @@ { "cell_type": "code", "execution_count": 26, - "id": "20866a28", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:14.371374Z", - "iopub.status.busy": "2024-02-17T12:47:14.370983Z", - "iopub.status.idle": "2024-02-17T12:47:14.423818Z", - "shell.execute_reply": "2024-02-17T12:47:14.422823Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "
    Segmentation Mask (unknown)timeprojection_y_coordinateprojection_x_coordinate
    Shape10050100
    Dimension coordinates
    \ttimex--
    \tprojection_y_coordinate-x-
    \tprojection_x_coordinate--x
    Auxiliary coordinates
    \tlatitude-xx
    \tlongitude-xx
    \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "id": "33a10697", + "metadata": {}, + "outputs": [], "source": [ - "import xarray as xr\n", - "\n", - "imask = xr.DataArray.to_iris(mask)\n", - "imask" + "area = tobac.calculate_area(features, segment_labels)" ] }, { "cell_type": "code", "execution_count": 27, - "id": "33a10697", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:14.437699Z", - "iopub.status.busy": "2024-02-17T12:47:14.437388Z", - "iopub.status.idle": "2024-02-17T12:47:14.448235Z", - "shell.execute_reply": "2024-02-17T12:47:14.442773Z" - } - }, + "id": "571cb182", + "metadata": {}, "outputs": [], "source": [ - "# area = tobac.analysis.calculate_area(features, imask)" + "blob_magitude = 10 # also hard-code in the test\n", + "blob_sigma = 10e3\n", + "\n", + "normalized_circle_radius = np.sqrt(np.log(blob_magitude / threshold))\n", + "absolute_circle_radius = np.sqrt(2) * blob_sigma * normalized_circle_radius\n", + "expected_area = np.pi * absolute_circle_radius**2" ] }, { "cell_type": "code", "execution_count": 28, "id": "86be74d4", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:14.459910Z", - "iopub.status.busy": "2024-02-17T12:47:14.459329Z", - "iopub.status.idle": "2024-02-17T12:47:14.465913Z", - "shell.execute_reply": "2024-02-17T12:47:14.463934Z" + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" } - }, - "outputs": [], + ], "source": [ - "# plt.figure(figsize=(10, 5))\n", - "# plt.tight_layout()\n", - "# plt.plot(area[\"frame\"], area[\"area\"])\n", - "# plt.xlabel(\"timeframe\")\n", - "# plt.ylabel(r\"area [$m^2$]\")\n", - "# plt.grid()" + "plt.figure(figsize=(10, 5))\n", + "plt.tight_layout()\n", + "plt.plot(area[\"frame\"], area[\"area\"])\n", + "plt.xlabel(\"timeframe\")\n", + "plt.ylabel(r\"area [$m^2$]\")\n", + "plt.grid()\n", + "\n", + "plt.axhline(expected_area, color=\"darkgreen\", lw=5, alpha=0.5)\n", + "sns.despine()" ] }, { @@ -4753,17 +4465,10 @@ "cell_type": "code", "execution_count": 29, "id": "4a7b37f1", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:14.474517Z", - "iopub.status.busy": "2024-02-17T12:47:14.474125Z", - "iopub.status.idle": "2024-02-17T12:47:14.486908Z", - "shell.execute_reply": "2024-02-17T12:47:14.485462Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "hist, bins, centers = tobac.analysis.lifetime_histogram(\n", + "hist, bins, centers = tobac.lifetime_histogram(\n", " track, bin_edges=np.arange(0, 200, 20)\n", ")" ] @@ -4772,14 +4477,7 @@ "cell_type": "code", "execution_count": 30, "id": "36bb6765", - "metadata": { - "execution": { - "iopub.execute_input": "2024-02-17T12:47:14.492981Z", - "iopub.status.busy": "2024-02-17T12:47:14.491513Z", - "iopub.status.idle": "2024-02-17T12:47:14.721691Z", - "shell.execute_reply": "2024-02-17T12:47:14.720229Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -4810,6 +4508,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tobac_RC_v1.5.x]", + "language": "python", + "name": "conda-env-tobac_RC_v1.5.x-py" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/setup.py b/setup.py index 68b13c4d..2339a6ca 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,10 @@ -from setuptools import setup - """ This code is from the python documentation and is designed to read in the version number. See: https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ """ + +from setuptools import setup from pathlib import Path @@ -37,6 +37,14 @@ def get_requirements(requirements_filename): return requirements +def get_packages(package_name): + package = Path(package_name) + packages = [ + str(path.parent).replace("/", ".") for path in package.rglob("__init__.py") + ] + return packages + + PACKAGE_NAME = "tobac" # See classifiers list at: https://pypi.org/classifiers/ @@ -86,7 +94,7 @@ def get_requirements(requirements_filename): "peter.marinescu@colostate.edu", ], license="BSD-3-Clause License", - packages=[PACKAGE_NAME, PACKAGE_NAME + ".utils", PACKAGE_NAME + ".utils.internal"], + packages=get_packages(PACKAGE_NAME), install_requires=get_requirements("requirements.txt"), test_requires=["pytest"], zip_safe=False, diff --git a/tobac/__init__.py b/tobac/__init__.py index 652645e4..1b668bd1 100644 --- a/tobac/__init__.py +++ b/tobac/__init__.py @@ -29,47 +29,53 @@ plot_mask_cell_track_follow, plot_mask_cell_track_static, plot_mask_cell_track_static_timeseries, -) -from .plotting import ( plot_lifetime_histogram, plot_lifetime_histogram_bar, plot_histogram_cellwise, plot_histogram_featurewise, -) -from .plotting import plot_mask_cell_track_3Dstatic, plot_mask_cell_track_2D3Dstatic -from .plotting import ( + plot_mask_cell_track_3Dstatic, + plot_mask_cell_track_2D3Dstatic, plot_mask_cell_individual_static, plot_mask_cell_individual_3Dstatic, + animation_mask_field, + make_map, + map_tracks, ) -from .plotting import animation_mask_field -from .plotting import make_map, map_tracks -from .analysis import ( +from tobac.analysis.cell_analysis import ( cell_statistics, cog_cell, lifetime_histogram, - histogram_featurewise, histogram_cellwise, -) -from .analysis import calculate_velocity, calculate_distance, calculate_area -from .analysis import calculate_nearestneighbordistance -from .analysis import ( velocity_histogram, + calculate_overlap, +) +from tobac.analysis.feature_analysis import ( + histogram_featurewise, + calculate_nearestneighbordistance, nearestneighbordistance_histogram, area_histogram, ) -from .analysis import calculate_overlap -from .utils import ( +from tobac.analysis.spatial import ( + calculate_velocity, + calculate_distance, + calculate_area, +) +from .utils.mask import ( mask_cell, mask_cell_surface, mask_cube_cell, mask_cube_untracked, mask_cube, column_mask_from2D, + mask_features, + mask_features_surface, + mask_cube_features, +) +from .utils.general import ( get_bounding_box, + add_coordinates, + get_spacings, ) -from .utils import mask_features, mask_features_surface, mask_cube_features - -from .utils import add_coordinates, get_spacings from .feature_detection import feature_detection_multithreshold from .tracking import linking_trackpy from .wrapper import maketrack diff --git a/tobac/analysis.py b/tobac/analysis.py deleted file mode 100644 index afc11084..00000000 --- a/tobac/analysis.py +++ /dev/null @@ -1,1243 +0,0 @@ -"""Provide tools to analyse and visualize the tracked objects. -This module provides a set of routines that enables performing analyses -and deriving statistics for individual tracks, such as the time series -of integrated properties and vertical profiles. It also provides -routines to calculate summary statistics of the entire population of -tracked features in the field like histograms of areas/volumes -or mass and a detailed cell lifetime analysis. These analysis -routines are all built in a modular manner. Thus, users can reuse the -most basic methods for interacting with the data structure of the -package in their own analysis procedures in Python. This includes -functions performing simple tasks like looping over all identified -objects or trajectories and masking arrays for the analysis of -individual features. Plotting routines include both visualizations -for individual convective cells and their properties. [1]_ - -References ----------- -.. Heikenfeld, M., Marinescu, P. J., Christensen, M., - Watson-Parris, D., Senf, F., van den Heever, S. C. - & Stier, P. (2019). tobac 1.2: towards a flexible - framework for tracking and analysis of clouds in - diverse datasets. Geoscientific Model Development, - 12(11), 4551-4570. - -Notes ------ -""" - -import pandas as pd -import numpy as np -import logging -import os -import warnings - -from tobac.centerofgravity import calculate_cog -from .utils import mask_cell, mask_cell_surface, mask_cube_cell, get_bounding_box -from .utils import internal as internal_utils -from .utils import decorators - - -def cell_statistics_all( - input_cubes, - track, - mask, - aggregators, - output_path="./", - cell_selection=None, - output_name="Profiles", - width=10000, - z_coord="model_level_number", - dimensions=["x", "y"], - **kwargs, -): - """ - Parameters - ---------- - input_cubes : iris.cube.Cube - - track : dask.dataframe.DataFrame - - mask : iris.cube.Cube - Cube containing mask (int id for tracked volumes 0 everywhere - else). - - aggregators : list - list of iris.analysis.Aggregator instances - - output_path : str, optional - Default is './'. - - cell_selection : optional - Default is None. - - output_name : str, optional - Default is 'Profiles'. - - width : int, optional - Default is 10000. - - z_coord : str, optional - Name of the vertical coordinate in the cube. Default is - 'model_level_number'. - - dimensions : list of str, optional - Default is ['x', 'y']. - - **kwargs - - Returns - ------- - None - """ - warnings.warn( - "cell_statistics_all is depreciated and will be removed or significantly changed in v2.0.", - DeprecationWarning, - ) - - if cell_selection is None: - cell_selection = np.unique(track["cell"]) - for cell in cell_selection: - cell_statistics( - input_cubes=input_cubes, - track=track, - mask=mask, - dimensions=dimensions, - aggregators=aggregators, - cell=cell, - output_path=output_path, - output_name=output_name, - width=width, - z_coord=z_coord, - **kwargs, - ) - - -def cell_statistics( - input_cubes, - track, - mask, - aggregators, - cell, - output_path="./", - output_name="Profiles", - width=10000, - z_coord="model_level_number", - dimensions=["x", "y"], - **kwargs, -): - """ - Parameters - ---------- - input_cubes : iris.cube.Cube - - track : dask.dataframe.DataFrame - - mask : iris.cube.Cube - Cube containing mask (int id for tracked volumes 0 everywhere - else). - - aggregators list - list of iris.analysis.Aggregator instances - - cell : int - Integer id of cell to create masked cube for output. - - output_path : str, optional - Default is './'. - - output_name : str, optional - Default is 'Profiles'. - - width : int, optional - Default is 10000. - - z_coord : str, optional - Name of the vertical coordinate in the cube. Default is - 'model_level_number'. - - dimensions : list of str, optional - Default is ['x', 'y']. - - **kwargs - - Returns - ------- - None - """ - - from iris.cube import Cube, CubeList - from iris.coords import AuxCoord - from iris import Constraint, save - - warnings.warn( - "cell_statistics is depreciated and will be removed or significantly changed in v2.0.", - DeprecationWarning, - ) - - # If input is single cube, turn into cubelist - if type(input_cubes) is Cube: - input_cubes = CubeList([input_cubes]) - - logging.debug("Start calculating profiles for cell " + str(cell)) - track_i = track[track["cell"] == cell] - - cubes_profile = {} - for aggregator in aggregators: - cubes_profile[aggregator.name()] = CubeList() - - for time_i in track_i["time"].values: - constraint_time = Constraint(time=time_i) - - mask_i = mask.extract(constraint_time) - mask_cell_i = mask_cell(mask_i, cell, track_i, masked=False) - mask_cell_surface_i = mask_cell_surface( - mask_i, cell, track_i, masked=False, z_coord=z_coord - ) - - x_dim = mask_cell_surface_i.coord_dims("projection_x_coordinate")[0] - y_dim = mask_cell_surface_i.coord_dims("projection_y_coordinate")[0] - x_coord = mask_cell_surface_i.coord("projection_x_coordinate") - y_coord = mask_cell_surface_i.coord("projection_y_coordinate") - - if (mask_cell_surface_i.core_data() > 0).any(): - box_mask_i = get_bounding_box(mask_cell_surface_i.core_data(), buffer=1) - - box_mask = [ - [ - x_coord.points[box_mask_i[x_dim][0]], - x_coord.points[box_mask_i[x_dim][1]], - ], - [ - y_coord.points[box_mask_i[y_dim][0]], - y_coord.points[box_mask_i[y_dim][1]], - ], - ] - else: - box_mask = [[np.nan, np.nan], [np.nan, np.nan]] - - x = track_i[track_i["time"].values == time_i]["projection_x_coordinate"].values[ - 0 - ] - y = track_i[track_i["time"].values == time_i]["projection_y_coordinate"].values[ - 0 - ] - - box_slice = [[x - width, x + width], [y - width, y + width]] - - x_min = np.nanmin([box_mask[0][0], box_slice[0][0]]) - x_max = np.nanmax([box_mask[0][1], box_slice[0][1]]) - y_min = np.nanmin([box_mask[1][0], box_slice[1][0]]) - y_max = np.nanmax([box_mask[1][1], box_slice[1][1]]) - - constraint_x = Constraint( - projection_x_coordinate=lambda cell: int(x_min) < cell < int(x_max) - ) - constraint_y = Constraint( - projection_y_coordinate=lambda cell: int(y_min) < cell < int(y_max) - ) - - constraint = constraint_time & constraint_x & constraint_y - # Mask_cell_surface_i=mask_cell_surface(Mask_w_i,cell,masked=False,z_coord='model_level_number') - mask_cell_i = mask_cell_i.extract(constraint) - mask_cell_surface_i = mask_cell_surface_i.extract(constraint) - - input_cubes_i = input_cubes.extract(constraint) - for cube in input_cubes_i: - cube_masked = mask_cube_cell(cube, mask_cell_i, cell, track_i) - coords_remove = [] - for coordinate in cube_masked.coords(dim_coords=False): - if coordinate.name() not in dimensions: - for dim in dimensions: - if set(cube_masked.coord_dims(coordinate)).intersection( - set(cube_masked.coord_dims(dim)) - ): - coords_remove.append(coordinate.name()) - for coordinate in set(coords_remove): - cube_masked.remove_coord(coordinate) - - for aggregator in aggregators: - cube_collapsed = cube_masked.collapsed(dimensions, aggregator, **kwargs) - # remove all collapsed coordinates (x and y dim, scalar now) and keep only time as all these coordinates are useless - for coordinate in cube_collapsed.coords(): - if not cube_collapsed.coord_dims(coordinate): - if coordinate.name() != "time": - cube_collapsed.remove_coord(coordinate) - logging.debug(str(cube_collapsed)) - cubes_profile[aggregator.name()].append(cube_collapsed) - - minutes = (track_i["time_cell"] / pd.Timedelta(minutes=1)).values - latitude = track_i["latitude"].values - longitude = track_i["longitude"].values - minutes_coord = AuxCoord(minutes, long_name="cell_time", units="min") - latitude_coord = AuxCoord(latitude, long_name="latitude", units="degrees") - longitude_coord = AuxCoord(longitude, long_name="longitude", units="degrees") - - for aggregator in aggregators: - cubes_profile[aggregator.name()] = cubes_profile[aggregator.name()].merge() - for cube in cubes_profile[aggregator.name()]: - cube.add_aux_coord(minutes_coord, data_dims=cube.coord_dims("time")) - cube.add_aux_coord(latitude_coord, data_dims=cube.coord_dims("time")) - cube.add_aux_coord(longitude_coord, data_dims=cube.coord_dims("time")) - os.makedirs( - os.path.join(output_path, output_name, aggregator.name()), exist_ok=True - ) - savefile = os.path.join( - output_path, - output_name, - aggregator.name(), - output_name + "_" + aggregator.name() + "_" + str(int(cell)) + ".nc", - ) - save(cubes_profile[aggregator.name()], savefile) - - -def cog_cell( - cell, - Tracks=None, - M_total=None, - M_liquid=None, - M_frozen=None, - Mask=None, - savedir=None, -): - """ - Parameters - ---------- - cell : int - Integer id of cell to create masked cube for output. - - Tracks : optional - Default is None. - - M_total : subset of cube, optional - Default is None. - - M_liquid : subset of cube, optional - Default is None. - - M_frozen : subset of cube, optional - Default is None. - - savedir : str - Default is None. - - Returns - ------- - None - """ - - warnings.warn( - "cog_cell is depreciated and will be removed or significantly changed in v2.0.", - DeprecationWarning, - ) - - from iris import Constraint - - logging.debug("Start calculating COG for " + str(cell)) - Track = Tracks[Tracks["cell"] == cell] - constraint_time = Constraint( - time=lambda cell: Track.head(1)["time"].values[0] - <= cell - <= Track.tail(1)["time"].values[0] - ) - M_total_i = M_total.extract(constraint_time) - M_liquid_i = M_liquid.extract(constraint_time) - M_frozen_i = M_frozen.extract(constraint_time) - Mask_i = Mask.extract(constraint_time) - - savedir_cell = os.path.join(savedir, "cells", str(int(cell))) - os.makedirs(savedir_cell, exist_ok=True) - savefile_COG_total_i = os.path.join( - savedir_cell, "COG_total" + "_" + str(int(cell)) + ".h5" - ) - savefile_COG_liquid_i = os.path.join( - savedir_cell, "COG_liquid" + "_" + str(int(cell)) + ".h5" - ) - savefile_COG_frozen_i = os.path.join( - savedir_cell, "COG_frozen" + "_" + str(int(cell)) + ".h5" - ) - - Tracks_COG_total_i = calculate_cog(Track, M_total_i, Mask_i) - # Tracks_COG_total_list.append(Tracks_COG_total_i) - logging.debug("COG total loaded for " + str(cell)) - - Tracks_COG_liquid_i = calculate_cog(Track, M_liquid_i, Mask_i) - # Tracks_COG_liquid_list.append(Tracks_COG_liquid_i) - logging.debug("COG liquid loaded for " + str(cell)) - Tracks_COG_frozen_i = calculate_cog(Track, M_frozen_i, Mask_i) - # Tracks_COG_frozen_list.append(Tracks_COG_frozen_i) - logging.debug("COG frozen loaded for " + str(cell)) - - Tracks_COG_total_i.to_hdf(savefile_COG_total_i, "table") - Tracks_COG_liquid_i.to_hdf(savefile_COG_liquid_i, "table") - Tracks_COG_frozen_i.to_hdf(savefile_COG_frozen_i, "table") - logging.debug("individual COG calculated and saved to " + savedir_cell) - - -def lifetime_histogram( - Track, bin_edges=np.arange(0, 200, 20), density=False, return_values=False -): - """Compute the lifetime histogram of linked features. - - Parameters - ---------- - Track : pandas.DataFrame - Dataframe of linked features, containing the columns 'cell' - and 'time_cell'. - - bin_edges : int or ndarray, optional - If bin_edges is an int, it defines the number of equal-width - bins in the given range. If bins is a ndarray, it defines a - monotonically increasing array of bin edges, including the - rightmost edge. The unit is minutes. - Default is np.arange(0, 200, 20). - - density : bool, optional - If False, the result will contain the number of samples in - each bin. If True, the result is the value of the probability - density function at the bin, normalized such that the integral - over the range is 1. Default is False. - - return_values : bool, optional - Bool determining wether the lifetimes of the features are - returned from this function. Default is False. - - Returns - ------- - hist : ndarray - The values of the histogram. - - bin_edges : ndarray - The edges of the histogram. - - bin_centers : ndarray - The centers of the histogram intervalls. - - minutes, optional : ndarray - Numpy.array of the lifetime of each feature in minutes. - Returned if return_values is True. - - """ - - Track_cell = Track.groupby("cell") - minutes = (Track_cell["time_cell"].max() / pd.Timedelta(minutes=1)).values - hist, bin_edges = np.histogram(minutes, bin_edges, density=density) - bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) - if return_values: - return hist, bin_edges, bin_centers, minutes - else: - return hist, bin_edges, bin_centers - - -def haversine(lat1, lon1, lat2, lon2): - """Computes the Haversine distance in kilometers. - - Calculates the Haversine distance between two points - (based on implementation CIS https://github.com/cedadev/cis). - - Parameters - ---------- - lat1, lon1 : array of latitude, longitude - First point or points as array in degrees. - - lat2, lon2 : array of latitude, longitude - Second point or points as array in degrees. - - Returns - ------- - arclen * RADIUS_EARTH : array - Array of Distance(s) between the two points(-arrays) in - kilometers. - - """ - - RADIUS_EARTH = 6378.0 - lat1 = np.radians(lat1) - lat2 = np.radians(lat2) - lon1 = np.radians(lon1) - lon2 = np.radians(lon2) - # print(lat1,lat2,lon1,lon2) - arclen = 2 * np.arcsin( - np.sqrt( - (np.sin((lat2 - lat1) / 2)) ** 2 - + np.cos(lat1) * np.cos(lat2) * (np.sin((lon2 - lon1) / 2)) ** 2 - ) - ) - return arclen * RADIUS_EARTH - - -def calculate_distance(feature_1, feature_2, method_distance=None): - """Compute the distance between two features. It is based on - either lat/lon coordinates or x/y coordinates. - - Parameters - ---------- - feature_1, feature_2 : pandas.DataFrame or pandas.Series - Dataframes containing multiple features or pandas.Series - of one feature. Need to contain either projection_x_coordinate - and projection_y_coordinate or latitude and longitude - coordinates. - - method_distance : {None, 'xy', 'latlon'}, optional - Method of distance calculation. 'xy' uses the length of the - vector between the two features, 'latlon' uses the haversine - distance. None checks wether the required coordinates are - present and starts with 'xy'. Default is None. - - Returns - ------- - distance : float or pandas.Series - Float with the distance between the two features in meters if - the input are two pandas.Series containing one feature, - pandas.Series of the distances if one of the inputs contains - multiple features. - - """ - if method_distance is None: - if ( - ("projection_x_coordinate" in feature_1) - and ("projection_y_coordinate" in feature_1) - and ("projection_x_coordinate" in feature_2) - and ("projection_y_coordinate" in feature_2) - ): - method_distance = "xy" - elif ( - ("latitude" in feature_1) - and ("longitude" in feature_1) - and ("latitude" in feature_2) - and ("longitude" in feature_2) - ): - method_distance = "latlon" - else: - raise ValueError( - "either latitude/longitude or projection_x_coordinate/projection_y_coordinate have to be present to calculate distances" - ) - - if method_distance == "xy": - distance = np.sqrt( - ( - feature_1["projection_x_coordinate"] - - feature_2["projection_x_coordinate"] - ) - ** 2 - + ( - feature_1["projection_y_coordinate"] - - feature_2["projection_y_coordinate"] - ) - ** 2 - ) - elif method_distance == "latlon": - distance = 1000 * haversine( - feature_1["latitude"], - feature_1["longitude"], - feature_2["latitude"], - feature_2["longitude"], - ) - else: - raise ValueError("method undefined") - return distance - - -def calculate_velocity_individual(feature_old, feature_new, method_distance=None): - """Calculate the mean velocity of a feature between two timeframes. - - Parameters - ---------- - feature_old : pandas.Series - pandas.Series of a feature at a certain timeframe. Needs to - contain a 'time' column and either projection_x_coordinate - and projection_y_coordinate or latitude and longitude coordinates. - - feature_new : pandas.Series - pandas.Series of the same feature at a later timeframe. Needs - to contain a 'time' column and either projection_x_coordinate - and projection_y_coordinate or latitude and longitude coordinates. - - method_distance : {None, 'xy', 'latlon'}, optional - Method of distance calculation, used to calculate the velocity. - 'xy' uses the length of the vector between the two features, - 'latlon' uses the haversine distance. None checks wether the - required coordinates are present and starts with 'xy'. - Default is None. - - Returns - ------- - velocity : float - Value of the approximate velocity. - - """ - - distance = calculate_distance( - feature_old, feature_new, method_distance=method_distance - ) - diff_time = (feature_new["time"] - feature_old["time"]).total_seconds() - velocity = distance / diff_time - return velocity - - -def calculate_velocity(track, method_distance=None): - """Calculate the velocities of a set of linked features. - - Parameters - ---------- - track : pandas.DataFrame - Dataframe of linked features, containing the columns 'cell', - 'time' and either 'projection_x_coordinate' and - 'projection_y_coordinate' or 'latitude' and 'longitude'. - - method_distance : {None, 'xy', 'latlon'}, optional - Method of distance calculation, used to calculate the - velocity. 'xy' uses the length of the vector between the - two features, 'latlon' uses the haversine distance. None - checks wether the required coordinates are present and - starts with 'xy'. Default is None. - - Returns - ------- - track : pandas.DataFrame - DataFrame from the input, with an additional column 'v', - contain the value of the velocity for every feature at - every possible timestep - """ - - for cell_i, track_i in track.groupby("cell"): - index = track_i.index.values - for i, index_i in enumerate(index[:-1]): - velocity = calculate_velocity_individual( - track_i.loc[index[i]], - track_i.loc[index[i + 1]], - method_distance=method_distance, - ) - track.at[index_i, "v"] = velocity - return track - - -def velocity_histogram( - track, - bin_edges=np.arange(0, 30, 1), - density=False, - method_distance=None, - return_values=False, -): - """Create an velocity histogram of the features. If the DataFrame - does not contain a velocity column, the velocities are calculated. - - Parameters - ---------- - track: pandas.DataFrame - DataFrame of the linked features, containing the columns 'cell', - 'time' and either 'projection_x_coordinate' and - 'projection_y_coordinate' or 'latitude' and 'longitude'. - - bin_edges : int or ndarray, optional - If bin_edges is an int, it defines the number of equal-width - bins in the given range. If bins is a ndarray, it defines a - monotonically increasing array of bin edges, including the - rightmost edge. Default is np.arange(0, 30000, 500). - - density : bool, optional - If False, the result will contain the number of samples in - each bin. If True, the result is the value of the probability - density function at the bin, normalized such that the integral - over the range is 1. Default is False. - - methods_distance : {None, 'xy', 'latlon'}, optional - Method of distance calculation, used to calculate the velocity. - 'xy' uses the length of the vector between the two features, - 'latlon' uses the haversine distance. None checks wether the - required coordinates are present and starts with 'xy'. - Default is None. - - return_values : bool, optional - Bool determining wether the velocities of the features are - returned from this function. Default is False. - - Returns - ------- - hist : ndarray - The values of the histogram. - - bin_edges : ndarray - The edges of the histogram. - - velocities , optional : ndarray - Numpy array with the velocities of each feature. - - """ - - if "v" not in track.columns: - logging.info("calculate velocities") - track = calculate_velocity(track) - velocities = track["v"].values - hist, bin_edges = np.histogram( - velocities[~np.isnan(velocities)], bin_edges, density=density - ) - if return_values: - return hist, bin_edges, velocities - else: - return hist, bin_edges - - -def calculate_nearestneighbordistance(features, method_distance=None): - """Calculate the distance between a feature and the nearest other - feature in the same timeframe. - - Parameters - ---------- - features : pandas.DataFrame - DataFrame of the features whose nearest neighbor distance is to - be calculated. Needs to contain either projection_x_coordinate - and projection_y_coordinate or latitude and longitude coordinates. - - method_distance : {None, 'xy', 'latlon'}, optional - Method of distance calculation. 'xy' uses the length of the vector - between the two features, 'latlon' uses the haversine distance. - None checks wether the required coordinates are present and starts - with 'xy'. Default is None. - - Returns - ------- - features : pandas.DataFrame - DataFrame of the features with a new column 'min_distance', - containing the calculated minimal distance to other features. - - """ - - from itertools import combinations - - features["min_distance"] = np.nan - for time_i, features_i in features.groupby("time"): - logging.debug(str(time_i)) - indeces = combinations(features_i.index.values, 2) - # Loop over combinations to remove features that are closer together than min_distance and keep larger one (either higher threshold or larger area) - distances = [] - for index_1, index_2 in indeces: - if index_1 is not index_2: - distance = calculate_distance( - features_i.loc[index_1], - features_i.loc[index_2], - method_distance=method_distance, - ) - distances.append( - pd.DataFrame( - {"index_1": index_1, "index_2": index_2, "distance": distance}, - index=[0], - ) - ) - if any([x is not None for x in distances]): - distances = pd.concat(distances, ignore_index=True) - for i in features_i.index: - min_distance = distances.loc[ - (distances["index_1"] == i) | (distances["index_2"] == i), - "distance", - ].min() - features.at[i, "min_distance"] = min_distance - return features - - -def nearestneighbordistance_histogram( - features, - bin_edges=np.arange(0, 30000, 500), - density=False, - method_distance=None, - return_values=False, -): - """Create an nearest neighbor distance histogram of the features. - If the DataFrame does not contain a 'min_distance' column, the - distances are calculated. - - ---------- - features - - bin_edges : int or ndarray, optional - If bin_edges is an int, it defines the number of equal-width - bins in the given range. If bins is a ndarray, it defines a - monotonically increasing array of bin edges, including the - rightmost edge. Default is np.arange(0, 30000, 500). - - density : bool, optional - If False, the result will contain the number of samples in - each bin. If True, the result is the value of the probability - density function at the bin, normalized such that the integral - over the range is 1. Default is False. - - method_distance : {None, 'xy', 'latlon'}, optional - Method of distance calculation. 'xy' uses the length of the - vector between the two features, 'latlon' uses the haversine - distance. None checks wether the required coordinates are - present and starts with 'xy'. Default is None. - - return_values : bool, optional - Bool determining wether the nearest neighbor distance of the - features are returned from this function. Default is False. - - Returns - ------- - hist : ndarray - The values of the histogram. - - bin_edges : ndarray - The edges of the histogram. - - distances, optional : ndarray - A numpy array with the nearest neighbor distances of each - feature. - - """ - - if "min_distance" not in features.columns: - logging.debug("calculate nearest neighbor distances") - features = calculate_nearestneighbordistance( - features, method_distance=method_distance - ) - distances = features["min_distance"].values - hist, bin_edges = np.histogram( - distances[~np.isnan(distances)], bin_edges, density=density - ) - if return_values: - return hist, bin_edges, distances - else: - return hist, bin_edges - - -# Treatment of 2D lat/lon coordinates to be added: -def calculate_areas_2Dlatlon(_2Dlat_coord, _2Dlon_coord): - """Calculate an array of cell areas when given two 2D arrays - of latitude and longitude values - - NOTE: This currently assuems that the lat/lon grid is orthogonal, - which is not strictly true! It's close enough for most cases, but - should be updated in future to use the cross product of the - distances to the neighbouring cells. This will require the use - of a more advanced calculation. I would advise using pyproj - at some point in the future to solve this issue and replace - haversine distance. - - Parameters - ---------- - _2Dlat_coord : AuxCoord - Iris auxilliary coordinate containing a 2d grid of latitudes - for each point. - - _2Dlon_coord : AuxCoord - Iris auxilliary coordinate containing a 2d grid of longitudes - for each point. - - Returns - ------- - area : ndarray - A numpy array approximating the area of each cell. - - """ - - hdist1 = ( - haversine( - _2Dlat_coord.points[:-1], - _2Dlon_coord.points[:-1], - _2Dlat_coord.points[1:], - _2Dlon_coord.points[1:], - ) - * 1000 - ) - - dists1 = np.zeros(_2Dlat_coord.points.shape) - dists1[0] = hdist1[0] - dists1[-1] = hdist1[-1] - dists1[1:-1] = (hdist1[0:-1] + hdist1[1:]) * 0.5 - - hdist2 = ( - haversine( - _2Dlat_coord.points[:, :-1], - _2Dlon_coord.points[:, :-1], - _2Dlat_coord.points[:, 1:], - _2Dlon_coord.points[:, 1:], - ) - * 1000 - ) - - dists2 = np.zeros(_2Dlat_coord.points.shape) - dists2[:, 0] = hdist2[:, 0] - dists2[:, -1] = hdist2[:, -1] - dists2[:, 1:-1] = (hdist2[:, 0:-1] + hdist2[:, 1:]) * 0.5 - - area = dists1 * dists2 - - return area - - -@decorators.xarray_to_iris -def calculate_area(features, mask, method_area=None): - """Calculate the area of the segments for each feature. - - Parameters - ---------- - features : pandas.DataFrame - DataFrame of the features whose area is to be calculated. - - mask : iris.cube.Cube - Cube containing mask (int for tracked volumes 0 everywhere - else). Needs to contain either projection_x_coordinate and - projection_y_coordinate or latitude and longitude - coordinates. - - method_area : {None, 'xy', 'latlon'}, optional - Flag determining how the area is calculated. 'xy' uses the - areas of the individual pixels, 'latlon' uses the - area_weights method of iris.analysis.cartography, None - checks wether the required coordinates are present and - starts with 'xy'. Default is None. - - Returns - ------- - features : pandas.DataFrame - DataFrame of the features with a new column 'area', - containing the calculated areas. - - Raises - ------ - ValueError - If neither latitude/longitude nor - projection_x_coordinate/projection_y_coordinate are - present in mask_coords. - - If latitude/longitude coordinates are 2D. - - If latitude/longitude shapes are not supported. - - If method is undefined, i.e. method is neither None, - 'xy' nor 'latlon'. - - """ - - from tobac.utils import mask_features_surface, mask_features - from iris import Constraint - from iris.analysis.cartography import area_weights - from scipy.ndimage import labeled_comprehension - - features["area"] = np.nan - - mask_coords = [coord.name() for coord in mask.coords()] - if method_area is None: - if ("projection_x_coordinate" in mask_coords) and ( - "projection_y_coordinate" in mask_coords - ): - method_area = "xy" - elif ("latitude" in mask_coords) and ("longitude" in mask_coords): - method_area = "latlon" - else: - raise ValueError( - "either latitude/longitude or projection_x_coordinate/projection_y_coordinate have to be present to calculate distances" - ) - # logging.debug("calculating area using method " + method_area) - if method_area == "xy": - if not ( - mask.coord("projection_x_coordinate").has_bounds() - and mask.coord("projection_y_coordinate").has_bounds() - ): - mask.coord("projection_x_coordinate").guess_bounds() - mask.coord("projection_y_coordinate").guess_bounds() - area = np.outer( - np.diff(mask.coord("projection_x_coordinate").bounds, axis=1), - np.diff(mask.coord("projection_y_coordinate").bounds, axis=1), - ) - elif method_area == "latlon": - if (mask.coord("latitude").ndim == 1) and (mask.coord("latitude").ndim == 1): - if not ( - mask.coord("latitude").has_bounds() - and mask.coord("longitude").has_bounds() - ): - mask.coord("latitude").guess_bounds() - mask.coord("longitude").guess_bounds() - area = area_weights(mask, normalize=False) - elif mask.coord("latitude").ndim == 2 and mask.coord("longitude").ndim == 2: - area = calculate_areas_2Dlatlon( - mask.coord("latitude"), mask.coord("longitude") - ) - else: - raise ValueError("latitude/longitude coordinate shape not supported") - else: - raise ValueError("method undefined") - - feature_areas = labeled_comprehension( - area, mask.data, features["feature"], np.sum, area.dtype, np.nan - ) - - features["area"] = feature_areas - - return features - - -def area_histogram( - features, - mask, - bin_edges=np.arange(0, 30000, 500), - density=False, - method_area=None, - return_values=False, - representative_area=False, -): - """Create an area histogram of the features. If the DataFrame - does not contain an area column, the areas are calculated. - - Parameters - ---------- - features : pandas.DataFrame - DataFrame of the features. - - mask : iris.cube.Cube - Cube containing mask (int for tracked volumes 0 - everywhere else). Needs to contain either - projection_x_coordinate and projection_y_coordinate or - latitude and longitude coordinates. The output of a - segmentation should be used here. - - bin_edges : int or ndarray, optional - If bin_edges is an int, it defines the number of - equal-width bins in the given range. If bins is a ndarray, - it defines a monotonically increasing array of bin edges, - including the rightmost edge. - Default is np.arange(0, 30000, 500). - - density : bool, optional - If False, the result will contain the number of samples - in each bin. If True, the result is the value of the - probability density function at the bin, normalized such - that the integral over the range is 1. Default is False. - - return_values : bool, optional - Bool determining wether the areas of the features are - returned from this function. Default is False. - - representive_area: bool, optional - If False, no weights will associated to the values. - If True, the weights for each area will be the areas - itself, i.e. each bin count will have the value of - the sum of all areas within the edges of the bin. - Default is False. - - Returns - ------- - hist : ndarray - The values of the histogram. - - bin_edges : ndarray - The edges of the histogram. - - bin_centers : ndarray - The centers of the histogram intervalls. - - areas : ndarray, optional - A numpy array approximating the area of each feature. - - """ - - if "area" not in features.columns: - logging.info("calculate area") - features = calculate_area(features, mask, method_area) - areas = features["area"].values - # restrict to non NaN values: - areas = areas[~np.isnan(areas)] - if representative_area: - weights = areas - else: - weights = None - hist, bin_edges = np.histogram(areas, bin_edges, density=density, weights=weights) - bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) - - if return_values: - return hist, bin_edges, bin_centers, areas - else: - return hist, bin_edges, bin_centers - - -def histogram_cellwise( - Track, variable=None, bin_edges=None, quantity="max", density=False -): - """Create a histogram of the maximum, minimum or mean of - a variable for the cells (series of features linked together - over multiple timesteps) of a track. Essentially a wrapper - of the numpy.histogram() method. - - Parameters - ---------- - Track : pandas.DataFrame - The track containing the variable to create the histogram - from. - - variable : string, optional - Column of the DataFrame with the variable on which the - histogram is to be based on. Default is None. - - bin_edges : int or ndarray, optional - If bin_edges is an int, it defines the number of - equal-width bins in the given range. If bins is a ndarray, - it defines a monotonically increasing array of bin edges, - including the rightmost edge. - - quantity : {'max', 'min', 'mean'}, optional - Flag determining wether to use maximum, minimum or mean - of a variable from all timeframes the cell covers. - Default is 'max'. - - density : bool, optional - If False, the result will contain the number of samples - in each bin. If True, the result is the value of the - probability density function at the bin, normalized such - that the integral over the range is 1. - Default is False. - - Returns - ------- - hist : ndarray - The values of the histogram - - bin_edges : ndarray - The edges of the histogram - - bin_centers : ndarray - The centers of the histogram intervalls - - Raises - ------ - ValueError - If quantity is not 'max', 'min' or 'mean'. - - """ - - Track_cell = Track.groupby("cell") - if quantity == "max": - variable_cell = Track_cell[variable].max().values - elif quantity == "min": - variable_cell = Track_cell[variable].min().values - elif quantity == "mean": - variable_cell = Track_cell[variable].mean().values - else: - raise ValueError("quantity unknown, must be max, min or mean") - hist, bin_edges = np.histogram(variable_cell, bin_edges, density=density) - bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) - - return hist, bin_edges, bin_centers - - -def histogram_featurewise(Track, variable=None, bin_edges=None, density=False): - """Create a histogram of a variable from the features - (detected objects at a single time step) of a track. - Essentially a wrapper of the numpy.histogram() method. - - Parameters - ---------- - Track : pandas.DataFrame - The track containing the variable to create the - histogram from. - - variable : string, optional - Column of the DataFrame with the variable on which the - histogram is to be based on. Default is None. - - bin_edges : int or ndarray, optional - If bin_edges is an int, it defines the number of - equal-width bins in the given range. If bins is - a sequence, it defines a monotonically increasing - array of bin edges, including the rightmost edge. - - density : bool, optional - If False, the result will contain the number of - samples in each bin. If True, the result is the - value of the probability density function at the - bin, normalized such that the integral over the - range is 1. Default is False. - - Returns - ------- - hist : ndarray - The values of the histogram - - bin_edges : ndarray - The edges of the histogram - - bin_centers : ndarray - The centers of the histogram intervalls - - """ - - hist, bin_edges = np.histogram(Track[variable].values, bin_edges, density=density) - bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) - - return hist, bin_edges, bin_centers - - -def calculate_overlap( - track_1, track_2, min_sum_inv_distance=None, min_mean_inv_distance=None -): - """Count the number of time frames in which the - individual cells of two tracks are present together - and calculate their mean and summed inverse distance. - - Parameters - ---------- - track_1, track_2 : pandas.DataFrame - The tracks conaining the cells to analyze. - - min_sum_inv_distance : float, optional - Minimum of the inverse net distance for two - cells to be counted as overlapping. - Default is None. - - min_mean_inv_distance : float, optional - Minimum of the inverse mean distance for two cells - to be counted as overlapping. Default is None. - - Returns - ------- - overlap : pandas.DataFrame - DataFrame containing the columns cell_1 and cell_2 - with the index of the cells from the tracks, - n_overlap with the number of frames both cells are - present in, mean_inv_distance with the mean inverse - distance and sum_inv_distance with the summed - inverse distance of the cells. - - """ - - cells_1 = track_1["cell"].unique() - # n_cells_1_tot=len(cells_1) - cells_2 = track_2["cell"].unique() - overlap = pd.DataFrame() - for i_cell_1, cell_1 in enumerate(cells_1): - for cell_2 in cells_2: - track_1_i = track_1[track_1["cell"] == cell_1] - track_2_i = track_2[track_2["cell"] == cell_2] - track_1_i = track_1_i[track_1_i["time"].isin(track_2_i["time"])] - track_2_i = track_2_i[track_2_i["time"].isin(track_1_i["time"])] - if not track_1_i.empty: - n_overlap = len(track_1_i) - distances = [] - for i in range(len(track_1_i)): - distance = calculate_distance( - track_1_i.iloc[[i]], track_2_i.iloc[[i]], method_distance="xy" - ) - distances.append(distance) - # mean_distance=np.mean(distances) - mean_inv_distance = np.mean(1 / (1 + np.array(distances) / 1000)) - # mean_inv_squaredistance=np.mean(1/(1+(np.array(distances)/1000)**2)) - sum_inv_distance = np.sum(1 / (1 + np.array(distances) / 1000)) - # sum_inv_squaredistance=np.sum(1/(1+(np.array(distances)/1000)**2)) - overlap = overlap.append( - { - "cell_1": cell_1, - "cell_2": cell_2, - "n_overlap": n_overlap, - # 'mean_distance':mean_distance, - "mean_inv_distance": mean_inv_distance, - # 'mean_inv_squaredistance':mean_inv_squaredistance, - "sum_inv_distance": sum_inv_distance, - # 'sum_inv_squaredistance':sum_inv_squaredistance - }, - ignore_index=True, - ) - if min_sum_inv_distance: - overlap = overlap[(overlap["sum_inv_distance"] >= min_sum_inv_distance)] - if min_mean_inv_distance: - overlap = overlap[(overlap["mean_inv_distance"] >= min_mean_inv_distance)] - - return overlap diff --git a/tobac/analysis/__init__.py b/tobac/analysis/__init__.py new file mode 100644 index 00000000..f56f538f --- /dev/null +++ b/tobac/analysis/__init__.py @@ -0,0 +1,31 @@ +"""Provide tools to analyse and visualize the tracked objects. +This module provides a set of routines that enables performing analyses +and deriving statistics for individual tracks, such as the time series +of integrated properties and vertical profiles. It also provides +routines to calculate summary statistics of the entire population of +tracked features in the field like histograms of areas/volumes +or mass and a detailed cell lifetime analysis. These analysis +routines are all built in a modular manner. Thus, users can reuse the +most basic methods for interacting with the data structure of the +package in their own analysis procedures in Python. This includes +functions performing simple tasks like looping over all identified +objects or trajectories and masking arrays for the analysis of +individual features. Plotting routines include both visualizations +for individual convective cells and their properties. [1]_ + +References +---------- +.. Heikenfeld, M., Marinescu, P. J., Christensen, M., + Watson-Parris, D., Senf, F., van den Heever, S. C. + & Stier, P. (2019). tobac 1.2: towards a flexible + framework for tracking and analysis of clouds in + diverse datasets. Geoscientific Model Development, + 12(11), 4551-4570. + +Notes +----- +""" + +from tobac.analysis.cell_analysis import * +from tobac.analysis.feature_analysis import * +from tobac.analysis.spatial import * diff --git a/tobac/analysis/cell_analysis.py b/tobac/analysis/cell_analysis.py new file mode 100644 index 00000000..e8ef39f4 --- /dev/null +++ b/tobac/analysis/cell_analysis.py @@ -0,0 +1,628 @@ +""" +Perform analysis on the properties of tracked cells +""" + +import logging +import os +import warnings + +import numpy as np +import pandas as pd +from iris.cube import Cube, CubeList +from iris.coords import AuxCoord +from iris import Constraint, save + +from tobac.centerofgravity import calculate_cog +from tobac.utils.mask import mask_cell, mask_cell_surface, mask_cube_cell +from tobac.utils.general import get_bounding_box +from tobac.analysis.spatial import ( + calculate_distance, + calculate_velocity, +) + +__all__ = ( + "cell_statistics_all", + "cell_statistics", + "cog_cell", + "lifetime_histogram", + "velocity_histogram", + "histogram_cellwise", + "calculate_overlap", +) + + +def cell_statistics_all( + input_cubes, + track, + mask, + aggregators, + output_path="./", + cell_selection=None, + output_name="Profiles", + width=10000, + z_coord="model_level_number", + dimensions=["x", "y"], + **kwargs, +): + """ + Parameters + ---------- + input_cubes : iris.cube.Cube + + track : dask.dataframe.DataFrame + + mask : iris.cube.Cube + Cube containing mask (int id for tracked volumes 0 everywhere + else). + + aggregators : list + list of iris.analysis.Aggregator instances + + output_path : str, optional + Default is './'. + + cell_selection : optional + Default is None. + + output_name : str, optional + Default is 'Profiles'. + + width : int, optional + Default is 10000. + + z_coord : str, optional + Name of the vertical coordinate in the cube. Default is + 'model_level_number'. + + dimensions : list of str, optional + Default is ['x', 'y']. + + **kwargs + + Returns + ------- + None + """ + warnings.warn( + "cell_statistics_all is depreciated and will be removed or significantly changed in v2.0.", + DeprecationWarning, + ) + + if cell_selection is None: + cell_selection = np.unique(track["cell"]) + for cell in cell_selection: + cell_statistics( + input_cubes=input_cubes, + track=track, + mask=mask, + dimensions=dimensions, + aggregators=aggregators, + cell=cell, + output_path=output_path, + output_name=output_name, + width=width, + z_coord=z_coord, + **kwargs, + ) + + +def cell_statistics( + input_cubes, + track, + mask, + aggregators, + cell, + output_path="./", + output_name="Profiles", + width=10000, + z_coord="model_level_number", + dimensions=["x", "y"], + **kwargs, +): + """ + Parameters + ---------- + input_cubes : iris.cube.Cube + + track : dask.dataframe.DataFrame + + mask : iris.cube.Cube + Cube containing mask (int id for tracked volumes 0 everywhere + else). + + aggregators list + list of iris.analysis.Aggregator instances + + cell : int + Integer id of cell to create masked cube for output. + + output_path : str, optional + Default is './'. + + output_name : str, optional + Default is 'Profiles'. + + width : int, optional + Default is 10000. + + z_coord : str, optional + Name of the vertical coordinate in the cube. Default is + 'model_level_number'. + + dimensions : list of str, optional + Default is ['x', 'y']. + + **kwargs + + Returns + ------- + None + """ + + warnings.warn( + "cell_statistics is depreciated and will be removed or significantly changed in v2.0.", + DeprecationWarning, + ) + + # If input is single cube, turn into cubelist + if type(input_cubes) is Cube: + input_cubes = CubeList([input_cubes]) + + logging.debug("Start calculating profiles for cell " + str(cell)) + track_i = track[track["cell"] == cell] + + cubes_profile = {} + for aggregator in aggregators: + cubes_profile[aggregator.name()] = CubeList() + + for time_i in track_i["time"].values: + constraint_time = Constraint(time=time_i) + + mask_i = mask.extract(constraint_time) + mask_cell_i = mask_cell(mask_i, cell, track_i, masked=False) + mask_cell_surface_i = mask_cell_surface( + mask_i, cell, track_i, masked=False, z_coord=z_coord + ) + + x_dim = mask_cell_surface_i.coord_dims("projection_x_coordinate")[0] + y_dim = mask_cell_surface_i.coord_dims("projection_y_coordinate")[0] + x_coord = mask_cell_surface_i.coord("projection_x_coordinate") + y_coord = mask_cell_surface_i.coord("projection_y_coordinate") + + if (mask_cell_surface_i.core_data() > 0).any(): + box_mask_i = get_bounding_box(mask_cell_surface_i.core_data(), buffer=1) + + box_mask = [ + [ + x_coord.points[box_mask_i[x_dim][0]], + x_coord.points[box_mask_i[x_dim][1]], + ], + [ + y_coord.points[box_mask_i[y_dim][0]], + y_coord.points[box_mask_i[y_dim][1]], + ], + ] + else: + box_mask = [[np.nan, np.nan], [np.nan, np.nan]] + + x = track_i[track_i["time"].values == time_i]["projection_x_coordinate"].values[ + 0 + ] + y = track_i[track_i["time"].values == time_i]["projection_y_coordinate"].values[ + 0 + ] + + box_slice = [[x - width, x + width], [y - width, y + width]] + + x_min = np.nanmin([box_mask[0][0], box_slice[0][0]]) + x_max = np.nanmax([box_mask[0][1], box_slice[0][1]]) + y_min = np.nanmin([box_mask[1][0], box_slice[1][0]]) + y_max = np.nanmax([box_mask[1][1], box_slice[1][1]]) + + constraint_x = Constraint( + projection_x_coordinate=lambda cell: int(x_min) < cell < int(x_max) + ) + constraint_y = Constraint( + projection_y_coordinate=lambda cell: int(y_min) < cell < int(y_max) + ) + + constraint = constraint_time & constraint_x & constraint_y + # Mask_cell_surface_i=mask_cell_surface(Mask_w_i,cell,masked=False,z_coord='model_level_number') + mask_cell_i = mask_cell_i.extract(constraint) + mask_cell_surface_i = mask_cell_surface_i.extract(constraint) + + input_cubes_i = input_cubes.extract(constraint) + for cube in input_cubes_i: + cube_masked = mask_cube_cell(cube, mask_cell_i, cell, track_i) + coords_remove = [] + for coordinate in cube_masked.coords(dim_coords=False): + if coordinate.name() not in dimensions: + for dim in dimensions: + if set(cube_masked.coord_dims(coordinate)).intersection( + set(cube_masked.coord_dims(dim)) + ): + coords_remove.append(coordinate.name()) + for coordinate in set(coords_remove): + cube_masked.remove_coord(coordinate) + + for aggregator in aggregators: + cube_collapsed = cube_masked.collapsed(dimensions, aggregator, **kwargs) + # remove all collapsed coordinates (x and y dim, scalar now) and keep only time as all these coordinates are useless + for coordinate in cube_collapsed.coords(): + if not cube_collapsed.coord_dims(coordinate): + if coordinate.name() != "time": + cube_collapsed.remove_coord(coordinate) + logging.debug(str(cube_collapsed)) + cubes_profile[aggregator.name()].append(cube_collapsed) + + minutes = (track_i["time_cell"] / pd.Timedelta(minutes=1)).values + latitude = track_i["latitude"].values + longitude = track_i["longitude"].values + minutes_coord = AuxCoord(minutes, long_name="cell_time", units="min") + latitude_coord = AuxCoord(latitude, long_name="latitude", units="degrees") + longitude_coord = AuxCoord(longitude, long_name="longitude", units="degrees") + + for aggregator in aggregators: + cubes_profile[aggregator.name()] = cubes_profile[aggregator.name()].merge() + for cube in cubes_profile[aggregator.name()]: + cube.add_aux_coord(minutes_coord, data_dims=cube.coord_dims("time")) + cube.add_aux_coord(latitude_coord, data_dims=cube.coord_dims("time")) + cube.add_aux_coord(longitude_coord, data_dims=cube.coord_dims("time")) + os.makedirs( + os.path.join(output_path, output_name, aggregator.name()), exist_ok=True + ) + savefile = os.path.join( + output_path, + output_name, + aggregator.name(), + output_name + "_" + aggregator.name() + "_" + str(int(cell)) + ".nc", + ) + save(cubes_profile[aggregator.name()], savefile) + + +def cog_cell( + cell, + Tracks=None, + M_total=None, + M_liquid=None, + M_frozen=None, + Mask=None, + savedir=None, +): + """ + Parameters + ---------- + cell : int + Integer id of cell to create masked cube for output. + + Tracks : optional + Default is None. + + M_total : subset of cube, optional + Default is None. + + M_liquid : subset of cube, optional + Default is None. + + M_frozen : subset of cube, optional + Default is None. + + savedir : str + Default is None. + + Returns + ------- + None + """ + + warnings.warn( + "cog_cell is depreciated and will be removed or significantly changed in v2.0.", + DeprecationWarning, + ) + + logging.debug("Start calculating COG for " + str(cell)) + Track = Tracks[Tracks["cell"] == cell] + constraint_time = Constraint( + time=lambda cell: Track.head(1)["time"].values[0] + <= cell + <= Track.tail(1)["time"].values[0] + ) + M_total_i = M_total.extract(constraint_time) + M_liquid_i = M_liquid.extract(constraint_time) + M_frozen_i = M_frozen.extract(constraint_time) + Mask_i = Mask.extract(constraint_time) + + savedir_cell = os.path.join(savedir, "cells", str(int(cell))) + os.makedirs(savedir_cell, exist_ok=True) + savefile_COG_total_i = os.path.join( + savedir_cell, "COG_total" + "_" + str(int(cell)) + ".h5" + ) + savefile_COG_liquid_i = os.path.join( + savedir_cell, "COG_liquid" + "_" + str(int(cell)) + ".h5" + ) + savefile_COG_frozen_i = os.path.join( + savedir_cell, "COG_frozen" + "_" + str(int(cell)) + ".h5" + ) + + Tracks_COG_total_i = calculate_cog(Track, M_total_i, Mask_i) + # Tracks_COG_total_list.append(Tracks_COG_total_i) + logging.debug("COG total loaded for " + str(cell)) + + Tracks_COG_liquid_i = calculate_cog(Track, M_liquid_i, Mask_i) + # Tracks_COG_liquid_list.append(Tracks_COG_liquid_i) + logging.debug("COG liquid loaded for " + str(cell)) + Tracks_COG_frozen_i = calculate_cog(Track, M_frozen_i, Mask_i) + # Tracks_COG_frozen_list.append(Tracks_COG_frozen_i) + logging.debug("COG frozen loaded for " + str(cell)) + + Tracks_COG_total_i.to_hdf(savefile_COG_total_i, "table") + Tracks_COG_liquid_i.to_hdf(savefile_COG_liquid_i, "table") + Tracks_COG_frozen_i.to_hdf(savefile_COG_frozen_i, "table") + logging.debug("individual COG calculated and saved to " + savedir_cell) + + +def lifetime_histogram( + Track, bin_edges=np.arange(0, 200, 20), density=False, return_values=False +): + """Compute the lifetime histogram of tracked cells. + + Parameters + ---------- + Track : pandas.DataFrame + Dataframe of linked features, containing the columns 'cell' + and 'time_cell'. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of equal-width + bins in the given range. If bins is a ndarray, it defines a + monotonically increasing array of bin edges, including the + rightmost edge. The unit is minutes. + Default is np.arange(0, 200, 20). + + density : bool, optional + If False, the result will contain the number of samples in + each bin. If True, the result is the value of the probability + density function at the bin, normalized such that the integral + over the range is 1. Default is False. + + return_values : bool, optional + Bool determining wether the lifetimes of the features are + returned from this function. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + bin_centers : ndarray + The centers of the histogram intervalls. + + minutes, optional : ndarray + Numpy.array of the lifetime of each feature in minutes. + Returned if return_values is True. + + """ + + Track_cell = Track.groupby("cell") + minutes = (Track_cell["time_cell"].max() / pd.Timedelta(minutes=1)).values + hist, bin_edges = np.histogram(minutes, bin_edges, density=density) + bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) + if return_values: + return hist, bin_edges, bin_centers, minutes + else: + return hist, bin_edges, bin_centers + + +def velocity_histogram( + track, + bin_edges=np.arange(0, 30, 1), + density=False, + method_distance=None, + return_values=False, +): + """Create an velocity histogram of the tracked cells. If the DataFrame + does not contain a velocity column, the velocities are calculated. + + Parameters + ---------- + track: pandas.DataFrame + DataFrame of the linked features, containing the columns 'cell', + 'time' and either 'projection_x_coordinate' and + 'projection_y_coordinate' or 'latitude' and 'longitude'. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of equal-width + bins in the given range. If bins is a ndarray, it defines a + monotonically increasing array of bin edges, including the + rightmost edge. Default is np.arange(0, 30000, 500). + + density : bool, optional + If False, the result will contain the number of samples in + each bin. If True, the result is the value of the probability + density function at the bin, normalized such that the integral + over the range is 1. Default is False. + + methods_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation, used to calculate the velocity. + 'xy' uses the length of the vector between the two features, + 'latlon' uses the haversine distance. None checks wether the + required coordinates are present and starts with 'xy'. + Default is None. + + return_values : bool, optional + Bool determining wether the velocities of the features are + returned from this function. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + velocities , optional : ndarray + Numpy array with the velocities of each feature. + + """ + + if "v" not in track.columns: + logging.info("calculate velocities") + track = calculate_velocity(track) + velocities = track["v"].values + hist, bin_edges = np.histogram( + velocities[~np.isnan(velocities)], bin_edges, density=density + ) + if return_values: + return hist, bin_edges, velocities + else: + return hist, bin_edges + + +def histogram_cellwise( + Track, variable=None, bin_edges=None, quantity="max", density=False +): + """Create a histogram of the maximum, minimum or mean of + a variable for the cells (series of features linked together + over multiple timesteps) of a track. Essentially a wrapper + of the numpy.histogram() method. + + Parameters + ---------- + Track : pandas.DataFrame + The track containing the variable to create the histogram + from. + + variable : string, optional + Column of the DataFrame with the variable on which the + histogram is to be based on. Default is None. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of + equal-width bins in the given range. If bins is a ndarray, + it defines a monotonically increasing array of bin edges, + including the rightmost edge. + + quantity : {'max', 'min', 'mean'}, optional + Flag determining wether to use maximum, minimum or mean + of a variable from all timeframes the cell covers. + Default is 'max'. + + density : bool, optional + If False, the result will contain the number of samples + in each bin. If True, the result is the value of the + probability density function at the bin, normalized such + that the integral over the range is 1. + Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram + + bin_edges : ndarray + The edges of the histogram + + bin_centers : ndarray + The centers of the histogram intervalls + + Raises + ------ + ValueError + If quantity is not 'max', 'min' or 'mean'. + + """ + + Track_cell = Track.groupby("cell") + if quantity == "max": + variable_cell = Track_cell[variable].max().values + elif quantity == "min": + variable_cell = Track_cell[variable].min().values + elif quantity == "mean": + variable_cell = Track_cell[variable].mean().values + else: + raise ValueError("quantity unknown, must be max, min or mean") + hist, bin_edges = np.histogram(variable_cell, bin_edges, density=density) + bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) + + return hist, bin_edges, bin_centers + + +def calculate_overlap( + track_1, track_2, min_sum_inv_distance=None, min_mean_inv_distance=None +): + """Count the number of time frames in which the + individual cells of two tracks are present together + and calculate their mean and summed inverse distance. + + Parameters + ---------- + track_1, track_2 : pandas.DataFrame + The tracks conaining the cells to analyze. + + min_sum_inv_distance : float, optional + Minimum of the inverse net distance for two + cells to be counted as overlapping. + Default is None. + + min_mean_inv_distance : float, optional + Minimum of the inverse mean distance for two cells + to be counted as overlapping. Default is None. + + Returns + ------- + overlap : pandas.DataFrame + DataFrame containing the columns cell_1 and cell_2 + with the index of the cells from the tracks, + n_overlap with the number of frames both cells are + present in, mean_inv_distance with the mean inverse + distance and sum_inv_distance with the summed + inverse distance of the cells. + + """ + + cells_1 = track_1["cell"].unique() + # n_cells_1_tot=len(cells_1) + cells_2 = track_2["cell"].unique() + overlap = pd.DataFrame() + for i_cell_1, cell_1 in enumerate(cells_1): + for cell_2 in cells_2: + track_1_i = track_1[track_1["cell"] == cell_1] + track_2_i = track_2[track_2["cell"] == cell_2] + track_1_i = track_1_i[track_1_i["time"].isin(track_2_i["time"])] + track_2_i = track_2_i[track_2_i["time"].isin(track_1_i["time"])] + if not track_1_i.empty: + n_overlap = len(track_1_i) + distances = [] + for i in range(len(track_1_i)): + distance = calculate_distance( + track_1_i.iloc[[i]], track_2_i.iloc[[i]], method_distance="xy" + ) + distances.append(distance) + # mean_distance=np.mean(distances) + mean_inv_distance = np.mean(1 / (1 + np.array(distances) / 1000)) + # mean_inv_squaredistance=np.mean(1/(1+(np.array(distances)/1000)**2)) + sum_inv_distance = np.sum(1 / (1 + np.array(distances) / 1000)) + # sum_inv_squaredistance=np.sum(1/(1+(np.array(distances)/1000)**2)) + overlap = overlap.append( + { + "cell_1": cell_1, + "cell_2": cell_2, + "n_overlap": n_overlap, + # 'mean_distance':mean_distance, + "mean_inv_distance": mean_inv_distance, + # 'mean_inv_squaredistance':mean_inv_squaredistance, + "sum_inv_distance": sum_inv_distance, + # 'sum_inv_squaredistance':sum_inv_squaredistance + }, + ignore_index=True, + ) + if min_sum_inv_distance: + overlap = overlap[(overlap["sum_inv_distance"] >= min_sum_inv_distance)] + if min_mean_inv_distance: + overlap = overlap[(overlap["mean_inv_distance"] >= min_mean_inv_distance)] + + return overlap diff --git a/tobac/analysis/feature_analysis.py b/tobac/analysis/feature_analysis.py new file mode 100644 index 00000000..7366cbdd --- /dev/null +++ b/tobac/analysis/feature_analysis.py @@ -0,0 +1,212 @@ +""" +Perform analysis on the properties of detected features +""" + +import logging +import numpy as np + +from tobac.analysis.spatial import ( + calculate_nearestneighbordistance, + calculate_area, +) + +__all__ = ( + "nearestneighbordistance_histogram", + "area_histogram", + "histogram_featurewise", +) + + +def nearestneighbordistance_histogram( + features, + bin_edges=np.arange(0, 30000, 500), + density=False, + method_distance=None, + return_values=False, +): + """Create an nearest neighbor distance histogram of the features. + If the DataFrame does not contain a 'min_distance' column, the + distances are calculated. + + ---------- + features + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of equal-width + bins in the given range. If bins is a ndarray, it defines a + monotonically increasing array of bin edges, including the + rightmost edge. Default is np.arange(0, 30000, 500). + + density : bool, optional + If False, the result will contain the number of samples in + each bin. If True, the result is the value of the probability + density function at the bin, normalized such that the integral + over the range is 1. Default is False. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation. 'xy' uses the length of the + vector between the two features, 'latlon' uses the haversine + distance. None checks wether the required coordinates are + present and starts with 'xy'. Default is None. + + return_values : bool, optional + Bool determining wether the nearest neighbor distance of the + features are returned from this function. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + distances, optional : ndarray + A numpy array with the nearest neighbor distances of each + feature. + + """ + + if "min_distance" not in features.columns: + logging.debug("calculate nearest neighbor distances") + features = calculate_nearestneighbordistance( + features, method_distance=method_distance + ) + distances = features["min_distance"].values + hist, bin_edges = np.histogram( + distances[~np.isnan(distances)], bin_edges, density=density + ) + if return_values: + return hist, bin_edges, distances + else: + return hist, bin_edges + + +def area_histogram( + features, + mask, + bin_edges=np.arange(0, 30000, 500), + density=False, + method_area=None, + return_values=False, + representative_area=False, +): + """Create an area histogram of the features. If the DataFrame + does not contain an area column, the areas are calculated. + + Parameters + ---------- + features : pandas.DataFrame + DataFrame of the features. + + mask : iris.cube.Cube + Cube containing mask (int for tracked volumes 0 + everywhere else). Needs to contain either + projection_x_coordinate and projection_y_coordinate or + latitude and longitude coordinates. The output of a + segmentation should be used here. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of + equal-width bins in the given range. If bins is a ndarray, + it defines a monotonically increasing array of bin edges, + including the rightmost edge. + Default is np.arange(0, 30000, 500). + + density : bool, optional + If False, the result will contain the number of samples + in each bin. If True, the result is the value of the + probability density function at the bin, normalized such + that the integral over the range is 1. Default is False. + + return_values : bool, optional + Bool determining wether the areas of the features are + returned from this function. Default is False. + + representive_area: bool, optional + If False, no weights will associated to the values. + If True, the weights for each area will be the areas + itself, i.e. each bin count will have the value of + the sum of all areas within the edges of the bin. + Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + bin_centers : ndarray + The centers of the histogram intervalls. + + areas : ndarray, optional + A numpy array approximating the area of each feature. + + """ + + if "area" not in features.columns: + logging.info("calculate area") + features = calculate_area(features, mask, method_area) + areas = features["area"].values + # restrict to non NaN values: + areas = areas[~np.isnan(areas)] + if representative_area: + weights = areas + else: + weights = None + hist, bin_edges = np.histogram(areas, bin_edges, density=density, weights=weights) + bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) + + if return_values: + return hist, bin_edges, bin_centers, areas + else: + return hist, bin_edges, bin_centers + + +def histogram_featurewise(Track, variable=None, bin_edges=None, density=False): + """Create a histogram of a variable from the features + (detected objects at a single time step) of a track. + Essentially a wrapper of the numpy.histogram() method. + + Parameters + ---------- + Track : pandas.DataFrame + The track containing the variable to create the + histogram from. + + variable : string, optional + Column of the DataFrame with the variable on which the + histogram is to be based on. Default is None. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of + equal-width bins in the given range. If bins is + a sequence, it defines a monotonically increasing + array of bin edges, including the rightmost edge. + + density : bool, optional + If False, the result will contain the number of + samples in each bin. If True, the result is the + value of the probability density function at the + bin, normalized such that the integral over the + range is 1. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram + + bin_edges : ndarray + The edges of the histogram + + bin_centers : ndarray + The centers of the histogram intervalls + + """ + + hist, bin_edges = np.histogram(Track[variable].values, bin_edges, density=density) + bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) + + return hist, bin_edges, bin_centers diff --git a/tobac/analysis/spatial.py b/tobac/analysis/spatial.py new file mode 100644 index 00000000..8f1caa6d --- /dev/null +++ b/tobac/analysis/spatial.py @@ -0,0 +1,449 @@ +""" +Calculate spatial properties (distances, velocities, areas, volumes) of tracked objects +""" + +import logging +from itertools import combinations + +import numpy as np +import pandas as pd +import xarray as xr +from iris.analysis.cartography import area_weights + +from tobac.utils.bulk_statistics import get_statistics_from_mask +from tobac.utils.internal.basic import find_vertical_axis_from_coord +from tobac.utils import decorators + +__all__ = ( + "haversine", + "calculate_distance", + "calculate_velocity", + "calculate_velocity_individual", + "calculate_areas_2Dlatlon", + "calculate_area", +) + + +def haversine(lat1, lon1, lat2, lon2): + """Computes the Haversine distance in kilometers. + + Calculates the Haversine distance between two points + (based on implementation CIS https://github.com/cedadev/cis). + + Parameters + ---------- + lat1, lon1 : array of latitude, longitude + First point or points as array in degrees. + + lat2, lon2 : array of latitude, longitude + Second point or points as array in degrees. + + Returns + ------- + arclen * RADIUS_EARTH : array + Array of Distance(s) between the two points(-arrays) in + kilometers. + + """ + + RADIUS_EARTH = 6378.0 + lat1 = np.radians(lat1) + lat2 = np.radians(lat2) + lon1 = np.radians(lon1) + lon2 = np.radians(lon2) + # print(lat1,lat2,lon1,lon2) + arclen = 2 * np.arcsin( + np.sqrt( + (np.sin((lat2 - lat1) / 2)) ** 2 + + np.cos(lat1) * np.cos(lat2) * (np.sin((lon2 - lon1) / 2)) ** 2 + ) + ) + return arclen * RADIUS_EARTH + + +def calculate_distance(feature_1, feature_2, method_distance=None): + """Compute the distance between two features. It is based on + either lat/lon coordinates or x/y coordinates. + + Parameters + ---------- + feature_1, feature_2 : pandas.DataFrame or pandas.Series + Dataframes containing multiple features or pandas.Series + of one feature. Need to contain either projection_x_coordinate + and projection_y_coordinate or latitude and longitude + coordinates. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation. 'xy' uses the length of the + vector between the two features, 'latlon' uses the haversine + distance. None checks wether the required coordinates are + present and starts with 'xy'. Default is None. + + Returns + ------- + distance : float or pandas.Series + Float with the distance between the two features in meters if + the input are two pandas.Series containing one feature, + pandas.Series of the distances if one of the inputs contains + multiple features. + + """ + if method_distance is None: + if ( + ("projection_x_coordinate" in feature_1) + and ("projection_y_coordinate" in feature_1) + and ("projection_x_coordinate" in feature_2) + and ("projection_y_coordinate" in feature_2) + ): + method_distance = "xy" + elif ( + ("latitude" in feature_1) + and ("longitude" in feature_1) + and ("latitude" in feature_2) + and ("longitude" in feature_2) + ): + method_distance = "latlon" + else: + raise ValueError( + "either latitude/longitude or projection_x_coordinate/projection_y_coordinate have to be present to calculate distances" + ) + + if method_distance == "xy": + distance = np.sqrt( + ( + feature_1["projection_x_coordinate"] + - feature_2["projection_x_coordinate"] + ) + ** 2 + + ( + feature_1["projection_y_coordinate"] + - feature_2["projection_y_coordinate"] + ) + ** 2 + ) + elif method_distance == "latlon": + distance = 1000 * haversine( + feature_1["latitude"], + feature_1["longitude"], + feature_2["latitude"], + feature_2["longitude"], + ) + else: + raise ValueError("method undefined") + return distance + + +def calculate_velocity_individual(feature_old, feature_new, method_distance=None): + """Calculate the mean velocity of a feature between two timeframes. + + Parameters + ---------- + feature_old : pandas.Series + pandas.Series of a feature at a certain timeframe. Needs to + contain a 'time' column and either projection_x_coordinate + and projection_y_coordinate or latitude and longitude coordinates. + + feature_new : pandas.Series + pandas.Series of the same feature at a later timeframe. Needs + to contain a 'time' column and either projection_x_coordinate + and projection_y_coordinate or latitude and longitude coordinates. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation, used to calculate the velocity. + 'xy' uses the length of the vector between the two features, + 'latlon' uses the haversine distance. None checks wether the + required coordinates are present and starts with 'xy'. + Default is None. + + Returns + ------- + velocity : float + Value of the approximate velocity. + + """ + + distance = calculate_distance( + feature_old, feature_new, method_distance=method_distance + ) + diff_time = (feature_new["time"] - feature_old["time"]).total_seconds() + velocity = distance / diff_time + return velocity + + +def calculate_velocity(track, method_distance=None): + """Calculate the velocities of a set of linked features. + + Parameters + ---------- + track : pandas.DataFrame + Dataframe of linked features, containing the columns 'cell', + 'time' and either 'projection_x_coordinate' and + 'projection_y_coordinate' or 'latitude' and 'longitude'. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation, used to calculate the + velocity. 'xy' uses the length of the vector between the + two features, 'latlon' uses the haversine distance. None + checks wether the required coordinates are present and + starts with 'xy'. Default is None. + + Returns + ------- + track : pandas.DataFrame + DataFrame from the input, with an additional column 'v', + contain the value of the velocity for every feature at + every possible timestep + """ + + for cell_i, track_i in track.groupby("cell"): + index = track_i.index.values + for i, index_i in enumerate(index[:-1]): + velocity = calculate_velocity_individual( + track_i.loc[index[i]], + track_i.loc[index[i + 1]], + method_distance=method_distance, + ) + track.at[index_i, "v"] = velocity + return track + + +def calculate_nearestneighbordistance(features, method_distance=None): + """Calculate the distance between a feature and the nearest other + feature in the same timeframe. + + Parameters + ---------- + features : pandas.DataFrame + DataFrame of the features whose nearest neighbor distance is to + be calculated. Needs to contain either projection_x_coordinate + and projection_y_coordinate or latitude and longitude coordinates. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation. 'xy' uses the length of the vector + between the two features, 'latlon' uses the haversine distance. + None checks wether the required coordinates are present and starts + with 'xy'. Default is None. + + Returns + ------- + features : pandas.DataFrame + DataFrame of the features with a new column 'min_distance', + containing the calculated minimal distance to other features. + + """ + + features["min_distance"] = np.nan + for time_i, features_i in features.groupby("time"): + logging.debug(str(time_i)) + indeces = combinations(features_i.index.values, 2) + # Loop over combinations to remove features that are closer together than min_distance and keep larger one (either higher threshold or larger area) + distances = [] + for index_1, index_2 in indeces: + if index_1 is not index_2: + distance = calculate_distance( + features_i.loc[index_1], + features_i.loc[index_2], + method_distance=method_distance, + ) + distances.append( + pd.DataFrame( + {"index_1": index_1, "index_2": index_2, "distance": distance}, + index=[0], + ) + ) + if any([x is not None for x in distances]): + distances = pd.concat(distances, ignore_index=True) + for i in features_i.index: + min_distance = distances.loc[ + (distances["index_1"] == i) | (distances["index_2"] == i), + "distance", + ].min() + features.at[i, "min_distance"] = min_distance + return features + + +def calculate_areas_2Dlatlon(_2Dlat_coord, _2Dlon_coord): + """Calculate an array of cell areas when given two 2D arrays + of latitude and longitude values + + NOTE: This currently assuems that the lat/lon grid is orthogonal, + which is not strictly true! It's close enough for most cases, but + should be updated in future to use the cross product of the + distances to the neighbouring cells. This will require the use + of a more advanced calculation. I would advise using pyproj + at some point in the future to solve this issue and replace + haversine distance. + + Parameters + ---------- + _2Dlat_coord : AuxCoord + Iris auxilliary coordinate containing a 2d grid of latitudes + for each point. + + _2Dlon_coord : AuxCoord + Iris auxilliary coordinate containing a 2d grid of longitudes + for each point. + + Returns + ------- + area : ndarray + A numpy array approximating the area of each cell. + + """ + + hdist1 = ( + haversine( + _2Dlat_coord.points[:-1], + _2Dlon_coord.points[:-1], + _2Dlat_coord.points[1:], + _2Dlon_coord.points[1:], + ) + * 1000 + ) + + dists1 = np.zeros(_2Dlat_coord.points.shape) + dists1[0] = hdist1[0] + dists1[-1] = hdist1[-1] + dists1[1:-1] = (hdist1[0:-1] + hdist1[1:]) * 0.5 + + hdist2 = ( + haversine( + _2Dlat_coord.points[:, :-1], + _2Dlon_coord.points[:, :-1], + _2Dlat_coord.points[:, 1:], + _2Dlon_coord.points[:, 1:], + ) + * 1000 + ) + + dists2 = np.zeros(_2Dlat_coord.points.shape) + dists2[:, 0] = hdist2[:, 0] + dists2[:, -1] = hdist2[:, -1] + dists2[:, 1:-1] = (hdist2[:, 0:-1] + hdist2[:, 1:]) * 0.5 + + area = dists1 * dists2 + + return area + + +@decorators.xarray_to_iris() +def calculate_area(features, mask, method_area=None, vertical_coord=None): + """Calculate the area of the segments for each feature. + + Parameters + ---------- + features : pandas.DataFrame + DataFrame of the features whose area is to be calculated. + + mask : iris.cube.Cube + Cube containing mask (int for tracked volumes 0 everywhere + else). Needs to contain either projection_x_coordinate and + projection_y_coordinate or latitude and longitude + coordinates. + + method_area : {None, 'xy', 'latlon'}, optional + Flag determining how the area is calculated. 'xy' uses the + areas of the individual pixels, 'latlon' uses the + area_weights method of iris.analysis.cartography, None + checks wether the required coordinates are present and + starts with 'xy'. Default is None. + + vertical_coord: None | str, optional (default: None) + Name of the vertical coordinate. If None, tries to auto-detect. + It looks for the coordinate or the dimension name corresponding + to the string. + + Returns + ------- + features : pandas.DataFrame + DataFrame of the features with a new column 'area', + containing the calculated areas. + + Raises + ------ + ValueError + If neither latitude/longitude nor + projection_x_coordinate/projection_y_coordinate are + present in mask_coords. + + If latitude/longitude coordinates are 2D. + + If latitude/longitude shapes are not supported. + + If method is undefined, i.e. method is neither None, + 'xy' nor 'latlon'. + + """ + + features["area"] = np.nan + + # Get the first time step of mask to remove time dimension of calculated areas + mask_slice = next(mask.slices_over("time")) + is_3d = len(mask_slice.core_data().shape) == 3 + if is_3d: + vertical_coord_name = find_vertical_axis_from_coord(mask_slice, vertical_coord) + # Need to get var_name as xarray uses this to label dims + collapse_dim = mask_slice.coords(vertical_coord_name)[0].var_name + else: + collapse_dim = None + + mask_coords = [coord.name() for coord in mask_slice.coords()] + if method_area is None: + if ("projection_x_coordinate" in mask_coords) and ( + "projection_y_coordinate" in mask_coords + ): + method_area = "xy" + elif ("latitude" in mask_coords) and ("longitude" in mask_coords): + method_area = "latlon" + else: + raise ValueError( + "either latitude/longitude or projection_x_coordinate/projection_y_coordinate have to be present to calculate distances" + ) + # logging.debug("calculating area using method " + method_area) + if method_area == "xy": + if not ( + mask_slice.coord("projection_x_coordinate").has_bounds() + and mask_slice.coord("projection_y_coordinate").has_bounds() + ): + mask_slice.coord("projection_x_coordinate").guess_bounds() + mask_slice.coord("projection_y_coordinate").guess_bounds() + area = np.outer( + np.diff(mask_slice.coord("projection_y_coordinate").bounds, axis=1), + np.diff(mask_slice.coord("projection_x_coordinate").bounds, axis=1), + ) + elif method_area == "latlon": + if (mask_slice.coord("latitude").ndim == 1) and ( + mask_slice.coord("longitude").ndim == 1 + ): + if not ( + mask_slice.coord("latitude").has_bounds() + and mask_slice.coord("longitude").has_bounds() + ): + mask_slice.coord("latitude").guess_bounds() + mask_slice.coord("longitude").guess_bounds() + area = area_weights(mask_slice, normalize=False) + elif ( + mask_slice.coord("latitude").ndim == 2 + and mask_slice.coord("longitude").ndim == 2 + ): + area = calculate_areas_2Dlatlon( + mask_slice.coord("latitude"), mask_slice.coord("longitude") + ) + else: + raise ValueError("latitude/longitude coordinate shape not supported") + else: + raise ValueError("method undefined") + + # Area needs to be a dataarray for get_statistics from mask, but otherwise dims/coords don't actually matter + area = xr.DataArray(area, dims=("a", "b")) + + features = get_statistics_from_mask( + features, + mask, + area, + statistic={"area": np.sum}, + default=np.nan, + collapse_dim=collapse_dim, + ) + + return features diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index d98f9ed0..a21d8db1 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -16,6 +16,7 @@ diverse datasets. Geoscientific Model Development, 12(11), 4551-4570. """ + from __future__ import annotations from typing import Union, Callable import warnings @@ -1127,7 +1128,7 @@ def feature_detection_multithreshold_timestep( return features_thresholds -@decorators.xarray_to_iris +@decorators.xarray_to_iris() def feature_detection_multithreshold( field_in: iris.cube.Cube, dxy: float = None, diff --git a/tobac/plotting.py b/tobac/plotting.py index d4e1d72e..63cc6c09 100644 --- a/tobac/plotting.py +++ b/tobac/plotting.py @@ -14,13 +14,17 @@ 12(11), 4551-4570. """ -import matplotlib as mpl import warnings import logging -from .analysis import lifetime_histogram -from .analysis import histogram_cellwise, histogram_featurewise import numpy as np +import matplotlib as mpl + +from tobac.analysis.cell_analysis import ( + lifetime_histogram, + histogram_cellwise, +) +from tobac.analysis.feature_analysis import histogram_featurewise def plot_tracks_mask_field_loop( diff --git a/tobac/segmentation.py b/tobac/segmentation.py index cfb3d8cd..486100df 100644 --- a/tobac/segmentation.py +++ b/tobac/segmentation.py @@ -29,6 +29,7 @@ diverse datasets. Geoscientific Model Development, 12(11), 4551-4570. """ + import copy import logging @@ -330,7 +331,7 @@ def segmentation_2D( ) -@decorators.xarray_to_iris +@decorators.xarray_to_iris() def segmentation_timestep( field_in: iris.cube.Cube, features_in: pd.DataFrame, @@ -1117,7 +1118,7 @@ def check_add_unseeded_across_bdrys( return markers_out -@decorators.xarray_to_iris +@decorators.xarray_to_iris() def segmentation( features: pd.DataFrame, field: iris.cube.Cube, diff --git a/tobac/tests/test_analysis_spatial.py b/tobac/tests/test_analysis_spatial.py new file mode 100644 index 00000000..0ed2c16a --- /dev/null +++ b/tobac/tests/test_analysis_spatial.py @@ -0,0 +1,588 @@ +""" +Test spatial analysis functions +""" + +from datetime import datetime +import pytest +import numpy as np +import pandas as pd +import xarray as xr +from iris.analysis.cartography import area_weights + +from tobac.analysis.spatial import ( + calculate_distance, + calculate_velocity_individual, + calculate_velocity, + calculate_nearestneighbordistance, + calculate_area, + calculate_areas_2Dlatlon, +) + + +def test_calculate_distance(): + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + } + ) + + with pytest.raises(ValueError): + calculate_distance(test_features.iloc[0], test_features.iloc[1]) + + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + "projection_x_coordinate": [0, 1000], + "projection_y_coordinate": [0, 0], + } + ) + + assert calculate_distance(test_features.iloc[0], test_features.iloc[1]) == 1000 + + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + "longitude": [0, 1], + "latitude": [0, 0], + } + ) + + assert calculate_distance( + test_features.iloc[0], test_features.iloc[1] + ) == pytest.approx(1.11e5, rel=1e4) + + with pytest.raises(ValueError): + calculate_distance( + test_features.iloc[0], + test_features.iloc[1], + method_distance="invalid_method", + ) + + +def test_calculate_velocity_individual(): + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 1], + "time": [ + datetime(2000, 1, 1, 0, 0), + datetime(2000, 1, 1, 0, 10), + ], + "projection_x_coordinate": [0, 6000], + "projection_y_coordinate": [0, 0], + } + ) + + assert ( + calculate_velocity_individual(test_features.iloc[0], test_features.iloc[1]) + == 10 + ) + + +def test_calculate_velocity(): + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 1], + "time": [ + datetime(2000, 1, 1, 0, 0), + datetime(2000, 1, 1, 0, 10), + ], + "projection_x_coordinate": [0, 6000], + "projection_y_coordinate": [0, 0], + "cell": [1, 1], + } + ) + + assert calculate_velocity(test_features).at[0, "v"] == 10 + + +def test_calculate_nearestneighbordistance(): + test_features = pd.DataFrame( + { + "feature": [1, 2, 3, 4], + "frame": [0, 0, 1, 1], + "time": [ + datetime(2000, 1, 1, 0, 0), + datetime(2000, 1, 1, 0, 0), + datetime(2000, 1, 1, 0, 10), + datetime(2000, 1, 1, 0, 10), + ], + "projection_x_coordinate": [0, 1000, 0, 2000], + "projection_y_coordinate": [0, 0, 0, 0], + "cell": [1, 2, 1, 2], + } + ) + + assert calculate_nearestneighbordistance(test_features)[ + "min_distance" + ].to_list() == [1000, 1000, 2000, 2000] + + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 1], + "time": [ + datetime(2000, 1, 1, 0, 0), + datetime(2000, 1, 1, 0, 10), + ], + "projection_x_coordinate": [0, 6000], + "projection_y_coordinate": [0, 0], + "cell": [1, 1], + } + ) + + assert np.all( + np.isnan(calculate_nearestneighbordistance(test_features)["min_distance"]) + ) + + +def test_calculate_area(): + """ + Test the calculate_area function for 2D and 3D masks + """ + + test_labels = np.array( + [ + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + ], + dtype=int, + ) + + test_labels = xr.DataArray( + test_labels, + dims=("time", "projection_y_coordinate", "projection_x_coordinate"), + coords={ + "time": [datetime(2000, 1, 1)], + "projection_y_coordinate": np.arange(5), + "projection_x_coordinate": np.arange(5), + }, + ) + + # We need to do this to avoid round trip bug with xarray to iris conversion + test_cube = test_labels.to_iris() + test_cube = test_cube.copy(test_cube.core_data().filled()) + + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + } + ) + + expected_areas = np.array([3, 2]) + + area = calculate_area(test_features, test_cube) + + assert np.all(area["area"] == expected_areas) + + test_labels = np.array( + [ + [ + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 1, 0, 3, 0], + [0, 1, 0, 3, 0], + [0, 0, 0, 0, 0], + ], + ], + ], + dtype=int, + ) + + test_labels = xr.DataArray( + test_labels, + dims=( + "time", + "model_level_number", + "projection_y_coordinate", + "projection_x_coordinate", + ), + coords={ + "time": [datetime(2000, 1, 1)], + "model_level_number": np.arange(2), + "projection_y_coordinate": np.arange(5), + "projection_x_coordinate": np.arange(5), + }, + ) + + # We need to do this to avoid round trip bug with xarray to iris conversion + test_cube = test_labels.to_iris() + test_cube = test_cube.copy(test_cube.core_data().filled()) + + test_features = pd.DataFrame( + { + "feature": [1, 2, 3], + "frame": [0, 0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + } + ) + + expected_areas = np.array([3, 2, 2]) + + area = calculate_area(test_features, test_cube) + + assert np.all(area["area"] == expected_areas) + + test_labels = xr.DataArray( + test_labels, + dims=( + "time", + "model_level_number", + "hdim_0", + "hdim_1", + ), + coords={ + "time": [datetime(2000, 1, 1)], + "model_level_number": np.arange(2), + }, + ) + + # Test failure to find valid coordinates + with pytest.raises(ValueError): + calculate_area(test_features, test_labels) + + # Test failure for invalid method + with pytest.raises(ValueError): + calculate_area(test_features, test_labels, method_area="invalid_method") + + +def test_calculate_area_latlon(): + # Test with latitude/longitude + test_labels = np.array( + [ + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + [ + [0, 0, 0, 0, 0], + [0, 4, 0, 0, 0], + [0, 4, 0, 3, 0], + [0, 4, 0, 3, 0], + [0, 0, 0, 0, 0], + ], + ], + dtype=int, + ) + + test_labels = xr.DataArray( + test_labels, + dims=( + "time", + "latitude", + "longitude", + ), + coords={ + "time": [datetime(2000, 1, 1), datetime(2000, 1, 1, 1)], + "latitude": xr.DataArray( + np.arange(5), dims="latitude", attrs={"units": "degrees"} + ), + "longitude": xr.DataArray( + np.arange(5), dims="longitude", attrs={"units": "degrees"} + ), + }, + ) + + test_features = pd.DataFrame( + { + "feature": [1, 2, 3, 4], + "frame": [0, 0, 1, 1], + "time": [ + datetime(2000, 1, 1, 0), + datetime(2000, 1, 1, 0), + datetime(2000, 1, 1, 1), + datetime(2000, 1, 1, 1), + ], + } + ) + + area = calculate_area(test_features, test_labels) + + expected_areas = np.array([3, 2, 2, 3]) * 1.11e5**2 + + assert np.all(np.isclose(area["area"], expected_areas, atol=1e8)) + + # Test invalid lat/lon dimensions + + # Test 1D lat but 2D lon + test_labels = xr.DataArray( + test_labels.values, + dims=( + "time", + "y_dim", + "x_dim", + ), + coords={ + "time": [datetime(2000, 1, 1), datetime(2000, 1, 1, 1)], + "latitude": xr.DataArray( + np.arange(5), dims="y_dim", attrs={"units": "degrees"} # 1D lat + ), + "longitude": xr.DataArray( + np.tile(np.arange(5), (5, 1)), + dims=("y_dim", "x_dim"), # 2D lon + attrs={"units": "degrees"}, + ), + }, + ) + + with pytest.raises(ValueError): + calculate_area(test_features, test_labels, method_area="latlon") + + # Test 3D lat/lon + test_labels = xr.DataArray( + np.tile(test_labels.values[:, np.newaxis, ...], (1, 2, 1, 1)), + dims=( + "time", + "z_dim", + "y_dim", + "x_dim", + ), + coords={ + "time": [datetime(2000, 1, 1), datetime(2000, 1, 1, 1)], + "latitude": xr.DataArray( + np.tile(np.arange(5)[:, np.newaxis], (2, 1, 5)), + dims=("z_dim", "y_dim", "x_dim"), + attrs={"units": "degrees"}, + ), + "longitude": xr.DataArray( + np.tile(np.arange(5), (2, 5, 1)), + dims=("z_dim", "y_dim", "x_dim"), + attrs={"units": "degrees"}, + ), + }, + ) + + with pytest.raises(ValueError): + calculate_area(test_features, test_labels, method_area="latlon") + + +def test_calculate_area_1D_latlon(): + """ + Test area calculation using 1D lat/lon coords + """ + test_labels = np.array( + [ + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + ], + dtype=int, + ) + + test_labels = xr.DataArray( + test_labels, + dims=("time", "latitude", "longitude"), + coords={ + "time": [datetime(2000, 1, 1)], + "latitude": xr.DataArray( + np.arange(5), dims=("latitude",), attrs={"units": "degrees"} + ), + "longitude": xr.DataArray( + np.arange(5), dims=("longitude",), attrs={"units": "degrees"} + ), + }, + ) + + # We need to do this to avoid round trip bug with xarray to iris conversion + test_cube = test_labels.to_iris() + test_cube = test_cube.copy(test_cube.core_data().filled()) + + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + } + ) + + # Calculate expected areas + copy_of_test_cube = test_cube.copy() + copy_of_test_cube.coord("latitude").guess_bounds() + copy_of_test_cube.coord("longitude").guess_bounds() + area_array = area_weights(copy_of_test_cube, normalize=False) + + expected_areas = np.array( + [np.sum(area_array[test_labels.data == i]) for i in [1, 2]] + ) + + area = calculate_area(test_features, test_cube) + + assert np.all(area["area"] == expected_areas) + + +def test_calculate_areas_2Dlatlon(): + """ + Test calculation of area array from 2D lat/lon coords + Note, in future this needs to be updated to account for non-orthogonal lat/lon arrays + """ + + test_labels = np.ones([1, 5, 5], dtype=int) + + test_labels = xr.DataArray( + test_labels, + dims=("time", "latitude", "longitude"), + coords={ + "time": [datetime(2000, 1, 1)], + "latitude": xr.DataArray( + np.arange(5), dims=("latitude",), attrs={"units": "degrees"} + ), + "longitude": xr.DataArray( + np.arange(5), dims=("longitude",), attrs={"units": "degrees"} + ), + }, + ) + + test_cube = test_labels.to_iris() + test_cube = test_cube.copy(test_cube.core_data().filled()) + copy_of_test_cube = test_cube.copy() + copy_of_test_cube.coord("latitude").guess_bounds() + copy_of_test_cube.coord("longitude").guess_bounds() + area_array = area_weights(copy_of_test_cube, normalize=False) + + lat_2d = xr.DataArray( + np.stack([np.arange(5)] * 5, axis=1), + dims=("y", "x"), + attrs={"units": "degrees"}, + ) + + lon_2d = xr.DataArray( + np.stack([np.arange(5)] * 5, axis=0), + dims=("y", "x"), + attrs={"units": "degrees"}, + ) + + test_labels = xr.DataArray( + test_labels, + dims=("time", "y", "x"), + coords={ + "time": [datetime(2000, 1, 1)], + "latitude": lat_2d, + "longitude": lon_2d, + }, + ) + + test_cube = test_labels.to_iris() + test_cube = test_cube.copy(test_cube.core_data().filled()) + + assert np.allclose( + calculate_areas_2Dlatlon( + test_cube.coord("latitude"), test_cube.coord("longitude") + ), + area_array, + rtol=0.01, + ) + + +def test_calculate_area_2D_latlon(): + """ + Test area calculation using 2D lat/lon coords + """ + + test_labels = np.array( + [ + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + ], + dtype=int, + ) + + lat_2d = xr.DataArray( + np.stack([np.arange(5)] * 5, axis=1), + dims=("y", "x"), + attrs={"units": "degrees"}, + ) + + lon_2d = xr.DataArray( + np.stack([np.arange(5)] * 5, axis=0), + dims=("y", "x"), + attrs={"units": "degrees"}, + ) + + test_labels = xr.DataArray( + test_labels, + dims=("time", "y", "x"), + coords={ + "time": [datetime(2000, 1, 1)], + "latitude": lat_2d, + "longitude": lon_2d, + }, + ) + + test_cube = test_labels.to_iris() + test_cube = test_cube.copy(test_cube.core_data().filled()) + + area_array = calculate_areas_2Dlatlon( + test_cube.coord("latitude"), test_cube.coord("longitude") + ) + + expected_areas = np.array( + [np.sum(area_array[test_labels[0].data == i]) for i in [1, 2]] + ) + + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + } + ) + + area = calculate_area(test_features, test_cube) + + assert np.all(area["area"] == expected_areas) diff --git a/tobac/tests/test_decorators.py b/tobac/tests/test_decorators.py new file mode 100644 index 00000000..01a3a0ad --- /dev/null +++ b/tobac/tests/test_decorators.py @@ -0,0 +1,147 @@ +""" +Tests for tobac.utils.decorators +""" +import numpy as np +import pandas as pd +import xarray as xr +import iris + +from tobac.utils import decorators + + +def test_convert_cube_to_dataarray(): + test_da_float = xr.DataArray(np.arange(15, dtype=float).reshape(3, 5) + 0.5) + test_da_int = xr.DataArray(np.arange(15, dtype=int).reshape(3, 5)) + + assert np.all( + decorators.convert_cube_to_dataarray(test_da_float.to_iris()) + == test_da_float.values + ) + assert np.all( + decorators.convert_cube_to_dataarray(test_da_int.to_iris()) + == test_da_int.values + ) + + +def test_conv_kwargs_iris_to_xarray(): + assert decorators._conv_kwargs_iris_to_xarray({}) == {} + assert decorators._conv_kwargs_iris_to_xarray(dict(test_int=1)) == dict(test_int=1) + + test_da = xr.DataArray(np.arange(5)) + + test_xr_kwarg = decorators._conv_kwargs_iris_to_xarray(dict(test_xr=test_da)) + assert isinstance(test_xr_kwarg["test_xr"], xr.DataArray) + + test_iris_kwarg = decorators._conv_kwargs_iris_to_xarray( + dict(test_iris=test_da.to_iris()) + ) + assert isinstance(test_iris_kwarg["test_iris"], xr.DataArray) + + +def test_conv_kwargs_irispandas_to_xarray(): + assert decorators._conv_kwargs_irispandas_to_xarray({}) == {} + assert decorators._conv_kwargs_irispandas_to_xarray(dict(test_int=1)) == dict( + test_int=1 + ) + + test_da = xr.DataArray(np.arange(5)) + + test_xr_kwarg = decorators._conv_kwargs_irispandas_to_xarray(dict(test_xr=test_da)) + assert isinstance(test_xr_kwarg["test_xr"], xr.DataArray) + + test_iris_kwarg = decorators._conv_kwargs_irispandas_to_xarray( + dict(test_iris=test_da.to_iris()) + ) + assert isinstance(test_iris_kwarg["test_iris"], xr.DataArray) + + test_ds = xr.Dataset({"test": test_da}) + test_ds_kwarg = decorators._conv_kwargs_irispandas_to_xarray(dict(test_xr=test_ds)) + assert isinstance(test_ds_kwarg["test_xr"], xr.Dataset) + + test_pd_kwarg = decorators._conv_kwargs_irispandas_to_xarray( + dict(test_pd=test_ds.to_pandas()) + ) + assert isinstance(test_pd_kwarg["test_pd"], xr.Dataset) + + +def test_conv_kwargs_xarray_to_iris(): + assert decorators._conv_kwargs_xarray_to_iris({}) == {} + assert decorators._conv_kwargs_xarray_to_iris(dict(test_int=1)) == dict(test_int=1) + + test_da = xr.DataArray(np.arange(5)) + + test_xr_kwarg = decorators._conv_kwargs_xarray_to_iris(dict(test_xr=test_da)) + assert isinstance(test_xr_kwarg["test_xr"], iris.cube.Cube) + + test_iris_kwarg = decorators._conv_kwargs_xarray_to_iris( + dict(test_iris=test_da.to_iris()) + ) + assert isinstance(test_iris_kwarg["test_iris"], iris.cube.Cube) + + +def test_conv_kwargs_xarray_to_irispandas(): + assert decorators._conv_kwargs_xarray_to_irispandas({}) == {} + assert decorators._conv_kwargs_xarray_to_irispandas(dict(test_int=1)) == dict( + test_int=1 + ) + + test_da = xr.DataArray(np.arange(5)) + + test_xr_kwarg = decorators._conv_kwargs_xarray_to_irispandas(dict(test_xr=test_da)) + assert isinstance(test_xr_kwarg["test_xr"], iris.cube.Cube) + + test_iris_kwarg = decorators._conv_kwargs_xarray_to_irispandas( + dict(test_iris=test_da.to_iris()) + ) + assert isinstance(test_iris_kwarg["test_iris"], iris.cube.Cube) + + test_ds = xr.Dataset({"test": test_da}) + test_ds_kwarg = decorators._conv_kwargs_xarray_to_irispandas(dict(test_xr=test_ds)) + assert isinstance(test_ds_kwarg["test_xr"], pd.DataFrame) + + test_pd_kwarg = decorators._conv_kwargs_xarray_to_irispandas( + dict(test_pd=test_ds.to_pandas()) + ) + assert isinstance(test_pd_kwarg["test_pd"], pd.DataFrame) + + +@decorators.iris_to_xarray(save_iris_info=True) +def _test_iris_to_xarray(*args, **kwargs): + return kwargs["converted_from_iris"] + + +def test_iris_to_xarray(): + test_da = xr.DataArray(np.arange(5)) + + assert _test_iris_to_xarray(test_da) == False + assert _test_iris_to_xarray(kwarg_xr=test_da) == False + + assert _test_iris_to_xarray(test_da.to_iris()) == True + assert _test_iris_to_xarray(kwarg_ir=test_da.to_iris()) == True + + +@decorators.irispandas_to_xarray(save_iris_info=True) +def _test_irispandas_to_xarray(*args, **kwargs): + return kwargs["converted_from_iris"] + + +def test_irispandas_to_xarray(): + test_da = xr.DataArray(np.arange(5)) + + assert _test_irispandas_to_xarray(test_da) == False + assert _test_irispandas_to_xarray(kwarg_xr=test_da) == False + + assert _test_irispandas_to_xarray(test_da.to_iris()) == True + assert _test_irispandas_to_xarray(kwarg_ir=test_da.to_iris()) == True + + +@decorators.xarray_to_irispandas() +def _test_xarray_to_irispandas(*args, **kwargs): + return args, kwargs + + +def test_xarray_to_irispandas(): + test_da = xr.DataArray(np.arange(5, dtype=float)) + + assert isinstance(_test_xarray_to_irispandas(test_da)[0][0], iris.cube.Cube) + assert _test_xarray_to_irispandas(test_da)[1] == {} diff --git a/tobac/tests/test_sample_data.py b/tobac/tests/test_sample_data.py index 42f60344..bd395742 100644 --- a/tobac/tests/test_sample_data.py +++ b/tobac/tests/test_sample_data.py @@ -1,6 +1,7 @@ """ Tests for tobac based on simple sample datasets with moving blobs. These tests should be adapted to be more modular in the future. """ + from tobac.testing import ( make_sample_data_2D_3blobs, make_sample_data_2D_3blobs_inv, diff --git a/tobac/tests/test_testing.py b/tobac/tests/test_testing.py index 2dd0577b..282c1d03 100644 --- a/tobac/tests/test_testing.py +++ b/tobac/tests/test_testing.py @@ -2,6 +2,7 @@ Audit of the testing functions that produce our test data. Who's watching the watchmen, basically. """ + import pytest from tobac.testing import ( generate_single_feature, diff --git a/tobac/tests/test_tracking.py b/tobac/tests/test_tracking.py index 1266b210..94f410c3 100644 --- a/tobac/tests/test_tracking.py +++ b/tobac/tests/test_tracking.py @@ -1,6 +1,7 @@ """ Test for the trackpy tracking functions """ + import datetime import pytest diff --git a/tobac/tests/test_utils_bulk_statistics.py b/tobac/tests/test_utils_bulk_statistics.py index 0c2ad190..4967a779 100644 --- a/tobac/tests/test_utils_bulk_statistics.py +++ b/tobac/tests/test_utils_bulk_statistics.py @@ -1,6 +1,7 @@ from datetime import datetime import numpy as np import pandas as pd +import pytest import xarray as xr import tobac import tobac.utils as tb_utils @@ -396,3 +397,214 @@ def test_bulk_statistics_broadcasting(): assert np.all( bulk_statistics_output["weighted_sum"] == expected_weighted_sum_result ) + + +def test_get_statistics_collapse_axis(): + """ + Test the collapse_axis keyword of get_statistics + """ + test_labels = np.array( + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 0, 0, 0, 0], + ], + dtype=int, + ) + + test_values = np.array([0.25, 0.5, 0.75, 1, 1]) + + test_features = pd.DataFrame( + { + "feature": [1, 2], + "frame": [0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + } + ) + statistics_sum = {"sum": np.sum} + + expected_sum_result_axis0 = np.array([0.5, 1]) + output_collapse_axis0 = tb_utils.get_statistics( + test_features, + test_labels, + test_values, + statistic=statistics_sum, + collapse_axis=0, + ) + assert np.all(output_collapse_axis0["sum"] == expected_sum_result_axis0) + + expected_sum_result_axis1 = np.array([2.25, 1.75]) + output_collapse_axis1 = tb_utils.get_statistics( + test_features, + test_labels, + test_values, + statistic=statistics_sum, + collapse_axis=1, + ) + assert np.all(output_collapse_axis1["sum"] == expected_sum_result_axis1) + + # Check that attempting broadcast raises a ValueError + with pytest.raises(ValueError): + _ = tb_utils.get_statistics( + test_features, + test_labels, + test_values.reshape([5, 1]), + statistic=statistics_sum, + collapse_axis=0, + ) + + # Check that attempting to collapse all axes raises a ValueError: + with pytest.raises(ValueError): + _ = tb_utils.get_statistics( + test_features, + test_labels, + test_values, + statistic=statistics_sum, + collapse_axis=[0, 1], + ) + + # Test with collpasing multiple axes + test_labels = np.array( + [ + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 0, 0, 0, 0], + ], + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + ], + dtype=int, + ) + test_values = np.array([0.5, 1]) + expected_sum_result_axis12 = np.array([1.5, 0.5]) + output_collapse_axis12 = tb_utils.get_statistics( + test_features, + test_labels, + test_values, + statistic=statistics_sum, + collapse_axis=[1, 2], + ) + assert np.all(output_collapse_axis12["sum"] == expected_sum_result_axis12) + + +def test_get_statistics_from_mask_collapse_dim(): + """ + Test the collapse_dim keyword of get_statistics_from_mask + """ + + test_labels = np.array( + [ + [ + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 2, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + [ + [0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 1, 0, 3, 0], + [0, 1, 0, 3, 0], + [0, 0, 0, 0, 0], + ], + ], + ], + dtype=int, + ) + + test_labels = xr.DataArray( + test_labels, + dims=("time", "z", "y", "x"), + coords={ + "time": [datetime(2000, 1, 1)], + "z": np.arange(2), + "y": np.arange(5), + "x": np.arange(5), + }, + ) + + test_values = np.ones([5, 5]) + + test_values = xr.DataArray( + test_values, + dims=("x", "y"), + coords={ + "y": np.arange(5), + "x": np.arange(5), + }, + ) + + test_features = pd.DataFrame( + { + "feature": [1, 2, 3], + "frame": [0, 0, 0], + "time": [ + datetime(2000, 1, 1), + datetime(2000, 1, 1), + datetime(2000, 1, 1), + ], + } + ) + + statistics_sum = {"sum": np.sum} + + expected_sum_result = np.array([3, 2, 2]) + + # Test over a single dim + statistics_output = tb_utils.get_statistics_from_mask( + test_features, + test_labels, + test_values, + statistic=statistics_sum, + collapse_dim="z", + ) + + assert np.all(statistics_output["sum"] == expected_sum_result) + + test_values = np.ones([2]) + + test_values = xr.DataArray( + test_values, + dims=("z",), + coords={ + "z": np.arange(2), + }, + ) + + expected_sum_result = np.array([2, 1, 1]) + + # Test over multiple dims + statistics_output = tb_utils.get_statistics_from_mask( + test_features, + test_labels, + test_values, + statistic=statistics_sum, + collapse_dim=("x", "y"), + ) + + assert np.all(statistics_output["sum"] == expected_sum_result) + + # Test that collapse_dim not in labels raises an error + with pytest.raises(ValueError): + _ = statistics_output = tb_utils.get_statistics_from_mask( + test_features, + test_labels, + test_values, + statistic=statistics_sum, + collapse_dim="not_a_dim", + ) diff --git a/tobac/tracking.py b/tobac/tracking.py index 362ead05..558dd85e 100644 --- a/tobac/tracking.py +++ b/tobac/tracking.py @@ -385,7 +385,7 @@ def linking_trackpy( link_strategy="auto", adaptive_step=adaptive_step, adaptive_stop=adaptive_stop, - dist_func=dist_func + dist_func=dist_func, # copy_features=False, diagnostics=False, # hash_size=None, box_size=None, verify_integrity=True, # retain_index=False diff --git a/tobac/utils/bulk_statistics.py b/tobac/utils/bulk_statistics.py index e933f373..cb9ce0cf 100644 --- a/tobac/utils/bulk_statistics.py +++ b/tobac/utils/bulk_statistics.py @@ -3,16 +3,24 @@ or within feature detection or segmentation. """ + import logging import warnings -from . import internal as internal_utils -from . import decorators -from typing import Callable, Union from functools import partial +from typing import Callable, Union + import numpy as np + +# numpy renamed core to _core recently +try: + from numpy._core import multiarray as mu +except ModuleNotFoundError: + from numpy.core import multiarray as mu import pandas as pd import xarray as xr +from tobac.utils import decorators + def get_statistics( features: pd.DataFrame, @@ -24,6 +32,7 @@ def get_statistics( index: Union[None, list[int]] = None, default: Union[None, float] = None, id_column: str = "feature", + collapse_axis: Union[None, int, list[int]] = None, ) -> pd.DataFrame: """Get bulk statistics for objects (e.g. features or segmented features) given a labelled mask of the objects and any input field with the same @@ -66,6 +75,12 @@ def get_statistics( Name of the column in feature dataframe that contains IDs that match with the labels in mask. The default is the column "feature". + collapse_axis: None | int | list[int], optional (default: None): + Index or indices of axes of labels to collapse. This will reduce the dimensionality of labels + while allowing labelled features to overlap. This can be used, for example, to calculate the + footprint area (2D) of 3D labels + + Returns ------- features: pd.DataFrame @@ -74,16 +89,38 @@ def get_statistics( """ # if mask and input data dimensions do not match we can broadcast using numpy broadcasting rules - for field in fields: - if labels.shape != field.shape: + if collapse_axis is not None: + # Test if iterable and if not make a list + try: + collapse_axis = list(iter(collapse_axis)) + except TypeError: + collapse_axis = [collapse_axis] + + # Normalise axes to handle negative axis number conventions + ndim = len(labels.shape) + collapse_axis = [mu.normalize_axis_index(axis, ndim) for axis in collapse_axis] + uncollapsed_axes = [ + i for i, _ in enumerate(labels.shape) if i not in collapse_axis + ] + if not len(uncollapsed_axes): + raise ValueError("Cannot collapse all axes of labels") + collapsed_shape = tuple( + [s for i, s in enumerate(labels.shape) if i not in collapse_axis] + ) + broadcast_flag = any([collapsed_shape != field.shape for field in fields]) + if broadcast_flag: + raise ValueError("Broadcasting not supported with collapse_axis") + + else: + broadcast_flag = any([labels.shape != field.shape for field in fields]) + if broadcast_flag: # Broadcast input labels and fields to ensure they work according to numpy broadcasting rules broadcast_fields = np.broadcast_arrays(labels, *fields) labels = broadcast_fields[0] fields = broadcast_fields[1:] - break # mask must contain positive values to calculate statistics - if labels[labels > 0].size > 0: + if np.any(labels > 0): if index is None: index = features.feature.to_numpy() else: @@ -98,6 +135,20 @@ def get_statistics( bins = np.cumsum(np.bincount(np.maximum(labels.ravel(), 0))) argsorted = np.argsort(labels.ravel()) + # Create lambdas to get (ravelled) label locations using argsorted and bins + if collapse_axis is None: + label_locs = lambda i: argsorted[bins[i - 1] : bins[i]] + else: + # Collapse ravelled locations to the remaining axes + label_locs = lambda i: np.unique( + np.ravel_multi_index( + np.array( + np.unravel_index(argsorted[bins[i - 1] : bins[i]], labels.shape) + )[uncollapsed_axes], + collapsed_shape, + ) + ) + # apply each function given per statistic parameter for the labeled regions sorted in ascending order for stats_name in statistic.keys(): # if function is given as a tuple, take the input parameters provided @@ -119,14 +170,11 @@ def get_statistics( stats = np.array( [ - func( - *( - field.ravel()[argsorted[bins[i - 1] : bins[i]]] - for field in fields - ) + ( + func(*(field.ravel()[label_locs(i)] for field in fields)) + if i < bins.size and bins[i] > bins[i - 1] + else default ) - if i < bins.size and bins[i] > bins[i - 1] - else default for i in index ] ) @@ -172,6 +220,7 @@ def get_statistics_from_mask( index: Union[None, list[int]] = None, default: Union[None, float] = None, id_column: str = "feature", + collapse_dim: Union[None, str, list[str]] = None, ) -> pd.DataFrame: """Derives bulk statistics for each object in the segmentation mask, and returns a features Dataframe with these properties for each feature. @@ -206,28 +255,36 @@ def get_statistics_from_mask( default value to return in a region that has no values id_column: str, optional (default: "feature") - Name of the column in feature dataframe that contains IDs that match - with the labels in mask. The default is the column "feature". - - Returns - ------- - features: pd.DataFrame - Updated feature dataframe with bulk statistics for each feature saved in a new column + Name of the column in feature dataframe that contains IDs that match with the labels in mask. The default is the column "feature". + collapse_dim: None | str | list[str], optional (defailt: None) + Dimension names of labels to collapse, allowing, e.g. calulcation of statistics on 2D + fields for the footprint of 3D objects + + Returns: + ------- + features: pd.DataFrame + Updated feature dataframe with bulk statistics for each feature saved in a new column """ - - # check that mask and input data have the same dimensions - for field in fields: - if segmentation_mask.shape != field.shape: - warnings.warn( - "One or more field does not have the same shape as segmentation_mask. Numpy broadcasting rules will be applied" - ) - # warning when feature labels are not unique in dataframe if not features.feature.is_unique: raise logging.warning( "Feature labels are not unique which may cause unexpected results for the computation of bulk statistics." ) + if collapse_dim is not None: + if isinstance(collapse_dim, str): + collapse_dim = [collapse_dim] + non_time_dims = [dim for dim in segmentation_mask.dims if dim != "time"] + collapse_axis = [ + i for i, dim in enumerate(non_time_dims) if dim in collapse_dim + ] + if len(collapse_dim) != len(collapse_axis): + raise ValueError( + "One or more of collapse_dim not found in dimensions of segmentation_mask" + ) + else: + collapse_axis = None + # get bulk statistics for each timestep step_statistics = [] @@ -259,6 +316,7 @@ def get_statistics_from_mask( default=default, index=index, id_column=id_column, + collapse_axis=collapse_axis, ) ) diff --git a/tobac/utils/decorators.py b/tobac/utils/decorators.py index afe90d65..8bc6657f 100644 --- a/tobac/utils/decorators.py +++ b/tobac/utils/decorators.py @@ -3,9 +3,37 @@ import functools import warnings -import iris.cube + +import numpy as np +from numpy import ma import pandas as pd import xarray as xr +import iris.cube + + +def convert_cube_to_dataarray(cube): + """ + Convert an iris cube to an xarray dataarray, averting error for integer dtype cubes in xarray