From 93e88ad07c6eb36eeb63156bf1eb9537f652db1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20H=C3=B8xbro=20Hansen?= Date: Wed, 3 Jul 2024 17:39:42 +0200 Subject: [PATCH] Compatibility with geopandas 1.0 and dask-geopandas 0.4.0 (#1347) Co-authored-by: Demetris Roumis --- .github/workflows/test.yaml | 49 +---- .gitignore | 1 + datashader/core.py | 23 ++- datashader/tests/test_geopandas.py | 45 ++++- doc/conf.py | 3 + examples/user_guide/8_Polygons.ipynb | 269 ++++++++++++++++++++------- scripts/download_data.py | 1 + setup.py | 38 +--- 8 files changed, 261 insertions(+), 168 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 783a2a339..11a486f8f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -66,20 +66,9 @@ jobs: MATRIX=$(jq -nsc '{ "os": ["ubuntu-latest", "macos-latest", "windows-latest"], "python-version": ["3.9", "3.10", "3.11", "3.12"], - "numpy-version": ["1"], "exclude": [{ "os": "windows-latest", "python-version": "3.9" - }], - "include": [{ - "os": "ubuntu-latest", - "python-version": "3.12", - "numpy-version": "2" - }, - { - "os": "macos-latest", - "python-version": "3.12", - "numpy-version": "2" }] }') # Won't solve on Windows + Python 3.9 @@ -90,20 +79,9 @@ jobs: MATRIX=$(jq -nsc '{ "os": ["ubuntu-latest", "macos-latest", "windows-latest"], "python-version": ["3.9", "3.10", "3.11", "3.12"], - "numpy-version": ["1"], "exclude": [{ "os": "windows-latest", "python-version": "3.9" - }], - "include": [{ - "os": "ubuntu-latest", - "python-version": "3.12", - "numpy-version": "2" - }, - { - "os": "macos-latest", - "python-version": "3.12", - "numpy-version": "2" }] }') # Won't solve on Windows + Python 3.9 @@ -113,13 +91,12 @@ jobs: run: | MATRIX=$(jq -nsc '{ "os": ["ubuntu-latest"], - "numpy-version": ["1"], "python-version": ["3.11"] }') echo "MATRIX=$MATRIX" >> $GITHUB_ENV test_suite: - name: Tests on ${{ matrix.os }} with Python ${{ matrix.python-version }}, numpy ${{ matrix.numpy-version }} + name: Tests on ${{ matrix.os }} with Python ${{ matrix.python-version }} needs: [pre_commit, setup] runs-on: ${{ matrix.os }} strategy: @@ -132,32 +109,16 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - name: Set channels and envs - run: | - if [[ ${{ matrix.numpy-version }} == "2" ]]; then - channels="pyviz/label/dev,conda-forge/label/numpy_rc,numba/label/dev,conda-forge,nodefaults" - envs="-o numpy2" - else - channels="pyviz/label/dev,numba,conda-forge,nodefaults" - envs="-o tests -o examples" - fi - echo "CHANNELS=$channels" >> $GITHUB_ENV - echo "ENVS=$envs" >> $GITHUB_ENV - uses: holoviz-dev/holoviz_tasks/install@v0 with: - name: unit_test_suite_np${{ matrix.numpy-version }} + name: unit_test_suite python-version: ${{ matrix.python-version }} channel-priority: flexible - channels: ${{ env.CHANNELS }} - envs: ${{ env.ENVS }} + channels: "pyviz/label/dev,numba,conda-forge,nodefaults" + envs: "-o tests -o examples" cache: ${{ github.event.inputs.cache || github.event.inputs.cache == '' }} conda-update: true id: install - - name: check version - run: | - conda activate test-environment - python -c "import numba; print('Numba', numba.__version__)" - python -c "import numpy; print('Numpy', numpy.__version__)" - name: download data run: | conda activate test-environment @@ -174,7 +135,7 @@ jobs: NUMBA_DISABLE_JIT: 1 - name: doit test_examples env: - DASK_DATAFRAME__QUERY_PLANNING: false + DASK_DATAFRAME__QUERY_PLANNING: False run: | conda activate test-environment doit test_examples diff --git a/.gitignore b/.gitignore index 28c4fad3d..0da983948 100644 --- a/.gitignore +++ b/.gitignore @@ -82,6 +82,7 @@ examples/export examples/tiles_output_directory examples/user_guide/export examples/user_guide/df_world.parq +examples/user_guide/sgeodf.parq **.org .doit* diff --git a/datashader/core.py b/datashader/core.py index 0c80a5421..3473050e4 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -3,11 +3,13 @@ from numbers import Number from math import log10 import warnings +import contextlib import numpy as np import pandas as pd import dask.dataframe as dd import dask.array as da +from packaging.version import Version from xarray import DataArray, Dataset from .utils import Dispatcher, ngjit, calc_res, calc_bbox, orient_array, \ @@ -1274,20 +1276,23 @@ def _source_from_geopandas(self, source): If so, spatially filter the source and return it. If not, return None. """ - try: + dfs = [] + with contextlib.suppress(ImportError): import geopandas - except ImportError: - geopandas = None + dfs.append(geopandas.GeoDataFrame) - try: + with contextlib.suppress(ImportError): import dask_geopandas - except ImportError: - dask_geopandas = None + if Version(dask_geopandas.__version__) >= Version("0.4.0"): + from dask_geopandas.core import GeoDataFrame as gdf1 + from dask_geopandas.expr import GeoDataFrame as gdf2 - if ((geopandas and isinstance(source, geopandas.GeoDataFrame)) or - (dask_geopandas and isinstance(source, dask_geopandas.GeoDataFrame))): + dfs.extend([gdf1, gdf2]) + else: + dfs.append(dask_geopandas.GeoDataFrame) + + if isinstance(source, tuple(dfs)): # Explicit shapely version check as cannot continue unless shapely >= 2 - from packaging.version import Version from shapely import __version__ as shapely_version if Version(shapely_version) < Version('2.0.0'): raise ImportError("Use of GeoPandas in Datashader requires Shapely >= 2.0.0") diff --git a/datashader/tests/test_geopandas.py b/datashader/tests/test_geopandas.py index 6a9126b85..e7eafdd06 100644 --- a/datashader/tests/test_geopandas.py +++ b/datashader/tests/test_geopandas.py @@ -1,4 +1,6 @@ # Testing GeoPandas and SpatialPandas +import contextlib + import dask.dataframe as dd import datashader as ds from datashader.tests.test_pandas import assert_eq_ndarray @@ -6,11 +8,27 @@ from numpy import nan import pytest from datashader.tests.utils import dask_switcher +from packaging.version import Version + +_backends = [ + pytest.param(False, id="dask"), +] + +with contextlib.suppress(ImportError): + import dask_geopandas + + if Version(dask_geopandas.__version__) >= Version("0.4.0"): + _backends.append(pytest.param(True, id="dask-expr")) -@pytest.fixture(autouse=True) -def _classic_dd(): - with dask_switcher(query=False, extras=["spatialpandas.dask", "dask_geopandas"]): ... +@pytest.fixture(params=_backends) +def dask_both(request): + with dask_switcher(query=request.param, extras=["spatialpandas.dask", "dask_geopandas.backends", "dask_geopandas"]): ... + return request.param + +@pytest.fixture +def dask_classic(request): + with dask_switcher(query=False, extras=["spatialpandas.dask", "dask_geopandas.backends", "dask_geopandas"]): ... try: import dask_geopandas @@ -105,6 +123,15 @@ def _classic_dd(): ]) +@pytest.mark.skipif(not dask_geopandas, reason="dask_geopandas not installed") +def test_dask_geopandas_switcher(dask_both): + import dask_geopandas + if dask_both: + assert dask_geopandas.expr.GeoDataFrame == dask_geopandas.GeoDataFrame + else: + assert dask_geopandas.core.GeoDataFrame == dask_geopandas.GeoDataFrame + + @pytest.mark.skipif(not geodatasets, reason="geodatasets not installed") @pytest.mark.skipif(not geopandas, reason="geopandas not installed") @pytest.mark.parametrize("geom_type, explode, use_boundary", @@ -144,7 +171,7 @@ def test_lines_geopandas(geom_type, explode, use_boundary): ("linestring", True, True), ], ) -def test_lines_dask_geopandas(geom_type, explode, use_boundary, npartitions): +def test_lines_dask_geopandas(geom_type, explode, use_boundary, npartitions, dask_both): df = geopandas.read_file(geodatasets.get_path("nybb")) df["col"] = np.arange(len(df)) # Extra column for aggregation. geometry = "boundary" if use_boundary else "geometry" @@ -176,7 +203,7 @@ def test_lines_dask_geopandas(geom_type, explode, use_boundary, npartitions): ("linestring", True, True), ], ) -def test_lines_spatialpandas(geom_type, explode, use_boundary, npartitions): +def test_lines_spatialpandas(geom_type, explode, use_boundary, npartitions, dask_classic): df = geopandas.read_file(geodatasets.get_path("nybb")) df["col"] = np.arange(len(df)) # Extra column for aggregation. geometry = "boundary" if use_boundary else "geometry" @@ -219,7 +246,7 @@ def test_points_geopandas(geom_type): @pytest.mark.skipif(not geopandas, reason="geopandas not installed") @pytest.mark.parametrize('npartitions', [1, 2, 5]) @pytest.mark.parametrize("geom_type", ["multipoint", "point"]) -def test_points_dask_geopandas(geom_type, npartitions): +def test_points_dask_geopandas(geom_type, npartitions, dask_both): df = geopandas.read_file(geodatasets.get_path("nybb")) df["geometry"] = df["geometry"].sample_points(100, rng=93814) # multipoint @@ -241,7 +268,7 @@ def test_points_dask_geopandas(geom_type, npartitions): @pytest.mark.skipif(not spatialpandas, reason="spatialpandas not installed") @pytest.mark.parametrize('npartitions', [0, 1, 2, 5]) @pytest.mark.parametrize("geom_type", ["multipoint", "point"]) -def test_points_spatialpandas(geom_type, npartitions): +def test_points_spatialpandas(geom_type, npartitions, dask_classic): df = geopandas.read_file(geodatasets.get_path("nybb")) df["geometry"] = df["geometry"].sample_points(100, rng=93814) # multipoint @@ -282,7 +309,7 @@ def test_polygons_geopandas(geom_type): @pytest.mark.skipif(not geopandas, reason="geopandas not installed") @pytest.mark.parametrize('npartitions', [1, 2, 5]) @pytest.mark.parametrize("geom_type", ["multipolygon", "polygon"]) -def test_polygons_dask_geopandas(geom_type, npartitions): +def test_polygons_dask_geopandas(geom_type, npartitions, dask_both): df = geopandas.read_file(geodatasets.get_path("nybb")) df["col"] = np.arange(len(df)) @@ -305,7 +332,7 @@ def test_polygons_dask_geopandas(geom_type, npartitions): @pytest.mark.skipif(not spatialpandas, reason="spatialpandas not installed") @pytest.mark.parametrize('npartitions', [0, 1, 2, 5]) @pytest.mark.parametrize("geom_type", ["multipolygon", "polygon"]) -def test_polygons_spatialpandas(geom_type, npartitions): +def test_polygons_spatialpandas(geom_type, npartitions, dask_classic): df = geopandas.read_file(geodatasets.get_path("nybb")) df["col"] = np.arange(len(df)) diff --git a/doc/conf.py b/doc/conf.py index 69ef3bc4d..964772945 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -58,8 +58,11 @@ 'sphinx.ext.autosummary', 'numpydoc', 'nbsite.analytics', + 'sphinxcontrib.mermaid', ] +myst_fence_as_directive = ["mermaid"] + nbsite_analytics = { 'goatcounter_holoviz': True, } diff --git a/examples/user_guide/8_Polygons.ipynb b/examples/user_guide/8_Polygons.ipynb index fddd00691..9e7436fcd 100644 --- a/examples/user_guide/8_Polygons.ipynb +++ b/examples/user_guide/8_Polygons.ipynb @@ -4,20 +4,101 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In addition to points, lines, areas, rasters, and trimeshes, Datashader can quickly render large collections of polygons (filled polylines). Datashader's polygon support depends on data structures provided by the separate [spatialpandas](nbviewer.org/github/holoviz/spatialpandas/blob/main/examples/Overview.ipynb) library, which extends Pandas and Parquet to support efficient storage and manipulation of \"ragged\" (variable length) data like polygons. \n", + "In addition to points, lines, areas, rasters, and trimeshes, Datashader can quickly render large collections of polygons (filled polylines). Datashader's polygon support depends on data structures provided by the [spatialpandas](https://nbviewer.org/github/holoviz/spatialpandas/blob/main/examples/Overview.ipynb) library. SpatialPandas extends Pandas and Parquet to support efficient storage and manipulation of \"ragged\" (variable length) data like polygons. Instructions for installing spatialpandas are given at the bottom of this page.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SpatialPandas Data Structures\n", + "Pandas supports custom column types using an \"ExtensionArray\" interface. SpatialPandas provides two Pandas ExtensionArrays that support polygons:\n", "\n", - "Before running these examples, you will need spatialpandas installed with pip:\n", + "- `spatialpandas.geometry.PolygonArray`: Each row in the column is a single `Polygon` instance. As with shapely and geopandas, each Polygon may contain zero or more holes. \n", + " \n", + "- `spatialpandas.geometry.MultiPolygonArray`: Each row in the column is a `MultiPolygon` instance, each of which can store one or more polygons, with each polygon containing zero or more holes.\n", "\n", - "```\n", - "$ pip install spatialpandas\n", - "```\n", + "Datashader assumes that the vertices of the outer filled polygon will be listed as `x1`, `y1`, `x2`, `y2`, etc. in counter-clockwise (CCW) order around the polygon edge, while the holes will be in clockwise (CW) order. All polygons (both filled and holes) must be \"closed\", with the first vertex of each polygon repeated as the last vertex." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```mermaid\n", + "graph TD;\n", + " subgraph SP[ ]\n", + " A[Polygon]:::spatialpandas -->|used in| E[PolygonArray]:::spatialpandas\n", + " B[MultiPolygon]:::spatialpandas -->|used in| F[MultiPolygonArray]:::spatialpandas\n", + " E -->|used in| G[GeoDataFrame or
GeoSeries]:::spatialpandas\n", + " spatialpandas(SpatialPandas):::spatialpandas_title\n", + " F -->|used in| G\n", + " end\n", "\n", - "or conda:\n", - "```\n", - "$ conda install -c pyviz spatialpandas\n", + " subgraph SH[ ]\n", + " shapely(Shapely):::shapely_title\n", + " H[Polygon]:::shapely -->|converts to| A\n", + " I[MultiPolygon]:::shapely -->|converts to| B\n", + " end\n", + "\n", + " subgraph GP[ ]\n", + " geopandas(GeoPandas):::geopandas_title\n", + " J[GeoDataFrame or
GeoSeries]:::geopandas\n", + " end\n", + "\n", + " subgraph PD[ ]\n", + " pandas(Pandas):::pandas_title\n", + " K[DataFrame or
Series]:::pandas\n", + " end\n", + "\n", + " subgraph SPD[ ]\n", + " spatialpandas.dask(SpatialPandas.Dask):::dask_title\n", + " L[DaskGeoDataFrame or
DaskGeoSeries]:::dask\n", + " end\n", + "\n", + " M(Datashader):::datashader_title\n", + "\n", + " F -->|usable in| K\n", + " E -->|usable in| K\n", + " \n", + " G -->|converts to| L\n", + "\n", + " G <-->|converts to| J\n", + "\n", + " G -->|usable by| M\n", + " L -->|usable by| M\n", + " J -->|usable by| M\n", + " K -->|usable by| M\n", + "\n", + " classDef spatialpandas fill:#4e79a7,stroke:#000,stroke-width:0px,color:black;\n", + " classDef shapely fill:#f28e2b,stroke:#000,stroke-width:0px,color:black;\n", + " classDef geopandas fill:#59a14f,stroke:#000,stroke-width:0px,color:black;\n", + " classDef dask fill:#76b7b2,stroke:#000,stroke-width:0px,color:black;\n", + " classDef datashader fill:#b07aa1,stroke:#000,stroke-width:0px,color:black;\n", + " classDef pandas fill:#edc948,stroke:#000,stroke-width:0px,color:black;\n", + " classDef spatialpandas_title fill:#fff,stroke:#4e79a7,stroke-width:7px,color:black,fill-opacity:.9,font-weight: bold;\n", + " classDef shapely_title fill:#fff,stroke:#f28e2b,stroke-width:7px,color:black,fill-opacity:.9,font-weight: bold;\n", + " classDef geopandas_title fill:#fff,stroke:#59a14f,stroke-width:7px,color:black,fill-opacity:.9,font-weight: bold;\n", + " classDef dask_title fill:#fff,stroke:#76b7b2,stroke-width:7px,color:black,fill-opacity:.9,font-weight: bold;\n", + " classDef pandas_title fill:#fff,stroke:#edc948,stroke-width:7px,color:black,fill-opacity:.9,font-weight: bold;\n", + " classDef datashader_title fill:#fff,stroke:#b07aa1,stroke-width:7px,color:black,fill-opacity:.9,font-weight: bold;\n", + " classDef subgraph_style fill:#fff;\n", + "\n", + " style SP fill:grey,stroke:#fff,stroke-width:0px\n", + " style SH fill:grey,stroke:#fff,stroke-width:0px\n", + " style GP fill:grey,stroke:#fff,stroke-width:0px\n", + " style PD fill:grey,stroke:#fff,stroke-width:0px\n", + " style SPD fill:grey,stroke:#fff,stroke-width:0px\n", "```" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing Required Libraries" + ] + }, { "cell_type": "code", "execution_count": null, @@ -35,19 +116,6 @@ "import spatialpandas.dask" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pandas supports custom column types using an \"ExtensionArray\" interface. Spatialpandas provides two Pandas ExtensionArrays that support polygons:\n", - "\n", - "- `spatialpandas.geometry.PolygonArray`: Each row in the column is a single `Polygon` instance. As with shapely and geopandas, each Polygon may contain zero or more holes. \n", - " \n", - "- `spatialpandas.geometry.MultiPolygonArray`: Each row in the column is a `MultiPolygon` instance, each of which can store one or more polygons, with each polygon containing zero or more holes.\n", - "\n", - "Datashader assumes that the vertices of the outer filled polygon will be listed as x1, y1, x2, y2, etc. in counter clockwise (CCW) order around the polygon edge, while the holes will be in clockwise (CW) order. All polygons (both filled and holes) must be \"closed\", with the first vertex of each polygon repeated as the last vertex." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -94,6 +162,13 @@ "df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rasterizing Polygons\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -257,8 +332,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Geopandas import\n", - "The `.from_geopandas` static method on each `spatialpandas` ExtensionArray can be used to import a geopandas `GeoSeries` of `Polygon`/`MultiPolygon` objects:" + "## Using GeoPandas and Converting to SpatialPandas\n", + "\n", + "You can utilize a GeoPandas `GeoSeries` of `Polygon`/`MultiPolygon` objects with Datashader:\n" ] }, { @@ -267,20 +343,21 @@ "metadata": {}, "outputs": [], "source": [ + "import geodatasets as gds\n", "import geopandas\n", "\n", - "world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))\n", - "world = world.to_crs(epsg=4087) # simple cylindrical projection\n", - "world['boundary'] = world.geometry.boundary\n", - "world['centroid'] = world.geometry.centroid\n", - "world.head()" + "geodf = geopandas.read_file(gds.get_path('geoda health'))\n", + "geodf = geodf.to_crs(epsg=4087) # simple cylindrical projection\n", + "geodf['boundary'] = geodf.geometry.boundary\n", + "geodf['centroid'] = geodf.geometry.centroid\n", + "geodf.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Convert the geopandas GeoDataFrame to a spatialpandas GeoDataFrame for Datashader to use:" + "Optionally, you can convert the GeoPandas GeoDataFrame to a SpatialPandas GeoDataFrame. Since version 0.16, Datashader supports direct use of `geopandas` `GeoDataFrame`s without having to convert them to `spatialpandas`. See [GeoPandas](13_Geopandas.ipynb).\n" ] }, { @@ -290,22 +367,15 @@ "outputs": [], "source": [ "%%time\n", - "df_world = sp.GeoDataFrame(world)\n", - "df_world.head()" + "sgeodf = sp.GeoDataFrame(geodf)\n", + "sgeodf.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Since version 0.16, Datashader supports direct use of `geopandas` `GeoDataFrame`s without having to convert them to `spatialpandas`. See [GeoPandas](13_Geopandas.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Geopandas/shapely export\n", + "## Converting from SpatialPandas to GeoPandas/Shapely\n", "A `MultiPolygonArray` can be converted to a geopandas `GeometryArray` using the `to_geopandas` method." ] }, @@ -316,7 +386,7 @@ "outputs": [], "source": [ "%%time\n", - "pd.Series(df_world.boundary.array.to_geopandas())" + "pd.Series(sgeodf.boundary.array.to_geopandas())" ] }, { @@ -332,14 +402,28 @@ "metadata": {}, "outputs": [], "source": [ - "df_world.geometry.array[3].to_shapely()" + "sgeodf.geometry.array[3].to_shapely()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting Polygons with Datashader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plotting as filled polygons" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Plotting as filled polygons" + "Datashader can also plot polygons as filled shapes. Here’s an example using a `spatialpandas` `GeoDataFrame`." ] }, { @@ -349,7 +433,7 @@ "outputs": [], "source": [ "# Discard the output to avoid measuring Numba compilation times\n", - "tf.shade(cvs.polygons(df_world, geometry='geometry', agg=ds.mean('pop_est')));" + "tf.shade(cvs.polygons(sgeodf, geometry='geometry', agg=ds.mean('cty_pop200')));" ] }, { @@ -360,7 +444,7 @@ "source": [ "%%time\n", "cvs = ds.Canvas(plot_width=650, plot_height=400)\n", - "agg = cvs.polygons(df_world, geometry='geometry', agg=ds.mean('pop_est'))\n", + "agg = cvs.polygons(sgeodf, geometry='geometry', agg=ds.mean('cty_pop200'))\n", "tf.shade(agg)" ] }, @@ -368,7 +452,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Plotting as centroid points" + "### Plotting as centroid points" ] }, { @@ -378,7 +462,7 @@ "outputs": [], "source": [ "# Discard the output to avoid measuring Numba compilation times\n", - "cvs.points(df_world, geometry='centroid', agg=ds.mean('pop_est'));" + "cvs.points(sgeodf, geometry='centroid', agg=ds.mean('cty_pop200'));" ] }, { @@ -388,7 +472,7 @@ "outputs": [], "source": [ "%%time\n", - "agg = cvs.points(df_world, geometry='centroid', agg=ds.mean('pop_est'))\n", + "agg = cvs.points(sgeodf, geometry='centroid', agg=ds.mean('cty_pop200'))\n", "tf.spread(tf.shade(agg), 2)" ] }, @@ -407,10 +491,10 @@ "metadata": {}, "outputs": [], "source": [ - "{\"MultiPolygon2dArray length\": df_world.geometry.array.length[:4],\n", - " \"GeoPandas length\": world.geometry.array.length[:4],\n", - " \"MultiPolygonArray area\": df_world.geometry.array.area[:4],\n", - " \"GeoPandas area\": world.geometry.array.area[:4],}" + "{\"MultiPolygon2dArray length\": sgeodf.geometry.array.length[:4],\n", + " \"GeoPandas length\": geodf.geometry.array.length[:4],\n", + " \"MultiPolygonArray area\": sgeodf.geometry.array.area[:4],\n", + " \"GeoPandas area\": geodf.geometry.array.area[:4],}" ] }, { @@ -426,9 +510,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Duplicate world 1000 times\n", - "df_world_large = pd.concat([df_world.geometry] * 1000)\n", - "world_large = pd.concat([world.geometry] * 1000)" + "# Duplicate 500 times\n", + "sgeodf_large = pd.concat([sgeodf.geometry] * 500)\n", + "geodf_large = pd.concat([geodf.geometry] * 500)" ] }, { @@ -437,8 +521,8 @@ "metadata": {}, "outputs": [], "source": [ - "length_ds = %timeit -o world_large.array.length\n", - "length_gp = %timeit -o df_world_large.array.length\n", + "length_ds = %timeit -o -n 3 -r 1 geodf_large.array.length\n", + "length_gp = %timeit -o -n 3 -r 1 sgeodf_large.array.length\n", "print(\"\\nMultiPolygonArray.length speedup: %.2f\" % (length_ds.average / length_gp.average))" ] }, @@ -448,8 +532,8 @@ "metadata": {}, "outputs": [], "source": [ - "area_ds = %timeit -o world_large.array.area\n", - "area_gp = %timeit -o df_world_large.array.area\n", + "area_ds = %timeit -o -n 3 -r 1 geodf_large.array.area\n", + "area_gp = %timeit -o -n 3 -r 1 sgeodf_large.array.area\n", "print(\"\\nMultiPolygonArray.area speedup: %.2f\" % (area_ds.average / area_gp.average))" ] }, @@ -466,7 +550,7 @@ "source": [ "## Parquet support\n", "\n", - "spatialpandas geometry arrays can be stored in Parquet files, which support efficient chunked columnar access that is particularly important when working with Dask for large files. To create such a file, use `.to_parquet`:" + "SpatialPandas geometry arrays can be stored in Parquet files, which support efficient chunked columnar access that is particularly important when working with Dask for large files. To create such a file, use `.to_parquet`:" ] }, { @@ -475,7 +559,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_world.to_parquet('df_world.parq')" + "sgeodf.to_parquet('sgeodf.parq')" ] }, { @@ -492,7 +576,7 @@ "outputs": [], "source": [ "from spatialpandas.io import read_parquet\n", - "read_parquet('df_world.parq').head(2)" + "read_parquet('sgeodf.parq').head(2)" ] }, { @@ -501,7 +585,16 @@ "source": [ "## Dask support\n", "\n", - "For large collections of polygons, you can use [Dask](https://dask.org) to parallelize the rendering. If you are starting with a Pandas dataframe with a geometry column, just use the standard `dask.dataframe.from_pandas` method:" + "For large collections of polygons, you can use [Dask](https://dask.org) to parallelize the rendering. If you are starting with a Pandas dataframe with a geometry column or a SpatialPandas `GeoDataFrame`, just use the standard `dask.dataframe.from_pandas` method:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " If you are seeing an error when running this example, ensure that you have configured Dask for SpatialPandas compatibility, as described at the bottom of this page.\n", + "
" ] }, { @@ -510,9 +603,9 @@ "metadata": {}, "outputs": [], "source": [ - "ddf = dd.from_pandas(df_world, npartitions=2).pack_partitions(npartitions=100).persist()\n", + "ddf = dd.from_pandas(sgeodf, npartitions=2).pack_partitions(npartitions=100).persist()\n", "\n", - "tf.shade(cvs.polygons(ddf, geometry='geometry', agg=ds.mean('gdp_md_est')), cmap=cc.kg)" + "tf.shade(cvs.polygons(ddf, geometry='geometry', agg=ds.mean('cty_pop200')))" ] }, { @@ -528,7 +621,7 @@ "source": [ "## Interactive example using HoloViews\n", "\n", - "As you can see above, HoloViews can easily invoke Datashader on polygons using `rasterize`, with full interactive redrawing at each new zoom level as long as you have a live Python process running. The code for the world population example would be:" + "As you can see above, HoloViews can easily invoke Datashader on polygons using `rasterize`, with full interactive redrawing at each new zoom level **as long as you have a live Python process running**. The code for the world population example would be:" ] }, { @@ -537,7 +630,7 @@ "metadata": {}, "outputs": [], "source": [ - "out = rasterize(hv.Polygons(ddf, vdims=['pop_est']), aggregator=ds.sum('pop_est'))\n", + "out = rasterize(hv.Polygons(ddf, vdims=['cty_pop200']), aggregator=ds.sum('cty_pop200'))\n", "out.opts(width=700, height=500, tools=[\"hover\"]);" ] }, @@ -556,9 +649,9 @@ "source": [ "def compute_partitions(el):\n", " n = ddf.cx_partitions[slice(*el.range('x')), slice(*el.range('y'))].npartitions\n", - " return el.opts(title=f'Population by country (npartitions: {n})')\n", + " return el.opts(title=f'Population by county (npartitions: {n})')\n", "\n", - "out.apply(compute_partitions).opts(width=700, height=500, tools=[\"hover\"], clim=(0, 1.3e9))" + "out.apply(compute_partitions).opts(frame_width=700, frame_height=500, tools=[\"hover\"])" ] }, { @@ -571,6 +664,44 @@ "\n", "\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installing SpatialPandas\n", + "\n", + "\n", + "Before running these examples, you will need spatialpandas installed with pip:\n", + "\n", + "```\n", + "$ pip install spatialpandas\n", + "```\n", + "\n", + "or conda:\n", + "```\n", + "$ conda install -c pyviz/label/dev spatialpandas\n", + "```\n", + "\n", + "## Configuring Dask for SpatialPandas Compatibility\n", + "\n", + "By default, Dask version [2024.3.0](https://docs.dask.org/en/stable/changelog.html#:~:text=2024.3.0-,%C2%B6,-Released%20on%20March) enabled a new query-planning (`dask-expr`) version of their DataFrame API. SpatialPandas v0.4.11 does not yet support query-planning. Until SpatialPandas supports query planning, you can use one of the following steps to use the classic Dask DataFrame API:\n", + "\n", + "#### Method 1: Dask Configuration\n", + "\n", + "Run the following command in your terminal:\n", + "\n", + "```bash\n", + "$ dask config set dataframe.query-planning False\n", + "```\n", + "\n", + "#### Method 2: Environment Variable\n", + "Set the environment variable by running:\n", + "\n", + "```bash\n", + "$ export DASK_DATAFRAME__QUERY_PLANNING=False\n", + "```" + ] } ], "metadata": { @@ -580,5 +711,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/scripts/download_data.py b/scripts/download_data.py index 9b7a494a9..586548243 100644 --- a/scripts/download_data.py +++ b/scripts/download_data.py @@ -21,3 +21,4 @@ gds.get_path("geoda.natregimes") gds.get_path("nybb") + gds.get_path('geoda health') diff --git a/setup.py b/setup.py index 379ac43e5..d728613fa 100644 --- a/setup.py +++ b/setup.py @@ -46,42 +46,6 @@ 'bokeh_sampledata', ] -# Numpy 2 packages, should be removed when all commented out packages works with Numpy 2 -numpy2 = [ - 'numba ==0.60.0rc1', - 'numpy ==2.0.0rc2', - - # [geopandas] - # 'dask-geopandas', - # 'geopandas', - # 'shapely >=2.0.0', - - # [examples] - 'bokeh >3.1', - 'geodatasets', - 'holoviews', - 'matplotlib >=3.3', - 'panel >1.1', - # 'scikit-image', - # 'spatialpandas', - - # [tests] - 'geodatasets', - 'netcdf4', - 'nbval', - 'psutil', - 'pytest-xdist', - # 'pyarrow', - 'pytest', - 'pytest-benchmark', - 'pytest-cov', - # 'rasterio', - # 'rioxarray', # rasterio - # 'scikit-image', - # 'spatialpandas', - # 'dask-expr', # pyarrow -] - extras_require = { 'tests': geopandas + [ 'geodatasets', @@ -111,13 +75,13 @@ 'rasterio', ], 'geopandas': geopandas, - 'numpy2': numpy2, } extras_require['doc'] = extras_require['examples_extra'] + [ 'nbsite >=0.8.4,<0.9.0', 'numpydoc', + 'sphinxcontrib-mermaid', ] extras_require['all'] = sorted(set(sum(extras_require.values(), [])))