From f2171599fc00a77f60f9a1aead7d488dcf474dba Mon Sep 17 00:00:00 2001 From: linogaliana Date: Thu, 4 Apr 2024 20:33:37 +0000 Subject: [PATCH] Style --- post/polars/polars-tuto.ipynb | 1149 ++++++++------------------------- 1 file changed, 287 insertions(+), 862 deletions(-) diff --git a/post/polars/polars-tuto.ipynb b/post/polars/polars-tuto.ipynb index 49b18b18..008b8824 100644 --- a/post/polars/polars-tuto.ipynb +++ b/post/polars/polars-tuto.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "5f0f384d-6dd7-4525-ac0c-4e192e14e896", "metadata": {}, "source": [ "# Prise en main de Polars\n", @@ -30,11 +29,11 @@ "`utilitr`](https://www.book.utilitr.org/03_fiches_thematiques/fiche_tidyverse)),\n", "ce *notebook* exploite la [Base Permanente des Equipements\n", "(BPE)](https://www.insee.fr/fr/metadonnees/source/serie/s1161)." - ] + ], + "id": "858b86d0-6258-4aa6-9e0c-bd8914b750e0" }, { "cell_type": "markdown", - "id": "aecba529-b842-4372-ab5e-86b7b37490bc", "metadata": {}, "source": [ "
\n", @@ -55,80 +54,39 @@ "requêtes SQL pour la manipulation de données.\n", "\n", "
" - ] + ], + "id": "3bf049dd-2a49-4b26-8ab8-84b301c0eee1" }, { "cell_type": "markdown", - "id": "c58f0db9-a7aa-40ae-b4cc-c4336e73dd84", "metadata": {}, "source": [ "Pour pouvoir installer les packages utilisés par ce tutoriel, il est\n", "nécessaire d’exécuter la cellule suivante:" - ] + ], + "id": "b42c4e86-aed1-480c-91f0-63f59a7e9d06" }, { "cell_type": "code", "execution_count": 1, - "id": "a502d937", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: polars in /opt/mamba/lib/python3.11/site-packages (0.20.17)\n", - "Requirement already satisfied: s3fs in /opt/mamba/lib/python3.11/site-packages (2024.3.1)\n", - "Requirement already satisfied: duckdb in /opt/mamba/lib/python3.11/site-packages (0.10.1)\n", - "Requirement already satisfied: pynsee[full] in /opt/mamba/lib/python3.11/site-packages (0.1.7)\n", - "Requirement already satisfied: pandas>=0.24.2 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (2.2.1)\n", - "Requirement already satisfied: tqdm>=4.56.0 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (4.66.2)\n", - "Requirement already satisfied: requests>=2.23 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (2.31.0)\n", - "Requirement already satisfied: appdirs>=1.4.4 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (1.4.4)\n", - "Requirement already satisfied: unidecode>=1.1.0 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (1.3.8)\n", - "Requirement already satisfied: shapely>=1.8.0 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (2.0.3)\n", - "Requirement already satisfied: urllib3 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (1.26.18)\n", - "Requirement already satisfied: openpyxl<=3.1.0 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (3.1.0)\n", - "Requirement already satisfied: xlrd>=2.0.1 in /opt/mamba/lib/python3.11/site-packages (from pynsee[full]) (2.0.1)\n", - "Requirement already satisfied: aiobotocore<3.0.0,>=2.5.4 in /opt/mamba/lib/python3.11/site-packages (from s3fs) (2.12.1)\n", - "Requirement already satisfied: fsspec==2024.3.1 in /opt/mamba/lib/python3.11/site-packages (from s3fs) (2024.3.1)\n", - "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /opt/mamba/lib/python3.11/site-packages (from s3fs) (3.9.3)\n", - "Requirement already satisfied: botocore<1.34.52,>=1.34.41 in /opt/mamba/lib/python3.11/site-packages (from aiobotocore<3.0.0,>=2.5.4->s3fs) (1.34.51)\n", - "Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /opt/mamba/lib/python3.11/site-packages (from aiobotocore<3.0.0,>=2.5.4->s3fs) (1.16.0)\n", - "Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /opt/mamba/lib/python3.11/site-packages (from aiobotocore<3.0.0,>=2.5.4->s3fs) (0.11.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /opt/mamba/lib/python3.11/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/mamba/lib/python3.11/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /opt/mamba/lib/python3.11/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/mamba/lib/python3.11/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/mamba/lib/python3.11/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs) (1.9.4)\n", - "Requirement already satisfied: et-xmlfile in /opt/mamba/lib/python3.11/site-packages (from openpyxl<=3.1.0->pynsee[full]) (1.1.0)\n", - "Requirement already satisfied: numpy<2,>=1.23.2 in /opt/mamba/lib/python3.11/site-packages (from pandas>=0.24.2->pynsee[full]) (1.26.4)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/mamba/lib/python3.11/site-packages (from pandas>=0.24.2->pynsee[full]) (2.9.0)\n", - "Requirement already satisfied: pytz>=2020.1 in /opt/mamba/lib/python3.11/site-packages (from pandas>=0.24.2->pynsee[full]) (2024.1)\n", - "Requirement already satisfied: tzdata>=2022.7 in /opt/mamba/lib/python3.11/site-packages (from pandas>=0.24.2->pynsee[full]) (2024.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/mamba/lib/python3.11/site-packages (from requests>=2.23->pynsee[full]) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/mamba/lib/python3.11/site-packages (from requests>=2.23->pynsee[full]) (3.6)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/mamba/lib/python3.11/site-packages (from requests>=2.23->pynsee[full]) (2024.2.2)\n", - "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/mamba/lib/python3.11/site-packages (from botocore<1.34.52,>=1.34.41->aiobotocore<3.0.0,>=2.5.4->s3fs) (1.0.1)\n", - "Requirement already satisfied: six>=1.5 in /opt/mamba/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=0.24.2->pynsee[full]) (1.16.0)\n" - ] - } - ], + "outputs": [], "source": [ "!pip install polars pynsee[full] s3fs duckdb" - ] + ], + "id": "8233ee8e" }, { "cell_type": "markdown", - "id": "eb816a52-d832-487e-a9bb-0306a4412736", "metadata": {}, "source": [ "Nous aurons besoin d’importer les packages suivants dans ce *notebook*:" - ] + ], + "id": "ffc029bd-6c9c-444a-9c57-d7d77fd1b50f" }, { "cell_type": "code", "execution_count": 2, - "id": "b5724bcf", "metadata": {}, "outputs": [], "source": [ @@ -136,11 +94,11 @@ "import polars as pl\n", "import s3fs\n", "from pynsee.download import download_file" - ] + ], + "id": "65b85c9f" }, { "cell_type": "markdown", - "id": "71d2f426-7931-4b14-be71-a93d124362e6", "metadata": {}, "source": [ "# Lecture de données\n", @@ -170,62 +128,50 @@ "\n", "Pour récupérer des données via `Pynsee`, la fonction de référence est\n", "`download_file`:" - ] + ], + "id": "c91d6a02-4f92-4488-b799-17b93530eafb" }, { "cell_type": "code", "execution_count": 3, - "id": "532ca923", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 98%|█████████▊| 5.76M/5.88M [00:01<00:00, 4.81MiB/s]\n", - "/opt/mamba/lib/python3.11/site-packages/pynsee/download/_download_store_file.py:68: UserWarning: File in insee.fr modified or corrupted during download\n", - " warnings.warn(\"File in insee.fr modified or corrupted during download\")\n", - "Extracting: 100%|██████████| 74.6M/74.6M [00:00<00:00, 110MB/s] \n" - ] - } - ], + "outputs": [], "source": [ "pandas_df_bpe = download_file(\"BPE_ENS\", update = True)" - ] + ], + "id": "233d0df6" }, { "cell_type": "markdown", - "id": "03720560-effe-4a8b-89d2-05286ecb7a60", "metadata": {}, "source": [ "`Pynsee` renvoie un `DataFrame` `Pandas` correspondant à la source\n", "désiré :" - ] + ], + "id": "bd704bf4-8c47-43a9-b5ea-10b322505760" }, { "cell_type": "code", "execution_count": 4, - "id": "f50bd98a", "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "pandas_df_bpe.__class__" - ] + ], + "id": "62d4163e" }, { "cell_type": "markdown", - "id": "871ba342-305b-48dc-9bde-8a2232feebd7", "metadata": {}, "source": [ "
\n", @@ -236,206 +182,110 @@ "choisi. Cependant le typage des données pourrait ne pas être optimal.\n", "\n", "
" - ] + ], + "id": "2de08c35-24d4-4734-881a-2234ad6e2278" }, { "cell_type": "markdown", - "id": "40f9f82b-d2cc-47ff-b273-5ed5200e6ac4", "metadata": {}, "source": [ "La conversion d’un objet `Pandas` en `Polars` se fait via la méthode\n", "`from_pandas`:" - ] + ], + "id": "bb6fa5a1-58b0-454f-ac7d-a532bf697cf7" }, { "cell_type": "code", "execution_count": 5, - "id": "4cc28ea4", "metadata": {}, "outputs": [], "source": [ "df = pl.from_pandas(pandas_df_bpe)" - ] + ], + "id": "af405774" }, { "cell_type": "markdown", - "id": "b811771b-ee03-440c-a66d-8317bd6b7a87", "metadata": {}, "source": [ "Les `DataFrame` `Polars` apparaissent de manière différente des\n", "`DataFrame` `Pandas` dans la console ou dans le *display* de `Jupyter`:\n", "\n", "- `pd.DataFrame`:" - ] + ], + "id": "19f3bfbb-4ae3-4deb-a82a-9802bfe1ec58" }, { "cell_type": "code", "execution_count": 6, - "id": "c7319056", "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
05242021010930101001A20006919301001000084A1A129CSZ1
15242021010930101001A20006919301001000084A4A401CSZ2
\n", "
" - ], - "text/plain": [ - " AAV2020 AN BV2012 DEP DEPCOM DOM EPCI DCIRIS REG SDOM TYPEQU \\\n", - "0 524 2021 01093 01 01001 A 200069193 010010000 84 A1 A129 \n", - "1 524 2021 01093 01 01001 A 200069193 010010000 84 A4 A401 \n", - "\n", - " UU2020 NB_EQUIP \n", - "0 CSZ 1 \n", - "1 CSZ 2 " ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "pandas_df_bpe.head(2)" - ] + ], + "id": "ef4172eb" }, { "cell_type": "markdown", - "id": "6c255e13-2b9e-4260-8aa4-57e0bf8db666", "metadata": {}, "source": [ "- `pl.DataFrame`:" - ] + ], + "id": "158d64ea-7c0c-4c73-ba8d-281017ac14bf" }, { "cell_type": "code", "execution_count": 7, - "id": "5a3bca26", "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (2, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"524""2021""01093""01""01001""A""200069193""010010000""84""A1""A129""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A401""CSZ""2"
" - ], - "text/plain": [ - "shape: (2, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A1 ┆ A129 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A401 ┆ CSZ ┆ 2 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df.head(2)" - ] + ], + "id": "a708ac5a" }, { "cell_type": "markdown", - "id": "110685a8-955d-415c-b464-a341d78b7bc6", "metadata": {}, "source": [ "On va écrire au format\n", "[Parquet](https://pythonds.linogaliana.fr/reads3/) ce `DataFrame` pour\n", "ensuite illustrer la lecture à partir de ce format, plus performant." - ] + ], + "id": "8ddbc178-43ea-40e3-943e-c72cb2088e04" }, { "cell_type": "code", "execution_count": 8, - "id": "ee2c5286", "metadata": {}, "outputs": [], "source": [ "df.write_parquet(\"bpe.parquet\")" - ] + ], + "id": "180c727e" }, { "cell_type": "markdown", - "id": "3d2a0bab-1399-4e62-8a92-6f91adc2a7ef", "metadata": {}, "source": [ "## En lecture directe depuis un CSV\n", @@ -446,12 +296,12 @@ "copie de la version 2019 disponible sur le site de l’Insee est mise à\n", "disposition, prête à l’emploi. Elle est disponible depuis l’URL :\n", "https://minio.lab.sspcloud.fr/donnees-insee/diffusion/BPE/2019/BPE_ENS.csv" - ] + ], + "id": "674fb782-e2f3-48e1-9f6c-cf48f50a830c" }, { "cell_type": "code", "execution_count": 9, - "id": "099ce4ea", "metadata": {}, "outputs": [], "source": [ @@ -462,11 +312,11 @@ " \"DEP\": pl.Categorical,\n", " \"DEPCOM\": pl.Categorical\n", " })" - ] + ], + "id": "9948fc23" }, { "cell_type": "markdown", - "id": "ba7b1476-89a0-4c9d-99c4-2d6e67b9bd08", "metadata": {}, "source": [ "L’option `dtypes` est ici nécessaire sous peine d’une erreur qu’on ne\n", @@ -475,11 +325,11 @@ "français (qui sont tous numériques sauf les numéros Corse *“2A”* et\n", "*“2B”*) et demande à l’utilisateur de données de fixer le type de ces\n", "variables." - ] + ], + "id": "0dd067a0-260a-46e7-b3df-4095af8c1ce6" }, { "cell_type": "markdown", - "id": "bf8aa3f4-f0dd-449e-84a2-cc764981db65", "metadata": {}, "source": [ "
\n", @@ -521,11 +371,11 @@ " └─────┴─────┴────────┴────────┴──────┴────────┴──────────┘\n", "\n", "
" - ] + ], + "id": "7f6dd695-b23b-4822-b556-fe3f0cd426eb" }, { "cell_type": "markdown", - "id": "45fbcb9d-e7d6-4b91-affa-8c69e2d76c0f", "metadata": {}, "source": [ "# Comment utiliser Polars ?\n", @@ -533,21 +383,21 @@ "Dans la suite de ce tutoriel, on va privilégier l’import depuis un\n", "fichier `Parquet`, plus performant et plus fiable grâce au typage des\n", "colonnes." - ] + ], + "id": "f5f5b745-4c4b-47ac-83ec-7dfb4215e83c" }, { "cell_type": "code", - "execution_count": 10, - "id": "b98d96b5", + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df_bpe = pl.read_parquet(\"bpe.parquet\")" - ] + ], + "id": "da416ca2" }, { "cell_type": "markdown", - "id": "393d0eca-2c04-4574-999e-4d96fcc66465", "metadata": {}, "source": [ "A l’instar d’autres outils modernes d’exploitation des données, `Polars`\n", @@ -571,54 +421,35 @@ "3. On compte le nombre d’occurrences pour chaque département via `agg`\n", "4. Le dernier appel - `collect()` - indique que le traitement peut être\n", " lancé (et donc optimisé et parallelisé par `Polars`)." - ] + ], + "id": "4d82d88d-de67-42c2-a7f8-fd9b647eab5b" }, { "cell_type": "code", - "execution_count": 11, - "id": "e02e402a", + "execution_count": 12, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "/tmp/ipykernel_83272/1940419069.py:3: DeprecationWarning: `groupby` is deprecated. It has been renamed to `group_by`.\n", - " ).groupby( # 2.\n", - "/tmp/ipykernel_83272/1940419069.py:6: DeprecationWarning: `pl.count()` is deprecated. Please use `pl.len()` instead.\n", - " pl.count().alias(\"NB_STATION_SERVICE\")\n" + "/tmp/ipykernel_84584/1940419069.py:3: DeprecationWarning:\n", + "\n", + "`groupby` is deprecated. It has been renamed to `group_by`.\n", + "\n", + "/tmp/ipykernel_84584/1940419069.py:6: DeprecationWarning:\n", + "\n", + "`pl.count()` is deprecated. Please use `pl.len()` instead.\n" ] }, { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 2)
DEPNB_STATION_SERVICE
stru32
"01"91
"13"192
"10"48
"03"56
"09"24
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬────────────────────┐\n", - "│ DEP ┆ NB_STATION_SERVICE │\n", - "│ --- ┆ --- │\n", - "│ str ┆ u32 │\n", - "╞═════╪════════════════════╡\n", - "│ 01 ┆ 91 │\n", - "│ 13 ┆ 192 │\n", - "│ 10 ┆ 48 │\n", - "│ 03 ┆ 56 │\n", - "│ 09 ┆ 24 │\n", - "└─────┴────────────────────┘" + "" ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -631,11 +462,11 @@ ").collect() # 4.\n", "\n", "df_stations_service.head(5)" - ] + ], + "id": "e11e44ad" }, { "cell_type": "markdown", - "id": "1c1327b5-f70c-4dae-8fd6-89c561d0173f", "metadata": {}, "source": [ "Du point de vue du code, la complexité induite par l’approche\n", @@ -707,258 +538,148 @@ "## Sélection de variables\n", "\n", "Commençons par sélectionner des variables en utilisant leurs noms :" - ] + ], + "id": "34b17217-a28e-4bcc-8f98-ce20db89fdf8" }, { "cell_type": "code", - "execution_count": 12, - "id": "0ed5fc4f", + "execution_count": 13, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 3)
DEPCOMTYPEQUNB_EQUIP
strstrstr
"01001""A129""1"
"01001""A401""2"
"01001""A402""1"
"01001""A404""2"
"01001""A405""2"
" - ], - "text/plain": [ - "shape: (5, 3)\n", - "┌────────┬────────┬──────────┐\n", - "│ DEPCOM ┆ TYPEQU ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str │\n", - "╞════════╪════════╪══════════╡\n", - "│ 01001 ┆ A129 ┆ 1 │\n", - "│ 01001 ┆ A401 ┆ 2 │\n", - "│ 01001 ┆ A402 ┆ 1 │\n", - "│ 01001 ┆ A404 ┆ 2 │\n", - "│ 01001 ┆ A405 ┆ 2 │\n", - "└────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.select(\n", " [\"DEPCOM\", \"TYPEQU\", \"NB_EQUIP\"]\n", ").head(5)" - ] + ], + "id": "f9b9ed08" }, { "cell_type": "markdown", - "id": "e5e53b42-00cf-4208-a971-dfc745ed2fbe", "metadata": {}, "source": [ "Bien que cette méthode ne soit pas conseillée, il est bon de noter qu’il\n", "est également possible de sélectionner via les positions des colonnes,\n", "comme le permet `Pandas` :" - ] + ], + "id": "2558cbc7-8966-4714-bce8-f85539ea1999" }, { "cell_type": "code", - "execution_count": 13, - "id": "7b38c930", + "execution_count": 14, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 4)
ANBV2012DEPDEPCOM
strstrstrstr
"2021""01093""01""01001"
"2021""01093""01""01001"
"2021""01093""01""01001"
"2021""01093""01""01001"
"2021""01093""01""01001"
" - ], - "text/plain": [ - "shape: (5, 4)\n", - "┌──────┬────────┬─────┬────────┐\n", - "│ AN ┆ BV2012 ┆ DEP ┆ DEPCOM │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str │\n", - "╞══════╪════════╪═════╪════════╡\n", - "│ 2021 ┆ 01093 ┆ 01 ┆ 01001 │\n", - "│ 2021 ┆ 01093 ┆ 01 ┆ 01001 │\n", - "│ 2021 ┆ 01093 ┆ 01 ┆ 01001 │\n", - "│ 2021 ┆ 01093 ┆ 01 ┆ 01001 │\n", - "│ 2021 ┆ 01093 ┆ 01 ┆ 01001 │\n", - "└──────┴────────┴─────┴────────┘" + "" ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe[:, 1:5].head(5)" - ] + ], + "id": "17a9a4fb" }, { "cell_type": "markdown", - "id": "e128370c-ddcd-45cd-b72a-a2e4a649573c", "metadata": {}, "source": [ "On peut également s’appuyer sur des motifs de sélection des noms de\n", "colonnes mobilisant des expressions régulières (ici `^DEP.*$` signifiant\n", "*“débute par DEP”*):" - ] + ], + "id": "b2820037-1580-458f-8596-e3cbd28b801c" }, { "cell_type": "code", - "execution_count": 14, - "id": "8b775d0e", + "execution_count": 15, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 2)
DEPDEPCOM
strstr
"01""01001"
"01""01001"
"01""01001"
"01""01001"
"01""01001"
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬────────┐\n", - "│ DEP ┆ DEPCOM │\n", - "│ --- ┆ --- │\n", - "│ str ┆ str │\n", - "╞═════╪════════╡\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "└─────┴────────┘" + "" ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.select(\n", " pl.col(\"^DEP.*$\")\n", ").head(5)" - ] + ], + "id": "b3f4fc6a" }, { "cell_type": "markdown", - "id": "46f5f113-0845-4e40-849c-19e1db99ef26", "metadata": {}, "source": [ "La fonction `select` acceptant des `list` Python, on peut construire des\n", "sélecteurs assez puissants :" - ] + ], + "id": "3a378db3-ce81-4ba0-9b33-a3df56e44cff" }, { "cell_type": "code", - "execution_count": 15, - "id": "b1622027", + "execution_count": 16, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 2)
DEPDEPCOM
strstr
"01""01001"
"01""01001"
"01""01001"
"01""01001"
"01""01001"
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬────────┐\n", - "│ DEP ┆ DEPCOM │\n", - "│ --- ┆ --- │\n", - "│ str ┆ str │\n", - "╞═════╪════════╡\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "│ 01 ┆ 01001 │\n", - "└─────┴────────┘" + "" ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "dep_cols = [cols for cols in df_bpe.columns if cols.startswith(\"DEP\")] \n", "\n", "df_bpe.select(dep_cols).head(5)" - ] + ], + "id": "5909f465" }, { "cell_type": "markdown", - "id": "2a6b560e-c8ee-4fce-a1d8-8cdae2108b94", "metadata": {}, "source": [ "## Sélection d’observations\n", "\n", "La sélection d’observations (de lignes) se fera grâce à la fonction\n", "`filter`." - ] + ], + "id": "0965467e-8b32-476f-9d4e-9e1fa9f57870" }, { "cell_type": "code", - "execution_count": 16, - "id": "5a5a9d93", + "execution_count": 17, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"001""2021""75056""75""75101""B""200054781""751010101""11""B3""B304""00851""2"
"001""2021""75056""75""75101""B""200054781""751010201""11""B3""B304""00851""17"
"001""2021""75056""75""75101""B""200054781""751010202""11""B3""B304""00851""3"
"001""2021""75056""75""75101""B""200054781""751010203""11""B3""B304""00851""6"
"001""2021""75056""75""75101""B""200054781""751010204""11""B3""B304""00851""7"
" - ], - "text/plain": [ - "shape: (5, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ B3 ┆ B304 ┆ 00851 ┆ 2 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ B3 ┆ B304 ┆ 00851 ┆ 17 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ B3 ┆ B304 ┆ 00851 ┆ 3 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ B3 ┆ B304 ┆ 00851 ┆ 6 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ B3 ┆ B304 ┆ 00851 ┆ 7 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -966,11 +687,11 @@ " (pl.col(\"DEPCOM\") == \"75101\") &\n", " (pl.col(\"TYPEQU\") == \"B304\")\n", ").head(5)" - ] + ], + "id": "d004847d" }, { "cell_type": "markdown", - "id": "3e11c520-e996-49eb-a960-68463b50f49d", "metadata": {}, "source": [ "De nombreux opérateurs sont disponibles pour faciliter cette sélection\n", @@ -981,44 +702,22 @@ " *“EPCI”*.\n", "\n", "Par exemple, pour sélectionner les départements 75 et 92:" - ] + ], + "id": "d97b7fe8-0cbd-47c2-9515-94fbd0e1c020" }, { "cell_type": "code", - "execution_count": 17, - "id": "6b103b9f", + "execution_count": 18, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"001""2021""75056""75""75101""A""200054781""751010201""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010301""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010402""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010102""11""A1""A105""00851""1"
"001""2021""75056""75""75101""A""200054781""751010402""11""A1""A120""00851""1"
" - ], - "text/plain": [ - "shape: (5, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A105 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A120 ┆ 00851 ┆ 1 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -1026,48 +725,31 @@ "df_bpe.filter(\n", " pl.col(\"DEP\").is_in([\"75\", \"92\"])\n", ").head(5)" - ] + ], + "id": "bdb6d081" }, { "cell_type": "markdown", - "id": "173afba0-ec8e-4136-879f-27e085ef4111", "metadata": {}, "source": [ "Pour sélectionner les observations où la variable EPCI a une valeur\n", "manquante:" - ] + ], + "id": "5184ed98-7097-4f76-82b5-2dd597dfc5da" }, { "cell_type": "code", - "execution_count": 18, - "id": "8e24c831", + "execution_count": 19, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (0, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
" - ], - "text/plain": [ - "shape: (0, 13)\n", - "┌─────────┬─────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪═════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "└─────────┴─────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -1075,70 +757,42 @@ "df_bpe.filter(\n", " pl.col(\"EPCI\").is_null()\n", ")" - ] + ], + "id": "3bdb3719" }, { "cell_type": "markdown", - "id": "ba1144ff-6883-4a15-9482-d0d6eb5aa03a", "metadata": {}, "source": [ "A l’inverse, si on désire ne sélectionner que les observations où la\n", "colonne *“EPCI”* n’est pas manquante :" - ] + ], + "id": "e519fbda-fdc4-4410-a1eb-e491e8b9acf3" }, { "cell_type": "code", - "execution_count": 19, - "id": "d9d57997", + "execution_count": 20, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (1_056_914, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"524""2021""01093""01""01001""A""200069193""010010000""84""A1""A129""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A401""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A402""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A404""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A405""CSZ""2"
"9F1""2021""CSZ""976""97617""F""200059871""97617_IND""06""F1""F114""9F304""1"
"9F1""2021""CSZ""976""97617""F""200059871""97617_IND""06""F1""F120""9F304""1"
"9F1""2021""CSZ""976""97617""F""200059871""97617_IND""06""F1""F121""9F304""3"
"9F1""2021""CSZ""976""97617""F""200059871""97617_IND""06""F3""F307""9F304""1"
"9F1""2021""CSZ""976""97617""G""200059871""97617_IND""06""G1""G104""9F304""1"
" - ], - "text/plain": [ - "shape: (1_056_914, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A1 ┆ A129 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A401 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A402 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A404 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A405 ┆ CSZ ┆ 2 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ F1 ┆ F114 ┆ 9F304 ┆ 1 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ F1 ┆ F120 ┆ 9F304 ┆ 1 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ F1 ┆ F121 ┆ 9F304 ┆ 3 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ F3 ┆ F307 ┆ 9F304 ┆ 1 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ G1 ┆ G104 ┆ 9F304 ┆ 1 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.filter(\n", " pl.col(\"EPCI\").is_not_null()\n", ")" - ] + ], + "id": "5795f97e" }, { "cell_type": "markdown", - "id": "9436c0f7-af5e-492e-9de4-61ccce893140", "metadata": {}, "source": [ "Comme avec `Pandas`, il existe énormément de méthodes pratiques pour la\n", @@ -1148,57 +802,35 @@ "\n", "La fonction `rename` permet de lister les colonnes à renommer via un\n", "dictionnaire Python :" - ] + ], + "id": "b75b3876-c07b-462f-a4d5-6e346310cd70" }, { "cell_type": "code", - "execution_count": 20, - "id": "49b40922", + "execution_count": 21, "metadata": { "tags": [] }, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 13)
AAV2020ANBV2012DEPcode_communeDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"524""2021""01093""01""01001""A""200069193""010010000""84""A1""A129""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A401""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A402""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A404""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A405""CSZ""2"
" - ], - "text/plain": [ - "shape: (5, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A1 ┆ A129 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A401 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A402 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A404 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A405 ┆ CSZ ┆ 2 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.rename({\n", " \"DEPCOM\" : \"code_commune\"\n", "}).head(5)" - ] + ], + "id": "9bab9af3" }, { "cell_type": "markdown", - "id": "aa4c2c8c-26cf-49f2-8802-65a375839b81", "metadata": {}, "source": [ "
\n", @@ -1209,172 +841,106 @@ "`{\"ancienne_colonne\": \"nouvelle_colonne\"}`\n", "\n", "
" - ] + ], + "id": "304074ae-e5d1-461b-8dd7-c6cac2861d25" }, { "cell_type": "markdown", - "id": "07235d72-9294-4474-a24a-19451b3c0d6c", "metadata": {}, "source": [ "Comme vu plus haut, construire des expressions de renommage plus\n", "complexes pourra se faire en pur `Python` :" - ] + ], + "id": "bb93677d-5070-42f0-b094-4c53a8903060" }, { "cell_type": "code", - "execution_count": 21, - "id": "229b5e04", + "execution_count": 22, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 13)
aav2020anbv2012depdepcomdomepcidcirisregsdomtypequuu2020nb_equip
strstrstrstrstrstrstrstrstrstrstrstrstr
"524""2021""01093""01""01001""A""200069193""010010000""84""A1""A129""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A401""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A402""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A404""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A405""CSZ""2"
" - ], - "text/plain": [ - "shape: (5, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ aav2020 ┆ an ┆ bv2012 ┆ dep ┆ … ┆ sdom ┆ typequ ┆ uu2020 ┆ nb_equip │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A1 ┆ A129 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A401 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A402 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A404 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A405 ┆ CSZ ┆ 2 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "cols_minuscules = {cols: cols.lower() for cols in df_bpe.columns}\n", "\n", "df_bpe.rename(cols_minuscules).head(5)" - ] + ], + "id": "4744d4d6" }, { "cell_type": "markdown", - "id": "c9636adb-e40f-43ea-9ccc-93cbdf632e85", "metadata": {}, "source": [ "## Trier une table\n", "\n", "La fonction `sort` permet de trier la table sur une ou plusieurs\n", "variables." - ] + ], + "id": "1f39520d-ed06-4eca-83b2-0e70668d151b" }, { "cell_type": "code", - "execution_count": 22, - "id": "eb7dbe75", + "execution_count": 23, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"524""2021""01093""01""01001""A""200069193""010010000""84""A1""A129""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A401""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A402""CSZ""1"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A404""CSZ""2"
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A405""CSZ""2"
" - ], - "text/plain": [ - "shape: (5, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A1 ┆ A129 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A401 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A402 ┆ CSZ ┆ 1 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A404 ┆ CSZ ┆ 2 │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ A4 ┆ A405 ┆ CSZ ┆ 2 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.lazy()\\\n", " .sort(\"DEPCOM\", \"TYPEQU\")\\\n", " .head(5).collect()" - ] + ], + "id": "d7c0c50e" }, { "cell_type": "markdown", - "id": "9c652242-954c-42d3-8960-50e504b16ab4", "metadata": {}, "source": [ "L’ordre de tri - croissant par défaut - peut-être précisé pour chaque\n", "variable." - ] + ], + "id": "f7c46c6d-c921-4d80-a859-9c7219ee44f6" }, { "cell_type": "code", - "execution_count": 23, - "id": "8ea06506", + "execution_count": 24, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"9F1""2021""CSZ""976""97617""A""200059871""97617_IND""06""A1""A128""9F304""1"
"9F1""2021""CSZ""976""97617""A""200059871""97617_IND""06""A1""A129""9F304""1"
"9F1""2021""CSZ""976""97617""A""200059871""97617_IND""06""A2""A203""9F304""1"
"9F1""2021""CSZ""976""97617""A""200059871""97617_IND""06""A2""A205""9F304""1"
"9F1""2021""CSZ""976""97617""A""200059871""97617_IND""06""A2""A206""9F304""1"
" - ], - "text/plain": [ - "shape: (5, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ A1 ┆ A128 ┆ 9F304 ┆ 1 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ A1 ┆ A129 ┆ 9F304 ┆ 1 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ A2 ┆ A203 ┆ 9F304 ┆ 1 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ A2 ┆ A205 ┆ 9F304 ┆ 1 │\n", - "│ 9F1 ┆ 2021 ┆ CSZ ┆ 976 ┆ … ┆ A2 ┆ A206 ┆ 9F304 ┆ 1 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.lazy()\\\n", " .sort(\"DEPCOM\", \"TYPEQU\", descending=[True, False])\\\n", " .head(5).collect()" - ] + ], + "id": "289522fa" }, { "cell_type": "markdown", - "id": "c701cdfd-21aa-4912-b449-e409e3776087", "metadata": {}, "source": [ "## Création de nouvelles variables\n", @@ -1385,52 +951,31 @@ "- on convertit en entier numérique la variable `NB_EQUIP`\n", "- on calcule la somme cumulée (avec `cumsum`)\n", "- on nomme la nouvelle colonne `NB_EQUIP_SUM`" - ] + ], + "id": "b5e38f52-0c17-41a8-afcc-7f7f3b136e1c" }, { "cell_type": "code", - "execution_count": 24, - "id": "48205193", + "execution_count": 25, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "/tmp/ipykernel_83272/2362134496.py:4: DeprecationWarning: `cumsum` is deprecated. It has been renamed to `cum_sum`.\n", - " pl.col(\"NB_EQUIP\").cumsum().alias(\"NB_EQUIP_SUM\"),\n" + "/tmp/ipykernel_84584/2362134496.py:4: DeprecationWarning:\n", + "\n", + "`cumsum` is deprecated. It has been renamed to `cum_sum`.\n" ] }, { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 15)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIPNB_EQUIP_SUMNB_EQUIP_3PLUS
strstrstrstrstrstrstrstrstrstrstrstri64i64bool
"524""2021""01093""01""01001""A""200069193""010010000""84""A1""A129""CSZ"11false
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A401""CSZ"23false
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A402""CSZ"14false
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A404""CSZ"26false
"524""2021""01093""01""01001""A""200069193""010010000""84""A4""A405""CSZ"28false
" - ], - "text/plain": [ - "shape: (5, 15)\n", - "┌─────────┬──────┬────────┬─────┬───┬────────┬──────────┬──────────────┬────────────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ UU2020 ┆ NB_EQUIP ┆ NB_EQUIP_SUM ┆ NB_EQUIP_3PLUS │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ i64 ┆ i64 ┆ bool │\n", - "╞═════════╪══════╪════════╪═════╪═══╪════════╪══════════╪══════════════╪════════════════╡\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ CSZ ┆ 1 ┆ 1 ┆ false │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ CSZ ┆ 2 ┆ 3 ┆ false │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ CSZ ┆ 1 ┆ 4 ┆ false │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ CSZ ┆ 2 ┆ 6 ┆ false │\n", - "│ 524 ┆ 2021 ┆ 01093 ┆ 01 ┆ … ┆ CSZ ┆ 2 ┆ 8 ┆ false │\n", - "└─────────┴──────┴────────┴─────┴───┴────────┴──────────┴──────────────┴────────────────┘" + "" ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -1440,51 +985,33 @@ " pl.col(\"NB_EQUIP\").cumsum().alias(\"NB_EQUIP_SUM\"),\n", " pl.when(pl.col(\"NB_EQUIP\") > 3).then(True).otherwise(False).alias(\"NB_EQUIP_3PLUS\")\n", " ).head(5).collect()" - ] + ], + "id": "632956ae" }, { "cell_type": "markdown", - "id": "221cce93-b340-47c8-8e6a-519c9325cbeb", "metadata": {}, "source": [ "## Production de synthèses et d’agrégats\n", "\n", "On peut produire des statistiques synthétiques sur notre jeu de données\n", "avec la fonction `select`." - ] + ], + "id": "01faa85a-dcb2-44b9-9d66-ff5797dd015f" }, { "cell_type": "code", - "execution_count": 25, - "id": "ed75badb", + "execution_count": 26, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (1, 1)
NB_EQUIP_TOT
i64
2399536
" - ], - "text/plain": [ - "shape: (1, 1)\n", - "┌──────────────┐\n", - "│ NB_EQUIP_TOT │\n", - "│ --- │\n", - "│ i64 │\n", - "╞══════════════╡\n", - "│ 2399536 │\n", - "└──────────────┘" + "" ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -1493,72 +1020,46 @@ " .select(\n", " pl.col(\"NB_EQUIP\").sum().alias(\"NB_EQUIP_TOT\")\n", " ).head(5).collect()" - ] + ], + "id": "a90edcde" }, { "cell_type": "markdown", - "id": "8c6876b1-1b5e-44a8-b514-dd33b27c47f7", "metadata": {}, "source": [ "Les fonctions `describe` et `glimpse` proposent un aperçu synthétique de\n", "la table:" - ] + ], + "id": "a429d0c5-9b33-4d93-a9bd-a6c64d43fee2" }, { "cell_type": "code", - "execution_count": 26, - "id": "bcfc0578", + "execution_count": 27, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (9, 14)
statisticAAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstrstr
"count""1056914""1056914""1056914""1056914""1056914""1056914""1056914""1056914""1056914""1056914""1056914""1056914""1056914"
"null_count""0""0""0""0""0""0""0""0""0""0""0""0""0"
"mean"nullnullnullnullnullnullnullnullnullnullnullnullnull
"std"nullnullnullnullnullnullnullnullnullnullnullnullnull
"min""001""2021""01004""01""01001""A""200000172""010010000""01""A1""A101""00151""1"
"25%"nullnullnullnullnullnullnullnullnullnullnullnullnull
"50%"nullnullnullnullnullnullnullnullnullnullnullnullnull
"75%"nullnullnullnullnullnullnullnullnullnullnullnullnull
"max""SAR""2021""CSZ""976""97617""G""CSZ""97617_IND""94""G1""G104""CSZ""99"
" - ], - "text/plain": [ - "shape: (9, 14)\n", - "┌────────────┬─────────┬─────────┬─────────┬───┬─────────┬─────────┬─────────┬──────────┐\n", - "│ statistic ┆ AAV2020 ┆ AN ┆ BV2012 ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞════════════╪═════════╪═════════╪═════════╪═══╪═════════╪═════════╪═════════╪══════════╡\n", - "│ count ┆ 1056914 ┆ 1056914 ┆ 1056914 ┆ … ┆ 1056914 ┆ 1056914 ┆ 1056914 ┆ 1056914 │\n", - "│ null_count ┆ 0 ┆ 0 ┆ 0 ┆ … ┆ 0 ┆ 0 ┆ 0 ┆ 0 │\n", - "│ mean ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", - "│ std ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", - "│ min ┆ 001 ┆ 2021 ┆ 01004 ┆ … ┆ A1 ┆ A101 ┆ 00151 ┆ 1 │\n", - "│ 25% ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", - "│ 50% ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", - "│ 75% ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null │\n", - "│ max ┆ SAR ┆ 2021 ┆ CSZ ┆ … ┆ G1 ┆ G104 ┆ CSZ ┆ 99 │\n", - "└────────────┴─────────┴─────────┴─────────┴───┴─────────┴─────────┴─────────┴──────────┘" + "" ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.describe()" - ] + ], + "id": "c84c90a1" }, { "cell_type": "code", - "execution_count": 27, - "id": "5fd98198", + "execution_count": 28, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Rows: 1056914\n", "Columns: 13\n", @@ -1574,18 +1075,17 @@ "$ SDOM 'A1', 'A4', 'A4', 'A4', 'A4', 'A5', 'A5', 'A5', 'B2', 'C1'\n", "$ TYPEQU 'A129', 'A401', 'A402', 'A404', 'A405', 'A504', 'A505', 'A507', 'B203', 'C104'\n", "$ UU2020 'CSZ', 'CSZ', 'CSZ', 'CSZ', 'CSZ', 'CSZ', 'CSZ', 'CSZ', 'CSZ', 'CSZ'\n", - "$ NB_EQUIP '1', '2', '1', '2', '2', '1', '1', '2', '1', '1'\n", - "\n" + "$ NB_EQUIP '1', '2', '1', '2', '2', '1', '1', '2', '1', '1'\n" ] } ], "source": [ "df_bpe.glimpse()" - ] + ], + "id": "be2f60be" }, { "cell_type": "markdown", - "id": "8e958bfc-d4c3-439f-be4f-b96cbfef18f6", "metadata": {}, "source": [ "Les fonctions `groupby` et `agg` sont mobilisées pour créer des\n", @@ -1597,63 +1097,42 @@ " (`pl.col(\"TYPEQU\") == \"B203\").sum()`) ;\n", "3. On donne un nom à la colonne portant la somme\n", " (`alias(\"NB_BOULANGERIES_TOT\")`)" - ] + ], + "id": "b19d2ee3-44b7-487c-9731-b4ef021882a9" }, { "cell_type": "code", - "execution_count": 28, - "id": "4f4345b5", + "execution_count": 29, "metadata": {}, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "/tmp/ipykernel_83272/2211860743.py:1: DeprecationWarning: `groupby` is deprecated. It has been renamed to `group_by`.\n", - " df_bpe.lazy().groupby(\"DEP\").agg(\n" + "/tmp/ipykernel_84584/2211860743.py:1: DeprecationWarning:\n", + "\n", + "`groupby` is deprecated. It has been renamed to `group_by`.\n" ] }, { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 2)
DEPNB_BOULANGERIES_TOT
stru32
"75"731
"59"731
"13"576
"69"507
"62"440
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬─────────────────────┐\n", - "│ DEP ┆ NB_BOULANGERIES_TOT │\n", - "│ --- ┆ --- │\n", - "│ str ┆ u32 │\n", - "╞═════╪═════════════════════╡\n", - "│ 75 ┆ 731 │\n", - "│ 59 ┆ 731 │\n", - "│ 13 ┆ 576 │\n", - "│ 69 ┆ 507 │\n", - "│ 62 ┆ 440 │\n", - "└─────┴─────────────────────┘" + "" ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.lazy().groupby(\"DEP\").agg(\n", " (pl.col(\"TYPEQU\") == \"B203\").sum().alias(\"NB_BOULANGERIES_TOT\")\n", ").sort(\"NB_BOULANGERIES_TOT\", descending=True).head(5).collect()" - ] + ], + "id": "5cec98bd" }, { "cell_type": "markdown", - "id": "71eb02f4-c2e9-4372-af2e-cfc2b604958a", "metadata": {}, "source": [ "# Explorer le plan d’exécution\n", @@ -1663,28 +1142,24 @@ "\n", "Pour cela, il suffit pour cela de ne pas effectuer d’action comme\n", "`collect` ou `head`." - ] + ], + "id": "3926f8b5-0898-457d-b9a7-333d286e7eab" }, { "cell_type": "code", - "execution_count": 29, - "id": "7188e5d3", + "execution_count": 30, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ "naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)\n", "

\n", "
SELECT [col(\"NB_EQUIP\").sum().alias(\"NB_EQUIP_TOT\")] FROM

WITH_COLUMNS:

[col(\"NB_EQUIP\").cast(Int64)]

DF [\"AAV2020\", \"AN\", \"BV2012\", \"DEP\"]; PROJECT */13 COLUMNS; SELECTION: \"None\"
" - ], - "text/plain": [ - "" ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -1693,11 +1168,11 @@ " .select(\n", " pl.col(\"NB_EQUIP\").sum().alias(\"NB_EQUIP_TOT\")\n", " )" - ] + ], + "id": "0a9c4c74" }, { "cell_type": "markdown", - "id": "5b76073f-1050-4f87-b5d3-1fbb2dfaefc3", "metadata": {}, "source": [ "
\n", @@ -1708,11 +1183,11 @@ "retardée en attente d’une action n’ayant pas lieu.\n", "\n", "
" - ] + ], + "id": "d07badf8-168a-4f1e-a3a0-d9bec01c73c9" }, { "cell_type": "markdown", - "id": "da3ececc-c4e6-4461-967d-95574b43261e", "metadata": {}, "source": [ "# Une interconnexion avec `DuckDB` pour faire des requêtes SQL\n", @@ -1733,68 +1208,48 @@ "Pour illustrer le premier cas, reprenons un exemple précédent qui\n", "illustrait la sélection d’observations en se restreignant au choix de\n", "deux départements:" - ] + ], + "id": "419bf3de-278e-4208-946f-d12f3d5afadf" }, { "cell_type": "code", - "execution_count": 30, - "id": "80d68416", + "execution_count": 31, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (5, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"001""2021""75056""75""75101""A""200054781""751010201""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010301""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010402""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010102""11""A1""A105""00851""1"
"001""2021""75056""75""75101""A""200054781""751010402""11""A1""A120""00851""1"
" - ], - "text/plain": [ - "shape: (5, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A105 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A120 ┆ 00851 ┆ 1 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "df_bpe.filter(\n", " pl.col(\"DEP\").is_in([\"75\", \"92\"])\n", ").head(5)" - ] + ], + "id": "703fed73" }, { "cell_type": "markdown", - "id": "e8b88f2b-ae33-4be7-b67e-1f946662cab0", "metadata": {}, "source": [ "Avec `DuckDB`, pour effectuer la même opération, nommée *filtre* en SQL,\n", "une requête SQL s’écrira de la manière suivante:" - ] + ], + "id": "f2264c67-1f8e-4bd1-adc8-84961f596128" }, { "cell_type": "code", - "execution_count": 31, - "id": "5a216651", + "execution_count": 32, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/plain": [ "┌─────────┬─────────┬─────────┬─────────┬─────────┬───┬───────────┬─────────┬─────────┬─────────┬─────────┬──────────┐\n", @@ -1828,10 +1283,7 @@ "│ ? rows (>9999 rows, 20 shown) 13 columns (11 shown) │\n", "└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘" ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ @@ -1839,11 +1291,11 @@ "\n", "df_duckdb = duckdb.sql(\"SELECT * FROM df_bpe WHERE CAST(DEP AS VARCHAR) in ('75', '92')\")\n", "df_duckdb" - ] + ], + "id": "8bacfe89" }, { "cell_type": "markdown", - "id": "0f90a456-2659-40d9-ae97-f8251cac83f7", "metadata": {}, "source": [ "
\n", @@ -1854,11 +1306,11 @@ "montré dans le prochain exemple\n", "\n", "
" - ] + ], + "id": "c2681d7f-12f6-40c2-92ba-77de68fe6e5f" }, { "cell_type": "markdown", - "id": "311942c1-b34a-4eee-8980-5b1becec2e2a", "metadata": {}, "source": [ "La seconde approche implique quant à elle d’utiliser `DuckDB` pour lire\n", @@ -1870,86 +1322,59 @@ " effectuer le filtre\n", "- Le résultat de cette requête est transformé en `DataFrame` `Polars`\n", " avec la méthode `pl`:" - ] + ], + "id": "c301b72f-5c8a-496f-a17e-367df8a639aa" }, { "cell_type": "code", - "execution_count": 32, - "id": "83f7910f", + "execution_count": 33, "metadata": {}, "outputs": [ { + "output_type": "display_data", + "metadata": {}, "data": { "text/html": [ - "
\n", - "shape: (53_280, 13)
AAV2020ANBV2012DEPDEPCOMDOMEPCIDCIRISREGSDOMTYPEQUUU2020NB_EQUIP
strstrstrstrstrstrstrstrstrstrstrstrstr
"001""2021""75056""75""75101""A""200054781""751010201""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010301""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010402""11""A1""A101""00851""1"
"001""2021""75056""75""75101""A""200054781""751010102""11""A1""A105""00851""1"
"001""2021""75056""75""75101""A""200054781""751010402""11""A1""A120""00851""1"
"001""2021""75056""92""92078""F""200054781""920780109""11""F3""F314""00851""1"
"001""2021""75056""92""92078""G""200054781""920780107""11""G1""G101""00851""2"
"001""2021""75056""92""92078""G""200054781""920780103""11""G1""G102""00851""1"
"001""2021""75056""92""92078""G""200054781""920780102""11""G1""G104""00851""1"
"001""2021""75056""75""93045""A""200054781""930450108""11""A3""A303""00851""1"
" - ], - "text/plain": [ - "shape: (53_280, 13)\n", - "┌─────────┬──────┬────────┬─────┬───┬──────┬────────┬────────┬──────────┐\n", - "│ AAV2020 ┆ AN ┆ BV2012 ┆ DEP ┆ … ┆ SDOM ┆ TYPEQU ┆ UU2020 ┆ NB_EQUIP │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ ┆ str ┆ str ┆ str ┆ str │\n", - "╞═════════╪══════╪════════╪═════╪═══╪══════╪════════╪════════╪══════════╡\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A101 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A105 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A1 ┆ A120 ┆ 00851 ┆ 1 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 92 ┆ … ┆ F3 ┆ F314 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 92 ┆ … ┆ G1 ┆ G101 ┆ 00851 ┆ 2 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 92 ┆ … ┆ G1 ┆ G102 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 92 ┆ … ┆ G1 ┆ G104 ┆ 00851 ┆ 1 │\n", - "│ 001 ┆ 2021 ┆ 75056 ┆ 75 ┆ … ┆ A3 ┆ A303 ┆ 00851 ┆ 1 │\n", - "└─────────┴──────┴────────┴─────┴───┴──────┴────────┴────────┴──────────┘" + "" ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" + } } ], "source": [ "duckdb.sql(\"SELECT * FROM read_parquet(\\\"bpe.parquet\\\") WHERE CAST(DEP AS VARCHAR) in ('75', '92')\").pl()" - ] + ], + "id": "f57a845c" }, { "cell_type": "markdown", - "id": "d137416a-5ecf-4a50-b2bf-0a44048fcae0", "metadata": {}, "source": [ "Cette approche est intéressante pour effectuer les opérations de\n", "sélection d’observations ou de variables le plus tôt possible afin de ne\n", "pas avoir en mémoire des données inutilisées." - ] + ], + "id": "b3374e3c-2eae-41c8-b81c-7b5d8daaac3f" } ], + "nbformat": 4, + "nbformat_minor": 5, "metadata": { "kernelspec": { + "name": "python3", "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "language": "python" }, "language_info": { + "name": "python", "codemirror_mode": { "name": "ipython", - "version": 3 + "version": "3" }, "file_extension": ".py", "mimetype": "text/x-python", - "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + } +} \ No newline at end of file