Skip to content

Commit

Permalink
improve tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
jgarciab committed Jun 20, 2024
1 parent 158cef9 commit f65a95f
Showing 1 changed file with 61 additions and 69 deletions.
130 changes: 61 additions & 69 deletions tutorial_netCBS.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import polars as pl\n",
"\n",
"import netcbs"
"import netcbs as net"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand All @@ -28,8 +28,8 @@
],
"source": [
"# Print contexts and codebook\n",
"print(netcbs.context2types)\n",
"print(netcbs.codebook)"
"print(net.context2types)\n",
"print(net.codebook)"
]
},
{
Expand All @@ -45,23 +45,23 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Create df_sample example\n",
"df_sample = pl.DataFrame(\n",
" {\n",
" \"RINPERSOON\": range(100_000_000, 100_010_000),\n",
" \"RINPERSOON\": [str(_) for _ in range(100_000_000, 100_010_000)],\n",
" \"RINPERSOONS\": [\"R\"]*10_000\n",
" }\n",
" \n",
")\n",
"\n",
"df_agg = pl.LazyFrame(\n",
" {\n",
" \"RINPERSOON\": range(100_000_000, 101_000_000),\n",
" \"RINPERSOON\": [str(_) for _ in range(100_000_000, 101_000_000)],\n",
" \"RINPERSOONS\": [\"R\"]*1_000_000,\n",
" \"Income\": [random.normalvariate(30000, 5000) for _ in range(1_000_000)],\n",
" \"Age\": [random.normalvariate(30, 10) for _ in range(1_000_000)]\n",
Expand All @@ -81,15 +81,15 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:netcbs.netcbs:The dataframe may contain duplicated entries. Use a non-lazy dataframe to find if this is the case\n",
"INFO:netcbs.netcbs:The dataframe may contain duplicated entries. Use a non-lazy dataframe to find if this is the case\n"
"INFO:netcbs.netcbs:Dropping duplicated entries (if any). Check this before submitting the query or set lazy==False\n",
"INFO:netcbs.netcbs:Dropping duplicated entries (if any). Check this before submitting the query or set lazy==False\n"
]
},
{
Expand All @@ -102,38 +102,41 @@
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (10_000, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>RINPERSOON</th><th>RINPERSOONS</th><th>mean_Income</th><th>mean_Age</th><th>sum_Income</th><th>sum_Age</th><th>max_Income</th><th>max_Age</th></tr><tr><td>i64</td><td>str</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td></tr></thead><tbody><tr><td>100000000</td><td>&quot;R&quot;</td><td>31380.1757</td><td>28.972624</td><td>62760.351401</td><td>57.945248</td><td>34687.57934</td><td>29.336526</td></tr><tr><td>100000001</td><td>&quot;R&quot;</td><td>32879.284118</td><td>37.345571</td><td>131517.136471</td><td>149.382285</td><td>35894.169057</td><td>44.11918</td></tr><tr><td>100000002</td><td>&quot;R&quot;</td><td>27460.960294</td><td>30.001971</td><td>137304.80147</td><td>150.009855</td><td>37925.004174</td><td>38.651416</td></tr><tr><td>100000003</td><td>&quot;R&quot;</td><td>29677.4618</td><td>28.657164</td><td>504516.850594</td><td>487.171792</td><td>34707.643278</td><td>44.786676</td></tr><tr><td>100000004</td><td>&quot;R&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>100009995</td><td>&quot;R&quot;</td><td>28769.819413</td><td>31.496589</td><td>115079.277653</td><td>125.986358</td><td>34641.216281</td><td>44.699332</td></tr><tr><td>100009996</td><td>&quot;R&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>100009997</td><td>&quot;R&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>100009998</td><td>&quot;R&quot;</td><td>28584.240395</td><td>30.853813</td><td>171505.442372</td><td>185.122875</td><td>37256.265964</td><td>36.488158</td></tr><tr><td>100009999</td><td>&quot;R&quot;</td><td>27008.856403</td><td>28.560398</td><td>135044.282017</td><td>142.801989</td><td>34038.645475</td><td>34.666268</td></tr></tbody></table></div>"
"<small>shape: (10_000, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>RINPERSOON</th><th>RINPERSOONS</th><th>mean_Income</th><th>mean_Age</th><th>sum_Income</th><th>sum_Age</th><th>max_Income</th><th>max_Age</th></tr><tr><td>str</td><td>str</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td></tr></thead><tbody><tr><td>&quot;100000000&quot;</td><td>&quot;R&quot;</td><td>28321.320778</td><td>31.688816</td><td>226570.566225</td><td>253.510529</td><td>35314.662122</td><td>50.746184</td></tr><tr><td>&quot;100000001&quot;</td><td>&quot;R&quot;</td><td>31915.268944</td><td>29.239601</td><td>191491.613665</td><td>175.437603</td><td>37542.068247</td><td>34.754215</td></tr><tr><td>&quot;100000002&quot;</td><td>&quot;R&quot;</td><td>27835.622318</td><td>28.88142</td><td>111342.489273</td><td>115.525682</td><td>29769.59044</td><td>48.629047</td></tr><tr><td>&quot;100000003&quot;</td><td>&quot;R&quot;</td><td>31834.957176</td><td>43.231485</td><td>127339.828702</td><td>172.925941</td><td>34207.911626</td><td>54.516692</td></tr><tr><td>&quot;100000004&quot;</td><td>&quot;R&quot;</td><td>28058.979387</td><td>29.276037</td><td>140294.896933</td><td>146.380185</td><td>42740.930614</td><td>36.350667</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;100009995&quot;</td><td>&quot;R&quot;</td><td>30907.152506</td><td>27.924227</td><td>154535.76253</td><td>139.621134</td><td>34128.946573</td><td>40.61732</td></tr><tr><td>&quot;100009996&quot;</td><td>&quot;R&quot;</td><td>28838.149166</td><td>26.783768</td><td>230705.193326</td><td>214.270146</td><td>35071.174766</td><td>35.482094</td></tr><tr><td>&quot;100009997&quot;</td><td>&quot;R&quot;</td><td>29566.960626</td><td>31.629272</td><td>88700.881877</td><td>94.887815</td><td>36388.549702</td><td>47.795576</td></tr><tr><td>&quot;100009998&quot;</td><td>&quot;R&quot;</td><td>28922.907062</td><td>30.984691</td><td>260306.163561</td><td>278.862223</td><td>37701.643059</td><td>45.215123</td></tr><tr><td>&quot;100009999&quot;</td><td>&quot;R&quot;</td><td>29045.115959</td><td>30.887665</td><td>203315.811714</td><td>216.213658</td><td>35919.352833</td><td>38.384273</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (10_000, 8)\n",
"┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐\n",
"│ RINPERSOON ┆ RINPERSOON ┆ mean_Incom ┆ mean_Age ┆ sum_Incom ┆ sum_Age ┆ max_Incom ┆ max_Age │\n",
"│ --- ┆ S ┆ e ┆ --- ┆ e ┆ --- ┆ e ┆ --- │\n",
"i64 ┆ --- ┆ --- ┆ f64 ┆ --- ┆ f64 ┆ --- ┆ f64 │\n",
"str ┆ --- ┆ --- ┆ f64 ┆ --- ┆ f64 ┆ --- ┆ f64 │\n",
"│ ┆ str ┆ f64 ┆ ┆ f64 ┆ ┆ f64 ┆ │\n",
"╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
"│ 100000000 ┆ R ┆ 31380.1757 ┆ 28.972624 ┆ 62760.351 ┆ 57.945248 ┆ 34687.579 ┆ 29.336526 │\n",
"│ ┆ ┆ ┆ ┆ 401 ┆ ┆ 34 ┆ │\n",
"│ 100000001 ┆ R ┆ 32879.2841 ┆ 37.345571 ┆ 131517.13 ┆ 149.38228 ┆ 35894.169 ┆ 44.11918 │\n",
"│ ┆ ┆ 18 ┆ ┆ 6471 ┆ 5 ┆ 057 ┆ │\n",
"│ 100000002 ┆ R ┆ 27460.9602 ┆ 30.001971 ┆ 137304.80 ┆ 150.00985 ┆ 37925.004 ┆ 38.651416 │\n",
"│ ┆ ┆ 94 ┆ ┆ 147 ┆ 5 ┆ 174 ┆ │\n",
"│ 100000003 ┆ R ┆ 29677.4618 ┆ 28.657164 ┆ 504516.85 ┆ 487.17179 ┆ 34707.643 ┆ 44.786676 │\n",
"│ ┆ ┆ ┆ ┆ 0594 ┆ 2 ┆ 278 ┆ │\n",
"│ 100000004 ┆ R ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ 100000000 ┆ R ┆ 28321.3207 ┆ 31.688816 ┆ 226570.56 ┆ 253.51052 ┆ 35314.662 ┆ 50.746184 │\n",
"│ ┆ ┆ 78 ┆ ┆ 6225 ┆ 9 ┆ 122 ┆ │\n",
"│ 100000001 ┆ R ┆ 31915.2689 ┆ 29.239601 ┆ 191491.61 ┆ 175.43760 ┆ 37542.068 ┆ 34.754215 │\n",
"│ ┆ ┆ 44 ┆ ┆ 3665 ┆ 3 ┆ 247 ┆ │\n",
"│ 100000002 ┆ R ┆ 27835.6223 ┆ 28.88142 ┆ 111342.48 ┆ 115.52568 ┆ 29769.590 ┆ 48.629047 │\n",
"│ ┆ ┆ 18 ┆ ┆ 9273 ┆ 2 ┆ 44 ┆ │\n",
"│ 100000003 ┆ R ┆ 31834.9571 ┆ 43.231485 ┆ 127339.82 ┆ 172.92594 ┆ 34207.911 ┆ 54.516692 │\n",
"│ ┆ ┆ 76 ┆ ┆ 8702 ┆ 1 ┆ 626 ┆ │\n",
"│ 100000004 ┆ R ┆ 28058.9793 ┆ 29.276037 ┆ 140294.89 ┆ 146.38018 ┆ 42740.930 ┆ 36.350667 │\n",
"│ ┆ ┆ 87 ┆ ┆ 6933 ┆ 5 ┆ 614 ┆ │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ 100009995 ┆ R ┆ 28769.8194 ┆ 31.496589 ┆ 115079.27 ┆ 125.98635 ┆ 34641.216 ┆ 44.699332 │\n",
"│ ┆ ┆ 13 ┆ ┆ 7653 ┆ 8 ┆ 281 ┆ │\n",
"│ 100009996 ┆ R ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ 100009997 ┆ R ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ 100009998 ┆ R ┆ 28584.2403 ┆ 30.853813 ┆ 171505.44 ┆ 185.12287 ┆ 37256.265 ┆ 36.488158 │\n",
"│ ┆ ┆ 95 ┆ ┆ 2372 ┆ 5 ┆ 964 ┆ │\n",
"│ 100009999 ┆ R ┆ 27008.8564 ┆ 28.560398 ┆ 135044.28 ┆ 142.80198 ┆ 34038.645 ┆ 34.666268 │\n",
"│ ┆ ┆ 03 ┆ ┆ 2017 ┆ 9 ┆ 475 ┆ │\n",
"│ 100009995 ┆ R ┆ 30907.1525 ┆ 27.924227 ┆ 154535.76 ┆ 139.62113 ┆ 34128.946 ┆ 40.61732 │\n",
"│ ┆ ┆ 06 ┆ ┆ 253 ┆ 4 ┆ 573 ┆ │\n",
"│ 100009996 ┆ R ┆ 28838.1491 ┆ 26.783768 ┆ 230705.19 ┆ 214.27014 ┆ 35071.174 ┆ 35.482094 │\n",
"│ ┆ ┆ 66 ┆ ┆ 3326 ┆ 6 ┆ 766 ┆ │\n",
"│ 100009997 ┆ R ┆ 29566.9606 ┆ 31.629272 ┆ 88700.881 ┆ 94.887815 ┆ 36388.549 ┆ 47.795576 │\n",
"│ ┆ ┆ 26 ┆ ┆ 877 ┆ ┆ 702 ┆ │\n",
"│ 100009998 ┆ R ┆ 28922.9070 ┆ 30.984691 ┆ 260306.16 ┆ 278.86222 ┆ 37701.643 ┆ 45.215123 │\n",
"│ ┆ ┆ 62 ┆ ┆ 3561 ┆ 3 ┆ 059 ┆ │\n",
"│ 100009999 ┆ R ┆ 29045.1159 ┆ 30.887665 ┆ 203315.81 ┆ 216.21365 ┆ 35919.352 ┆ 38.384273 │\n",
"│ ┆ ┆ 59 ┆ ┆ 1714 ┆ 8 ┆ 833 ┆ │\n",
"└────────────┴────────────┴────────────┴───────────┴───────────┴───────────┴───────────┴───────────┘"
]
},
"execution_count": 11,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -164,12 +167,12 @@
"# Example\n",
"query = \"[Income, Age] -> Family[all] -> Schoolmates[all] -> Sample\"\n",
"\n",
"df = netcbs.transform(query, \n",
"df = net.transform(query, \n",
" df_sample = df_sample, \n",
" df_agg = df_agg, \n",
" year=2021,\n",
" cbsdata_path='cbsdata/Bevolking', # Path to the CBS data (\"G:/Bevolking\"), in this example is synthetic data locally \n",
" agg_func=[pl.mean, pl.sum, pl.max], \n",
" agg_funcs=[pl.mean, pl.sum, pl.max], \n",
" return_pandas=False, \n",
" lazy=True)\n",
"\n",
Expand All @@ -178,29 +181,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "DataFrame constructor called with unsupported type 'LazyFrame' for the `data` parameter",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/var/folders/hx/nz98f65j615c4ygz7xt694700000gp/T/ipykernel_23424/390691059.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_sample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/miniforge3/envs/st/lib/python3.10/site-packages/polars/dataframe/frame.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, schema, schema_overrides, strict, orient, infer_schema_length, nan_to_null)\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[0;34m\" for the `data` parameter\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 418\u001b[0m )\n\u001b[0;32m--> 419\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 420\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 421\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: DataFrame constructor called with unsupported type 'LazyFrame' for the `data` parameter"
]
}
],
"source": [
"pl.DataFrame(df_sample"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 27,
"metadata": {},
"outputs": [
{
Expand All @@ -209,15 +190,15 @@
"['Sample', 'Schoolmates[all]', 'Family[301,302,303]', 'Income']"
]
},
"execution_count": 6,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# You can also validate the query before running it\n",
"query = \"Income -> Family[301,302,303] -> Schoolmates[all] -> Sample\"\n",
"netcbs.validate_query(query, \n",
"net.validate_query(query, \n",
" df_sample = df_sample, \n",
" df_agg = df_agg, \n",
" year=2021,\n",
Expand All @@ -227,23 +208,23 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('cbsdata/Bevolking/FAMILIENETWERKTAB/FAMILIENETWERKTAB2010V1.csv', {301})"
"('cbsdata/Bevolking/FAMILIENETWERKTAB/FAMILIENETWERK2010TABV1.csv', {301})"
]
},
"execution_count": 7,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create path to latest verion of CBS data\n",
"netcbs.format_path(context='Family[301]', year=2010, cbsdata_path='cbsdata/Bevolking')"
"net.format_path(context='Family[301]', year=2010, cbsdata_path='cbsdata/Bevolking')"
]
},
{
Expand All @@ -270,11 +251,11 @@
"metadata": {},
"outputs": [],
"source": [
"netcbs.create_synthetic_data(\"Family\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"netcbs.create_synthetic_data(\"Colleagues\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"netcbs.create_synthetic_data(\"Neighbors\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"netcbs.create_synthetic_data(\"Schoolmates\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"netcbs.create_synthetic_data(\"Housemates\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n"
"net.create_synthetic_data(\"Family\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"net.create_synthetic_data(\"Colleagues\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"net.create_synthetic_data(\"Neighbors\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"net.create_synthetic_data(\"Schoolmates\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n",
"net.create_synthetic_data(\"Housemates\", 2021, 1_000_000, outpath=\"cbsdata/Bevolking\")\n"
]
},
{
Expand Down Expand Up @@ -352,7 +333,7 @@
"\n",
"query = \"[Income, Age] -> Family[all] -> Schoolmates[all] -> Sample\"\n",
"\n",
"df = netcbs.transform(query, \n",
"df = net.transform(query, \n",
" df_sample = df_sample, \n",
" df_agg = df_agg, \n",
" year=2020,\n",
Expand All @@ -364,12 +345,23 @@
"df "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading data directly with polars"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import polars as pl\n",
"pl.read_csv(\"cbsdata/Bevolking/FAMILIENETWERKTAB/FAMILIENETWERK2021TABV1.csv\", \n",
" n_rows=10, separator=\";\", dtypes={\"RINPERSOON\": str})"
]
}
],
"metadata": {
Expand Down

0 comments on commit f65a95f

Please sign in to comment.