From 6f18dc20df01f6c4b14d9e8cec26eea2e3ea42af Mon Sep 17 00:00:00 2001 From: Yennie Jun Date: Tue, 15 Dec 2020 16:52:34 +0900 Subject: [PATCH] Adding example for cleaning and exploring survey data --- data-plots/survey-data.ipynb | 1287 ++++++++++++++++++++++++++++++++++ 1 file changed, 1287 insertions(+) create mode 100644 data-plots/survey-data.ipynb diff --git a/data-plots/survey-data.ipynb b/data-plots/survey-data.ipynb new file mode 100644 index 0000000..652fb5c --- /dev/null +++ b/data-plots/survey-data.ipynb @@ -0,0 +1,1287 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import datetime\n", + "import matplotlib.pyplot as plt\n", + "import re\n", + "import psycopg2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetching data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Connect to database\n", + "conn = psycopg2.connect(\n", + " host='covid19db.org',\n", + " port=5432,\n", + " dbname='covid19',\n", + " user='covid19',\n", + " password='covid19')\n", + "cur = conn.cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch data using SQL Query\n", + "sql_command = \"\"\"SELECT * FROM world_bank\"\"\"\n", + "df_wb = pd.read_sql(sql_command, conn)\n", + "\n", + "sql_command = \"\"\"SELECT * FROM surveys\"\"\"\n", + "df_surveys = pd.read_sql(sql_command, conn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## World Bank Table" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
valueyear
countryindicator_name
AfghanistanARI treatment (% of children under 5 taken to a health provider)61.5000002015
Access to clean fuels and technologies for cooking (% of population)32.4400002016
Access to electricity (% of population)98.7132032018
Access to electricity, rural (% of rural population)98.2728722018
Access to electricity, urban (% of urban population)100.0000002018
............
ZimbabweWomen who believe a husband is justified in beating his wife when she neglects the children (%)21.4000002015
Women who believe a husband is justified in beating his wife when she refuses sex with him (%)14.5000002015
Women who were first married by age 15 (% of women ages 20-24)3.7000002015
Women who were first married by age 18 (% of women ages 20-24)32.4000002015
Women's share of population ages 15+ living with HIV (%)59.8000002018
\n", + "

307665 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " value \\\n", + "country indicator_name \n", + "Afghanistan ARI treatment (% of children under 5 taken to a... 61.500000 \n", + " Access to clean fuels and technologies for cook... 32.440000 \n", + " Access to electricity (% of population) 98.713203 \n", + " Access to electricity, rural (% of rural popula... 98.272872 \n", + " Access to electricity, urban (% of urban popula... 100.000000 \n", + "... ... \n", + "Zimbabwe Women who believe a husband is justified in bea... 21.400000 \n", + " Women who believe a husband is justified in bea... 14.500000 \n", + " Women who were first married by age 15 (% of wo... 3.700000 \n", + " Women who were first married by age 18 (% of wo... 32.400000 \n", + " Women's share of population ages 15+ living wit... 59.800000 \n", + "\n", + " year \n", + "country indicator_name \n", + "Afghanistan ARI treatment (% of children under 5 taken to a... 2015 \n", + " Access to clean fuels and technologies for cook... 2016 \n", + " Access to electricity (% of population) 2018 \n", + " Access to electricity, rural (% of rural popula... 2018 \n", + " Access to electricity, urban (% of urban popula... 2018 \n", + "... ... \n", + "Zimbabwe Women who believe a husband is justified in bea... 2015 \n", + " Women who believe a husband is justified in bea... 2015 \n", + " Women who were first married by age 15 (% of wo... 2015 \n", + " Women who were first married by age 18 (% of wo... 2015 \n", + " Women's share of population ages 15+ living wit... 2018 \n", + "\n", + "[307665 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# To group by country and indicator_name (survey question) and look at values\n", + "df_wb.groupby([\"country\", \"indicator_name\"])\\\n", + " [[\"country\", \"indicator_name\", \"value\", \"year\"]].mean(\"value\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Just looking at survey results from 2019\n", + "df_wb_2019 = df_wb[df_wb.year==2018].dropna(subset=[\"value\"])[\n", + " [\"country\", \"indicator_name\", \"value\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryvalue
indicator_name
Mammal species, threatened215215
Bird species, threatened215215
Plant species (higher), threatened215215
Fish species, threatened215215
Adjusted savings: mineral depletion (current US$)214214
Access to electricity (% of population)214214
Access to electricity, urban (% of urban population)212212
Terrestrial protected areas (% of total land area)211211
Terrestrial and marine protected areas (% of total territorial area)210210
Surface area (sq. km)209209
\n", + "
" + ], + "text/plain": [ + " country value\n", + "indicator_name \n", + "Mammal species, threatened 215 215\n", + "Bird species, threatened 215 215\n", + "Plant species (higher), threatened 215 215\n", + "Fish species, threatened 215 215\n", + "Adjusted savings: mineral depletion (current US$) 214 214\n", + "Access to electricity (% of population) 214 214\n", + "Access to electricity, urban (% of urban popula... 212 212\n", + "Terrestrial protected areas (% of total land area) 211 211\n", + "Terrestrial and marine protected areas (% of to... 210 210\n", + "Surface area (sq. km) 209 209" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Showing the top questions that have been asked by the most number of countries\n", + "\n", + "df_wb_2019.groupby(\"indicator_name\").count()\\\n", + " .sort_values(by=\"value\", ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryvalue
indicator_name
Adjusted savings: education expenditure (% of GNI)198198
Compulsory education, duration (years)178178
Adjusted savings: education expenditure (current US$)178178
Preprimary education, duration (years)177177
Primary education, pupils (% female)8989
Primary education, teachers8989
Primary education, teachers (% female)8989
Primary education, pupils8989
Secondary education, general pupils8686
Secondary education, general pupils (% female)8585
\n", + "
" + ], + "text/plain": [ + " country value\n", + "indicator_name \n", + "Adjusted savings: education expenditure (% of GNI) 198 198\n", + "Compulsory education, duration (years) 178 178\n", + "Adjusted savings: education expenditure (curren... 178 178\n", + "Preprimary education, duration (years) 177 177\n", + "Primary education, pupils (% female) 89 89\n", + "Primary education, teachers 89 89\n", + "Primary education, teachers (% female) 89 89\n", + "Primary education, pupils 89 89\n", + "Secondary education, general pupils 86 86\n", + "Secondary education, general pupils (% female) 85 85" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Showing the top questions that have been asked by the most number of countries\n", + "# that include certain keywords \n", + "\n", + "df_wb_2019[df_wb_2019.indicator_name.str\\\n", + " .contains(\"education\")]\\\n", + " .groupby(\"indicator_name\").count()\\\n", + " .sort_values(by=\"value\", ascending=False).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Survey Table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Currently, the survey properties are in one long JSON. We can explode each property in the JSON into its own line so that each survey question-answer pair gets its own line" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcewavegidcountrycountrycodeadm_area_1adm_area_2adm_area_3samplesizeproperties
0EVS1981-1984[BEL]BelgiumBELNoneNoneNone1145{'A001': {'Label': 'Important in life: Family'...
1EVS1981-1984[CAN]CanadaCANNoneNoneNone1254{'A001': {'Label': 'Important in life: Family'...
\n", + "
" + ], + "text/plain": [ + " source wave gid country countrycode adm_area_1 adm_area_2 \\\n", + "0 EVS 1981-1984 [BEL] Belgium BEL None None \n", + "1 EVS 1981-1984 [CAN] Canada CAN None None \n", + "\n", + " adm_area_3 samplesize properties \n", + "0 None 1145 {'A001': {'Label': 'Important in life: Family'... \n", + "1 None 1254 {'A001': {'Label': 'Important in life: Family'... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_surveys.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "def explode_survey_props(props):\n", + " q_label, label, category, frequency = [], [], [], []\n", + "\n", + " for line in props:\n", + " if \"X023\" in line: continue # this one too complicated to clean\n", + " if \"X051\" in line: continue # ethnic group; this one has no categories\n", + " if \"original_region_code\" in line: break #this one is not related\n", + " \n", + " for freq in props[line]['Frequencies']: \n", + " if re.search(\"[A-Z]\\d+_\\d+_(-*\\d+)\", freq):\n", + " q_val = re.search(\"[A-Z]\\d+_\\d+_(-*\\d+)\", freq).group(1)\n", + " else:\n", + " q_val = re.search(\"[A-Z]\\d\\d\\d[A-Z]*_(-*\\d+)\", freq).group(1)\n", + " q_label.append(line)\n", + " label.append(props[line]['Label'])\n", + " frequency.append(props[line]['Frequencies'][freq]) \n", + "\n", + " if \"X002\" in freq or \"X003\" in freq: #special case of birth year/age\n", + " category.append(freq)\n", + " else:\n", + " category.append(props[line]['Categories'][q_val])\n", + "\n", + " return q_label, label, category, frequency" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Build up in arrays\n", + "waves, gids, countries, samplesizes = [],[],[],[]\n", + "q_labels, labels, categories, frequencies = [],[],[],[]\n", + "\n", + "# Iterate through the survey to explode each json\n", + "for i in range(len(df_surveys)):\n", + " props = df_surveys.properties[i]\n", + " q_label, label, category, frequency = explode_survey_props(props)\n", + " \n", + " # These are copied over from the original dataframe\n", + " source = [df_surveys.source[i]] * len(q_label)\n", + " wave = [df_surveys.wave[i]] * len(q_label)\n", + " gid = [df_surveys.gid[i]] * len(q_label)\n", + " country = [df_surveys.country[i]] * len(q_label)\n", + " samplesize = [df_surveys.samplesize[i]] * len(q_label)\n", + "\n", + " waves.extend(wave)\n", + " gids.extend(gid)\n", + " countries.extend(country)\n", + " samplesizes.extend(samplesize)\n", + " q_labels.extend(q_label)\n", + " labels.extend(label)\n", + " categories.extend(category)\n", + " frequencies.extend(frequency)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Build the dataframe... this takes some time because the arrays are long\n", + "\n", + "cols = ['wave', 'gid', 'country', 'samplesize', \n", + " 'q_label', 'label', 'category', 'frequency']\n", + "survey_explode_df = pd.DataFrame([waves, gids, countries, samplesizes, \n", + " q_labels, labels, categories, frequencies]).T\n", + "survey_explode_df.columns=cols\n", + "survey_explode_df.frequency = survey_explode_df.frequency.apply(float)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "833664\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wavegidcountrysamplesizeq_labellabelcategoryfrequency
6611092010-2014[ESP.2_1]Spain33X028Employment statusSelf employed0.121212
2728212005-2009[MAR]Morocco1200E037Government responsibility40.033333
1038802008-2010[NOR]Norway1090X003AgeX003_230.019688
7903822008-2010[PRT.3_1, PRT.8_1, PRT.12.4_1, PRT.14_1, PRT.1...Portugal183X036Profession/job\"Missing; Unkown\"0.000000
2261091999-2004[PRI]Puerto Rico720X003AgeX003_-10.000000
\n", + "
" + ], + "text/plain": [ + " wave gid \\\n", + "661109 2010-2014 [ESP.2_1] \n", + "272821 2005-2009 [MAR] \n", + "103880 2008-2010 [NOR] \n", + "790382 2008-2010 [PRT.3_1, PRT.8_1, PRT.12.4_1, PRT.14_1, PRT.1... \n", + "226109 1999-2004 [PRI] \n", + "\n", + " country samplesize q_label label \\\n", + "661109 Spain 33 X028 Employment status \n", + "272821 Morocco 1200 E037 Government responsibility \n", + "103880 Norway 1090 X003 Age \n", + "790382 Portugal 183 X036 Profession/job \n", + "226109 Puerto Rico 720 X003 Age \n", + "\n", + " category frequency \n", + "661109 Self employed 0.121212 \n", + "272821 4 0.033333 \n", + "103880 X003_23 0.019688 \n", + "790382 \"Missing; Unkown\" 0.000000 \n", + "226109 X003_-1 0.000000 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(len(survey_explode_df))\n", + "display(survey_explode_df.sample(5))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "74\n", + "['Important in life: Family' 'Important in life: Friends'\n", + " 'Important in life: Work' 'State of health (subjective)'\n", + " 'Important child qualities: independence'\n", + " 'Important child qualities: feeling of responsibility'\n", + " 'Important child qualities: obedience'\n", + " 'Member: Belong to religious organization'\n", + " 'Member: Belong to sports or recreation'\n", + " 'Active/Inactive membership of church or religious organization'\n", + " 'Active/Inactive membership of sport or recreation'\n", + " 'Most people can be trusted' 'Satisfaction with your life'\n", + " 'How much freedom of choice and control'\n", + " 'Schwartz: It is important to this person living in secure surroundings'\n", + " 'Schwartz: It is important to this person to have a good time'\n", + " 'Schwartz: It is important to this person to always behave properly'\n", + " 'Schwartz: It is important to this person to do something for the good of society'\n", + " 'Social position: People in their 20s'\n", + " 'Social position: People in their 40s']\n" + ] + } + ], + "source": [ + "# Showing 20 of the unique questions asked in the survey\n", + "survey_questions = survey_explode_df.label.unique()\n", + "print(len(survey_questions))\n", + "print(survey_questions[:20])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['1981-1984', '1990-1993', '1999-2001', '2008-2010', '1989-1993',\n", + " '1994-1998', '1999-2004', '2005-2009', '2010-2014'], dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The unique waves of years\n", + "survey_explode_df.wave.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
frequency
countrycategory
AlgeriaDon´t know0.003333
Missing; Unknown0.000000
No answer0.000833
Not asked in survey0.000000
Not at all important0.011667
Not very important0.011667
Rather important0.041667
Very important0.930833
ArgentinaDon´t know0.000000
Missing; Unknown0.000000
No answer0.002467
Not asked in survey0.000000
Not at all important0.002577
Not very important0.004394
Rather important0.101518
Very important0.889044
ArmeniaDon´t know0.001592
Missing; Unknown0.002474
No answer0.000573
Not asked in survey0.000000
\n", + "
" + ], + "text/plain": [ + " frequency\n", + "country category \n", + "Algeria Don´t know 0.003333\n", + " Missing; Unknown 0.000000\n", + " No answer 0.000833\n", + " Not asked in survey 0.000000\n", + " Not at all important 0.011667\n", + " Not very important 0.011667\n", + " Rather important 0.041667\n", + " Very important 0.930833\n", + "Argentina Don´t know 0.000000\n", + " Missing; Unknown 0.000000\n", + " No answer 0.002467\n", + " Not asked in survey 0.000000\n", + " Not at all important 0.002577\n", + " Not very important 0.004394\n", + " Rather important 0.101518\n", + " Very important 0.889044\n", + "Armenia Don´t know 0.001592\n", + " Missing; Unknown 0.002474\n", + " No answer 0.000573\n", + " Not asked in survey 0.000000" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# How do countries respond to Family Values in 2010-2014?\n", + "mask = (survey_explode_df.wave=='2010-2014')\\\n", + " &(survey_explode_df.label==\"Important in life: Family\")\n", + "survey_explode_df[mask].groupby([\"country\", \"category\"]).sum().head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrywave
label
Important child qualities: feeling of responsibility1079
Important child qualities: independence1079
Important child qualities: obedience1079
Important in life: Family1079
Important in life: Friends1079
Important in life: Work1079
It is not important for me to know about science in my daily life1079
Most important: first choice1079
Most important: second choice1079
Schwartz: It is important to this person living in secure surroundings1079
Schwartz: It is important to this person to always behave properly1079
Schwartz: It is important to this person to do something for the good of society1079
Schwartz: It is important to this person to have a good time1079
\n", + "
" + ], + "text/plain": [ + " country wave\n", + "label \n", + "Important child qualities: feeling of responsib... 107 9\n", + "Important child qualities: independence 107 9\n", + "Important child qualities: obedience 107 9\n", + "Important in life: Family 107 9\n", + "Important in life: Friends 107 9\n", + "Important in life: Work 107 9\n", + "It is not important for me to know about scienc... 107 9\n", + "Most important: first choice 107 9\n", + "Most important: second choice 107 9\n", + "Schwartz: It is important to this person living... 107 9\n", + "Schwartz: It is important to this person to alw... 107 9\n", + "Schwartz: It is important to this person to do ... 107 9\n", + "Schwartz: It is important to this person to hav... 107 9" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# How do countries respond to survey questions that include the word \"important\"?\n", + "survey_explode_df[survey_explode_df.label.str.contains(\"Important|important\")]\\\n", + " .groupby(\"label\").agg({\"country\": \"nunique\", \"wave\":\"nunique\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
frequency
countrywavelabelcategory
Albania1994-1998Active/Inactive membership of church or religious organizationActive member0.048048
Don´t know0.000000
Inactive member0.162162
Missing; Unknown0.000000
No answer0.003003
...............
Zimbabwe2010-2014Year of birthX002_19940.006580
X002_19950.000000
X002_19960.000000
X002_19970.000000
X002_19990.000000
\n", + "

354576 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " frequency\n", + "country wave label category \n", + "Albania 1994-1998 Active/Inactive membership of church or religio... Active member 0.048048\n", + " Don´t know 0.000000\n", + " Inactive member 0.162162\n", + " Missing; Unknown 0.000000\n", + " No answer 0.003003\n", + "... ...\n", + "Zimbabwe 2010-2014 Year of birth X002_1994 0.006580\n", + " X002_1995 0.000000\n", + " X002_1996 0.000000\n", + " X002_1997 0.000000\n", + " X002_1999 0.000000\n", + "\n", + "[354576 rows x 1 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Group survey questions by country, wave, label, category, then sum\n", + "survey_explode_df.groupby([\"country\", \"wave\", \"label\", \"category\"])\\\n", + " [[\"samplesize\", \"frequency\"]].sum()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}