diff --git a/data-plots/survey-data.ipynb b/data-plots/survey-data.ipynb
new file mode 100644
index 0000000..652fb5c
--- /dev/null
+++ b/data-plots/survey-data.ipynb
@@ -0,0 +1,1287 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import datetime\n",
+ "import matplotlib.pyplot as plt\n",
+ "import re\n",
+ "import psycopg2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetching data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Connect to database\n",
+ "conn = psycopg2.connect(\n",
+ " host='covid19db.org',\n",
+ " port=5432,\n",
+ " dbname='covid19',\n",
+ " user='covid19',\n",
+ " password='covid19')\n",
+ "cur = conn.cursor()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fetch data using SQL Query\n",
+ "sql_command = \"\"\"SELECT * FROM world_bank\"\"\"\n",
+ "df_wb = pd.read_sql(sql_command, conn)\n",
+ "\n",
+ "sql_command = \"\"\"SELECT * FROM surveys\"\"\"\n",
+ "df_surveys = pd.read_sql(sql_command, conn)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## World Bank Table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " value | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " | country | \n",
+ " indicator_name | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Afghanistan | \n",
+ " ARI treatment (% of children under 5 taken to a health provider) | \n",
+ " 61.500000 | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ " | Access to clean fuels and technologies for cooking (% of population) | \n",
+ " 32.440000 | \n",
+ " 2016 | \n",
+ "
\n",
+ " \n",
+ " | Access to electricity (% of population) | \n",
+ " 98.713203 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ " | Access to electricity, rural (% of rural population) | \n",
+ " 98.272872 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ " | Access to electricity, urban (% of urban population) | \n",
+ " 100.000000 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | Zimbabwe | \n",
+ " Women who believe a husband is justified in beating his wife when she neglects the children (%) | \n",
+ " 21.400000 | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ " | Women who believe a husband is justified in beating his wife when she refuses sex with him (%) | \n",
+ " 14.500000 | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ " | Women who were first married by age 15 (% of women ages 20-24) | \n",
+ " 3.700000 | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ " | Women who were first married by age 18 (% of women ages 20-24) | \n",
+ " 32.400000 | \n",
+ " 2015 | \n",
+ "
\n",
+ " \n",
+ " | Women's share of population ages 15+ living with HIV (%) | \n",
+ " 59.800000 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
307665 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " value \\\n",
+ "country indicator_name \n",
+ "Afghanistan ARI treatment (% of children under 5 taken to a... 61.500000 \n",
+ " Access to clean fuels and technologies for cook... 32.440000 \n",
+ " Access to electricity (% of population) 98.713203 \n",
+ " Access to electricity, rural (% of rural popula... 98.272872 \n",
+ " Access to electricity, urban (% of urban popula... 100.000000 \n",
+ "... ... \n",
+ "Zimbabwe Women who believe a husband is justified in bea... 21.400000 \n",
+ " Women who believe a husband is justified in bea... 14.500000 \n",
+ " Women who were first married by age 15 (% of wo... 3.700000 \n",
+ " Women who were first married by age 18 (% of wo... 32.400000 \n",
+ " Women's share of population ages 15+ living wit... 59.800000 \n",
+ "\n",
+ " year \n",
+ "country indicator_name \n",
+ "Afghanistan ARI treatment (% of children under 5 taken to a... 2015 \n",
+ " Access to clean fuels and technologies for cook... 2016 \n",
+ " Access to electricity (% of population) 2018 \n",
+ " Access to electricity, rural (% of rural popula... 2018 \n",
+ " Access to electricity, urban (% of urban popula... 2018 \n",
+ "... ... \n",
+ "Zimbabwe Women who believe a husband is justified in bea... 2015 \n",
+ " Women who believe a husband is justified in bea... 2015 \n",
+ " Women who were first married by age 15 (% of wo... 2015 \n",
+ " Women who were first married by age 18 (% of wo... 2015 \n",
+ " Women's share of population ages 15+ living wit... 2018 \n",
+ "\n",
+ "[307665 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# To group by country and indicator_name (survey question) and look at values\n",
+ "df_wb.groupby([\"country\", \"indicator_name\"])\\\n",
+ " [[\"country\", \"indicator_name\", \"value\", \"year\"]].mean(\"value\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Just looking at survey results from 2019\n",
+ "df_wb_2019 = df_wb[df_wb.year==2018].dropna(subset=[\"value\"])[\n",
+ " [\"country\", \"indicator_name\", \"value\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " value | \n",
+ "
\n",
+ " \n",
+ " | indicator_name | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Mammal species, threatened | \n",
+ " 215 | \n",
+ " 215 | \n",
+ "
\n",
+ " \n",
+ " | Bird species, threatened | \n",
+ " 215 | \n",
+ " 215 | \n",
+ "
\n",
+ " \n",
+ " | Plant species (higher), threatened | \n",
+ " 215 | \n",
+ " 215 | \n",
+ "
\n",
+ " \n",
+ " | Fish species, threatened | \n",
+ " 215 | \n",
+ " 215 | \n",
+ "
\n",
+ " \n",
+ " | Adjusted savings: mineral depletion (current US$) | \n",
+ " 214 | \n",
+ " 214 | \n",
+ "
\n",
+ " \n",
+ " | Access to electricity (% of population) | \n",
+ " 214 | \n",
+ " 214 | \n",
+ "
\n",
+ " \n",
+ " | Access to electricity, urban (% of urban population) | \n",
+ " 212 | \n",
+ " 212 | \n",
+ "
\n",
+ " \n",
+ " | Terrestrial protected areas (% of total land area) | \n",
+ " 211 | \n",
+ " 211 | \n",
+ "
\n",
+ " \n",
+ " | Terrestrial and marine protected areas (% of total territorial area) | \n",
+ " 210 | \n",
+ " 210 | \n",
+ "
\n",
+ " \n",
+ " | Surface area (sq. km) | \n",
+ " 209 | \n",
+ " 209 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country value\n",
+ "indicator_name \n",
+ "Mammal species, threatened 215 215\n",
+ "Bird species, threatened 215 215\n",
+ "Plant species (higher), threatened 215 215\n",
+ "Fish species, threatened 215 215\n",
+ "Adjusted savings: mineral depletion (current US$) 214 214\n",
+ "Access to electricity (% of population) 214 214\n",
+ "Access to electricity, urban (% of urban popula... 212 212\n",
+ "Terrestrial protected areas (% of total land area) 211 211\n",
+ "Terrestrial and marine protected areas (% of to... 210 210\n",
+ "Surface area (sq. km) 209 209"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Showing the top questions that have been asked by the most number of countries\n",
+ "\n",
+ "df_wb_2019.groupby(\"indicator_name\").count()\\\n",
+ " .sort_values(by=\"value\", ascending=False).head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " value | \n",
+ "
\n",
+ " \n",
+ " | indicator_name | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Adjusted savings: education expenditure (% of GNI) | \n",
+ " 198 | \n",
+ " 198 | \n",
+ "
\n",
+ " \n",
+ " | Compulsory education, duration (years) | \n",
+ " 178 | \n",
+ " 178 | \n",
+ "
\n",
+ " \n",
+ " | Adjusted savings: education expenditure (current US$) | \n",
+ " 178 | \n",
+ " 178 | \n",
+ "
\n",
+ " \n",
+ " | Preprimary education, duration (years) | \n",
+ " 177 | \n",
+ " 177 | \n",
+ "
\n",
+ " \n",
+ " | Primary education, pupils (% female) | \n",
+ " 89 | \n",
+ " 89 | \n",
+ "
\n",
+ " \n",
+ " | Primary education, teachers | \n",
+ " 89 | \n",
+ " 89 | \n",
+ "
\n",
+ " \n",
+ " | Primary education, teachers (% female) | \n",
+ " 89 | \n",
+ " 89 | \n",
+ "
\n",
+ " \n",
+ " | Primary education, pupils | \n",
+ " 89 | \n",
+ " 89 | \n",
+ "
\n",
+ " \n",
+ " | Secondary education, general pupils | \n",
+ " 86 | \n",
+ " 86 | \n",
+ "
\n",
+ " \n",
+ " | Secondary education, general pupils (% female) | \n",
+ " 85 | \n",
+ " 85 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country value\n",
+ "indicator_name \n",
+ "Adjusted savings: education expenditure (% of GNI) 198 198\n",
+ "Compulsory education, duration (years) 178 178\n",
+ "Adjusted savings: education expenditure (curren... 178 178\n",
+ "Preprimary education, duration (years) 177 177\n",
+ "Primary education, pupils (% female) 89 89\n",
+ "Primary education, teachers 89 89\n",
+ "Primary education, teachers (% female) 89 89\n",
+ "Primary education, pupils 89 89\n",
+ "Secondary education, general pupils 86 86\n",
+ "Secondary education, general pupils (% female) 85 85"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Showing the top questions that have been asked by the most number of countries\n",
+ "# that include certain keywords \n",
+ "\n",
+ "df_wb_2019[df_wb_2019.indicator_name.str\\\n",
+ " .contains(\"education\")]\\\n",
+ " .groupby(\"indicator_name\").count()\\\n",
+ " .sort_values(by=\"value\", ascending=False).head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Survey Table"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Currently, the survey properties are in one long JSON. We can explode each property in the JSON into its own line so that each survey question-answer pair gets its own line"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " source | \n",
+ " wave | \n",
+ " gid | \n",
+ " country | \n",
+ " countrycode | \n",
+ " adm_area_1 | \n",
+ " adm_area_2 | \n",
+ " adm_area_3 | \n",
+ " samplesize | \n",
+ " properties | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " EVS | \n",
+ " 1981-1984 | \n",
+ " [BEL] | \n",
+ " Belgium | \n",
+ " BEL | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " 1145 | \n",
+ " {'A001': {'Label': 'Important in life: Family'... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " EVS | \n",
+ " 1981-1984 | \n",
+ " [CAN] | \n",
+ " Canada | \n",
+ " CAN | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " 1254 | \n",
+ " {'A001': {'Label': 'Important in life: Family'... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source wave gid country countrycode adm_area_1 adm_area_2 \\\n",
+ "0 EVS 1981-1984 [BEL] Belgium BEL None None \n",
+ "1 EVS 1981-1984 [CAN] Canada CAN None None \n",
+ "\n",
+ " adm_area_3 samplesize properties \n",
+ "0 None 1145 {'A001': {'Label': 'Important in life: Family'... \n",
+ "1 None 1254 {'A001': {'Label': 'Important in life: Family'... "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_surveys.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "code_folding": []
+ },
+ "outputs": [],
+ "source": [
+ "def explode_survey_props(props):\n",
+ " q_label, label, category, frequency = [], [], [], []\n",
+ "\n",
+ " for line in props:\n",
+ " if \"X023\" in line: continue # this one too complicated to clean\n",
+ " if \"X051\" in line: continue # ethnic group; this one has no categories\n",
+ " if \"original_region_code\" in line: break #this one is not related\n",
+ " \n",
+ " for freq in props[line]['Frequencies']: \n",
+ " if re.search(\"[A-Z]\\d+_\\d+_(-*\\d+)\", freq):\n",
+ " q_val = re.search(\"[A-Z]\\d+_\\d+_(-*\\d+)\", freq).group(1)\n",
+ " else:\n",
+ " q_val = re.search(\"[A-Z]\\d\\d\\d[A-Z]*_(-*\\d+)\", freq).group(1)\n",
+ " q_label.append(line)\n",
+ " label.append(props[line]['Label'])\n",
+ " frequency.append(props[line]['Frequencies'][freq]) \n",
+ "\n",
+ " if \"X002\" in freq or \"X003\" in freq: #special case of birth year/age\n",
+ " category.append(freq)\n",
+ " else:\n",
+ " category.append(props[line]['Categories'][q_val])\n",
+ "\n",
+ " return q_label, label, category, frequency"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Build up in arrays\n",
+ "waves, gids, countries, samplesizes = [],[],[],[]\n",
+ "q_labels, labels, categories, frequencies = [],[],[],[]\n",
+ "\n",
+ "# Iterate through the survey to explode each json\n",
+ "for i in range(len(df_surveys)):\n",
+ " props = df_surveys.properties[i]\n",
+ " q_label, label, category, frequency = explode_survey_props(props)\n",
+ " \n",
+ " # These are copied over from the original dataframe\n",
+ " source = [df_surveys.source[i]] * len(q_label)\n",
+ " wave = [df_surveys.wave[i]] * len(q_label)\n",
+ " gid = [df_surveys.gid[i]] * len(q_label)\n",
+ " country = [df_surveys.country[i]] * len(q_label)\n",
+ " samplesize = [df_surveys.samplesize[i]] * len(q_label)\n",
+ "\n",
+ " waves.extend(wave)\n",
+ " gids.extend(gid)\n",
+ " countries.extend(country)\n",
+ " samplesizes.extend(samplesize)\n",
+ " q_labels.extend(q_label)\n",
+ " labels.extend(label)\n",
+ " categories.extend(category)\n",
+ " frequencies.extend(frequency)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Build the dataframe... this takes some time because the arrays are long\n",
+ "\n",
+ "cols = ['wave', 'gid', 'country', 'samplesize', \n",
+ " 'q_label', 'label', 'category', 'frequency']\n",
+ "survey_explode_df = pd.DataFrame([waves, gids, countries, samplesizes, \n",
+ " q_labels, labels, categories, frequencies]).T\n",
+ "survey_explode_df.columns=cols\n",
+ "survey_explode_df.frequency = survey_explode_df.frequency.apply(float)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "833664\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " wave | \n",
+ " gid | \n",
+ " country | \n",
+ " samplesize | \n",
+ " q_label | \n",
+ " label | \n",
+ " category | \n",
+ " frequency | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 661109 | \n",
+ " 2010-2014 | \n",
+ " [ESP.2_1] | \n",
+ " Spain | \n",
+ " 33 | \n",
+ " X028 | \n",
+ " Employment status | \n",
+ " Self employed | \n",
+ " 0.121212 | \n",
+ "
\n",
+ " \n",
+ " | 272821 | \n",
+ " 2005-2009 | \n",
+ " [MAR] | \n",
+ " Morocco | \n",
+ " 1200 | \n",
+ " E037 | \n",
+ " Government responsibility | \n",
+ " 4 | \n",
+ " 0.033333 | \n",
+ "
\n",
+ " \n",
+ " | 103880 | \n",
+ " 2008-2010 | \n",
+ " [NOR] | \n",
+ " Norway | \n",
+ " 1090 | \n",
+ " X003 | \n",
+ " Age | \n",
+ " X003_23 | \n",
+ " 0.019688 | \n",
+ "
\n",
+ " \n",
+ " | 790382 | \n",
+ " 2008-2010 | \n",
+ " [PRT.3_1, PRT.8_1, PRT.12.4_1, PRT.14_1, PRT.1... | \n",
+ " Portugal | \n",
+ " 183 | \n",
+ " X036 | \n",
+ " Profession/job | \n",
+ " \"Missing; Unkown\" | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 226109 | \n",
+ " 1999-2004 | \n",
+ " [PRI] | \n",
+ " Puerto Rico | \n",
+ " 720 | \n",
+ " X003 | \n",
+ " Age | \n",
+ " X003_-1 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " wave gid \\\n",
+ "661109 2010-2014 [ESP.2_1] \n",
+ "272821 2005-2009 [MAR] \n",
+ "103880 2008-2010 [NOR] \n",
+ "790382 2008-2010 [PRT.3_1, PRT.8_1, PRT.12.4_1, PRT.14_1, PRT.1... \n",
+ "226109 1999-2004 [PRI] \n",
+ "\n",
+ " country samplesize q_label label \\\n",
+ "661109 Spain 33 X028 Employment status \n",
+ "272821 Morocco 1200 E037 Government responsibility \n",
+ "103880 Norway 1090 X003 Age \n",
+ "790382 Portugal 183 X036 Profession/job \n",
+ "226109 Puerto Rico 720 X003 Age \n",
+ "\n",
+ " category frequency \n",
+ "661109 Self employed 0.121212 \n",
+ "272821 4 0.033333 \n",
+ "103880 X003_23 0.019688 \n",
+ "790382 \"Missing; Unkown\" 0.000000 \n",
+ "226109 X003_-1 0.000000 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "print(len(survey_explode_df))\n",
+ "display(survey_explode_df.sample(5))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "74\n",
+ "['Important in life: Family' 'Important in life: Friends'\n",
+ " 'Important in life: Work' 'State of health (subjective)'\n",
+ " 'Important child qualities: independence'\n",
+ " 'Important child qualities: feeling of responsibility'\n",
+ " 'Important child qualities: obedience'\n",
+ " 'Member: Belong to religious organization'\n",
+ " 'Member: Belong to sports or recreation'\n",
+ " 'Active/Inactive membership of church or religious organization'\n",
+ " 'Active/Inactive membership of sport or recreation'\n",
+ " 'Most people can be trusted' 'Satisfaction with your life'\n",
+ " 'How much freedom of choice and control'\n",
+ " 'Schwartz: It is important to this person living in secure surroundings'\n",
+ " 'Schwartz: It is important to this person to have a good time'\n",
+ " 'Schwartz: It is important to this person to always behave properly'\n",
+ " 'Schwartz: It is important to this person to do something for the good of society'\n",
+ " 'Social position: People in their 20s'\n",
+ " 'Social position: People in their 40s']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Showing 20 of the unique questions asked in the survey\n",
+ "survey_questions = survey_explode_df.label.unique()\n",
+ "print(len(survey_questions))\n",
+ "print(survey_questions[:20])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['1981-1984', '1990-1993', '1999-2001', '2008-2010', '1989-1993',\n",
+ " '1994-1998', '1999-2004', '2005-2009', '2010-2014'], dtype=object)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# The unique waves of years\n",
+ "survey_explode_df.wave.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " frequency | \n",
+ "
\n",
+ " \n",
+ " | country | \n",
+ " category | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Algeria | \n",
+ " Don´t know | \n",
+ " 0.003333 | \n",
+ "
\n",
+ " \n",
+ " | Missing; Unknown | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | No answer | \n",
+ " 0.000833 | \n",
+ "
\n",
+ " \n",
+ " | Not asked in survey | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | Not at all important | \n",
+ " 0.011667 | \n",
+ "
\n",
+ " \n",
+ " | Not very important | \n",
+ " 0.011667 | \n",
+ "
\n",
+ " \n",
+ " | Rather important | \n",
+ " 0.041667 | \n",
+ "
\n",
+ " \n",
+ " | Very important | \n",
+ " 0.930833 | \n",
+ "
\n",
+ " \n",
+ " | Argentina | \n",
+ " Don´t know | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | Missing; Unknown | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | No answer | \n",
+ " 0.002467 | \n",
+ "
\n",
+ " \n",
+ " | Not asked in survey | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | Not at all important | \n",
+ " 0.002577 | \n",
+ "
\n",
+ " \n",
+ " | Not very important | \n",
+ " 0.004394 | \n",
+ "
\n",
+ " \n",
+ " | Rather important | \n",
+ " 0.101518 | \n",
+ "
\n",
+ " \n",
+ " | Very important | \n",
+ " 0.889044 | \n",
+ "
\n",
+ " \n",
+ " | Armenia | \n",
+ " Don´t know | \n",
+ " 0.001592 | \n",
+ "
\n",
+ " \n",
+ " | Missing; Unknown | \n",
+ " 0.002474 | \n",
+ "
\n",
+ " \n",
+ " | No answer | \n",
+ " 0.000573 | \n",
+ "
\n",
+ " \n",
+ " | Not asked in survey | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " frequency\n",
+ "country category \n",
+ "Algeria Don´t know 0.003333\n",
+ " Missing; Unknown 0.000000\n",
+ " No answer 0.000833\n",
+ " Not asked in survey 0.000000\n",
+ " Not at all important 0.011667\n",
+ " Not very important 0.011667\n",
+ " Rather important 0.041667\n",
+ " Very important 0.930833\n",
+ "Argentina Don´t know 0.000000\n",
+ " Missing; Unknown 0.000000\n",
+ " No answer 0.002467\n",
+ " Not asked in survey 0.000000\n",
+ " Not at all important 0.002577\n",
+ " Not very important 0.004394\n",
+ " Rather important 0.101518\n",
+ " Very important 0.889044\n",
+ "Armenia Don´t know 0.001592\n",
+ " Missing; Unknown 0.002474\n",
+ " No answer 0.000573\n",
+ " Not asked in survey 0.000000"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# How do countries respond to Family Values in 2010-2014?\n",
+ "mask = (survey_explode_df.wave=='2010-2014')\\\n",
+ " &(survey_explode_df.label==\"Important in life: Family\")\n",
+ "survey_explode_df[mask].groupby([\"country\", \"category\"]).sum().head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " wave | \n",
+ "
\n",
+ " \n",
+ " | label | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Important child qualities: feeling of responsibility | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Important child qualities: independence | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Important child qualities: obedience | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Important in life: Family | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Important in life: Friends | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Important in life: Work | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | It is not important for me to know about science in my daily life | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Most important: first choice | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Most important: second choice | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Schwartz: It is important to this person living in secure surroundings | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Schwartz: It is important to this person to always behave properly | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Schwartz: It is important to this person to do something for the good of society | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | Schwartz: It is important to this person to have a good time | \n",
+ " 107 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country wave\n",
+ "label \n",
+ "Important child qualities: feeling of responsib... 107 9\n",
+ "Important child qualities: independence 107 9\n",
+ "Important child qualities: obedience 107 9\n",
+ "Important in life: Family 107 9\n",
+ "Important in life: Friends 107 9\n",
+ "Important in life: Work 107 9\n",
+ "It is not important for me to know about scienc... 107 9\n",
+ "Most important: first choice 107 9\n",
+ "Most important: second choice 107 9\n",
+ "Schwartz: It is important to this person living... 107 9\n",
+ "Schwartz: It is important to this person to alw... 107 9\n",
+ "Schwartz: It is important to this person to do ... 107 9\n",
+ "Schwartz: It is important to this person to hav... 107 9"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# How do countries respond to survey questions that include the word \"important\"?\n",
+ "survey_explode_df[survey_explode_df.label.str.contains(\"Important|important\")]\\\n",
+ " .groupby(\"label\").agg({\"country\": \"nunique\", \"wave\":\"nunique\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " frequency | \n",
+ "
\n",
+ " \n",
+ " | country | \n",
+ " wave | \n",
+ " label | \n",
+ " category | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Albania | \n",
+ " 1994-1998 | \n",
+ " Active/Inactive membership of church or religious organization | \n",
+ " Active member | \n",
+ " 0.048048 | \n",
+ "
\n",
+ " \n",
+ " | Don´t know | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | Inactive member | \n",
+ " 0.162162 | \n",
+ "
\n",
+ " \n",
+ " | Missing; Unknown | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | No answer | \n",
+ " 0.003003 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | Zimbabwe | \n",
+ " 2010-2014 | \n",
+ " Year of birth | \n",
+ " X002_1994 | \n",
+ " 0.006580 | \n",
+ "
\n",
+ " \n",
+ " | X002_1995 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | X002_1996 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | X002_1997 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | X002_1999 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
354576 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " frequency\n",
+ "country wave label category \n",
+ "Albania 1994-1998 Active/Inactive membership of church or religio... Active member 0.048048\n",
+ " Don´t know 0.000000\n",
+ " Inactive member 0.162162\n",
+ " Missing; Unknown 0.000000\n",
+ " No answer 0.003003\n",
+ "... ...\n",
+ "Zimbabwe 2010-2014 Year of birth X002_1994 0.006580\n",
+ " X002_1995 0.000000\n",
+ " X002_1996 0.000000\n",
+ " X002_1997 0.000000\n",
+ " X002_1999 0.000000\n",
+ "\n",
+ "[354576 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Group survey questions by country, wave, label, category, then sum\n",
+ "survey_explode_df.groupby([\"country\", \"wave\", \"label\", \"category\"])\\\n",
+ " [[\"samplesize\", \"frequency\"]].sum()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}