diff --git a/.gitignore b/.gitignore index 3c777f6..b242258 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ altdoc/freeze.rds _quarto/* !_quarto/_freeze/ *_files/ +_quarto/ diff --git a/README.md b/README.md index ea8faee..8928dcb 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ The `unitdid` package provides a set of functions for the analysis of -the unit-level difference-in-differences (Arkhangelsky, Yanagimoto, and -Zohar, 2024) +the unit-level difference-in-differences ([Arkhangelsky, Yanagimoto, and +Zohar 2024](https://arxiv.org/abs/2403.19563)). ## Installation @@ -158,3 +158,10 @@ sum_eage |> ``` + +## References + +Arkhangelsky, Dmitry, Kazuharu Yanagimoto, and Tom Zohar. 2024. +“Flexible Analysis of Individual Heterogeneity in Event Studies: +Application to the Child Penalty.” arXiv. +. diff --git a/README.qmd b/README.qmd index 28ae929..8431a11 100644 --- a/README.qmd +++ b/README.qmd @@ -18,7 +18,7 @@ knitr::opts_chunk$set( The `unitdid` package provides a set of functions for the analysis of -the unit-level difference-in-differences (Arkhangelsky, Yanagimoto, and Zohar, 2024) +the unit-level difference-in-differences ([Arkhangelsky, Yanagimoto, and Zohar 2024](https://arxiv.org/abs/2403.19563)). ## Installation @@ -130,3 +130,6 @@ sum_eage |> panel.grid.minor = element_blank()) ``` +## References + +Arkhangelsky, Dmitry, Kazuharu Yanagimoto, and Tom Zohar. 2024. "Flexible Analysis of Individual Heterogeneity in Event Studies: Application to the Child Penalty." arXiv. [https://arxiv.org/abs/2403.19563](https://arxiv.org/abs/2403.19563). \ No newline at end of file diff --git a/_quarto/_freeze/index/execute-results/html.json b/_quarto/_freeze/index/execute-results/html.json index 836fef5..9cb8e16 100644 --- a/_quarto/_freeze/index/execute-results/html.json +++ b/_quarto/_freeze/index/execute-results/html.json @@ -1,9 +1,11 @@ { - "hash": "ca7f64c46b9360786d1547d80d5d3800", + "hash": "ca9eb4cbd691a4666e4e735949aff977", "result": { "engine": "knitr", - "markdown": "\n\n\n\n\n\n# unitdid\n\n\n\n\nThe `unitdid` package provides a set of functions for the analysis of\nthe unit-level difference-in-differences (Arkhangelsky, Yanagimoto, and Zohar, 2024)\n\n## Installation\n\nYou can install the development version of unitdid from [GitHub](https://github.com/kazuyanagimoto/unitdid) with:\n\n``` r\n# install.packages(\"remotes\")\nremotes::install_github(\"kazuyanagimoto/unitdid\")\n```\n\n## Example\n\nThis is a basic example with the simulated `base_heterocp` data set:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(unitdid)\nlibrary(dplyr)\nlibrary(ggplot2)\n\nbase_heterocp |>\n head()\n#> # A tibble: 6 × 5\n#> id year byear cyear y\n#> \n#> 1 1 1999 1955 1985 -0.848\n#> 2 1 2000 1955 1985 0.759\n#> 3 1 2001 1955 1985 -1.03 \n#> 4 1 2002 1955 1985 0.858\n#> 5 1 2003 1955 1985 -0.866\n#> 6 1 2004 1955 1985 -0.651\n```\n:::\n\n\nIndividual-level child penalties are estimated by `unitdid()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmdl_base <- base_heterocp |>\n unitdid(yname = \"y\",\n iname = \"id\",\n tname = \"year\",\n ename = \"cyear\",\n bname = \"byear\")\n\n# Estimated individual-level child penalties (y_tilde)\nget_unitdid(mdl_base)\n#> # A tibble: 32,257 × 6\n#> id year byear cyear y y_tilde\n#> \n#> 1 705 2000 1957 2000 0.138 -0.0287 \n#> 2 997 2000 1958 2000 0.138 0.0849 \n#> 3 998 2000 1958 2000 0.119 -0.104 \n#> 4 1013 2000 1958 2000 0.115 -0.0000709\n#> 5 1082 2000 1958 2000 0.0362 0.00549 \n#> 6 1127 2000 1958 2000 0.386 0.125 \n#> 7 1225 2001 1959 2001 0.158 -0.118 \n#> 8 1228 2000 1959 2000 0.241 -0.0937 \n#> 9 1228 2001 1959 2000 0.443 0.0226 \n#> 10 1230 2000 1959 2000 0.143 -0.0266 \n#> # ℹ 32,247 more rows\n```\n:::\n\n\n### Aggregation of Individual-level Child Penalties\n\nThey can be aggregated to the `full`,\n`event` (year at event (treatment). Mainly for staggered DiD design),\n`event_age` (age at event. Mainly for child penalties) levels:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(mdl_base) # default agg = \"full\"\n#> # A tibble: 6 × 3\n#> mean rel_time n\n#> \n#> 1 -0.0653 0 4357\n#> 2 -0.193 1 4357\n#> 3 -0.307 2 4357\n#> 4 -0.310 3 4357\n#> 5 -0.350 4 4357\n#> 6 -0.349 5 4357\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nsum_eage <- summary(mdl_base, agg = \"event_age\")\n\nsum_eage |>\n filter(rel_time == 0) |>\n mutate(rel_time = -1,\n mean = 0) |>\n bind_rows(sum_eage) |>\n filter(between(event_age, 25, 34)) |>\n mutate(lbl_facet = paste0(\"Age \", event_age)) |>\n ggplot(aes(x = rel_time, y = mean)) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet, ncol = 5) +\n labs(x = \"Time to First Childbirth\",\n y = \"Child Penalties on y\") +\n theme_minimal() +\n theme(panel.grid.major.x = element_blank(),\n panel.grid.minor = element_blank())\n```\n\n::: {.cell-output-display}\n![](man/figures/README-agg_cage-1.svg){width=100%}\n:::\n:::\n\n\n### Variance of Individual-level Child Penalties\n\nSince the individual-level child penalties are estimated with measurement errors,\nthe variance of the `y_tilde` is not equal to the variance of\nthe individual-level child penalties.\n\nThe `compute_varcov = \"var\"` option of the `unitdid` estimates the variance of\nthe measurement errors and the variance of the individual-level child penalties\nby subtracting the variance of the measurement errors from the variance of `y_tilde`\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmdl_base <- base_heterocp |>\n unitdid(yname = \"y\",\n iname = \"id\",\n tname = \"year\",\n ename = \"cyear\",\n bname = \"byear\",\n compute_varcov = \"var\")\n\nsum_eage <- summary(mdl_base, agg = \"event_age\")\n\nsum_eage |>\n filter(rel_time == 0) |>\n mutate(rel_time = -1,\n var = 0) |>\n bind_rows(sum_eage) |>\n filter(between(event_age, 25, 34)) |>\n mutate(lbl_facet = paste0(\"Age \", event_age)) |>\n ggplot(aes(x = rel_time, y = sqrt(var))) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet, ncol = 5) +\n labs(x = \"Time to First Childbirth\",\n y = \"S.D. of Child Penalties\") +\n theme_minimal() +\n theme(panel.grid.major.x = element_blank(),\n panel.grid.minor = element_blank())\n```\n\n::: {.cell-output-display}\n![](man/figures/README-var_cage-1.svg){width=100%}\n:::\n:::\n", - "supporting": [], + "markdown": "\n\n\n\n\n\n# unitdid\n\n\n\n\nThe `unitdid` package provides a set of functions for the analysis of\nthe unit-level difference-in-differences ([Arkhangelsky, Yanagimoto, and Zohar 2024](https://arxiv.org/abs/2403.19563)).\n\n## Installation\n\nYou can install the development version of unitdid from [GitHub](https://github.com/kazuyanagimoto/unitdid) with:\n\n``` r\n# install.packages(\"remotes\")\nremotes::install_github(\"kazuyanagimoto/unitdid\")\n```\n\n## Example\n\nThis is a basic example with the simulated `base_heterocp` data set:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(unitdid)\nlibrary(dplyr)\nlibrary(ggplot2)\n\nbase_heterocp |>\n head()\n#> # A tibble: 6 × 5\n#> id year byear cyear y\n#> \n#> 1 1 1999 1955 1985 -0.848\n#> 2 1 2000 1955 1985 0.759\n#> 3 1 2001 1955 1985 -1.03 \n#> 4 1 2002 1955 1985 0.858\n#> 5 1 2003 1955 1985 -0.866\n#> 6 1 2004 1955 1985 -0.651\n```\n:::\n\n\nIndividual-level child penalties are estimated by `unitdid()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmdl_base <- base_heterocp |>\n unitdid(yname = \"y\",\n iname = \"id\",\n tname = \"year\",\n ename = \"cyear\",\n bname = \"byear\")\n\n# Estimated individual-level child penalties (y_tilde)\nget_unitdid(mdl_base)\n#> # A tibble: 32,257 × 6\n#> id year byear cyear y y_tilde\n#> \n#> 1 705 2000 1957 2000 0.138 -0.0287 \n#> 2 997 2000 1958 2000 0.138 0.0849 \n#> 3 998 2000 1958 2000 0.119 -0.104 \n#> 4 1013 2000 1958 2000 0.115 -0.0000709\n#> 5 1082 2000 1958 2000 0.0362 0.00549 \n#> 6 1127 2000 1958 2000 0.386 0.125 \n#> 7 1225 2001 1959 2001 0.158 -0.118 \n#> 8 1228 2000 1959 2000 0.241 -0.0937 \n#> 9 1228 2001 1959 2000 0.443 0.0226 \n#> 10 1230 2000 1959 2000 0.143 -0.0266 \n#> # ℹ 32,247 more rows\n```\n:::\n\n\n### Aggregation of Individual-level Child Penalties\n\nThey can be aggregated to the `full`,\n`event` (year at event (treatment). Mainly for staggered DiD design),\n`event_age` (age at event. Mainly for child penalties) levels:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(mdl_base) # default agg = \"full\"\n#> # A tibble: 6 × 3\n#> mean rel_time n\n#> \n#> 1 -0.0653 0 4357\n#> 2 -0.193 1 4357\n#> 3 -0.307 2 4357\n#> 4 -0.310 3 4357\n#> 5 -0.350 4 4357\n#> 6 -0.349 5 4357\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nsum_eage <- summary(mdl_base, agg = \"event_age\")\n\nsum_eage |>\n filter(rel_time == 0) |>\n mutate(rel_time = -1,\n mean = 0) |>\n bind_rows(sum_eage) |>\n filter(between(event_age, 25, 34)) |>\n mutate(lbl_facet = paste0(\"Age \", event_age)) |>\n ggplot(aes(x = rel_time, y = mean)) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet, ncol = 5) +\n labs(x = \"Time to First Childbirth\",\n y = \"Child Penalties on y\") +\n theme_minimal() +\n theme(panel.grid.major.x = element_blank(),\n panel.grid.minor = element_blank())\n```\n\n::: {.cell-output-display}\n![](man/figures/README-agg_cage-1.svg){width=100%}\n:::\n:::\n\n\n### Variance of Individual-level Child Penalties\n\nSince the individual-level child penalties are estimated with measurement errors,\nthe variance of the `y_tilde` is not equal to the variance of\nthe individual-level child penalties.\n\nThe `compute_varcov = \"var\"` option of the `unitdid` estimates the variance of\nthe measurement errors and the variance of the individual-level child penalties\nby subtracting the variance of the measurement errors from the variance of `y_tilde`\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmdl_base <- base_heterocp |>\n unitdid(yname = \"y\",\n iname = \"id\",\n tname = \"year\",\n ename = \"cyear\",\n bname = \"byear\",\n compute_varcov = \"var\")\n\nsum_eage <- summary(mdl_base, agg = \"event_age\")\n\nsum_eage |>\n filter(rel_time == 0) |>\n mutate(rel_time = -1,\n var = 0) |>\n bind_rows(sum_eage) |>\n filter(between(event_age, 25, 34)) |>\n mutate(lbl_facet = paste0(\"Age \", event_age)) |>\n ggplot(aes(x = rel_time, y = sqrt(var))) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet, ncol = 5) +\n labs(x = \"Time to First Childbirth\",\n y = \"S.D. of Child Penalties\") +\n theme_minimal() +\n theme(panel.grid.major.x = element_blank(),\n panel.grid.minor = element_blank())\n```\n\n::: {.cell-output-display}\n![](man/figures/README-var_cage-1.svg){width=100%}\n:::\n:::\n\n\n## References\n\nArkhangelsky, Dmitry, Kazuharu Yanagimoto, and Tom Zohar. 2024. \"Flexible Analysis of Individual Heterogeneity in Event Studies: Application to the Child Penalty.\" arXiv. [https://arxiv.org/abs/2403.19563](https://arxiv.org/abs/2403.19563).", + "supporting": [ + "index_files" + ], "filters": [ "rmarkdown/pagebreak.lua" ], diff --git a/_quarto/_freeze/vignettes/example_psid/execute-results/html.json b/_quarto/_freeze/vignettes/example_psid/execute-results/html.json index 4ae58b8..924514c 100644 --- a/_quarto/_freeze/vignettes/example_psid/execute-results/html.json +++ b/_quarto/_freeze/vignettes/example_psid/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "1d65ca6a1cd522ba3d642c37bb4b4ede", + "hash": "9a657b08153f611bd2803f52446b3027", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Individual-level Child Penalties with PSID\"\nauthor: Kazuharu Yanagimoto\ndate: today\nbibliography: references.bib\n---\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(unitdid)\nlibrary(here)\nlibrary(dplyr)\nlibrary(ggplot2)\nlibrary(purrr)\n```\n:::\n\n\n### Panel Study of Income Dynamics (PSID)\n\nThe PSID data is a longitudinal household survey of the United States and one of\nthe longest and most famous panel surveys in the world. The study began in 1968\nwith a nationally representative sample of over 18,000 individuals living in\n5,000 families. The data is available [here](https://psidonline.isr.umich.edu)\nfor the registered users.\n\nTo avoid the convoluted cleaning tasks, I mostly use the cleaning codes provided\nby the [Comparative Panel File (CPF)](https://www.cpfdata.com). Their cleaning\n[codes](https://www.cpfdata.com) are developed for creating a harmonized panel\ndata from seven countries (Australia, Germany, the UK, South Korea, Russia,\nSwitzerland, and the US).\n\nTo replicate this article, you need to run\n\n1. Run the code of the CPF only for the PSID\n - In step 4 (Add vars constant across all waves) of the\n `CPF-Code/03_PSID/us_01_3_GEt_vars.do`, add `ER32024` (birth year of the\n first child)\n - `us_01.dta`-`us_03.dta` will be produced\n2. Run the following code to create a dataset for the child penalties\n\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\nlibrary(haven)\ncpf_us01 <- read_dta(here(\"vignettes/us_01.dta\"))\ncpf_us03 <- read_dta(here(\"vignettes/us_03_CPF.dta\"))\n\never_birth <- cpf_us01 |>\n select(pid, cyear = ER32024) |>\n filter(cyear != 9999) |>\n distinct()\n\ncp_psid <- cpf_us03 |>\n filter(country == 3) |>\n select(id = pid,\n year = intyear,\n byear = yborn,\n gender = female,\n earn = incjobs_yg,\n hours = whweek,\n years_edu = eduy) |>\n mutate(gender = if_else(gender == 0, \"Men\", \"Women\"),\n particip = if_else(hours > 0, 1, 0),\n wage = if_else(particip == 1, earn / hours, 0),\n years_edu = if_else(years_edu < 0, NA_integer_, years_edu)) |>\n right_join(ever_birth, by = c(\"id\" = \"pid\")) |>\n mutate(rel_time = year - cyear) |>\n filter(!is.na(earn))\n\nsave(cp_psid, file = here(\"vignettes/cp_psid.rds\"))\n```\n:::\n\n\n## Analysis of Child Penalties on Participation\n\n\n::: {.cell}\n\n```{.r .cell-code}\nload(here(\"vignettes/cp_psid.rds\"))\n\ncp_psid <- cp_psid |>\n mutate(eage = cyear - byear) |>\n filter(cyear >= 1968,\n year <= 1997, # The last year of the annual data\n between(eage, 22, 40)) # The Age range of the first childbirth\n\nmdl_particip <- cp_psid |>\n unitdid(yname = \"particip\",\n iname = \"id\",\n tname = \"year\",\n ename = \"cyear\",\n bname = \"byear\",\n by = c(\"gender\"),\n compute_varcov = \"var\")\n\nsum_particip <- summary(mdl_particip, agg = \"event_age\") |>\n filter(between(event_age, 26, 32))\n\ndf_plot <- sum_particip |>\n filter(rel_time == 0) |>\n mutate(rel_time = -1, mean = 0,\n var = 0) |>\n bind_rows(sum_particip) |>\n mutate(lbl_facet = paste0(\"Age \", event_age))\n```\n:::\n\n\n### Mean of Child Penalties on Participation\n\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\ndf_plot |>\n ggplot(aes(x = rel_time, y = mean,\n color = gender, shape = gender)) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet) +\n scale_color_manual(values = c(\"#009F8C\", \"#B75C9D\")) +\n labs(x = \"Time to First Childbirth\",\n y = NULL,\n color = NULL, shape = NULL) +\n theme_minimal() +\n theme(panel.grid.minor = element_blank(),\n panel.grid.major.x = element_blank(),\n legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/mean_cage-1.svg){width=672}\n:::\n:::\n\n\n### Standard Deviation of Child Penalties on Participation\n\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\ndf_plot |>\n ggplot(aes(x = rel_time, y = sqrt(var),\n color = gender, shape = gender)) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet) +\n scale_color_manual(values = c(\"#009F8C\", \"#B75C9D\")) +\n labs(x = \"Time to First Childbirth\",\n y = NULL, color = NULL, shape = NULL) +\n theme_minimal() +\n theme(panel.grid.minor = element_blank(),\n panel.grid.major.x = element_blank(),\n legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/sd_cage-1.svg){width=672}\n:::\n:::\n\n\n\n## Scatter Plot with Covariates\n\nTo highlight the flexibility of the individual child penalties, let's plot the\nrelationship between the years of education and the child penalties.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nunitdid_particip <- get_unitdid(mdl_particip)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\nlibrary(patchwork)\n\nplot_scatter <- function(gender) {\n clr <- ifelse(gender == \"Men\", \"#009F8C\", \"#B75C9D\")\n\n unitdid_particip |>\n filter(gender == {{gender}}, between(rel_time, 0, 5)) |>\n mutate(lbl_facet = paste0(\"k = \", rel_time)) |>\n ggplot(aes(x = years_edu, y = particip_tilde)) +\n geom_point(color = clr, alpha = 0.1) +\n geom_smooth(method = \"lm\", color = clr, fill = clr) +\n facet_wrap(~lbl_facet, nrow = 1) +\n labs(x = \"Years of Education\",\n y = \"CP on Participation\",\n title = gender) +\n coord_cartesian(xlim = c(6, NA), ylim = c(-1.25, 0.5)) +\n theme_minimal()\n}\n\np1 <- plot_scatter(\"Men\")\np2 <- plot_scatter(\"Women\")\n\np1 / p2\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/unnamed-chunk-3-1.svg){width=672}\n:::\n:::\n\n\n### On Binscatter\n\nAnother way to visualize the relationship is to use the binscatter [@cattaneo2023].\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(binsreg)\n\ncompute_binsreg <- function(gender) {\n est <- unitdid_particip |>\n filter(gender == {{ gender }}) |>\n as.data.frame() |>\n binsreg(x = years_edu, y = particip_tilde, data = _, by = rel_time)\n\n map(0:5, ~{est$data.plot[[paste0(\"Group \", .x)]]$data.dots |>\n select(x, fit) |>\n mutate(rel_time = .x)}) |>\n list_rbind() |>\n mutate(gender = gender)\n}\n\nbs <- map(c(\"Men\", \"Women\"), compute_binsreg) |> list_rbind()\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\nplot_binsreg <- function(gender) {\n clr <- ifelse(gender == \"Men\", \"#009F8C\", \"#B75C9D\")\n\n bs |>\n filter(gender == {{ gender }}) |>\n mutate(lbl_facet = paste0(\"k = \", rel_time)) |>\n ggplot(aes(x = x, y = fit)) +\n geom_point(color = clr) +\n facet_wrap(~lbl_facet, nrow = 1) +\n labs(x = \"Years of Education\",\n y = \"CP on Participation\",\n title = gender) +\n theme_minimal()\n}\n\np1 <- plot_binsreg(\"Men\")\np2 <- plot_binsreg(\"Women\")\n\np1 / p2\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/unnamed-chunk-6-1.svg){width=672}\n:::\n:::\n\n\n## References\n", + "markdown": "---\ntitle: \"Individual-level Child Penalties with PSID\"\nauthor: Kazuharu Yanagimoto\ndate: today\n---\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(unitdid)\nlibrary(here)\nlibrary(dplyr)\nlibrary(ggplot2)\nlibrary(purrr)\n```\n:::\n\n\n### Panel Study of Income Dynamics (PSID)\n\nThe PSID data is a longitudinal household survey of the United States and one of\nthe longest and most famous panel surveys in the world. The study began in 1968\nwith a nationally representative sample of over 18,000 individuals living in\n5,000 families. The data is available [here](https://psidonline.isr.umich.edu)\nfor the registered users.\n\nTo avoid the convoluted cleaning tasks, I mostly use the cleaning codes provided\nby the [Comparative Panel File (CPF)](https://www.cpfdata.com). Their cleaning\n[codes](https://www.cpfdata.com) are developed for creating a harmonized panel\ndata from seven countries (Australia, Germany, the UK, South Korea, Russia,\nSwitzerland, and the US).\n\nTo replicate this article, you need to run\n\n1. Run the code of the CPF only for the PSID\n - In step 4 (Add vars constant across all waves) of the\n `CPF-Code/03_PSID/us_01_3_GEt_vars.do`, add `ER32024` (birth year of the\n first child)\n - `us_01.dta`-`us_03.dta` will be produced\n2. Run the following code to create a dataset for the child penalties\n\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\nlibrary(haven)\ncpf_us01 <- read_dta(here(\"vignettes/us_01.dta\"))\ncpf_us03 <- read_dta(here(\"vignettes/us_03_CPF.dta\"))\n\never_birth <- cpf_us01 |>\n select(pid, cyear = ER32024) |>\n filter(cyear != 9999) |>\n distinct()\n\ncp_psid <- cpf_us03 |>\n filter(country == 3) |>\n select(id = pid,\n year = intyear,\n byear = yborn,\n gender = female,\n earn = incjobs_yg,\n hours = whweek,\n years_edu = eduy) |>\n mutate(gender = if_else(gender == 0, \"Men\", \"Women\"),\n particip = if_else(hours > 0, 1, 0),\n wage = if_else(particip == 1, earn / hours, 0),\n years_edu = if_else(years_edu < 0, NA_integer_, years_edu)) |>\n right_join(ever_birth, by = c(\"id\" = \"pid\")) |>\n mutate(rel_time = year - cyear) |>\n filter(!is.na(earn))\n\nsave(cp_psid, file = here(\"vignettes/cp_psid.rds\"))\n```\n:::\n\n\n## Analysis of Child Penalties on Participation\n\n\n::: {.cell}\n\n```{.r .cell-code}\nload(here(\"vignettes/cp_psid.rds\"))\n\ncp_psid <- cp_psid |>\n mutate(eage = cyear - byear) |>\n filter(cyear >= 1968,\n year <= 1997, # The last year of the annual data\n between(eage, 22, 40))\n\nmdl_particip <- cp_psid |>\n unitdid(yname = \"particip\",\n iname = \"id\",\n tname = \"year\",\n ename = \"cyear\",\n bname = \"byear\",\n by = c(\"gender\"),\n compute_varcov = \"var\")\n\nsum_particip <- summary(mdl_particip, agg = \"event_age\") |>\n filter(between(event_age, 26, 32))\n\ndf_plot <- sum_particip |>\n filter(rel_time == 0) |>\n mutate(rel_time = -1, mean = 0,\n var = 0) |>\n bind_rows(sum_particip) |>\n mutate(lbl_facet = paste0(\"Age \", event_age))\n```\n:::\n\n\n### Mean of Child Penalties on Participation\n\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\ndf_plot |>\n ggplot(aes(x = rel_time, y = mean,\n color = gender, shape = gender)) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet) +\n scale_color_manual(values = c(\"#009F8C\", \"#B75C9D\")) +\n labs(x = \"Time to First Childbirth\",\n y = NULL,\n color = NULL, shape = NULL) +\n theme_minimal() +\n theme(panel.grid.minor = element_blank(),\n panel.grid.major.x = element_blank(),\n legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/mean_cage-1.svg){width=672}\n:::\n:::\n\n\n### Standard Deviation of Child Penalties on Participation\n\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\ndf_plot |>\n ggplot(aes(x = rel_time, y = sqrt(var),\n color = gender, shape = gender)) +\n geom_point() +\n geom_line() +\n geom_vline(xintercept = -1, linetype = \"dashed\") +\n geom_hline(yintercept = 0) +\n facet_wrap(~lbl_facet) +\n scale_color_manual(values = c(\"#009F8C\", \"#B75C9D\")) +\n labs(x = \"Time to First Childbirth\",\n y = NULL, color = NULL, shape = NULL) +\n theme_minimal() +\n theme(panel.grid.minor = element_blank(),\n panel.grid.major.x = element_blank(),\n legend.position = \"bottom\")\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/sd_cage-1.svg){width=672}\n:::\n:::\n\n\n\n## Scatter Plot with Covariates\n\nTo highlight the flexibility of the individual child penalties, let's plot the\nrelationship between the years of education and the child penalties.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nunitdid_particip <- get_unitdid(mdl_particip)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\nlibrary(patchwork)\n\nplot_scatter <- function(gender) {\n clr <- ifelse(gender == \"Men\", \"#009F8C\", \"#B75C9D\")\n\n unitdid_particip |>\n filter(gender == {{gender}}, between(rel_time, 0, 5)) |>\n mutate(lbl_facet = paste0(\"k = \", rel_time)) |>\n ggplot(aes(x = years_edu, y = particip_tilde)) +\n geom_point(color = clr, alpha = 0.1) +\n geom_smooth(method = \"lm\", color = clr, fill = clr) +\n facet_wrap(~lbl_facet, nrow = 1) +\n labs(x = \"Years of Education\",\n y = \"CP on Participation\",\n title = gender) +\n coord_cartesian(xlim = c(6, NA), ylim = c(-1.25, 0.5)) +\n theme_minimal()\n}\n\np1 <- plot_scatter(\"Men\")\np2 <- plot_scatter(\"Women\")\n\np1 / p2\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/unnamed-chunk-3-1.svg){width=672}\n:::\n:::\n\n\n### On Binscatter\n\nAnother way to visualize the relationship is to use the binscatter [@cattaneo2023].\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(binsreg)\n\ncompute_binsreg <- function(gender) {\n est <- unitdid_particip |>\n filter(gender == {{ gender }}) |>\n as.data.frame() |>\n binsreg(x = years_edu, y = particip_tilde,\n data = _, by = rel_time)\n\n map(0:5, ~{est$data.plot[[paste0(\"Group \", .x)]]$data.dots |>\n select(x, fit) |>\n mutate(rel_time = .x)}) |>\n list_rbind() |>\n mutate(gender = gender)\n}\n\nbs <- map(c(\"Men\", \"Women\"), compute_binsreg) |> list_rbind()\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-fold=\"true\"}\nplot_binsreg <- function(gender) {\n clr <- ifelse(gender == \"Men\", \"#009F8C\", \"#B75C9D\")\n\n bs |>\n filter(gender == {{ gender }}) |>\n mutate(lbl_facet = paste0(\"k = \", rel_time)) |>\n ggplot(aes(x = x, y = fit)) +\n geom_point(color = clr) +\n facet_wrap(~lbl_facet, nrow = 1) +\n labs(x = \"Years of Education\",\n y = \"CP on Participation\",\n title = gender) +\n theme_minimal()\n}\n\np1 <- plot_binsreg(\"Men\")\np2 <- plot_binsreg(\"Women\")\n\np1 / p2\n```\n\n::: {.cell-output-display}\n![](example_psid_files/figure-html/unnamed-chunk-6-1.svg){width=672}\n:::\n:::\n\n\n## References\n", "supporting": [ "example_psid_files" ], diff --git a/altdoc/quarto_website.yml b/altdoc/quarto_website.yml index 671472f..62e0b2b 100644 --- a/altdoc/quarto_website.yml +++ b/altdoc/quarto_website.yml @@ -44,4 +44,6 @@ format: - custom.scss highlight-style: nord code-copy: true - code-overflow: scroll \ No newline at end of file + code-overflow: scroll + +bibliography: ../references.bib \ No newline at end of file diff --git a/references.bib b/references.bib new file mode 100644 index 0000000..9ce9f6a --- /dev/null +++ b/references.bib @@ -0,0 +1,28 @@ +@misc{cattaneo2023, + title = {On {{Binscatter}}}, + author = {Cattaneo, Matias D. and Crump, Richard K. and Farrell, Max H. and Feng, Yingjie}, + year = {2023}, + month = nov, + number = {arXiv:1902.09608}, + eprint = {1902.09608}, + primaryclass = {econ, stat}, + publisher = {{arXiv}}, + doi = {10.48550/arXiv.1902.09608}, + urldate = {2024-02-05}, + archiveprefix = {arxiv}, + keywords = {Economics - Econometrics,Statistics - Machine Learning,Statistics - Methodology} +} +@misc{arkhangelsky2024, + title = {Flexible {{Analysis}} of {{Individual Heterogeneity}} in {{Event Studies}}: {{Application}} to the {{Child Penalty}}}, + shorttitle = {Flexible {{Analysis}} of {{Individual Heterogeneity}} in {{Event Studies}}}, + author = {Arkhangelsky, Dmitry and Yanagimoto, Kazuharu and Zohar, Tom}, + year = {2024}, + month = mar, + number = {arXiv:2403.19563}, + eprint = {2403.19563}, + primaryclass = {econ, q-fin}, + publisher = {arXiv}, + urldate = {2024-03-29}, + archiveprefix = {arxiv}, + keywords = {Economics - General Economics} +} diff --git a/vignettes/example_psid.qmd b/vignettes/example_psid.qmd index ef53539..5c962ec 100644 --- a/vignettes/example_psid.qmd +++ b/vignettes/example_psid.qmd @@ -2,7 +2,6 @@ title: "Individual-level Child Penalties with PSID" author: Kazuharu Yanagimoto date: today -bibliography: references.bib --- ```{r, include = FALSE} @@ -89,7 +88,7 @@ cp_psid <- cp_psid |> mutate(eage = cyear - byear) |> filter(cyear >= 1968, year <= 1997, # The last year of the annual data - between(eage, 22, 40)) # The Age range of the first childbirth + between(eage, 22, 40)) mdl_particip <- cp_psid |> unitdid(yname = "particip", @@ -224,7 +223,8 @@ compute_binsreg <- function(gender) { est <- unitdid_particip |> filter(gender == {{ gender }}) |> as.data.frame() |> - binsreg(x = years_edu, y = particip_tilde, data = _, by = rel_time) + binsreg(x = years_edu, y = particip_tilde, + data = _, by = rel_time) map(0:5, ~{est$data.plot[[paste0("Group ", .x)]]$data.dots |> select(x, fit) |> diff --git a/vignettes/references.bib b/vignettes/references.bib deleted file mode 100644 index 16fb2b4..0000000 --- a/vignettes/references.bib +++ /dev/null @@ -1,14 +0,0 @@ -@misc{cattaneo2023, - title = {On {{Binscatter}}}, - author = {Cattaneo, Matias D. and Crump, Richard K. and Farrell, Max H. and Feng, Yingjie}, - year = {2023}, - month = nov, - number = {arXiv:1902.09608}, - eprint = {1902.09608}, - primaryclass = {econ, stat}, - publisher = {{arXiv}}, - doi = {10.48550/arXiv.1902.09608}, - urldate = {2024-02-05}, - archiveprefix = {arxiv}, - keywords = {Economics - Econometrics,Statistics - Machine Learning,Statistics - Methodology} -}