## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment  = "#>",
  fig.width = 8,
  fig.height = 8,
  message = FALSE,
  warning = FALSE
)
library(ggplot2); library(dplyr); library(tidyr)
data.table::setDTthreads(1L)
options(dplyr.summarise.inform = FALSE, scipen = 999, digits = 5)
theme_set(theme_bw(base_size = 12))

## ----dgp, class.source = "fold-show"------------------------------------------
# Parameters
baseline    <- 500
ratio_bias  <- 3    # late vs early slope multiplier
ratio_apos  <- 3    # men vs women slope multiplier
ages        <- 20:30
d_early     <- 25
d_late      <- 30

# Counterfactual earning slopes
s_f_25 <- baseline
s_f_30 <- baseline * ratio_bias
s_m_25 <- baseline * ratio_apos
s_m_30 <- baseline * ratio_apos * ratio_bias

# Treatment effects (ATT)
att_f  <- -6000
att_m  <- -2000

# Build each group
make_group <- function(sex, D, slope, att_level) {
  tibble(
    age    = ages,
    female = sex,
    D      = D,
    y_0    = slope * age,           # counterfactual (no treatment)
    y_1    = y_0 + ifelse(age >= D, att_level, 0),
    y      = y_1                     # observed
  )
}

data <- bind_rows(
  make_group(1, d_early, s_f_25, att_f),
  make_group(1, d_late,  s_f_30, att_f),
  make_group(0, d_early, s_m_25, att_m),
  make_group(0, d_late,  s_m_30, att_m)
)

## ----verify_pt_levels---------------------------------------------------------
# Calculate trend from pre-treatment (age D-1) to each subsequent age
trends <- function(g, d, a) {
  pre <- data |> filter(D == d, female == g, age == d-1) |> pull(y_0)
  post <- data |> filter(D == d, female == g, age == a) |> pull(y_0)
  post - pre
}

# Difference in trends between early and late cohorts
diff_trends <- function(g, a) trends(g, 25, a) - trends(g, 30, a)

expand_grid(
  female = c(0, 1),
  age    = ages
) |>
  mutate(
    diff_trend = purrr::map2_dbl(female, age, diff_trends),
    gender = factor(if_else(female == 1, "Women", "Men"), levels = c("Women", "Men"))
  ) |> 
  ggplot(aes(x = age, y = diff_trend, color = gender)) + 
  geom_line(linewidth = 1.1) + 
  labs(
    title = "Parallel Trends Violation in Levels by Gender",
    x = "Age", 
    y = "Difference in counterfactual trends", 
    color = "Gender"
  ) +
  theme(legend.position = "bottom")

## ----verify_normalized_pt-----------------------------------------------------
# Get counterfactual earnings for early cohort (APO)
get_apo <- function(g, a) {
  data |>
    dplyr::filter(D == 25, female == g, age == a) |>
    dplyr::pull(y_0)
}

expand_grid(
  female = c(0, 1),
  age    = ages
) |>
  mutate(
    diff_trend = purrr::map2_dbl(female, age, diff_trends),
    y_0 = purrr::map2_dbl(female, age, get_apo),
    norm_diff_trend = diff_trend / y_0,
    gender = factor(if_else(female == 1, "Women", "Men"), levels = c("Women", "Men"))
  ) |> 
  ggplot(aes(x = age, y = norm_diff_trend, color = gender)) + 
  geom_line(linewidth = 1.1) + 
  labs(
    title = "Normalized Parallel Trends Violation",
    x = "Age", 
    y = "Normalized difference in counterfactual trends", 
    color = "Gender"
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme(legend.position = "bottom")

## ----fig1_setup, fig.height=4-------------------------------------------------
plot_df <- bind_rows(
  # Early cohort: observed and counterfactual
  data %>%
    filter(D == d_early) %>%
    pivot_longer(cols = c(y_0, y), names_to = "series", values_to = "value"),
  # Late cohort: counterfactual only (not yet treated at these ages for early cohort)
  data %>%
    filter(D == d_late) %>%
    transmute(age, female, D, series = "y_0", value = y_0)
) %>%
  mutate(
    gender = factor(if_else(female == 1, "Women", "Men"), levels = c("Women", "Men")),
    cohort = factor(if_else(D == d_early, "Early (D=25)", "Late (D=30)"),
                    levels = c("Early (D=25)", "Late (D=30)")),
    series_label = case_when(
      cohort == "Early (D=25)" & series == "y"   ~ "Early — Observed",
      cohort == "Early (D=25)" & series == "y_0" ~ "Early — Counterfactual",
      cohort == "Late (D=30)"  & series == "y_0" ~ "Late — Counterfactual"
    ),
    linetype = if_else(series_label == "Early — Observed", "solid", "dashed"),
    color = case_when(
      series_label == "Early — Observed"       ~ "Obs.",
      series_label == "Early — Counterfactual" ~ "CF Early",
      TRUE                                     ~ "CF Late"
    )
  )

ggplot(plot_df, aes(x = age, y = value, group = series_label)) +
  geom_line(aes(linetype = linetype, color = color), linewidth = 1.1) +
  facet_wrap(~ gender, nrow = 1) +
  scale_linetype_identity() +
  scale_color_manual(
    values = c("Obs." = "#1f77b4", "CF Early" = "#9ecae1", "CF Late" = "#fdae6b")
  ) +
  labs(
    title = "Figure 1: Observed vs. Counterfactual Earnings",
    x = "Age", y = "Earnings", color = NULL
  ) +
  theme(legend.position = "bottom")

## ----fig2_did-----------------------------------------------------------------
# Calculate DID-imputed counterfactual
did_cf <- bind_rows(lapply(c(1, 0), function(g) {
  # Early cohort's pre-treatment level
  early_pre <- data %>%
    filter(female == g, D == d_early, age == d_early - 1) %>%
    pull(y)
  
  # Late cohort trend
  late_cf <- data %>% filter(female == g, D == d_late) %>% select(age, y_0)
  late_pre <- late_cf %>% filter(age == d_early - 1) %>% pull(y_0)
  
  # DID imputation: shift late trend to match early pre-treatment level
  tibble(
    age    = late_cf$age,
    female = g,
    D      = d_early,
    series = "y_cf_did",
    value  = early_pre + (late_cf$y_0 - late_pre)
  )
})) %>%
  mutate(
    gender = factor(if_else(female == 1, "Women", "Men"), levels = c("Women", "Men")),
    cohort = factor("Early (D=25)", levels = c("Early (D=25)", "Late (D=30)")),
    series_label = "Early — DID-imputed CF",
    linetype = "dotdash",
    color = "CF DID"
  )

plot_df_plus <- bind_rows(plot_df, did_cf)

ggplot(plot_df_plus, aes(x = age, y = value, group = series_label)) +
  geom_line(aes(linetype = linetype, color = color), linewidth = 1.1) +
  facet_wrap(~ gender, nrow = 1) +
  scale_linetype_identity() +
  scale_color_manual(
    values = c(
      "Obs." = "#1f77b4", "CF Early" = "#9ecae1", 
      "CF Late" = "#fdae6b", "CF DID" = "#2ca02c"
    )
  ) +
  labs(
    title = "Figure 2: DID-Imputed Counterfactual",
    subtitle = "Green line = what DID thinks would have happened without treatment",
    x = "Age", y = "Earnings", color = NULL
  ) +
  theme(legend.position = "bottom")

## ----fig3_decomposition-------------------------------------------------------
# Calculate true APO and ATE
early_true_age <- data %>%
  filter(D == d_early) %>%
  group_by(female, age) %>%
  summarise(APO_true = y_0, APO_obs = y, .groups = "drop") |> 
  mutate(ATE_true = APO_obs - APO_true)

# Calculate DID APO
early_did_age <- did_cf %>%
  group_by(female, age) %>%
  summarise(APO_did = value, .groups = "drop")

# Combine and calculate bias
summary_age <- early_true_age %>%
  left_join(early_did_age, by = c("female", "age")) %>%
  mutate(
    PT_bias = APO_did - APO_true,  # parallel trends bias
    ATE_did = APO_obs - APO_did,  # DID ATE = truth - PT bias
    gender  = factor(if_else(female == 1, "Women", "Men"), levels = c("Women", "Men"))
  )

# Prepare stacked bar chart data
apo_stack_age <- summary_age %>%
  transmute(
    gender, age = factor(age),
    measure = factor("APO", levels = c("APO", "ATE")),
    total = APO_did, comp_true = APO_true, comp_bias = PT_bias
  ) %>%
  pivot_longer(c(comp_true, comp_bias), names_to = "component", values_to = "value") %>%
  mutate(component = if_else(component == "comp_true", "Truth", "PT bias"))

ate_stack_age <- summary_age %>%
  transmute(
    gender, age = factor(age),
    measure = factor("ATE", levels = c("APO", "ATE")),
    total = ATE_did, comp_true = ATE_true, comp_bias = -PT_bias
  ) %>%
  pivot_longer(c(comp_true, comp_bias), names_to = "component", values_to = "value") %>%
  mutate(component = if_else(component == "comp_true", "Truth", "PT bias"))

bar_age_df <- bind_rows(apo_stack_age, ate_stack_age)

ggplot(bar_age_df, aes(x = age, y = value, fill = component)) +
  geom_col(width = 0.7) +
  facet_grid(rows = vars(measure), cols = vars(gender), scales = "free_y") +
  scale_fill_manual(
    values = c("Truth" = "#1f77b4", "PT bias" = "#fdae6b"),
    breaks = c("Truth", "PT bias")
  ) +
  labs(
    title = "Decomposing DID into effect and bias",
    subtitle = "Top row: DID APO | Bottom row: DID ATE",
    x = "Age", y = NULL, fill = NULL
  ) +
  theme(legend.position = "bottom")

## ----fig4_pt_ratio------------------------------------------------------------
pt_ratio_df <- summary_age %>%
  transmute(
    gender, age = factor(age),
    measure = factor("Ratios", levels = c("APO","ATE","Ratios")),
    component = "PT / DID APO",
    value = -PT_bias / APO_did
  )

bar_age_three <- bind_rows(
  apo_stack_age %>% mutate(measure = factor("APO", levels = c("APO","ATE","Ratios"))),
  ate_stack_age %>% mutate(measure = factor("ATE", levels = c("APO","ATE","Ratios"))),
  pt_ratio_df
)

ggplot(bar_age_three, aes(x = age, y = value, fill = component)) +
  geom_col(width = 0.7) +
  facet_grid(rows = vars(measure), cols = vars(gender), scales = "free_y") +
  scale_fill_manual(
    values = c(
      "Truth" = "#1f77b4", "PT bias" = "#fdae6b", "PT / DID APO" = "#99d8c9"
    )
  ) +
  labs(
    title = "Figure 4: Normalizing the PT Bias",
    subtitle = "Bottom row shows PT bias / DID APO",
    x = "Age", y = NULL, fill = NULL
  ) +
  theme(legend.position = "bottom")

## ----fig5_full_ratio----------------------------------------------------------
ratios_df <- summary_age %>%
  transmute(
    gender, age = factor(age),
    measure = factor("Ratios", levels = c("APO","ATE","Ratios")),
    `PT ratio`  = -PT_bias / APO_did,
    `ATE ratio` = ATE_true / APO_did
  ) %>%
  pivot_longer(
    cols = c(`PT ratio`, `ATE ratio`), 
    names_to = "component", 
    values_to = "value"
  )

bar_age_three2 <- bind_rows(
  apo_stack_age %>% mutate(measure = factor("APO", levels = c("APO","ATE","Ratios"))),
  ate_stack_age %>% mutate(measure = factor("ATE", levels = c("APO","ATE","Ratios"))),
  ratios_df
)

ggplot(bar_age_three2, aes(x = age, y = value, fill = component)) +
  geom_col(width = 0.7) +
  facet_grid(rows = vars(measure), cols = vars(gender), scales = "free_y") +
  scale_fill_manual(
    values = c(
      "Truth" = "#1f77b4", "PT bias" = "#fdae6b",
      "PT ratio" = "#99d8c9", "ATE ratio" = "#2ca25f"
    )
  ) +
  labs(
    title = "Figure 5: Complete Decomposition of the Normalized Ratio",
    subtitle = "Bottom row: Light teal (drops out when differencing) + Dark green (what remains)",
    x = "Age", y = NULL, fill = NULL
  ) +
  theme(legend.position = "bottom")

## ----fig6_multiplicative------------------------------------------------------

## sanity check
summary_age %>%
  transmute(
    gender, age = factor(age),
    measure = factor("Ratios", levels = c("APO","ATE","Ratios")),
    `ATE ratio` = ATE_true / APO_did,
    `ATE ratio check` = (ATE_true / APO_true) - (ATE_true / APO_true) * (PT_bias / APO_did)
  )

theta_df <- summary_age %>%
  transmute(
    gender, age = factor(age),
    APO_true, ATE_true, APO_did, PT_bias,
    theta_true = ATE_true / APO_true,
    bias_A = -PT_bias / APO_did,
    bias_B = - theta_true * (PT_bias / APO_did)
  )

theta_stack <- bind_rows(
  theta_df %>% transmute(
    gender, age, measure = factor("Ratios", levels = c("APO","ATE","Ratios")),
    component = "Truth", value = theta_true
  ),
  theta_df %>% transmute(
    gender, age, measure = factor("Ratios", levels = c("APO","ATE","Ratios")),
    component = "Bias A", value = bias_A
  ),
  theta_df %>% transmute(
    gender, age, measure = factor("Ratios", levels = c("APO","ATE","Ratios")),
    component = "Bias B", value = bias_B
  )
)

bar_age_three3 <- bind_rows(
  apo_stack_age %>% mutate(measure = factor("APO", levels = c("APO","ATE","Ratios"))),
  ate_stack_age %>% mutate(measure = factor("ATE", levels = c("APO","ATE","Ratios"))),
  theta_stack
)

ggplot(bar_age_three3, aes(x = age, y = value, fill = component)) +
  geom_col(width = 0.7) +
  facet_grid(rows = vars(measure), cols = vars(gender), scales = "free_y") +
  scale_fill_manual(
    values = c(
      "Truth" = "#1f77b4", "PT bias" = "#fdae6b",
      "Bias A" = "#99d8c9", "Bias B" = "#2ca25f"
    ),
    breaks = c("Truth", "PT bias", "Bias A", "Bias B"),
    labels = c("Truth", "PT bias", "PT / DID APO", "Truth × Bias")
  ) +
  labs(
    title = "Figure 6: The Multiplicative Bias Structure",
    subtitle = "Bottom row: Blue (true θ) + Teal (drops out) + Green (multiplicative bias)",
    x = "Age", y = NULL, fill = NULL
  ) +
  theme(legend.position = "bottom")