STAT151A Homework 5 problem 2 solution

Author

Your name here

Code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(mvtnorm)

# Ensure reproducibility
set.seed(130)

# Draw n_obs residuals.
# If normal is true, draw normal.
# If normal is false, draw mean zero log-normal.
# If x is not null, multiply the residual by the provided x.
DrawEps <- function(n_obs, normal, x=NULL) {
  eps_var <- 1
  if (normal) {
    eps <- rnorm(n_obs, 0, sd=sqrt(eps_var))
  } else {
    z <- rnorm(n_obs, 0, sd=sqrt(eps_var))
    return(exp(z) - exp(eps_var / 2))
  }
  
  if (!is.null(x)) {
    # Induce heteroskedasticity 
    eps <- eps * x
  }
  
  return(eps)
}


DrawX <- function(n_obs) {
  rnorm(n_obs) %>% as.matrix(ncol=1)
}

if (FALSE) {
  # Sanity check
  x <- DrawX(10000)
  mean(DrawEps(10000, TRUE))
  mean(DrawEps(10000, FALSE))
  mean(DrawEps(10000, TRUE, x=x))
  mean(DrawEps(10000, FALSE, x=x))
}


# Return a dataframe with y, x1, and the true epsilon and ey.
# DrawEpsFun() should take a single argument, n_obs, and return a draw
# of n_obs residuals.
DrawData <- function(x_mat, DrawEpsFun, beta_true) {
  n_obs <- nrow(x_mat)
  eps <- DrawEpsFun(n_obs)
  ey <- cbind(rep(1, n_obs), x_mat) %*% beta_true
  y <- ey + eps
  df <- data.frame(y=y, x1=x_mat[,1], eps_true=eps, ey_true=ey)
  return(df)
}


if (FALSE) {
  # Sanity check
  n_obs <- 100
  x_mat <- DrawX(n_obs)
  beta_true <- c("(Intercept)"=1, "x1"=2)
  DrawEpsFun <- \(n_obs) { DrawEps(n_obs, normal=TRUE, x=x_mat[, 1])}
  df_sim <- DrawData(x_mat, DrawEpsFun, beta_true)
  plot(df_sim$x1, df_sim$eps_true)
}


# Run n_sims simulations with the provided DrawDataFun, which should take
# a single argument, beta_true.  The argument `level` specifies the level of the
# test. 
RunExperiment <- function(DrawDataFun, level, num_sims, beta_true) {
  #pb <- txtProgressBar(max=num_sims, style=3) # qmd doesn't like this
  sim_df <- data.frame()
  for (n_sim in 1:num_sims) {
    #setTxtProgressBar(pb, n_sim)
    df_sim <- DrawDataFun(beta_true)
    reg_fit_sim <- lm(y ~ 1 + x1, df_sim)
    ci <- confint(reg_fit_sim, parm="x1", level=level)
    covered <- (beta_true["x1"] > ci[1]) && (beta_true["x1"] < ci[2])
    betahat <- coefficients(reg_fit_sim)["x1"]
    sim_df <- bind_rows(
      sim_df, 
      data.frame(sim=n_sim, covered=covered, cil=ci[1], ciu=ci[2], betahat=betahat))
  }
  #close(pb)
  return(sim_df)
}
Code
level <- 0.8
n_sims <- 5000
beta_true <- c("(Intercept)"=1, "x1"=2)

sim_df <- data.frame() # Simulation results
data_df <- data.frame() # Collect an example dataset 
for (n_obs in c(10, 500)) {
  x_mat <- DrawX(n_obs)
  for (heteroskedastic in c(FALSE, TRUE)) {
    for (normal in c(FALSE, TRUE)) {
      if (heteroskedastic) {
        x_arg <- x_mat[, 1]
      } else {
        x_arg <- NULL
      }
      DrawEpsFun <- \(n_obs) { DrawEps(n_obs, normal=normal, x=x_arg) }
      DrawDataFun <- \(beta_true) { DrawData(x_mat, DrawEpsFun, beta_true) }
      this_sim_df <- RunExperiment(
        DrawDataFun, num_sims=n_sims, level=level, beta_true=beta_true)
      data_df <- bind_rows(
        data_df,
        DrawDataFun(beta_true) %>% mutate(
          heteroskedastic=ifelse(heteroskedastic, "hetero", "homo"), 
          normal=ifelse(normal, "normal", "gamma"),
          n_obs=n_obs)
      )
      sim_df <- bind_rows(
        sim_df,
        this_sim_df %>% mutate(
          heteroskedastic=ifelse(heteroskedastic, "hetero", "homo"), 
          normal=ifelse(normal, "normal", "gamma"),
          n_obs=n_obs))
}}}

if (FALSE) {
  data_df %>%
    group_by(normal, n_obs, heteroskedastic) %>%
    summarize(eps_mean=mean(eps_true), eps_sd=sd(eps_true))
  
  ggplot(data_df) + 
    geom_histogram(aes(x=eps_true)) +
    facet_grid(normal ~ heteroskedastic)
}
Code
sim_df %>%
  group_by(heteroskedastic, normal, n_obs) %>%
  summarize(cil=mean(cil), ciu=mean(ciu), covered=mean(covered)) %>%
  mutate(ci_width=ciu - cil, 
         covered_ci=sqrt(level * (1 - level) / n_sims),
         covered_lower=covered - 2 * covered_ci,
         covered_upper=covered + 2 * covered_ci) %>%
  select(heteroskedastic, normal, n_obs, 
         covered_lower, covered, covered_upper, ci_width)
`summarise()` has grouped output by 'heteroskedastic', 'normal'. You can
override using the `.groups` argument.
# A tibble: 8 × 7
# Groups:   heteroskedastic, normal [4]
  heteroskedastic normal n_obs covered_lower covered covered_upper ci_width
  <chr>           <chr>  <dbl>         <dbl>   <dbl>         <dbl>    <dbl>
1 hetero          gamma     10         0.798   0.809         0.821    1.70 
2 hetero          gamma    500         0.781   0.792         0.804    0.237
3 hetero          normal    10         0.573   0.585         0.596    0.805
4 hetero          normal   500         0.545   0.556         0.568    0.115
5 homo            gamma     10         0.797   0.808         0.820    1.73 
6 homo            gamma    500         0.781   0.793         0.804    0.237
7 homo            normal    10         0.789   0.800         0.812    0.977
8 homo            normal   500         0.785   0.797         0.808    0.111

We see that coverage is quite good except for the heteroskedastic normal case. As might be expected, increased variance increases the ci width, as does lower numbers of observations.

Code
# Even with only 10 observations the betahat looks pretty good
ggplot(sim_df %>% filter(n_obs < 100)) + 
  geom_histogram(aes(x=betahat, fill=n_obs, group=n_obs)) +
  facet_grid(normal ~ heteroskedastic)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.