$$ \newcommand{\mybold}[1]{\boldsymbol{#1}} \newcommand{\trans}{\intercal} \newcommand{\norm}[1]{\left\Vert#1\right\Vert} \newcommand{\abs}[1]{\left|#1\right|} \newcommand{\bbr}{\mathbb{R}} \newcommand{\bbz}{\mathbb{Z}} \newcommand{\bbc}{\mathbb{C}} \newcommand{\gauss}[1]{\mathcal{N}\left(#1\right)} \newcommand{\chisq}[1]{\mathcal{\chi}^2_{#1}} \newcommand{\studentt}[1]{\mathrm{StudentT}_{#1}} \newcommand{\fdist}[2]{\mathrm{FDist}_{#1,#2}} \newcommand{\argmin}[1]{\underset{#1}{\mathrm{argmin}}\,} \newcommand{\projop}[1]{\underset{#1}{\mathrm{Proj}}\,} \newcommand{\proj}[1]{\underset{#1}{\mybold{P}}} \newcommand{\expect}[1]{\mathbb{E}\left[#1\right]} \newcommand{\prob}[1]{\mathbb{P}\left(#1\right)} \newcommand{\dens}[1]{\mathit{p}\left(#1\right)} \newcommand{\var}[1]{\mathrm{Var}\left(#1\right)} \newcommand{\cov}[1]{\mathrm{Cov}\left(#1\right)} \newcommand{\sumn}{\sum_{n=1}^N} \newcommand{\meann}{\frac{1}{N} \sumn} \newcommand{\cltn}{\frac{1}{\sqrt{N}} \sumn} \newcommand{\trace}[1]{\mathrm{trace}\left(#1\right)} \newcommand{\diag}[1]{\mathrm{Diag}\left(#1\right)} \newcommand{\grad}[2]{\nabla_{#1} \left. #2 \right.} \newcommand{\gradat}[3]{\nabla_{#1} \left. #2 \right|_{#3}} \newcommand{\fracat}[3]{\left. \frac{#1}{#2} \right|_{#3}} \newcommand{\W}{\mybold{W}} \newcommand{\w}{w} \newcommand{\wbar}{\bar{w}} \newcommand{\wv}{\mybold{w}} \newcommand{\X}{\mybold{X}} \newcommand{\x}{x} \newcommand{\xbar}{\bar{x}} \newcommand{\xv}{\mybold{x}} \newcommand{\Xcov}{\Sigmam_{\X}} \newcommand{\Xcovhat}{\hat{\Sigmam}_{\X}} \newcommand{\Covsand}{\Sigmam_{\mathrm{sand}}} \newcommand{\Covsandhat}{\hat{\Sigmam}_{\mathrm{sand}}} \newcommand{\Z}{\mybold{Z}} \newcommand{\z}{z} \newcommand{\zv}{\mybold{z}} \newcommand{\zbar}{\bar{z}} \newcommand{\Y}{\mybold{Y}} \newcommand{\Yhat}{\hat{\Y}} \newcommand{\y}{y} \newcommand{\yv}{\mybold{y}} \newcommand{\yhat}{\hat{\y}} \newcommand{\ybar}{\bar{y}} \newcommand{\res}{\varepsilon} \newcommand{\resv}{\mybold{\res}} \newcommand{\resvhat}{\hat{\mybold{\res}}} \newcommand{\reshat}{\hat{\res}} \newcommand{\betav}{\mybold{\beta}} \newcommand{\betavhat}{\hat{\betav}} \newcommand{\betahat}{\hat{\beta}} \newcommand{\betastar}{{\beta^{*}}} \newcommand{\bv}{\mybold{\b}} \newcommand{\bvhat}{\hat{\bv}} \newcommand{\alphav}{\mybold{\alpha}} \newcommand{\alphavhat}{\hat{\av}} \newcommand{\alphahat}{\hat{\alpha}} \newcommand{\omegav}{\mybold{\omega}} \newcommand{\gv}{\mybold{\gamma}} \newcommand{\gvhat}{\hat{\gv}} \newcommand{\ghat}{\hat{\gamma}} \newcommand{\hv}{\mybold{\h}} \newcommand{\hvhat}{\hat{\hv}} \newcommand{\hhat}{\hat{\h}} \newcommand{\gammav}{\mybold{\gamma}} \newcommand{\gammavhat}{\hat{\gammav}} \newcommand{\gammahat}{\hat{\gamma}} \newcommand{\new}{\mathrm{new}} \newcommand{\zerov}{\mybold{0}} \newcommand{\onev}{\mybold{1}} \newcommand{\id}{\mybold{I}} \newcommand{\sigmahat}{\hat{\sigma}} \newcommand{\etav}{\mybold{\eta}} \newcommand{\muv}{\mybold{\mu}} \newcommand{\Sigmam}{\mybold{\Sigma}} \newcommand{\rdom}[1]{\mathbb{R}^{#1}} \newcommand{\RV}[1]{\tilde{#1}} \def\A{\mybold{A}} \def\A{\mybold{A}} \def\av{\mybold{a}} \def\a{a} \def\B{\mybold{B}} \def\S{\mybold{S}} \def\sv{\mybold{s}} \def\s{s} \def\R{\mybold{R}} \def\rv{\mybold{r}} \def\r{r} \def\V{\mybold{V}} \def\vv{\mybold{v}} \def\v{v} \def\U{\mybold{U}} \def\uv{\mybold{u}} \def\u{u} \def\W{\mybold{W}} \def\wv{\mybold{w}} \def\w{w} \def\tv{\mybold{t}} \def\t{t} \def\Sc{\mathcal{S}} \def\ev{\mybold{e}} \def\Lammat{\mybold{\Lambda}} $$

Interpreting the coefficients and R output

library(tidyverse)
library(sandwich)

theme_update(text = element_text(size=24))
options(repr.plot.width=12, repr.plot.height=6)
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install <- FALSE

# https://avehtari.github.io/ROS-Examples/examples.html
if (install) {
    install.packages("rprojroot")
    remotes::install_github("avehtari/ROS-Examples", subdir="rpackage")    
}

library(rosdata)
data(kidiq)
head(kidiq)
A data.frame: 6 × 5
kid_score mom_hs mom_iq mom_work mom_age
<int> <int> <dbl> <int> <int>
1 65 1 121.11753 4 27
2 98 1 89.36188 4 25
3 85 1 115.44316 4 27
4 83 1 99.44964 3 25
5 115 1 92.74571 4 27
6 98 0 107.90184 1 18

Let’s regress the kid’s IQ score on a constant and on whether or not the mom went to high school. This should be interepreted as a comparison between groups, not the effect of “going to high school.”

fit1 <- lm(kid_score ~ mom_hs, kidiq)
print(fit1)

Call:
lm(formula = kid_score ~ mom_hs, data = kidiq)

Coefficients:
(Intercept)       mom_hs  
      77.55        11.77  
kidiq %>% mutate(pred=fit1$fitted.values) %>%
ggplot() +
    geom_point(aes(x=mom_hs, y=kid_score)) +
    geom_point(aes(x=mom_hs, y=pred), color="red", size=5)

# There are many ways to get estimates out of lm.
print(summary(fit1))

cat("\n----------------\n")
print(summary(fit1)$coefficients)

cat("\n----------------\n")
print(coefficients(fit1))

Call:
lm(formula = kid_score ~ mom_hs, data = kidiq)

Residuals:
   Min     1Q Median     3Q    Max 
-57.55 -13.32   2.68  14.68  58.45 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   77.548      2.059  37.670  < 2e-16 ***
mom_hs        11.771      2.322   5.069 5.96e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 19.85 on 432 degrees of freedom
Multiple R-squared:  0.05613,   Adjusted R-squared:  0.05394 
F-statistic: 25.69 on 1 and 432 DF,  p-value: 5.957e-07


----------------
            Estimate Std. Error   t value      Pr(>|t|)
(Intercept) 77.54839   2.058612 37.670231 1.392224e-138
mom_hs      11.77126   2.322427  5.068516  5.956524e-07

----------------
(Intercept)      mom_hs 
   77.54839    11.77126 

We can interpret the coefficients as the means within samples. Arguably the easiest way to see this is to re-write our regressors. The problem uses \(x_n^\trans = (1, i_n)\) where \(i_n\) is \(1\) if the mom went to high school and \(0\) otherwise. It is somewhat easier to see what the coefficients are estimating if you write the regression in the form of the transformed variable

\[ \zv_n = \begin{pmatrix} 1 & -1 \\ 0 & 1 \end{pmatrix} \xv_n = \begin{pmatrix} 1 - i_n \\ i_n \end{pmatrix}, \]

since then

\[ \Z^\trans \Z = \begin{pmatrix} N_0 & 0 \\ 0 & N_1 \\ \end{pmatrix}, \]

where \(N_0\) and \(N_1\) are the respective counts of rows of moms without and with high school education. From this it follows that, regressing \(\y_n \sim \zv_n^\trans \gamma\), that

\[ \gammahat = \begin{pmatrix}\ybar_0 \\ \ybar_1 \end{pmatrix}, \]

the means of the response within the two groups. Using the fact that \(\zv_n\) is an invertible transformation of \(\xv_n\), and so the least squares fit is the same at each possible regressor value, we get

\[ \zv_n^\trans \gammahat = \xv_n^\trans \begin{pmatrix} 1 & 0 \\ -1 & 1 \end{pmatrix} \gammahat = \xv_n^\trans \betahat \quad\textrm{for all }\xv_n\quad \Rightarrow \]

\[ \betahat = \begin{pmatrix} 1 & 0 \\ -1 & 1 \end{pmatrix}\gammahat = \begin{pmatrix} 1 & 0 \\ -1 & 1 \end{pmatrix} \begin{pmatrix}\ybar_0 \\ \ybar_1 \end{pmatrix} = \begin{pmatrix} \ybar_0 \\ \ybar_1 - \ybar_0 \end{pmatrix}. \]

So the least square estimates are just comparing means within groups in this case.

Exercise

Prove that, if \(\xv_n\) takes one of \(P\) distinct values, then \(\betahat\) is always a linear combination of the sample means of the response within observations taking the distinct values. Find the linear combination, and prove the present result as a special case.

mean0 <- mean(kidiq %>% filter(mom_hs == 0) %>% pull(kid_score))
mean1 <- mean(kidiq %>% filter(mom_hs == 1) %>% pull(kid_score))
cbind(c(mean0, mean1 - mean0), coefficients(fit1))
A matrix: 2 × 2 of type dbl
(Intercept) 77.54839 77.54839
mom_hs 11.77126 11.77126

Let’s go through and confirm that lm is calculating exactly the same quantites we’ve been looking at in class.

Check that the coefficient estimates are given by \(\betavhat = (\X^\trans \X)^{-1} \X^\trans \Y\).

xmat <- model.matrix(fit1)
yvec <- kidiq$kid_score

xtx <- t(xmat) %*% xmat
betahat <- solve(xtx, t(xmat) %*% yvec)

cat("\nThe betahat is the same in lm:\n")
print(cbind(betahat, coefficients(fit1)))

The betahat is the same in lm:
                [,1]     [,2]
(Intercept) 77.54839 77.54839
mom_hs      11.77126 11.77126

Check that the residuals are given by \(\Y - \X \betavhat\).

reshat <- yvec - xmat %*% betahat
cat("\nThe estimated residuals are the same in lm:\n")
print(max(abs(reshat - fit1$residuals)))

The estimated residuals are the same in lm:
[1] 4.902745e-13

The residual estimated variances is given by \(\sigmahat^2 = \frac{1}{N - P} \sumn \reshat_n\).

sigmahat2 <- sum(reshat^2) / (nrow(xmat) - ncol(xmat))
cat("\nThe residual variance estimates are the same in lm:\n")
print(sigmahat2)
print(sigma(fit1)^2)

The residual variance estimates are the same in lm:
[1] 394.1231
[1] 394.1231

The estimated covariance \(\cov{\betahat - \beta}\) is given by \(\sigmahat^2 (\X^\trans\X)^{-1}\), the homoskedastic covariance estimate.

betahat_cov <- sigmahat2 * solve(xtx)

cat("\nThe betahat covariance are the same in lm:\n")
print(betahat_cov)
print(vcov(fit1))

The betahat covariance are the same in lm:
            (Intercept)    mom_hs
(Intercept)    4.237883 -4.237883
mom_hs        -4.237883  5.393669
            (Intercept)    mom_hs
(Intercept)    4.237883 -4.237883
mom_hs        -4.237883  5.393669

The sandwich covariance matrix can also be used, though it’s not the default for lm. Note that the sandwich package has lots of variants, read the documentation for more information.

cat("\nThe sandwich covariances match:\n")
score_cov <- t(xmat) %*% (diag(as.numeric(reshat)^2) %*% xmat)
betahat_sandwich_cov <- solve(xtx, score_cov) %*% solve(xtx)
print(betahat_sandwich_cov)
print(sandwich::vcovCL(fit1, type="HC0", cadjust=FALSE))

The sandwich covariances match:
            (Intercept)    mom_hs
(Intercept)    5.420399 -5.420399
mom_hs        -5.420399  6.481451
            (Intercept)    mom_hs
(Intercept)    5.420399 -5.420399
mom_hs        -5.420399  6.481451

The reported “standard errors” are the square root of the diagonal of the covariance matrix.

cat("\nThe betahat standard errors are the same in lm:\n")
betahat_sd <- sqrt(diag(betahat_cov))
print(cbind(betahat_sd, summary(fit1)$coefficients[,"Std. Error"]))

The betahat standard errors are the same in lm:
            betahat_sd         
(Intercept)   2.058612 2.058612
mom_hs        2.322427 2.322427

Recall that, if \(\betahat_k - \beta_k \sim \gauss{0, \sigma^2 v_k}\), then we have shown that, under the normal assumption,

\[ t_k := \frac{\betahat_k - \beta_k}{\sigmahat} \sim \studentt{N-P}. \]

It follows that, if \(\prob{t_k \le -z_\alpha} = \alpha / 2\), then

\[ \prob{\beta \in (\betahat \pm z_\alpha \sigmahat v_k} = 1 - \alpha \]

Here \(\sigmahat v_k\) would just be the “standard error” from above, that is, the square root of the \(k\)–th entry of the diagonal of the estimated covariance matrix.

alpha = 0.05
zalpha <- -1 * qt(alpha / 2, df=nrow(xmat) - ncol(xmat))
ci <- betahat[2] +  betahat_sd[2] * c(-zalpha, zalpha)

cat("\nThe confidence values are the same as lm:\n")
print(ci)
print(confint(fit1, "mom_hs", level=1 - alpha))

The confidence values are the same as lm:
[1]  7.206598 16.335924
          2.5 %   97.5 %
mom_hs 7.206598 16.33592

The t-values and p-values are about hypothesis tests for the hypothesis that \(\betav = 0\) using the student-t distribution.

First, the t-values is the statistic \(t_k\) defined above for \(\beta_k = 0\). This is how far away from zero the t-statistic would have to be had \(\beta_k\) been equal to \(0\).

The p-value is then the \(\alpha\) for which you would have included \(0\) in your confidence interval. Equivalently, it is the probability of observing \(t_k\) as large or larger than what you actually observed, had \(\beta_k\) actually been equal to zero.

cat("\nThe t values are the same in lm:\n")
t_value <- betahat / betahat_sd
print(cbind(t_value, summary(fit1)$coefficients[, "t value"]))

cat("\nThe p values are the same in lm:\n")
df <- nrow(xmat) - ncol(xmat)
p_value <- pt(- abs(t_value), df=df) * 2
print(cbind(p_value, summary(fit1)$coefficients[, "Pr(>|t|)"]))

The t values are the same in lm:
                 [,1]      [,2]
(Intercept) 37.670231 37.670231
mom_hs       5.068516  5.068516

The p values are the same in lm:
                     [,1]          [,2]
(Intercept) 1.392224e-138 1.392224e-138
mom_hs       5.956524e-07  5.956524e-07

We can check that the F-statistic matches as well.

summary(fit1)

Call:
lm(formula = kid_score ~ mom_hs, data = kidiq)

Residuals:
   Min     1Q Median     3Q    Max 
-57.55 -13.32   2.68  14.68  58.45 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   77.548      2.059  37.670  < 2e-16 ***
mom_hs        11.771      2.322   5.069 5.96e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 19.85 on 432 degrees of freedom
Multiple R-squared:  0.05613,   Adjusted R-squared:  0.05394 
F-statistic: 25.69 on 1 and 432 DF,  p-value: 5.957e-07
f_stats <- summary(fit1)$fstatistic
print(summary(fit1))
print(f_stats)

Call:
lm(formula = kid_score ~ mom_hs, data = kidiq)

Residuals:
   Min     1Q Median     3Q    Max 
-57.55 -13.32   2.68  14.68  58.45 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   77.548      2.059  37.670  < 2e-16 ***
mom_hs        11.771      2.322   5.069 5.96e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 19.85 on 432 degrees of freedom
Multiple R-squared:  0.05613,   Adjusted R-squared:  0.05394 
F-statistic: 25.69 on 1 and 432 DF,  p-value: 5.957e-07

    value     numdf     dendf 
 25.68986   1.00000 432.00000 
non_intercept_rows <- names(coefficients(fit1)) != "(Intercept)"
betahat_ni_cov <- betahat_cov[non_intercept_rows, non_intercept_rows, drop=FALSE]
betahat_ni <- coefficients(fit1)[non_intercept_rows, drop=FALSE]
f_stat_df <- sum(non_intercept_rows)
f_stat_df2 <- length(fit1$fitted.values) - length(coefficients(fit1))
f_stat <- t(betahat_ni) %*% solve(betahat_ni_cov, betahat_ni) / f_stat_df

cat("\nThe f statistic is same in lm:\n")
print(cbind(f_stat, f_stats["value"]))

cat("\nThe df statistic is same in lm:\n")
print(cbind(f_stat_df, f_stats["numdf"]))
print(cbind(f_stat_df2, f_stats["dendf"]))

cat("\nThe f test p value is same in lm:\n")
1 - pf(f_stat, f_stat_df, f_stat_df2)
cat("(Note that the summary() p-value is apparently computed in the print statement and not accessible directly)")
#print(cbind(qf(f_stat, f_stat_df, f_stats["numdf"]))

The f statistic is same in lm:
          [,1]     [,2]
value 25.68986 25.68986

The df statistic is same in lm:
      f_stat_df  
numdf         1 1
      f_stat_df2    
dendf        432 432

The f test p value is same in lm:
(Note that the summary() p-value is apparently computed in the print statement and not accessible directly)
A matrix: 1 × 1 of type dbl
5.956524e-07