实现简单线性回归

R
econometrics
Author

Bowen Qin

Published

April 9, 2024

Modified

April 11, 2024

library(tidyverse)
theme_set(hrbrthemes::theme_ipsum_rc())

准备花几期时间,自己写代码,实现常用的计量方法。

生成模拟数据

set.seed(111)
x <- 1:4
names(x) <- LETTERS[1:4]
df <- tibble(
  x1 = runif(1000, -10, 10),
  x2 = sample(1:20, 1000, replace = TRUE),
  x3 = sample(LETTERS[1:4], 1000, replace = TRUE),
  error = rnorm(1000, sd = 1000),
  y = 1 + 2 * x1 + 3 * x2 + x[x3]
)
df
# A tibble: 1,000 × 5
       x1    x2 x3     error     y
    <dbl> <int> <chr>  <dbl> <dbl>
 1  1.86     11 A      285.  38.7 
 2  4.53      5 B      176.  27.1 
 3 -2.59     11 A     -610.  29.8 
 4  0.298     9 A       92.1 29.6 
 5 -2.45     20 D      611.  60.1 
 6 -1.63     15 B      170.  44.7 
 7 -9.79      6 C     -104.   2.43
 8  0.646    18 A     -253.  57.3 
 9 -1.36     10 B     -169.  30.3 
10 -8.13      6 B     -509.   4.75
# ℹ 990 more rows

1 步骤拆解

step 1: 根据formula生成矩阵

因为formula是R里面一个特殊的class,他的具体机制我没弄清楚,所以这一步用R中的model.matrix.lm()实现

formula <- y ~ x1 + x2 + x3
df_model <- model.frame(formula, df)
X <- model.matrix.lm(df_model)
head(X)
  (Intercept)         x1 x2 x3B x3C x3D
1           1  1.8596257 11   0   0   0
2           1  4.5296224  5   1   0   0
3           1 -2.5915599 11   0   0   0
4           1  0.2984766  9   0   0   0
5           1 -2.4467357 20   0   0   1
6           1 -1.6332535 15   1   0   0
Y <- model.response(df_model)
head(Y)
       1        2        3        4        5        6 
38.71925 27.05924 29.81688 29.59695 60.10653 44.73349 

step 2: 得到回归系数

假设对于总体数据来说:

\[ \mathbf Y = \mathbf X \boldsymbol \beta + \boldsymbol \epsilon \]

但是我们得到的拟合方程是:

\[ \hat{\mathbf Y} = \mathbf X \hat{\boldsymbol \beta} \]

\[ \begin{align} \boldsymbol e &= \mathbf Y - \hat{\mathbf Y} \\ &= \mathbf Y - \mathbf X \hat{\boldsymbol \beta} \end{align} \]

要想使\(\mathbf Y - \hat{\mathbf Y}\)最小,则\(\boldsymbol e\)\(\mathbf X\)正交,即

\[ \begin{align} &\mathbf {X^T (Y - X\hat{\beta})} = \mathbf {0} \\ &\mathbf {\hat{\beta}} = \mathbf {(X^T X)^{-1}X^{T} Y} \end{align} \]

get_beta <- function(X, Y) {
  solve(crossprod(X, X)) %*% crossprod(X, Y)
}
beta <- get_beta(X, Y)
beta
            [,1]
(Intercept)    2
x1             2
x2             3
x3B            1
x3C            2
x3D            3

step 3: 求出拟合值和误差

\[ \begin{align} \mathbf{\hat{Y}} &= \mathbf{X\hat{\beta}} \\ \mathbf{e} &= \mathbf{ Y - \hat{Y}} \end{align} \]

get_fit <- function(X, beta) {
  X %*% beta
}

Y_hat <- get_fit(X, beta)
head(Y_hat)
      [,1]
1 38.71925
2 27.05924
3 29.81688
4 29.59695
5 60.10653
6 44.73349
get_residual <- function(Y, Y_hat) {
  Y - Y_hat
}
residual <- get_residual(Y, Y_hat)
head(residual)
           [,1]
1 -7.105427e-15
2 -2.842171e-14
3 -3.552714e-15
4 -1.421085e-14
5  2.842171e-14
6  0.000000e+00

step 4: 求出系数的标准误

感觉这一部分的可拓展性比较强,在不同的假设下(同方差、异方差等),系数的标准误不同。这次先假设同方差且无自相关(球形扰动项)。

求一下自由度

get_freedom <- function(X) {
  nrow(X) - ncol(X)
}
freedom <- get_freedom(X)
freedom
[1] 994
get_var_sepherical <- function(residual, freedom) {
  as.numeric(crossprod(residual, residual) / freedom)
}
residual_var <- get_var_sepherical(residual, freedom)
residual_var
[1] 5.95194e-28

球形扰动项假设下,系数的标准误为

\[ \mathbf {\text{SE}(\hat{\beta})} = \mathbf {\sqrt {\text{Var}(e) (X^T X)^{-1}_{kk}}} \]

get_beta_se <- function(X, residual_var) {
  sqrt(residual_var * diag(solve(crossprod(X, X))))
}
beta_se <- get_beta_se(X, residual_var)
beta_se
 (Intercept)           x1           x2          x3B          x3C          x3D 
2.135352e-15 1.373596e-16 1.354870e-16 2.187586e-15 2.153656e-15 2.223240e-15 

step 5: 系数的t值和p值

计算系数的t值

\[ t_k = \frac{\hat{\beta_k} - \beta_k}{ \text{SE}(\hat{ \beta_k}) } \]

get_beta_t <- function(beta, beta_se, assump = 0) {
  (beta - assump) / beta_se
}
beta_t <- get_beta_t(beta, beta_se)
beta_t
                    [,1]
(Intercept) 9.366136e+14
x1          1.456032e+16
x2          2.214234e+16
x3B         4.571248e+14
x3C         9.286532e+14
x3D         1.349382e+15

置信区间

\[ [\hat{\beta_k} - t_{\frac{\alpha}{2}} \text{SE}(\hat{ \beta_k}), \hat{\beta_k} + t_{\frac{\alpha}{2}} \text{SE}(\hat{ \beta_k})] \]

get_conf <- function(beta, beta_se, freedom, p = 0.05) {
  beta_lconf <- beta + qt(p/2, freedom) * beta_se
  beta_hconf <- beta - qt(p/2, freedom) * beta_se
  cbind(beta_lconf, beta_hconf)
}

conf <- get_conf(beta, beta_se, freedom)
conf
            [,1] [,2]
(Intercept)    2    2
x1             2    2
x2             3    3
x3B            1    1
x3C            2    2
x3D            3    3

系数的p-value

\[ p = P(|T| > |t_k|) \]

get_pvalue <- function(beta_t, freedom) {
  2 * pt(abs(beta_t), df = freedom, lower.tail = FALSE)
}
pvalue <- get_pvalue(beta_t, freedom)
pvalue
            [,1]
(Intercept)    0
x1             0
x2             0
x3B            0
x3C            0
x3D            0

step 6: 整理结果

result_df <- as_tibble(beta, rownames = "term") %>% 
  rename(estimate = V1) %>% 
  bind_cols(std_error = beta_se,
            statistics = beta_t[, 1],
            p_value = pvalue[, 1])
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.

2 打包成一个函数

前面编写的函数有

get_beta <- function(X, Y) {
  solve(crossprod(X, X)) %*% crossprod(X, Y)
}

get_fit <- function(X, beta) {
  X %*% beta
}

get_residual <- function(Y, Y_hat) {
  Y - Y_hat
}

get_freedom <- function(X) {
  nrow(X) - ncol(X)
}

get_var_sepherical <- function(residual, freedom) {
  as.numeric(crossprod(residual, residual) / freedom)
}

get_beta_se <- function(X, residual) {
  sqrt(residual * diag(solve(crossprod(X, X))))
}

get_beta_t <- function(beta, beta_se, assump = 0) {
  (beta - assump) / beta_se
}

get_conf <- function(beta, beta_se, freedom, p = 0.05) {
  beta_lconf <- beta + qt(p/2, freedom) * beta_se
  beta_hconf <- beta - qt(p/2, freedom) * beta_se
  cbind(beta_lconf, beta_hconf)
}

get_pvalue <- function(beta_t, freedom) {
  2 * pt(abs(beta_t), df = freedom, lower.tail = FALSE)
}

将这些函数作为积木拼入一个大函数中

my_lm <- function(formula, df) {
  df_model <- model.frame(formula, df)
  X <- model.matrix.lm(df_model)
  Y <- model.response(df_model)
  
  beta <- get_beta(X, Y)
  
  Y_hat <- get_fit(X, beta)
  residual <- get_residual(Y, Y_hat)
  
  freedom <- get_freedom(X)
  residual_var <- get_var_sepherical(residual, freedom)
  
  beta_se <- get_beta_se(X, residual_var)
  
  beta_t <- get_beta_t(beta, beta_se)
  #conf <- get_conf(beta, beta_se, freedom)
  pvalue <- get_pvalue(beta_t, freedom)
  
  
  result_df <- as_tibble(beta,rownames = "term") %>% 
    rename(estimate = V1) %>% 
    bind_cols(std_error = beta_se,
              statistics = beta_t[, 1],
              p_value = pvalue[, 1])
  
  return(result_df)
}
my_result <- my_lm(y ~ x1 + x2 + x3, df = df)
my_result
# A tibble: 6 × 5
  term        estimate std_error statistics p_value
  <chr>          <dbl>     <dbl>      <dbl>   <dbl>
1 (Intercept)     2.00  2.14e-15    9.37e14       0
2 x1              2.00  1.37e-16    1.46e16       0
3 x2              3     1.35e-16    2.21e16       0
4 x3B             1     2.19e-15    4.57e14       0
5 x3C             2.00  2.15e-15    9.29e14       0
6 x3D             3.00  2.22e-15    1.35e15       0

3lm()的结果对比

lm_result <- lm(y ~ x1 + x2 + x3, df) %>% 
  broom::tidy()
Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
lm_result
# A tibble: 6 × 5
  term        estimate std.error statistic p.value
  <chr>          <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)     2     9.24e-16   2.16e15       0
2 x1              2     5.94e-17   3.37e16       0
3 x2              3     5.86e-17   5.12e16       0
4 x3B             1.00  9.47e-16   1.06e15       0
5 x3C             2.00  9.32e-16   2.15e15       0
6 x3D             3     9.62e-16   3.12e15       0

结果看起来不对,但其实是浮点数的数字存储机制造成的,比如

sqrt(2) ^ 2 == 2
[1] FALSE

这时应该使用near()来判断两个数是否相等

near(sqrt(2) ^ 2, 2)
[1] TRUE
near(my_result$std_error, lm_result$std.error)
(Intercept)          x1          x2         x3B         x3C         x3D 
       TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
near(my_result$statistics, lm_result$statistic)
(Intercept)          x1          x2         x3B         x3C         x3D 
      FALSE       FALSE       FALSE       FALSE       FALSE       FALSE 

最终得到的t值不同,原因还不清楚。

4 使用真实的数据集

mtcars
# A tibble: 32 × 11
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
 3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
 4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
 5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
 6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
 7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
 8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
 9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
# ℹ 22 more rows
my_lm(mpg ~ ., mtcars)
# A tibble: 11 × 5
   term        estimate std_error statistics p_value
   <chr>          <dbl>     <dbl>      <dbl>   <dbl>
 1 (Intercept)  12.3      18.7         0.657  0.518 
 2 cyl          -0.111     1.05       -0.107  0.916 
 3 disp          0.0133    0.0179      0.747  0.463 
 4 hp           -0.0215    0.0218     -0.987  0.335 
 5 drat          0.787     1.64        0.481  0.635 
 6 wt           -3.72      1.89       -1.96   0.0633
 7 qsec          0.821     0.731       1.12   0.274 
 8 vs            0.318     2.10        0.151  0.881 
 9 am            2.52      2.06        1.23   0.234 
10 gear          0.655     1.49        0.439  0.665 
11 carb         -0.199     0.829      -0.241  0.812 
lm(mpg ~ ., mtcars) %>% 
  broom::tidy()
# A tibble: 11 × 5
   term        estimate std.error statistic p.value
   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)  12.3      18.7        0.657  0.518 
 2 cyl          -0.111     1.05      -0.107  0.916 
 3 disp          0.0133    0.0179     0.747  0.463 
 4 hp           -0.0215    0.0218    -0.987  0.335 
 5 drat          0.787     1.64       0.481  0.635 
 6 wt           -3.72      1.89      -1.96   0.0633
 7 qsec          0.821     0.731      1.12   0.274 
 8 vs            0.318     2.10       0.151  0.881 
 9 am            2.52      2.06       1.23   0.234 
10 gear          0.655     1.49       0.439  0.665 
11 carb         -0.199     0.829     -0.241  0.812 

这次得到的结果又完全一样。前面t值不同的原因比较复杂,可能是函数运算机制不同。