library(tidyverse)
theme_set(hrbrthemes::theme_ipsum_rc())
准备花几期时间,自己写代码,实现常用的计量方法。
生成模拟数据
set.seed(111)
<- 1:4
x names(x) <- LETTERS[1:4]
<- tibble(
df x1 = runif(1000, -10, 10),
x2 = sample(1:20, 1000, replace = TRUE),
x3 = sample(LETTERS[1:4], 1000, replace = TRUE),
error = rnorm(1000, sd = 1000),
y = 1 + 2 * x1 + 3 * x2 + x[x3]
) df
# A tibble: 1,000 × 5
x1 x2 x3 error y
<dbl> <int> <chr> <dbl> <dbl>
1 1.86 11 A 285. 38.7
2 4.53 5 B 176. 27.1
3 -2.59 11 A -610. 29.8
4 0.298 9 A 92.1 29.6
5 -2.45 20 D 611. 60.1
6 -1.63 15 B 170. 44.7
7 -9.79 6 C -104. 2.43
8 0.646 18 A -253. 57.3
9 -1.36 10 B -169. 30.3
10 -8.13 6 B -509. 4.75
# ℹ 990 more rows
1 步骤拆解
step 1: 根据formula生成矩阵
因为formula
是R里面一个特殊的class
,他的具体机制我没弄清楚,所以这一步用R中的model.matrix.lm()
实现
<- y ~ x1 + x2 + x3
formula <- model.frame(formula, df)
df_model <- model.matrix.lm(df_model)
X head(X)
(Intercept) x1 x2 x3B x3C x3D
1 1 1.8596257 11 0 0 0
2 1 4.5296224 5 1 0 0
3 1 -2.5915599 11 0 0 0
4 1 0.2984766 9 0 0 0
5 1 -2.4467357 20 0 0 1
6 1 -1.6332535 15 1 0 0
<- model.response(df_model)
Y head(Y)
1 2 3 4 5 6
38.71925 27.05924 29.81688 29.59695 60.10653 44.73349
step 2: 得到回归系数
假设对于总体数据来说:
\[ \mathbf Y = \mathbf X \boldsymbol \beta + \boldsymbol \epsilon \]
但是我们得到的拟合方程是:
\[ \hat{\mathbf Y} = \mathbf X \hat{\boldsymbol \beta} \]
且
\[ \begin{align} \boldsymbol e &= \mathbf Y - \hat{\mathbf Y} \\ &= \mathbf Y - \mathbf X \hat{\boldsymbol \beta} \end{align} \]
要想使\(\mathbf Y - \hat{\mathbf Y}\)最小,则\(\boldsymbol e\)与\(\mathbf X\)正交,即
\[ \begin{align} &\mathbf {X^T (Y - X\hat{\beta})} = \mathbf {0} \\ &\mathbf {\hat{\beta}} = \mathbf {(X^T X)^{-1}X^{T} Y} \end{align} \]
<- function(X, Y) {
get_beta solve(crossprod(X, X)) %*% crossprod(X, Y)
}<- get_beta(X, Y)
beta beta
[,1]
(Intercept) 2
x1 2
x2 3
x3B 1
x3C 2
x3D 3
step 3: 求出拟合值和误差
\[ \begin{align} \mathbf{\hat{Y}} &= \mathbf{X\hat{\beta}} \\ \mathbf{e} &= \mathbf{ Y - \hat{Y}} \end{align} \]
<- function(X, beta) {
get_fit %*% beta
X
}
<- get_fit(X, beta)
Y_hat head(Y_hat)
[,1]
1 38.71925
2 27.05924
3 29.81688
4 29.59695
5 60.10653
6 44.73349
<- function(Y, Y_hat) {
get_residual - Y_hat
Y
}<- get_residual(Y, Y_hat)
residual head(residual)
[,1]
1 -7.105427e-15
2 -2.842171e-14
3 -3.552714e-15
4 -1.421085e-14
5 2.842171e-14
6 0.000000e+00
step 4: 求出系数的标准误
感觉这一部分的可拓展性比较强,在不同的假设下(同方差、异方差等),系数的标准误不同。这次先假设同方差且无自相关(球形扰动项)。
求一下自由度
<- function(X) {
get_freedom nrow(X) - ncol(X)
}<- get_freedom(X)
freedom freedom
[1] 994
<- function(residual, freedom) {
get_var_sepherical as.numeric(crossprod(residual, residual) / freedom)
}<- get_var_sepherical(residual, freedom)
residual_var residual_var
[1] 5.95194e-28
球形扰动项假设下,系数的标准误为
\[ \mathbf {\text{SE}(\hat{\beta})} = \mathbf {\sqrt {\text{Var}(e) (X^T X)^{-1}_{kk}}} \]
<- function(X, residual_var) {
get_beta_se sqrt(residual_var * diag(solve(crossprod(X, X))))
}<- get_beta_se(X, residual_var)
beta_se beta_se
(Intercept) x1 x2 x3B x3C x3D
2.135352e-15 1.373596e-16 1.354870e-16 2.187586e-15 2.153656e-15 2.223240e-15
step 5: 系数的t值和p值
计算系数的t值
\[ t_k = \frac{\hat{\beta_k} - \beta_k}{ \text{SE}(\hat{ \beta_k}) } \]
<- function(beta, beta_se, assump = 0) {
get_beta_t - assump) / beta_se
(beta
}<- get_beta_t(beta, beta_se)
beta_t beta_t
[,1]
(Intercept) 9.366136e+14
x1 1.456032e+16
x2 2.214234e+16
x3B 4.571248e+14
x3C 9.286532e+14
x3D 1.349382e+15
置信区间
\[ [\hat{\beta_k} - t_{\frac{\alpha}{2}} \text{SE}(\hat{ \beta_k}), \hat{\beta_k} + t_{\frac{\alpha}{2}} \text{SE}(\hat{ \beta_k})] \]
<- function(beta, beta_se, freedom, p = 0.05) {
get_conf <- beta + qt(p/2, freedom) * beta_se
beta_lconf <- beta - qt(p/2, freedom) * beta_se
beta_hconf cbind(beta_lconf, beta_hconf)
}
<- get_conf(beta, beta_se, freedom)
conf conf
[,1] [,2]
(Intercept) 2 2
x1 2 2
x2 3 3
x3B 1 1
x3C 2 2
x3D 3 3
系数的p-value
\[ p = P(|T| > |t_k|) \]
<- function(beta_t, freedom) {
get_pvalue 2 * pt(abs(beta_t), df = freedom, lower.tail = FALSE)
}<- get_pvalue(beta_t, freedom)
pvalue pvalue
[,1]
(Intercept) 0
x1 0
x2 0
x3B 0
x3C 0
x3D 0
step 6: 整理结果
<- as_tibble(beta, rownames = "term") %>%
result_df rename(estimate = V1) %>%
bind_cols(std_error = beta_se,
statistics = beta_t[, 1],
p_value = pvalue[, 1])
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
2 打包成一个函数
前面编写的函数有
<- function(X, Y) {
get_beta solve(crossprod(X, X)) %*% crossprod(X, Y)
}
<- function(X, beta) {
get_fit %*% beta
X
}
<- function(Y, Y_hat) {
get_residual - Y_hat
Y
}
<- function(X) {
get_freedom nrow(X) - ncol(X)
}
<- function(residual, freedom) {
get_var_sepherical as.numeric(crossprod(residual, residual) / freedom)
}
<- function(X, residual) {
get_beta_se sqrt(residual * diag(solve(crossprod(X, X))))
}
<- function(beta, beta_se, assump = 0) {
get_beta_t - assump) / beta_se
(beta
}
<- function(beta, beta_se, freedom, p = 0.05) {
get_conf <- beta + qt(p/2, freedom) * beta_se
beta_lconf <- beta - qt(p/2, freedom) * beta_se
beta_hconf cbind(beta_lconf, beta_hconf)
}
<- function(beta_t, freedom) {
get_pvalue 2 * pt(abs(beta_t), df = freedom, lower.tail = FALSE)
}
将这些函数作为积木拼入一个大函数中
<- function(formula, df) {
my_lm <- model.frame(formula, df)
df_model <- model.matrix.lm(df_model)
X <- model.response(df_model)
Y
<- get_beta(X, Y)
beta
<- get_fit(X, beta)
Y_hat <- get_residual(Y, Y_hat)
residual
<- get_freedom(X)
freedom <- get_var_sepherical(residual, freedom)
residual_var
<- get_beta_se(X, residual_var)
beta_se
<- get_beta_t(beta, beta_se)
beta_t #conf <- get_conf(beta, beta_se, freedom)
<- get_pvalue(beta_t, freedom)
pvalue
<- as_tibble(beta,rownames = "term") %>%
result_df rename(estimate = V1) %>%
bind_cols(std_error = beta_se,
statistics = beta_t[, 1],
p_value = pvalue[, 1])
return(result_df)
}
<- my_lm(y ~ x1 + x2 + x3, df = df)
my_result my_result
# A tibble: 6 × 5
term estimate std_error statistics p_value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 2.00 2.14e-15 9.37e14 0
2 x1 2.00 1.37e-16 1.46e16 0
3 x2 3 1.35e-16 2.21e16 0
4 x3B 1 2.19e-15 4.57e14 0
5 x3C 2.00 2.15e-15 9.29e14 0
6 x3D 3.00 2.22e-15 1.35e15 0
3 和lm()
的结果对比
<- lm(y ~ x1 + x2 + x3, df) %>%
lm_result ::tidy() broom
Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
lm_result
# A tibble: 6 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 2 9.24e-16 2.16e15 0
2 x1 2 5.94e-17 3.37e16 0
3 x2 3 5.86e-17 5.12e16 0
4 x3B 1.00 9.47e-16 1.06e15 0
5 x3C 2.00 9.32e-16 2.15e15 0
6 x3D 3 9.62e-16 3.12e15 0
结果看起来不对,但其实是浮点数的数字存储机制造成的,比如
sqrt(2) ^ 2 == 2
[1] FALSE
这时应该使用near()
来判断两个数是否相等
near(sqrt(2) ^ 2, 2)
[1] TRUE
near(my_result$std_error, lm_result$std.error)
(Intercept) x1 x2 x3B x3C x3D
TRUE TRUE TRUE TRUE TRUE TRUE
near(my_result$statistics, lm_result$statistic)
(Intercept) x1 x2 x3B x3C x3D
FALSE FALSE FALSE FALSE FALSE FALSE
最终得到的t值不同,原因还不清楚。
4 使用真实的数据集
mtcars
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
# ℹ 22 more rows
my_lm(mpg ~ ., mtcars)
# A tibble: 11 × 5
term estimate std_error statistics p_value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 12.3 18.7 0.657 0.518
2 cyl -0.111 1.05 -0.107 0.916
3 disp 0.0133 0.0179 0.747 0.463
4 hp -0.0215 0.0218 -0.987 0.335
5 drat 0.787 1.64 0.481 0.635
6 wt -3.72 1.89 -1.96 0.0633
7 qsec 0.821 0.731 1.12 0.274
8 vs 0.318 2.10 0.151 0.881
9 am 2.52 2.06 1.23 0.234
10 gear 0.655 1.49 0.439 0.665
11 carb -0.199 0.829 -0.241 0.812
lm(mpg ~ ., mtcars) %>%
::tidy() broom
# A tibble: 11 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 12.3 18.7 0.657 0.518
2 cyl -0.111 1.05 -0.107 0.916
3 disp 0.0133 0.0179 0.747 0.463
4 hp -0.0215 0.0218 -0.987 0.335
5 drat 0.787 1.64 0.481 0.635
6 wt -3.72 1.89 -1.96 0.0633
7 qsec 0.821 0.731 1.12 0.274
8 vs 0.318 2.10 0.151 0.881
9 am 2.52 2.06 1.23 0.234
10 gear 0.655 1.49 0.439 0.665
11 carb -0.199 0.829 -0.241 0.812
这次得到的结果又完全一样。前面t值不同的原因比较复杂,可能是函数运算机制不同。