当前位置：首页 > article >正文

子集选择——基于R语言实现（最优子集选择法、逐步回归法、Lasso回归法、交叉验证法）

article 2024/12/28 10:20:32

( a )使用 `rnorm()` 函数生成预测变量X(`n=100`)与噪声向量 $\epsilon$ `(n=100)`

set.seed(1)
x<-rnorm(100)#预测变量X
eps<-rnorm(100)#噪声向量$\epsilon$

( b ) 生成响应变量Y（`n=100`）, $Y=\beta_0+\beta_1X+\beta_2X^{2}+\beta_3X^{3}+\epsilon$ ,在本次实验中，设定 $\beta_0=1,\beta_1=2,\beta_2=3,\beta_3=4$

y=1+2*x+3*x^2+4*x^3+eps#系数为1,2,3,4

( c ) 最优子集法

( c ) regsubsets()实现最优子集算法，从包含 $X^{0},X^{1},...,X^{10}$ 的模型中选出最优的模型；
根据 $C_p,BIC,Adjust R^{2}$ 选择出最优模型；
给出最优子集模型的系数估计值。

library(leaps)
best=data.frame(y,x)#将x,y转化为数据框
#选择最优子集，子集中的变量为X的10次多项式，nvmax=10表示最多选择10个变量
b=regsubsets(y~poly(x,10,raw = T),data=best,nvmax=10)
s=summary(b)
print(s)       #print the summary of the model,通过查看summary(b)的结果，*表示变量被选入


> print(s)#print the summary of the model,通过查看summary(b)的结果，*表示变量被选入
Subset selection object
Call: regsubsets.formula(y ~ poly(x, 10, raw = T), data = best, nvmax = 10)
10 Variables  (and intercept)
                       Forced in Forced out
poly(x, 10, raw = T)1      FALSE      FALSE
poly(x, 10, raw = T)2      FALSE      FALSE
poly(x, 10, raw = T)3      FALSE      FALSE
poly(x, 10, raw = T)4      FALSE      FALSE
poly(x, 10, raw = T)5      FALSE      FALSE
poly(x, 10, raw = T)6      FALSE      FALSE
poly(x, 10, raw = T)7      FALSE      FALSE
poly(x, 10, raw = T)8      FALSE      FALSE
poly(x, 10, raw = T)9      FALSE      FALSE
poly(x, 10, raw = T)10     FALSE      FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
          poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3
1  ( 1 )  " "                   " "                   "*"                  
2  ( 1 )  " "                   "*"                   "*"                  
3  ( 1 )  "*"                   "*"                   "*"                  
4  ( 1 )  "*"                   "*"                   "*"                  
5  ( 1 )  "*"                   "*"                   "*"                  
6  ( 1 )  "*"                   "*"                   "*"                  
7  ( 1 )  "*"                   "*"                   "*"                  
8  ( 1 )  "*"                   "*"                   "*"                  
9  ( 1 )  "*"                   "*"                   "*"                  
10  ( 1 ) "*"                   "*"                   "*"                  
          poly(x, 10, raw = T)4 poly(x, 10, raw = T)5 poly(x, 10, raw = T)6
1  ( 1 )  " "                   " "                   " "                  
2  ( 1 )  " "                   " "                   " "                  
3  ( 1 )  " "                   " "                   " "                  
4  ( 1 )  " "                   "*"                   " "                  
5  ( 1 )  " "                   "*"                   "*"                  
6  ( 1 )  " "                   " "                   " "                  
7  ( 1 )  " "                   "*"                   "*"                  
8  ( 1 )  "*"                   " "                   "*"                  
9  ( 1 )  "*"                   "*"                   "*"                  
10  ( 1 ) "*"                   "*"                   "*"                  
          poly(x, 10, raw = T)7 poly(x, 10, raw = T)8 poly(x, 10, raw = T)9
1  ( 1 )  " "                   " "                   " "                  
2  ( 1 )  " "                   " "                   " "                  
3  ( 1 )  " "                   " "                   " "                  
4  ( 1 )  " "                   " "                   " "                  
5  ( 1 )  " "                   " "                   " "                  
6  ( 1 )  "*"                   "*"                   "*"                  
7  ( 1 )  " "                   "*"                   " "                  
8  ( 1 )  " "                   "*"                   "*"                  
9  ( 1 )  " "                   "*"                   "*"                  
10  ( 1 ) "*"                   "*"                   "*"                  
          poly(x, 10, raw = T)10
1  ( 1 )  " "                   
2  ( 1 )  " "                   
3  ( 1 )  " "                   
4  ( 1 )  " "                   
5  ( 1 )  " "                   
6  ( 1 )  " "                   
7  ( 1 )  "*"                   
8  ( 1 )  "*"                   
9  ( 1 )  "*"                   
10  ( 1 ) "*"

通过查看summary(b)的结果，*表示变量被选入，被选入模型的三次多项式和四次多项式

names(summary(b))#查看summary(b)的属性

> names(summary(b))#查看summary(b)的属性
[1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

s$which #查看哪些变量被选入模型

> s$which#查看哪些变量被选入模型
   (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3
1         TRUE                 FALSE                 FALSE                  TRUE
2         TRUE                 FALSE                  TRUE                  TRUE
3         TRUE                  TRUE                  TRUE                  TRUE
4         TRUE                  TRUE                  TRUE                  TRUE
5         TRUE                  TRUE                  TRUE                  TRUE
6         TRUE                  TRUE                  TRUE                  TRUE
7         TRUE                  TRUE                  TRUE                  TRUE
8         TRUE                  TRUE                  TRUE                  TRUE
9         TRUE                  TRUE                  TRUE                  TRUE
10        TRUE                  TRUE                  TRUE                  TRUE
   poly(x, 10, raw = T)4 poly(x, 10, raw = T)5 poly(x, 10, raw = T)6 poly(x, 10, raw = T)7
1                  FALSE                 FALSE                 FALSE                 FALSE
2                  FALSE                 FALSE                 FALSE                 FALSE
3                  FALSE                 FALSE                 FALSE                 FALSE
4                  FALSE                  TRUE                 FALSE                 FALSE
5                  FALSE                  TRUE                  TRUE                 FALSE
6                  FALSE                 FALSE                 FALSE                  TRUE
7                  FALSE                  TRUE                  TRUE                 FALSE
8                   TRUE                 FALSE                  TRUE                 FALSE
9                   TRUE                  TRUE                  TRUE                 FALSE
10                  TRUE                  TRUE                  TRUE                  TRUE
   poly(x, 10, raw = T)8 poly(x, 10, raw = T)9 poly(x, 10, raw = T)10
1                  FALSE                 FALSE                  FALSE
2                  FALSE                 FALSE                  FALSE
3                  FALSE                 FALSE                  FALSE
4                  FALSE                 FALSE                  FALSE
5                  FALSE                 FALSE                  FALSE
6                   TRUE                  TRUE                  FALSE
7                   TRUE                 FALSE                   TRUE
8                   TRUE                  TRUE                   TRUE
9                   TRUE                  TRUE                   TRUE
10                  TRUE                  TRUE                   TRUE

s$cp#选择cp最小的进入模型
s$bic#选择BIC最小的进入模型
s$adjr2#选择adjr2最大的进入模型
which.min(s$cp)
which.min(s$bic)
which.max(s$adjr2)
coefficients(b,id=3)
coefficients(b,id=4)
par(mfrow=c(2,2))
plot(b,scale="bic")
plot(1:10,summary(b)$cp,type="b")
plot(1:10,summary(b)$bic,type="b")
plot(1:10,summary(b)$adjr2,type="b")

> s$cp#选择cp最小的进入模型
 [1] 1123.2892318  109.3256041    2.1859433    0.6067483    2.1782005    3.9955812    5.7869063
 [8]    7.1694092    9.1535580   11.0000000
> s$bic#选择BIC最小的进入模型
 [1] -262.7744 -437.2907 -509.6393 -508.9084 -504.7773 -500.3748 -496.0018 -492.0868 -487.4994
[10] -483.0666
> s$adjr2#选择adjr2最大的进入模型
 [1] 0.9334429 0.9887867 0.9947516 0.9948979 0.9948680 0.9948233 0.9947792 0.9947581 0.9947008
[10] 0.9946505
> which.min(s$cp)
[1] 4
> which.min(s$bic)
[1] 3
> which.max(s$adjr2)
[1] 4

#画图
par(mfrow=c(2,2))
plot(b,scale="bic",main="BIC")
plot(1:10,summary(b)$cp,type="b")
points(4, s$cp[3], pch=4, col="red", lwd=7)
plot(1:10,summary(b)$bic,type="b")
points(3, s$bic[3], pch=4, col="red", lwd=7)
plot(1:10,summary(b)$adjr2,type="b",)
points(4, s$adjr2[3], pch=4, col="red", lwd=7)

在这里插入图片描述
通过最优子集法分析，并通过图像得出，根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3$ ；根据 $C_p,Adjust R^2$ 选择出来的最优模型为 $y=\epsilon+\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3+\beta_4x^5$ 。


> coefficients(b,id=3)#选择BIC最小的进入模型
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3 
             1.061507              1.975280              2.876209              4.017639 
> coefficients(b,id=4)#选择cp,Adjust R2最小的进入模型
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3 
           1.07200775            2.38745596            2.84575641            3.55797426 
poly(x, 10, raw = T)5 
           0.08072292

通过最优子集法分析，并通过参数的估计得出，根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+1.06+1.98x+2.88x^2+4.02x^3$ ，这个模型与问题（b）中的结果十分接近；根据 $C_p,Adjust R^2$ 选择出来的最优模型为 $y=\epsilon+1.07+2.39x+2.85x^2+3.56x^3+0.08x^5$ 。

(d)逐步选择

1 向前逐步选择

fit2=regsubsets(y~poly(x,10,raw = T),method="forward",data=best)
s2=summary(fit2)
which.min(s2$cp)
which.min(s2$bic)
which.max(s2$adjr2)


> fit2=regsubsets(y~poly(x,10,raw = T),method="forward",data=best)
> s2=summary(fit2)
> which.min(s2$cp)
[1] 4
> which.min(s2$bic)
[1] 3
> which.max(s2$adjr2)
[1] 4

向前逐步选择结果与（c）得出的最优拟合模型一致,根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3$ ；根据 $C_p,Adjust R^2$ 选择出来的最优模型为 $y=\epsilon+\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3+\beta_4x^9$ 。

2 向后逐步选择

fit3=regsubsets(y~poly(x,10,raw = T),method="backward",data=best,nvmax = 10)
s3=summary(fit3)
which.min(s3$cp)
which.min(s3$bic)
which.max(s3$adjr2)

> fit3=regsubsets(y~poly(x,10,raw = T),method="backward",data=best,nvmax = 10)
> s3=summary(fit3)
> which.min(s3$cp)
[1] 4
> which.min(s3$bic)
[1] 3
> which.max(s3$adjr2)
[1] 4

向后逐步选择结果与（c）得出的最优拟合模型一致,根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3$ ；根据 $C_p,Adjust R^2$ 选择出来的最优模型为 $y=\epsilon+\beta_0+\beta_1x+\beta_2x^2+\beta_3x^3+\beta_4x^9$ 。

3 两种方法比较——向前逐步选择与向后逐步选择

3.1绘图

#向前逐步选择
par(mfrow=c(2,3))
plot(1:10,s2$cp,type="b")
points(4, s2$cp[3], pch=4, col="red", lwd=7)
plot(1:10,s2$bic,type="b")
points(3, s2$bic[3], pch=4, col="red", lwd=7)
plot(1:10,s2$adjr2,type="b")
points(4, s2$adjr2[3], pch=4, col="red", lwd=7)
#向后逐步选择
plot(1:10,s3$cp,type="b")
points(4, s3$cp[3], pch=4, col="red", lwd=7)
plot(1:10,s3$bic,type="b")
points(3, s3$bic[3], pch=4, col="red", lwd=7)
plot(1:10,s3$adjr2,type="b")
points(4, s3$adjr2[3], pch=4, col="red", lwd=7)

在这里插入图片描述

3.2 求解模型系数

#三次拟合模型
coefficients(fit2,id=3)#向前逐步选择
coefficients(fit3,id=3)#向后逐步选择
#四次拟合模型
coefficients(fit2,id=4)#向前逐步选择
coefficients(fit3,id=4)#向后逐步选择


> #两种方法比较
> coefficients(fit2,id=3)#向前逐步选择
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3 
             1.061507              1.975280              2.876209              4.017639 
> coefficients(fit3,id=3)#向后逐步选择
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3 
             1.061507              1.975280              2.876209              4.017639 
> coefficients(fit2,id=4)#向前逐步选择
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3 
           1.07200775            2.38745596            2.84575641            3.55797426 
poly(x, 10, raw = T)5 
           0.08072292 
> coefficients(fit3,id=4)#向后逐步选择
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3 
          1.079236362           2.231905828           2.833494180           3.819555807 
poly(x, 10, raw = T)9 
          0.001290827

对于向前逐步选择，向前逐步选择结果与（c）得出的最优拟合模型大致相同,根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+ 1.06+1.98x+2.88x^2+4.02x^3$ ；根据 $C_p,Adjust R^2$ 选择出来的最优模型为 $y=\epsilon+1.07+2.39x+2.85x^2+3.56x^3+0.08x^9$ 。

向后逐步回归结果与（c）得出的最优拟合模型大致相同,根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+1.06+1.98x+2.88x^2+4.02x^3$ ；根据 $C_p,Adjust R^2$ 选择出来的最优模型为 $y=\epsilon+1.07+2.39x+2.85x^2+3.56x^3+0.08x^9$ 。

(e)lasso法选择：交叉验证法求出最优 $\lambda$ ,并对系数进行估计。

x=model.matrix(y~poly(x,10,raw = T),data=best)#将数据转化为矩阵
print(x)#输出x的值
#去除最后一列
xmat=model.matrix(y~poly(x,10,raw = T),data=best)[,-1]#去除第一列
print(xmat)

library(glmnet)#加载glmnet包
grid=10^seq(10,-2,length=100)#生成一个从10^10到10^-2的长度为100的等比数列，用于lambda的选择
set.seed(1)#设置随机种子，保证每次结果一样
#交叉验证法求出最优lambda
fit5=cv.glmnet(xmat,y,nfolds =5,alpha=1,lambda = grid)#交叉验证，alpha=1表示lasso回归，alpha=0代表岭回归，lambda=grid表示lambda的选择范围

plot(fit5)#画图

在这里插入图片描述

#求解最优lambda，交叉验证法求出最优lambda
bestlambda=fit5$lambda.min#选择最优的lambda，交叉验证选择最小的lambda，lamda.min=6
bestlambda

> bestlambda
[1] 0.07054802

通过交叉验证法求出最优 $\lambda=0.07$

fit6=glmnet(xmat,y,alpha=1)#拟合模型lasso回归
coefficients(fit6,s=bestlambda)#输出系数

> fit6=glmnet(xmat,y,alpha=1)#拟合模型lasso回归
> coefficients(fit6,s=bestlambda)#输出系数
11 x 1 sparse Matrix of class "dgCMatrix"
                                s1
(Intercept)            1.178301396
poly(x, 10, raw = T)1  2.142635982
poly(x, 10, raw = T)2  2.628493946
poly(x, 10, raw = T)3  3.812038946
poly(x, 10, raw = T)4  0.042147458
poly(x, 10, raw = T)5  0.012647742
poly(x, 10, raw = T)6  .          
poly(x, 10, raw = T)7  0.003884896
poly(x, 10, raw = T)8  .          
poly(x, 10, raw = T)9  .          
poly(x, 10, raw = T)10 .

通过分析得出有6个变量不等于0，说明使用lasso法筛选出来6个变量，得出拟合模型为 $y=\epsilon+1.17+2.14x+2.63x^2+3.81x^3+0.04x^4+0.01x^5+0.004x^7$ .

(f) 现在依据 $Y=b\beta_0+\beta_7X^{7}+\epsilon$ 产生响应变量Y，使用最优子集选择法与lasso法，对比分析。

1 产生响应变量Y

y1=1+2*x^7+eps
best1=data.frame(y1,x)

2最优子集选择法

fit7=regsubsets(y1~poly(x,10,raw = T),data=best1,nvmax=10)
s7=summary(fit7)
which.min(s7$cp)
which.min(s7$bic)
which.max(s7$adjr2)
coefficients(fit7,id=2)
coefficients(fit7,id=1)
coefficients(fit7,id=4)

> set.seed(1)
> x=rnorm(100)
> eps=rnorm(100)
> #(f)
> y1=1+2*x^7+eps
> best1=data.frame(y1,x)
> library(leaps)
> fit7=regsubsets(y1~poly(x,10,raw = T),data=best1,nvmax=10)
> s7=summary(fit7)
> which.min(s7$cp)
[1] 2
> which.min(s7$bic)
[1] 1
> which.max(s7$adjr2)
[1] 4
> coefficients(fit7,id=2)
          (Intercept) poly(x, 10, raw = T)2 poly(x, 10, raw = T)7 
            1.0704904            -0.1417084             2.0015552 
> coefficients(fit7,id=1)
          (Intercept) poly(x, 10, raw = T)7 
            0.9589402             2.0007705 
> coefficients(fit7,id=4)
          (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2 poly(x, 10, raw = T)3 
            1.0762524             0.2914016            -0.1617671            -0.2526527 
poly(x, 10, raw = T)7 
            2.0091338

通过最优子集选择法，并通过参数的估计得出，根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+0.96+2x^7$ ，这个模型与假设的结果十分接近；根据 $C_p$ 选择出来的最优模型为y= $\epsilon+1.07-0.14x^2+2x^7$ ，,根据 $Adjust R^2$ 选择出来的最优模型为y= $\epsilon+1.08+0.29x-0.16x^2-0.25x^3+2x^7$ .

par(mfrow=c(2,2))
plot(fit7,scale="bic")
plot(1:10,s7$cp,type="b")
points(2, s7$cp[2], pch=4, col="red", lwd=7)
plot(1:10,s7$bic,type="b")
points(1, s7$bic[1], pch=4, col="red", lwd=7)
plot(1:10,s7$adjr2,type="b")
points(4, s7$adjr2[4], pch=4, col="red", lwd=7)

在这里插入图片描述

3 lasso法

3.1 交叉验证法

library(glmnet)
xmat=model.matrix(y1~poly(x,10,raw = T),data=best1)[,-1]
set.seed(1)
#nfolds=5表示5折交叉验证，alpha=1表示lasso回归，lambda=grid表示lambda的选择范围
grid=10^seq(10,-2,length=100)
fit8=cv.glmnet(xmat,y1,nfolds =5,alpha=1,lambda = grid)

plot(fit8)

在这里插入图片描述

best.lambda=fit8$lambda.min
best.lambda


> best.lambda
[1] 0.05336699

predict(fit8, s = best.lambda, type = "coefficients")

> predict(fit8, s = best.lambda, type = "coefficients")
11 x 1 sparse Matrix of class "dgCMatrix"
                                  s1
(Intercept)             1.0247546165
poly(x, 10, raw = T)1   .           
poly(x, 10, raw = T)2  -0.0810199644
poly(x, 10, raw = T)3   .           
poly(x, 10, raw = T)4   .           
poly(x, 10, raw = T)5   0.0004449398
poly(x, 10, raw = T)6   .           
poly(x, 10, raw = T)7   1.9966707008
poly(x, 10, raw = T)8   .           
poly(x, 10, raw = T)9   0.0007067609
poly(x, 10, raw = T)10  .

通过分析得出有4个变量不等于0，说明使用lasso法筛选出来6个变量，得出拟合模型为 $y=\epsilon+ 1.02-0.08x^2+0.0004x^5+1.997x^7+0.04x^4+0.0007x^9$ .

通过 $Y=b\beta_0+\beta_7X^{7}+\epsilon$ 产生响应变量Y，使用最优子集选择法与lasso法，对比分析，得出通过最优子集选择法，并通过参数的估计得出，

根据 $B I C$ 选择出来的最优模型为 $y=\epsilon+0.96+2x^7$ ，这个模型与假设的结果十分接近；
根据 $C_p$ 选择出来的最优模型为y= $\epsilon+1.07-0.14x^2+2x^7$ ；
根据 $Adjust R^2$ 选择出来的最优模型为y= $\epsilon+1.08+0.29x-0.16x^2-0.25x^3+2x^7$ .