suppressMessages({
library(foreach) #flexible looping construct
library(stargazer) # for tables
})
We illustrate the use of the BIC model selection procedures using simulated data from a polynomial regression model.
Consider a polynomial regression model with a polynomial of order \(p_0\):
\[\begin{align} Y_i & = \beta_0+\beta_1 X_{i}+\beta_2 X_i^2+\ldots+\beta_{p_0} X_i^{p_0}+U_i,\\ U_i &\sim N(0,1),\\ X_i &\sim N(0,1). \end{align}\]
Let’s define a custom R function to generate the data. Here, all \(\beta\)’s are set to one:
DGP<-function(n,p0){
#p0 = the number of polynomial terms
#n = the number of observations
X<-rnorm(n)
U<-rnorm(n)
Y<-1+U
for (j in 1:p0){
Y<-Y+X^j
}
return(list(y=Y,x=X))
}
Let’s simulate some data. To make the results reproducible, we will use a specified state/seed of the random number generator. Each time you run the code, the same sequence of ovservations is simulated.
n=30
p0=4
set.seed(42)
MyData<-DGP(n=n,p0=p0)
We can obtain the OLS estimator for the polynomial regression using
the function poly(x,p,raw=TRUE)
, where:
x
is the data vector used to construct the polynomial
terms.p
is the degree of the polynomial.raw=TRUE
instructs R to not use
orthogonal polynomial terms.Below, we estimate the model with the true number of the polynomial
terms p=p0
:
model4<-lm(MyData$y ~ poly(MyData$x,p=p0,raw=TRUE))
stargazer(model4,
covariate.labels=c(as.character(seq(from=1,to=4,by=1))),
omit.stat=c("ll","f","ser","n","adj.rsq")
,type="html",notes.append = FALSE,notes = c("<sup>⋆</sup>p<0.1; <sup>⋆⋆</sup>p<0.05; <sup>⋆⋆⋆</sup>p<0.01")
#,type="text"
)
Dependent variable: | |
y | |
1 | 1.117*** |
(0.388) | |
2 | 1.023*** |
(0.347) | |
3 | 0.922*** |
(0.096) | |
4 | 0.989*** |
(0.063) | |
Constant | 0.881*** |
(0.288) | |
R2 | 0.993 |
Note: | ⋆p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01 |
model4<-lm(MyData$y ~ poly(MyData$x,p=p0,raw=TRUE))
summary(model4)
##
## Call:
## lm(formula = MyData$y ~ poly(MyData$x, p = p0, raw = TRUE))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9250 -0.6420 0.4428 0.7135 1.7134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.88129 0.28818 3.058 0.00525 **
## poly(MyData$x, p = p0, raw = TRUE)1 1.11676 0.38835 2.876 0.00812 **
## poly(MyData$x, p = p0, raw = TRUE)2 1.02338 0.34713 2.948 0.00684 **
## poly(MyData$x, p = p0, raw = TRUE)3 0.92229 0.09614 9.593 7.37e-10 ***
## poly(MyData$x, p = p0, raw = TRUE)4 0.98906 0.06266 15.784 1.65e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.093 on 25 degrees of freedom
## Multiple R-squared: 0.9933, Adjusted R-squared: 0.9922
## F-statistic: 928.1 on 4 and 25 DF, p-value: < 2.2e-16
Now let’s increase the number of regressors (polynomial terms) beyond
what was used to generate the data: p=10
model10<-lm(MyData$y ~ poly(MyData$x,p=10,raw=TRUE))
stargazer(model10,
covariate.labels=c(as.character(seq(from=1,to=10,by=1)),"Intercept"),
omit.stat=c("ll","f","ser","n","adj.rsq")
,type="html",notes.append = FALSE,notes = c("<sup>⋆</sup>p<0.1; <sup>⋆⋆</sup>p<0.05; <sup>⋆⋆⋆</sup>p<0.01")
#,type="text"
)
Dependent variable: | |
y | |
1 | 0.437 |
(1.203) | |
2 | 3.453 |
(3.131) | |
3 | 0.283 |
(3.387) | |
4 | -0.863 |
(3.957) | |
5 | 0.998 |
(2.619) | |
6 | 0.684 |
(1.944) | |
7 | -0.327 |
(0.722) | |
8 | -0.146 |
(0.410) | |
9 | 0.032 |
(0.065) | |
10 | 0.013 |
(0.031) | |
Intercept | 0.490 |
(0.447) | |
R2 | 0.995 |
Note: | ⋆p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01 |
Let’s try a misspecified model:
model3<-lm(MyData$y ~ poly(MyData$x,p=3,raw=TRUE))
stargazer(model3,
covariate.labels=c(as.character(seq(from=1,to=3,by=1)),"Intercept"),
omit.stat=c("ll","f","ser","n","adj.rsq")
,type="html",notes.append = FALSE,notes = c("<sup>⋆</sup>p<0.1; <sup>⋆⋆</sup>p<0.05; <sup>⋆⋆⋆</sup>p<0.01")
#,type="text"
)
Dependent variable: | |
y | |
1 | 1.729 |
(1.255) | |
2 | 6.194*** |
(0.373) | |
3 | 0.386 |
(0.292) | |
Intercept | -1.156 |
(0.837) | |
R2 | 0.927 |
Note: | ⋆p<0.1; ⋆⋆p<0.05; ⋆⋆⋆p<0.01 |
BIC()
function computes BICcat("BIC for: \n",
'- correct model =', BIC(model4),'\n',
'- overspecified model =', BIC(model10),'\n',
'- under-specified model =', BIC(model3),'\n')
## BIC for:
## - correct model = 105.3882
## - overspecified model = 117.5081
## - under-specified model = 173.8286
The custom function below runs all polynomial regression up to the
max_p
order, and picks the one with the lowest BIC
value:
MySelect<-function(MyData,max_p){
Powers<-seq(from=1,to=max_p,by=1)
myBICs<-foreach(j=Powers) %do% BIC(lm(MyData$y ~ poly(MyData$x,p=j,raw=TRUE)))
p_hat<-which.min(myBICs)
return(p_hat)
}
Let’s try BIC selection:
MySelect(MyData,max_p=10)
## [1] 4
Let’s check how likely BIC to select the right model using MC simulations.
n
is the sample size.p0
is the true degree.max_p
is the largest degree to try.R
is the number of MC repetitions.set.seed(seed)
for reproducabilityMC_BIC<-function(n,p0,max_p,R,seed){
set.seed(seed)
Success<-foreach(r=1:R, .combine='rbind') %do% {
MyData<-DGP(n,p0=p0)
p_hat<-MySelect(MyData,max_p=max_p)
(p_hat==p0)
}
cat("With n=",n, "the probability of BIC selecting the true model is",mean(Success))
}
MC_BIC(n=30,p0=4,max_p=10,R=10^2,seed=42)
## With n= 30 the probability of BIC selecting the true model is 0.73
MC_BIC(n=100,p0=4,max_p=10,R=10^2,seed=42)
## With n= 100 the probability of BIC selecting the true model is 0.97
We re-define the function that generates the coefficients:
DGP<-function(n,p0){
X<-rnorm(n)
U<-rnorm(n)
Y<-U+0.01*X^p0
for (j in 1:p0-1){
Y<-Y+X^j
}
return(list(y=Y,x=X))
}
MC_BIC(n=1000,p0=4,max_p=10,R=10^2,seed=42)
## With n= 1000 the probability of BIC selecting the true model is 0.12
Execution time:
execution_time <- system.time({
# Code to measure
result <- MC_BIC(n=100,p0=4,max_p=10,R=10^3,seed=42)
})
## With n= 100 the probability of BIC selecting the true model is 0.04
cat("\n Execution time:\n")
##
## Execution time:
print(execution_time)
## user system elapsed
## 3.182 0.019 3.202
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(doRNG)
## Loading required package: rngtools
cl <- makeCluster(8) #Use 8 workers/cores
registerDoParallel(cl)
invisible(clusterEvalQ(cl, {
suppressPackageStartupMessages(library(foreach, quietly = TRUE))
})) #load library foreach for all workers
Parallel version of the MC function: - use %dopar%
instead. - export=c("DGP","MySelect"))
to ensure that
workers see custom functions
MC_BIC_par<-function(n,p0,max_p,R){
Success<-foreach(r=1:R, .combine='rbind',.export=c("DGP","MySelect")) %dopar% {
MyData<-DGP(n,p0=p0)
p_hat<-MySelect(MyData,max_p=max_p)
(p_hat==p0)
}
cat("With n=",n, "the probability of BIC selecting the true model is",mean(Success))
}
Run parallel loop:
registerDoRNG(42)
execution_time <- system.time({
# Code to measure
result <- MC_BIC_par(n=100,p0=4,max_p=10,R=10^2)
})
## With n= 100 the probability of BIC selecting the true model is 0.06
cat("\n Execution time:\n")
##
## Execution time:
print(execution_time)
## user system elapsed
## 0.021 0.004 0.152