suppressMessages({
library(foreach) #flexible looping construct
library(stargazer) # for tables
})

We illustrate the use of the BIC model selection procedures using simulated data from a polynomial regression model.

DGP

Consider a polynomial regression model with a polynomial of order \(p_0\):

\[\begin{align} Y_i & = \beta_0+\beta_1 X_{i}+\beta_2 X_i^2+\ldots+\beta_{p_0} X_i^{p_0}+U_i,\\ U_i &\sim N(0,1),\\ X_i &\sim N(0,1). \end{align}\]

Let’s define a custom R function to generate the data. Here, all \(\beta\)’s are set to one:

DGP<-function(n,p0){
  #p0 = the number of polynomial terms
  #n = the number of observations
  X<-rnorm(n)
  U<-rnorm(n)
  Y<-1+U
  for (j in 1:p0){
    Y<-Y+X^j
  }
  return(list(y=Y,x=X))
}

Let’s simulate some data. To make the results reproducible, we will use a specified state/seed of the random number generator. Each time you run the code, the same sequence of ovservations is simulated.

n=30
p0=4
set.seed(42)
MyData<-DGP(n=n,p0=p0)

Polynomial regression

We can obtain the OLS estimator for the polynomial regression using the function poly(x,p,raw=TRUE), where:

x is the data vector used to construct the polynomial terms.
p is the degree of the polynomial.
raw=TRUE instructs R to not use orthogonal polynomial terms.

Below, we estimate the model with the true number of the polynomial terms p=p0:

model4<-lm(MyData$y ~ poly(MyData$x,p=p0,raw=TRUE))
stargazer(model4,
          covariate.labels=c(as.character(seq(from=1,to=4,by=1))),
          omit.stat=c("ll","f","ser","n","adj.rsq")
          ,type="html",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01")
          #,type="text"
          )


	Dependent variable:

	y

1	1.117^***
	(0.388)

2	1.023^***
	(0.347)

3	0.922^***
	(0.096)

4	0.989^***
	(0.063)

Constant	0.881^***
	(0.288)


R²	0.993

Note:	^⋆p<0.1; ^⋆⋆p<0.05; ^⋆⋆⋆p<0.01

model4<-lm(MyData$y ~ poly(MyData$x,p=p0,raw=TRUE))
summary(model4)

## 
## Call:
## lm(formula = MyData$y ~ poly(MyData$x, p = p0, raw = TRUE))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9250 -0.6420  0.4428  0.7135  1.7134 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          0.88129    0.28818   3.058  0.00525 ** 
## poly(MyData$x, p = p0, raw = TRUE)1  1.11676    0.38835   2.876  0.00812 ** 
## poly(MyData$x, p = p0, raw = TRUE)2  1.02338    0.34713   2.948  0.00684 ** 
## poly(MyData$x, p = p0, raw = TRUE)3  0.92229    0.09614   9.593 7.37e-10 ***
## poly(MyData$x, p = p0, raw = TRUE)4  0.98906    0.06266  15.784 1.65e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.093 on 25 degrees of freedom
## Multiple R-squared:  0.9933, Adjusted R-squared:  0.9922 
## F-statistic: 928.1 on 4 and 25 DF,  p-value: < 2.2e-16

All estimates are significant.
All estimates are reasonably close to one (the true value)

Now let’s increase the number of regressors (polynomial terms) beyond what was used to generate the data: p=10

model10<-lm(MyData$y ~ poly(MyData$x,p=10,raw=TRUE))
stargazer(model10,
          covariate.labels=c(as.character(seq(from=1,to=10,by=1)),"Intercept"),
          omit.stat=c("ll","f","ser","n","adj.rsq")
          ,type="html",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01")
          #,type="text"
          )


	Dependent variable:

	y

1	0.437
	(1.203)

2	3.453
	(3.131)

3	0.283
	(3.387)

4	-0.863
	(3.957)

5	0.998
	(2.619)

6	0.684
	(1.944)

7	-0.327
	(0.722)

8	-0.146
	(0.410)

9	0.032
	(0.065)

10	0.013
	(0.031)

Intercept	0.490
	(0.447)


R²	0.995

Note:	^⋆p<0.1; ^⋆⋆p<0.05; ^⋆⋆⋆p<0.01

No significant estimates.
The estimates for the first four parameters can substantially deviate from the true values due to the large variances.
The variances (and the standard errors) are large because the extra regressors are correlated with the relevant regressors.

Let’s try a misspecified model:

model3<-lm(MyData$y ~ poly(MyData$x,p=3,raw=TRUE))
stargazer(model3,
          covariate.labels=c(as.character(seq(from=1,to=3,by=1)),"Intercept"),
          omit.stat=c("ll","f","ser","n","adj.rsq")
          ,type="html",notes.append = FALSE,notes = c("<sup>&sstarf;</sup>p<0.1; <sup>&sstarf;&sstarf;</sup>p<0.05; <sup>&sstarf;&sstarf;&sstarf;</sup>p<0.01")
          #,type="text"
          )


	Dependent variable:

	y

1	1.729
	(1.255)

2	6.194^***
	(0.373)

3	0.386
	(0.292)

Intercept	-1.156
	(0.837)


R²	0.927

Note:	^⋆p<0.1; ^⋆⋆p<0.05; ^⋆⋆⋆p<0.01

Biased estimates due to the omitted variables

BIC

R BIC() function computes BIC

cat("BIC for: \n",
   '- correct model         =', BIC(model4),'\n', 
   '- overspecified model   =', BIC(model10),'\n',
   '- under-specified model =', BIC(model3),'\n')

## BIC for: 
##  - correct model         = 105.3882 
##  - overspecified model   = 117.5081 
##  - under-specified model = 173.8286

The custom function below runs all polynomial regression up to the max_p order, and picks the one with the lowest BIC value:

MySelect<-function(MyData,max_p){
  Powers<-seq(from=1,to=max_p,by=1)
  myBICs<-foreach(j=Powers) %do% BIC(lm(MyData$y ~ poly(MyData$x,p=j,raw=TRUE)))
    p_hat<-which.min(myBICs)
  return(p_hat)
}

Let’s try BIC selection:

MySelect(MyData,max_p=10)

## [1] 4

Monter Carlo simulations for the accuracy of BIC in selecting the correct model

Let’s check how likely BIC to select the right model using MC simulations.

n is the sample size.
p0 is the true degree.
max_p is the largest degree to try.
R is the number of MC repetitions.
set.seed(seed) for reproducability

MC_BIC<-function(n,p0,max_p,R,seed){
    set.seed(seed)
    Success<-foreach(r=1:R, .combine='rbind')  %do% {
      MyData<-DGP(n,p0=p0)
      p_hat<-MySelect(MyData,max_p=max_p)
      (p_hat==p0)
  }
  cat("With n=",n, "the probability of BIC selecting the true model is",mean(Success))
}

MC_BIC(n=30,p0=4,max_p=10,R=10^2,seed=42)

## With n= 30 the probability of BIC selecting the true model is 0.73

MC_BIC(n=100,p0=4,max_p=10,R=10^2,seed=42)

## With n= 100 the probability of BIC selecting the true model is 0.97

What if some of the coefficients are small?

We re-define the function that generates the coefficients:

The coefficient on the term of order \(p_0\) is now small:

DGP<-function(n,p0){
  X<-rnorm(n)
  U<-rnorm(n)
  Y<-U+0.01*X^p0
  for (j in 1:p0-1){
    Y<-Y+X^j
  }
  return(list(y=Y,x=X))
}

MC_BIC(n=1000,p0=4,max_p=10,R=10^2,seed=42)

## With n= 1000 the probability of BIC selecting the true model is 0.12

Now, the probability of selecting the true order \(p_0\) is much smaller.
BIC thinks the \(p_0\) term is irrelevant.

Parallel loops

Without parallel loops

Execution time:

execution_time <- system.time({
  # Code to measure
  result <- MC_BIC(n=100,p0=4,max_p=10,R=10^3,seed=42)
})

## With n= 100 the probability of BIC selecting the true model is 0.04

cat("\n Execution time:\n")

## 
##  Execution time:

print(execution_time)

##    user  system elapsed 
##   3.182   0.019   3.202

Over 3 secs sequential execution of the loop

Set up parallel loops

library(doParallel)

## Loading required package: iterators

## Loading required package: parallel

library(doRNG)

## Loading required package: rngtools

cl <- makeCluster(8) #Use 8 workers/cores
registerDoParallel(cl)
invisible(clusterEvalQ(cl, {
  suppressPackageStartupMessages(library(foreach, quietly = TRUE))
})) #load library foreach for all workers

Parallel version of the MC function: - use %dopar% instead. - export=c("DGP","MySelect")) to ensure that workers see custom functions

MC_BIC_par<-function(n,p0,max_p,R){
    Success<-foreach(r=1:R, .combine='rbind',.export=c("DGP","MySelect"))  %dopar% {
      MyData<-DGP(n,p0=p0)
      p_hat<-MySelect(MyData,max_p=max_p)
      (p_hat==p0)
  }
  cat("With n=",n, "the probability of BIC selecting the true model is",mean(Success))
}

Run parallel loop:

registerDoRNG(42)
execution_time <- system.time({
  # Code to measure
  result <- MC_BIC_par(n=100,p0=4,max_p=10,R=10^2)
})

## With n= 100 the probability of BIC selecting the true model is 0.06

cat("\n Execution time:\n")

## 
##  Execution time:

print(execution_time)

##    user  system elapsed 
##   0.021   0.004   0.152

Execution time: 0.025 sec!

Lab 01: BIC illustration