library("MCMCpack")
library("ggplot2")
library("GGally")
h <- swiss
str(h)
## 'data.frame': 47 obs. of 6 variables:
## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
## $ Examination : int 15 6 5 12 17 9 16 14 12 16 ...
## $ Education : int 12 9 5 7 15 7 7 8 7 13 ...
## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
theme_set(theme_bw())
# plotmatrix(h)
ggpairs(h)
formula <- Fertility ~ Agriculture + Examination + Catholic + Education
Обычный МНК:
m1 <- lm(formula, data = h)
summary(m1)
##
## Call:
## lm(formula = formula, data = h)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7813 -6.3308 0.8113 5.7205 15.5569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91.05542 6.94881 13.104 < 2e-16 ***
## Agriculture -0.22065 0.07360 -2.998 0.00455 **
## Examination -0.26058 0.27411 -0.951 0.34722
## Catholic 0.12442 0.03727 3.339 0.00177 **
## Education -0.96161 0.19455 -4.943 1.28e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.736 on 42 degrees of freedom
## Multiple R-squared: 0.6498, Adjusted R-squared: 0.6164
## F-statistic: 19.48 on 4 and 42 DF, p-value: 3.95e-09
priors <- c(2, 0.8, 0.5, 0.3, -5)
m1.mcmc <- MCMCregress(formula, data = h,
b0 = priors, B0 = 0.0001)
summary(m1.mcmc)
##
## Iterations = 1001:11000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## (Intercept) 90.5927 7.04269 0.0704269 0.0704269
## Agriculture -0.2162 0.07436 0.0007436 0.0007241
## Examination -0.2479 0.28145 0.0028145 0.0028614
## Catholic 0.1250 0.03777 0.0003777 0.0003777
## Education -0.9630 0.20025 0.0020025 0.0020025
## sigma2 62.7837 14.41234 0.1441234 0.1629205
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## (Intercept) 76.80204 85.93207 90.5969 95.15602 104.57959
## Agriculture -0.36345 -0.26524 -0.2161 -0.16730 -0.07206
## Examination -0.80269 -0.43400 -0.2477 -0.06113 0.29906
## Catholic 0.04986 0.09998 0.1252 0.15047 0.19832
## Education -1.35832 -1.09493 -0.9641 -0.82896 -0.56468
## sigma2 40.51536 52.49152 60.7454 70.65440 96.90170
str(m1.mcmc)
## mcmc [1:10000, 1:6] 101.4 91 80.8 100.3 100.3 ...
## - attr(*, "mcpar")= num [1:3] 1001 11000 1
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "(Intercept)" "Agriculture" "Examination" "Catholic" ...
## - attr(*, "title")= chr "MCMCregress Posterior Sample"
## - attr(*, "y")= num [1:47, 1] 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:47] "Courtelary" "Delemont" "Franches-Mnt" "Moutier" ...
## .. ..$ : NULL
## - attr(*, "call")= language MCMCregress(formula = formula, data = h, b0 = priors, B0 = 1e-04)
beta.sample <- as.data.frame(m1.mcmc)
beta.sample$t <- 1:nrow(beta.sample) # добавим номер элемента последовательности
str(beta.sample)
## 'data.frame': 10000 obs. of 7 variables:
## $ (Intercept): num 101.4 91 80.8 100.3 100.3 ...
## $ Agriculture: num -0.36 -0.237 -0.108 -0.25 -0.353 ...
## $ Examination: num -0.377 -0.212 0.04 -0.628 -0.268 ...
## $ Catholic : num 0.122 0.151 0.122 0.117 0.149 ...
## $ Education : num -1.015 -0.951 -1.064 -0.992 -1.232 ...
## $ sigma2 : num 86.8 60.3 40.4 42 45.5 ...
## $ t : int 1 2 3 4 5 6 7 8 9 10 ...
Зададим удобные имена переменным
colnames(beta.sample)[1] <- "const"
Построим апостериорное распределение свободного члена и визуально оценим сходимость:
ggplot(beta.sample,aes(x = const)) + geom_density()
qplot(t, Agriculture, data = beta.sample)
Оценим модель с другим априорным мнением о коэффициентах:
priors <- c(0.2, 0.1, 0.2, -0.3, -0.1)
m1.mcmc <- MCMCregress(formula, data = h,
b0 = priors, B0 = 0.0001)
summary(m1.mcmc)
##
## Iterations = 1001:11000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## (Intercept) 90.5837 7.04276 0.0704276 0.0704276
## Agriculture -0.2161 0.07436 0.0007436 0.0007242
## Examination -0.2476 0.28145 0.0028145 0.0028614
## Catholic 0.1250 0.03777 0.0003777 0.0003777
## Education -0.9630 0.20025 0.0020025 0.0020025
## sigma2 62.7840 14.41242 0.1441242 0.1629224
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## (Intercept) 76.79315 85.9221 90.5882 95.14615 104.57198
## Agriculture -0.36335 -0.2652 -0.2161 -0.16723 -0.07196
## Examination -0.80246 -0.4338 -0.2474 -0.06093 0.29934
## Catholic 0.04988 0.1000 0.1252 0.15049 0.19833
## Education -1.35829 -1.0949 -0.9641 -0.82894 -0.56466
## sigma2 40.51396 52.4900 60.7440 70.65677 96.89862
Сюда добавить: geweke.diag
, heidel.diag
, rafbery.diag
, gelman.diag
и более симпатичные графики :)
Почиташки:
MCMCglmm
в виде курса лекций :)