Similar to bootstrapping, except permutation testing resamples without replacement (meaning when a value is selected, it can not be selected again, so no value can be a duplicate). This simply shuffles the values. In the case of a univariate statistic (e.g. mean), this will not change anything. However, when there are two or more variables, reshuffling one variable will change the test statistic (e.g. correlation or regression). Usually this is done on the response/outcome/y variable, and usually for tests that use discrete or categorical (“yes”, “no”) variables.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ds <- data.frame(y = runif(10), x = runif(10)); head(ds)
## y x
## 1 0.20200727 0.2020174
## 2 0.88228326 0.5967211
## 3 0.38736966 0.4249668
## 4 0.24538380 0.6020442
## 5 0.12865606 0.7334585
## 6 0.07374217 0.3428088
cor(ds$y, ds$x, method = "spearman") # correlation is statistic for paired measurement.
## [1] -0.3090909
ds$resample_y <- sample(ds$y); head(ds)
## y x resample_y
## 1 0.20200727 0.2020174 0.89397897
## 2 0.88228326 0.5967211 0.24538380
## 3 0.38736966 0.4249668 0.88228326
## 4 0.24538380 0.6020442 0.44260192
## 5 0.12865606 0.7334585 0.51159146
## 6 0.07374217 0.3428088 0.03607334
cor(ds$resample_y, ds$x, method = "spearman") # If you shuffle order, it change the coefficient
## [1] 0.2363636
# Do permutation 1000 times
library(ggplot2)
data_frame(num = 1:1000) %>%
group_by(num) %>% # group_by number, we actually iterate sampling of ds$y 1000 times
mutate(corr = cor(sample(ds$y), ds$x, method = "spearman")) %>%
ggplot(aes(x = corr)) +
geom_freqpoly() # estimated background correlation coefficient distribution
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# R package
library(coin)
## Loading required package: survival
ds <- data.frame(y = runif(10), x = runif(10))
cor(ds$y, ds$x, method = "spearman")
## [1] 0.2363636
spearman_test(y ~ x, data = ds)
##
## Asymptotic Spearman Correlation Test
##
## data: y by x
## Z = 0.70909, p-value = 0.4783
## alternative hypothesis: true rho is not equal to 0
spearman_test(y ~ x, data = ds, distribution = "approximate") # MCMC
##
## Approximative Spearman Correlation Test
##
## data: y by x
## Z = 0.70909, p-value = 0.5212
## alternative hypothesis: true rho is not equal to 0
# vignette("coin", package = "coin") # for more information, see the package vignette
The “spearman_test” function in “coin” package tests whether given correlation is significant compared to estimated background correlation coefficient distribution by default 10,000 times permutation.
Ref: https://uoftcoders.github.io/studyGroup/lessons/r/resampling/lesson/
We perform simple MCMC simulation with toy example below. See more detailed explanation on the reference page given below this code block.
# making example data
trueA <- 5
trueB <- 0
trueSd <- 10
sampleSize <- 31
# create independent x-values
x <- (-(sampleSize-1)/2):((sampleSize-1)/2)
# create dependent values according to ax + b + N(0,sd)
y <- trueA * x + trueB + rnorm(n=sampleSize,mean=0,sd=trueSd)
# plot data
plot(x,y, main="Test Data")
# Define likelihood function
likelihood <- function(param){
a = param[1]
b = param[2]
sd = param[3]
pred = a*x + b
singlelikelihoods = dnorm(y, mean = pred, sd = sd, log = T)
sumll = sum(singlelikelihoods)
return(sumll)
}
# Example: plot the likelihood profile of the slope a
slopevalues <- function(x){return(likelihood(c(x, trueB, trueSd)))}
slopelikelihoods <- lapply(seq(3, 7, by=.05), slopevalues )
plot (seq(3, 7, by=.05), slopelikelihoods , type="l",
xlab = "values of slope parameter a", ylab = "Log likelihood")
# Prior distribution
prior <- function(param){
a = param[1]
b = param[2]
sd = param[3]
aprior = dunif(a, min=0, max=10, log = T)
bprior = dnorm(b, sd = 5, log = T)
sdprior = dunif(sd, min=0, max=30, log = T)
return(aprior+bprior+sdprior)
}
posterior <- function(param){
return (likelihood(param) + prior(param))
}
######## Metropolis algorithm ################
proposalfunction <- function(param){
return(rnorm(3,mean = param, sd= c(0.1,0.5,0.3)))
}
run_metropolis_MCMC <- function(startvalue, iterations){
chain = array(dim = c(iterations+1,3))
chain[1,] = startvalue
for (i in 1:iterations){
proposal = proposalfunction(chain[i,])
probab = exp(posterior(proposal) - posterior(chain[i,]))
if (runif(1) < probab){
chain[i+1,] = proposal
}else{
chain[i+1,] = chain[i,]
}
}
return(chain)
}
startvalue = c(4,0,10)
chain = run_metropolis_MCMC(startvalue, 10000)
burnIn = 5000
acceptance = 1-mean(duplicated(chain[-(1:burnIn),]))
### Summary: #######################
par(mfrow = c(2,3))
hist(chain[-(1:burnIn),1],nclass=30, main="Posterior of a", xlab="True value = red line" )
abline(v = mean(chain[-(1:burnIn),1]))
abline(v = trueA, col="red" )
hist(chain[-(1:burnIn),2],nclass=30, main="Posterior of b", xlab="True value = red line")
abline(v = mean(chain[-(1:burnIn),2]))
abline(v = trueB, col="red" )
hist(chain[-(1:burnIn),3],nclass=30, main="Posterior of sd", xlab="True value = red line")
abline(v = mean(chain[-(1:burnIn),3]) )
abline(v = trueSd, col="red" )
plot(chain[-(1:burnIn),1], type = "l", xlab="True value = red line" , main = "Chain values of a")
abline(h = trueA, col="red" )
plot(chain[-(1:burnIn),2], type = "l", xlab="True value = red line" , main = "Chain values of b")
abline(h = trueB, col="red" )
plot(chain[-(1:burnIn),3], type = "l", xlab="True value = red line" , main = "Chain values of sd")
abline(h = trueSd, col="red" )
# for comparison:
summary(lm(y~x))
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.210 -5.393 -1.997 7.841 19.248
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5371 1.8507 -0.29 0.774
## x 5.6279 0.2069 27.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.3 on 29 degrees of freedom
## Multiple R-squared: 0.9623, Adjusted R-squared: 0.961
## F-statistic: 739.8 on 1 and 29 DF, p-value: < 2.2e-16
Ref: https://theoreticalecology.wordpress.com/2010/09/17/metropolis-hastings-mcmc-in-r/
# code from bu
numvec<-rep(NA,100000)
maxbiasvec<-rep(NA,100000)
jackbiasvec<-rep(NA,100000)
for (i in 1:100000){
samp<-runif(100, min = 0, max = 5)
jack<-max(samp)+(100-1)/100*(max(samp)-max(samp[!samp==max(samp)]))
numvec[i]<-ifelse(abs(5-jack)<abs(max(samp)-5), 1,0)
maxbiasvec[i]<-abs(5-max(samp))
jackbiasvec[i]<-abs(5-jack)
}
mean(numvec)
## [1] 0.66614
mean(jackbiasvec)
## [1] 0.04930729
mean(maxbiasvec)
## [1] 0.04938722
Suppose \(p(x, y)\) is a p.d.f. or p.m.f. that is difficult to sample from directly. Suppose, though, that we can easily sample from the conditional distributions \(p(x|y)\) and \(p(y|x)\).
The Gibbs sampler proceeds as follows: 1. set \(x\) and \(y\) to some initial starting values 2. then sample \(x|y\), then sample \(y|x\), then \(x|y\), and so on.
Let’s suppose we are constructing a Gibbs sampler for the bivariate distribution \[f(x,y) = kx^2exp\{-xy^2-y^2+2y-4x\} \hspace{1cm} x>0, y\in R\] with unknown normalising constant \(k>0\). The conditional probability is given as \[x|y \sim Ga(3, y^2+4)\] and \[y|x \sim N\Bigg(\frac{1}{1+x}, \frac{1}{2(1+x)}\Bigg)\] here \(Ga\) denotes Gamma distribution.
gibbs=function(N,thin)
{
x=0
y=0
cat(paste("Iter","x","y","\n"))
for (i in 1:N) {
for (j in 1:thin) {
x=rgamma(1,3,y*y+4)
y=rnorm(1,1/(x+1),1/sqrt(2*x+2))
}
cat(paste(i,x,y,"\n"))
}
}
gibbs(10,100)
## Iter x y
## 1 0.566421456729337 0.829296909186305
## 2 0.695655879093854 -0.303454968900284
## 3 0.982355323570921 0.518012501468744
## 4 0.288529881456506 0.857854718995394
## 5 0.411077486318104 1.04345740915507
## 6 0.258271594130548 0.784987332055443
## 7 0.958096319070601 1.17333236964631
## 8 0.484763389731867 0.70134420433492
## 9 0.573898414220757 0.870268529276595
## 10 0.998080698685726 -0.539143803446724
Another implementation is shown below
# summary statistics of sample
n <- 30
ybar <- 15
s2 <- 3
# sample from the joint posterior (mu, tau | data)
mu <- rep(NA, 11000)
tau <- rep(NA, 11000)
T <- 1000 # burnin
tau[1] <- 1 # initialisation
for(i in 2:11000) {
mu[i] <- rnorm(n = 1, mean = ybar, sd = sqrt(1 / (n * tau[i - 1])))
tau[i] <- rgamma(n = 1, shape = n / 2, scale = 2 / ((n - 1) * s2 + n * (mu[i] - ybar)^2))
}
mu <- mu[-(1:T)] # remove burnin
tau <- tau[-(1:T)] # remove burnin
hist(mu)
hist(tau)
Ref:
https://darrenjw.wordpress.com/2011/07/16/gibbs-sampler-in-various-languages-revisited/
https://stats.stackexchange.com/questions/266665/gibbs-sampler-examples-in-r
Casella, G. & George, E.I. (1992). Explaining the Gibbs Sampler. The American Statistician, 46, 467-174.