Statistical Inference

Central Limit Theorem

Mean of the distribution follows normal distribution when the sample size approaches to infinity. - Discrete Case

# for Discrete distribution
par(mfrow = c(2,1)) #split plot window (row:2, column:1)
# Binomial distribution
n <- 3
p <- 0.5
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
  sample1.mean <- c(sample1.mean,mean(rbinom(2,n,p)))
  sample2.mean <- c(sample2.mean,mean(rbinom(50,n,p)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of binomial sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of binomial sample n=50"),xlab = "Mean")

#Poisson distribution
lambda <- 5
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
  sample1.mean <- c(sample1.mean,mean(rpois(2,lambda)))
  sample2.mean <- c(sample2.mean,mean(rpois(100,lambda)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of Poisson sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of Poisson sample n=100"),xlab = "Mean")

Continuous Case

# for Discrete distribution
par(mfrow = c(2,1)) #split plot window (row:2, column:1)
# Binomial distribution
n <- 3
p <- 0.5;
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
  sample1.mean <- c(sample1.mean,mean(rbinom(2,n,p)))
  sample2.mean <- c(sample2.mean,mean(rbinom(50,n,p)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of binomial sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of binomial sample n=50"),xlab = "Mean")

#Poisson distribution
lambda <- 5
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
  sample1.mean <- c(sample1.mean,mean(rpois(2,lambda)))
  sample2.mean <- c(sample2.mean,mean(rpois(100,lambda)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of Poisson sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of Poisson sample n=100"),xlab = "Mean")

Point estimation

Refer to slide and lecture notes to point estimator derivation of each distribution (Method of moment, maximum likelihood estimation)

# Bernoulli distribution
theta0 <- 0.6
N <- 1000
x <- rbinom(N,1,theta0)
N1 <- sum(x)
N0 <- N - N1
theta <- N1/N #Estimated theta
theta

## [1] 0.592

# Normal distribution
mu0 <- 1
sigma0 <- 2
x <- rnorm(1000, mu0, sigma0)
(xbar <- mean(x)) #Estimated mean

## [1] 1.003554

(s2 <- sd(x)) #Estimated standard deviation

## [1] 1.981359

# using R package
# install.packages("fitdistrplus")
library(fitdistrplus)

## Loading required package: MASS

## Loading required package: survival

## Loading required package: npsurv

## Loading required package: lsei

test <- rnorm(n = 1000, mean = 0, sd = 1) #Samples from normal distribution
mmedist(test, "norm")$estimate #Method of moments estimation: normal distribution

##       mean         sd 
## 0.00585879 1.00641990

mledist(test, "norm", optim.method = "Nelder-Mead")$estimate #Maximum likelihood estimation: normal distribution

##       mean         sd 
## 0.00585879 1.00641990

####Check the below results####
test2 <- rpois(n = 5, lambda = 0.5) #Samples from poisson distribution
mmedist(test2, "pois")$estimate

## lambda 
##    0.8

mledist(test2, "pois")$estimate

## lambda 
##    0.8

test3 <- rexp(n = 100, rate = 1) #Samples from exponential distribution
mmedist(test3, "exp")$estimate

##     rate 
## 1.050387

mledist(test3, "exp")$estimate

##     rate 
## 1.050387

mledist() provides multiple optimization methods: Nelder-Mead(default), BFGS, CG, SANN etc. We can estimate any distributions existing in R using fitdistrplus package See the results from estimation of various distributions (Binomial, chi-square, beta, gamma …)

Point estimation - More examples

The \(CO_2\) uptake of six plants from Quebec and six plants from Mississippi was measured at several levels of ambient CO2 concentration. Half the plants of each type were chilled overnight before the experiment was conducted. This data is available at “\(CO_2\)” dataset in R.

1. Estimate the mean and standard deviation of \(CO_2\) uptake level of nonchilled plants using method of moments. Assume that the distribution of \(CO_2\) uptake level is normal distribution.
1. Estimate the mean and standard deviation of \(CO_2\) uptake level of nonchilled plants using maximum likelihood estimation. Assume that the distribution of \(CO_2\) uptake level is normal.

library(fitdistrplus)

# a
data("CO2")
obj <- CO2[which(CO2$Treatment == "nonchilled"),]
mmedist(obj$uptake,"norm")$estimate

##      mean        sd 
## 30.642857  9.588762

# b
mledist(obj$uptake,"norm")$estimate

##      mean        sd 
## 30.642857  9.588762

Interval estimation

Interval estimation of mean with known variance

After we found a point estimate of the population mean, we would need a way to quantify its accuracy. Here, we discuss the case where the population variance \(\sigma^2\) is assumed known.
Let us denote the \(100(1 - \alpha/2)\) percentile of the standard normal distribution as \(z_{\alpha/2}\). For random sample of sufficiently large size, the end points of the interval estimate at \(1-\alpha\) confidence level is given as follows:
\(\bar{x} \pm z_{\frac{\alpha}{2}}(\frac{\sigma}{\sqrt n})\)

Interval estimation of mean with unknown variance

After we found a point estimate of the population mean, we would need a way to quantify its accuracy. Here, we discuss the case where the population variance is not assumed.
Let us denote the \(100(1 - \alpha/2)\) percentile of the Student t distribution with \(n-1\) degrees of freedom as \(t_{\alpha/2}\). For random samples of sufficiently large size, and with standard deviation \(s\), the end points of the interval estimate at \((1-\alpha)\) confidence level is given as follows:
\(\bar{x} \pm t_{\frac{\alpha}{2}}(\frac{\sigma}{\sqrt n})\)

n.m <- 6
n.w <- 4
mu.m <- 117.5 # mean sys.bp of man
mu.w <- 126.8 # mean sys.bp of woman
s.m <- 9.7 # sample sd of man
s.w <- 12.0 # sample sd of woman 
alpha <- 0.975

error.m <- qt(alpha, df = n.m - 1)*(s.m/sqrt(n.m))
ci.m <- mu.m + c(-error.m,error.m)
ci.m

## [1] 107.3205 127.6795

error.w <- qt(alpha, df = n.w - 1)*(s.w/sqrt(n.w))
ci.w <- mu.w + c(-error.w,error.w)
ci.w

## [1] 107.7053 145.8947

mu <- mean(mu.m, mu.w)

range <- seq(-30 + mu, 30 + mu, 0.1)
plot(range, dnorm(range, mu.m, s.m), type = 'l', xlab = 'x', ylab = 'probability density',
     main = 'Interval estimation for systolic blood pressure between man and woman', cex = 1)
lines(range, dnorm(range, mu.w, s.w), lty = 2, col = "blue", type = 'l')
abline(v = ci.m[1], col = "red")
abline(v = ci.m[2], col = "red")
abline(v = ci.w[1], col = "blue")
abline(v = ci.w[2], col = "blue")
legend(192, 0.6, legend = c("sys.bp.m", "sysbp.w", "CI for man", "CI for woman"), 
       lty = c(1:2,1:2), col = c("blue", "red", "blue", "red"))

# pooled estimate of the common sd
sd.p <- sqrt((((n.m - 1)*s.m^2) + ((n.w - 1)*s.w^2)) / (n.m + n.w - 2))
t <- qt(0.975, df = n.m + n.w - 2)

error <- t*sd.p*sqrt(1/n.m + 1/n.w)
ci <- (mu.m - mu.w) + c(-error, error)
ci

## [1] -25.109606   6.509606

range <- seq(-30 + mu, 30 + mu, 0.1)
plot(range, dnorm(range, mu.m, s.m), type = 'l', xlab = 'x', ylab = 'probability density',
     main = 'Interval estimation for systolic blood pressure between man and woman', cex = 1)
lines(range, dnorm(range, mu.w, s.w), lty = 2, col = "blue", type = 'l')
abline(v = mu.m + ci[1], col = "red")
abline(v = mu.m + ci[2], col = "red")
abline(v = mu.w + ci[1], col = "blue")
abline(v = mu.w + ci[2], col = "blue")
legend(192, 0.6, legend = c("sys.bp.m", "sysbp.w", "CI for man", "CI for woman"), 
       lty = c(1:2,1:2), col = c("blue", "red", "blue", "red"))

Problem

The R Program provides datasets, including ‘iris’ and ‘iris3’. You can easily get the dataset with data(iris). - ‘iris’ is a data frame with 150 cases (rows) and 5 variables (columns) named Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, and Species.

There are 3 species(setosa, versicolor, virginica) in dataset. You can check the names of species in the iris dataset.
Assume that Sepal.Length, Sepal.Width, Petal.Length and Petal.Width of each species are following normal distribution.
1. Estimate the mean and standard deviation of sepal length, sepal width, petal length and petal width, for each species respectively.
1. Find two-sided confidence interval of each estimated data with 95% of confidence level.

# Direct calculation
# a - mean and sd estimation

data(iris)

setosa.Avg.Sepal.lenght <- mean(iris$Sepal.Length[which(iris$Species == "setosa")])
setosa.Sd.Sepal.lenght <- sd(iris$Sepal.Length[which(iris$Species == "setosa")])

setosa.Avg.Sepal.width <- mean(iris$Sepal.Width[which(iris$Species == "setosa")])
setosa.Sd.Sepal.width <- sd(iris$Sepal.Width[which(iris$Species == "setosa")])

setosa.Avg.Petal.lenght <- mean(iris$Petal.Length[which(iris$Species == "setosa")])
setosa.Sd.Petal.lenght <- sd(iris$Petal.Length[which(iris$Species == "setosa")])

setosa.Avg.Petal.width <- mean(iris$Petal.Width[which(iris$Species == "setosa")])
setosa.Sd.Petal.width <- sd(iris$Petal.Width[which(iris$Species == "setosa")])

#a-versicolor
versicolor.Avg.Sepal.lenght <- mean(iris$Sepal.Length[which(iris$Species == "versicolor")])
versicolor.Sd.Sepal.lenght <- sd(iris$Sepal.Length[which(iris$Species == "versicolor")])

versicolor.Avg.Sepal.width <- mean(iris$Sepal.Width[which(iris$Species == "versicolor")])
versicolor.Sd.Sepal.width <- sd(iris$Sepal.Width[which(iris$Species == "versicolor")])

versicolor.Avg.Petal.lenght <- mean(iris$Petal.Length[which(iris$Species == "versicolor")])
versicolor.Sd.Petal.lenght <- sd(iris$Petal.Length[which(iris$Species == "versicolor")])

versicolor.Avg.Petal.width <- mean(iris$Petal.Width[which(iris$Species == "versicolor")])
versicolor.Sd.Petal.width <- sd(iris$Petal.Width[which(iris$Species == "versicolor")])

#a-virginica
virginica.Avg.Sepal.lenght <- mean(iris$Sepal.Length[which(iris$Species == "virginica")])
virginica.Sd.Sepal.lenght <- sd(iris$Sepal.Length[which(iris$Species == "virginica")])

virginica.Avg.Sepal.width <- mean(iris$Sepal.Width[which(iris$Species == "virginica")])
virginica.Sd.Sepal.width <- sd(iris$Sepal.Width[which(iris$Species == "virginica")])

virginica.Avg.Petal.lenght <- mean(iris$Petal.Length[which(iris$Species == "virginica")])
virginica.Sd.Petal.lenght <- sd(iris$Petal.Length[which(iris$Species == "virginica")])

virginica.Avg.Petal.width <- mean(iris$Petal.Width[which(iris$Species == "virginica")])
virginica.Sd.Petal.width <- sd(iris$Petal.Width[which(iris$Species == "virginica")])

# b - ci estimation (2 sided 0.95 --> 0.975)

#setosa
setosa.length <- length(iris$Species[which(iris$Species == "setosa")])
setosa.error1 <- qt(0.975,setosa.length - 1)*setosa.Sd.Sepal.lenght/sqrt(setosa.length)
setosa.Sepal.lenght <- setosa.Avg.Sepal.lenght + c(-setosa.error1,setosa.error1)

setosa.error2 <- qt(0.975,setosa.length - 1)*setosa.Sd.Sepal.width/sqrt(setosa.length)
setosa.Sepal.width <- setosa.Avg.Sepal.width + c(-setosa.error2,setosa.error2)

setosa.error3 <- qt(0.975,setosa.length - 1)*setosa.Sd.Petal.lenght/sqrt(setosa.length)
setosa.Petal.lenght <- setosa.Avg.Petal.lenght + c(-setosa.error3,setosa.error3)

setosa.error4 <- qt(0.975,setosa.length - 1)*setosa.Sd.Petal.width/sqrt(setosa.length)
setosa.Petal.width <- setosa.Avg.Petal.width + c(-setosa.error4,setosa.error4)

#b-versicolor
versicolor.length <- length(iris$Species[which(iris$Species == "versicolor")])

versicolor.error1 <- qt(0.975, setosa.length - 1)*versicolor.Sd.Sepal.lenght/sqrt(versicolor.length)
versicolor.Sepal.lenght <- versicolor.Avg.Sepal.lenght + c(-versicolor.error1,versicolor.error1)

versicolor.error2 <- qt(0.975,setosa.length - 1)*versicolor.Sd.Sepal.width/sqrt(versicolor.length)
versicolor.Sepal.width <- versicolor.Avg.Sepal.width + c(-versicolor.error2,versicolor.error2)

versicolor.error3 <- qt(0.975,setosa.length - 1)*versicolor.Sd.Petal.lenght/sqrt(versicolor.length)
versicolor.Petal.lenght <- versicolor.Avg.Petal.lenght + c(-versicolor.error3,versicolor.error3)

versicolor.error4 <- qt(0.975,setosa.length - 1)*versicolor.Sd.Petal.width/sqrt(versicolor.length)
versicolor.Petal.width <- versicolor.Avg.Petal.width + c(-versicolor.error4,versicolor.error4)

#b-virginica
virginica.length <- length(iris$Species[which(iris$Species == "virginica")])
virginica.error1 <- qt(0.975,setosa.length - 1)*virginica.Sd.Sepal.lenght/sqrt(virginica.length)
virginica.Sepal.lenght <- virginica.Avg.Sepal.lenght + c(-virginica.error1,virginica.error1)

virginica.error2 <- qt(0.975,setosa.length - 1)*virginica.Sd.Sepal.width/sqrt(virginica.length)
virginica.Sepal.width <- virginica.Avg.Sepal.width + c(-virginica.error2,virginica.error2)

virginica.error3 <- qt(0.975,setosa.length - 1)*virginica.Sd.Petal.lenght/sqrt(virginica.length)
virginica.Petal.lenght <- virginica.Avg.Petal.lenght + c(-virginica.error3,virginica.error3)

virginica.error4 <- qt(0.975,setosa.length - 1)*virginica.Sd.Petal.width/sqrt(virginica.length)
virginica.Petal.width <- virginica.Avg.Petal.width + c(-virginica.error4,virginica.error4)

You can simplify this job with tidyr and dplyr package.

# load dataset 
data(iris)

# inspect data set 
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

# estimate mean, sd, and ci
# install.packages("tidyr")
suppressMessages(library(tidyr)) # for gather or spread data frame to tidy data set
suppressMessages(library(dplyr)) # for grouping, selecting, summarizing tidy data set
library(knitr) # to get knit output

# Length
tidy_length <- iris %>% 
               gather(Length, measure, c(1,3)) %>% 
               select(-c(1,2))

tidy_length %>% 
  group_by(Species, Length) %>% 
  summarise(mean = mean(measure), sd = sd(measure),
            df = n() - 1, err = qt(0.975, df)*sd/sqrt(df + 1), 
            lower = mean - err, upper = mean + err) %>% 
  kable()

Species	Length	mean	sd	df	err	lower	upper
setosa	Petal.Length	1.462	0.1736640	49	0.0493548	1.412645	1.511355
setosa	Sepal.Length	5.006	0.3524897	49	0.1001765	4.905824	5.106177
versicolor	Petal.Length	4.260	0.4699110	49	0.1335472	4.126453	4.393547
versicolor	Sepal.Length	5.936	0.5161711	49	0.1466942	5.789306	6.082694
virginica	Petal.Length	5.552	0.5518947	49	0.1568467	5.395153	5.708847
virginica	Sepal.Length	6.588	0.6358796	49	0.1807150	6.407285	6.768715

# Width
tidy_width <- iris %>% 
              gather(Width, measure, c(2,4)) %>% 
              select(-c(1,2))

tidy_width %>% 
  group_by(Species, Width) %>% 
  summarise(mean = mean(measure), sd = sd(measure), 
            df = n() - 1, err = qt(0.975, df)*sd/sqrt(df + 1), 
            lower = mean - err, upper = mean + err) %>% 
  kable()

Species	Width	mean	sd	df	err	lower	upper
setosa	Petal.Width	0.246	0.1053856	49	0.0299503	0.2160497	0.2759503
setosa	Sepal.Width	3.428	0.3790644	49	0.1077289	3.3202711	3.5357289
versicolor	Petal.Width	1.326	0.1977527	49	0.0562007	1.2697993	1.3822007
versicolor	Sepal.Width	2.770	0.3137983	49	0.0891805	2.6808195	2.8591805
virginica	Petal.Width	2.026	0.2746501	49	0.0780547	1.9479453	2.1040547
virginica	Sepal.Width	2.974	0.3224966	49	0.0916525	2.8823475	3.0656525

for more information about these packages and tidy data principle, plz refer to the following paper

Tidy Data, Hadley Wickham, Journal of statistical software, 2014

Statistical Inference - Practice

Bis335

Central Limit Theorem

Point estimation

Point estimation - More examples

Interval estimation

Problem