Mean of the distribution follows normal distribution when the sample size approaches to infinity. - Discrete Case
# for Discrete distribution
par(mfrow = c(2,1)) #split plot window (row:2, column:1)
# Binomial distribution
n <- 3
p <- 0.5
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
sample1.mean <- c(sample1.mean,mean(rbinom(2,n,p)))
sample2.mean <- c(sample2.mean,mean(rbinom(50,n,p)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of binomial sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of binomial sample n=50"),xlab = "Mean")
#Poisson distribution
lambda <- 5
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
sample1.mean <- c(sample1.mean,mean(rpois(2,lambda)))
sample2.mean <- c(sample2.mean,mean(rpois(100,lambda)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of Poisson sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of Poisson sample n=100"),xlab = "Mean")
# for Discrete distribution
par(mfrow = c(2,1)) #split plot window (row:2, column:1)
# Binomial distribution
n <- 3
p <- 0.5;
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
sample1.mean <- c(sample1.mean,mean(rbinom(2,n,p)))
sample2.mean <- c(sample2.mean,mean(rbinom(50,n,p)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of binomial sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of binomial sample n=50"),xlab = "Mean")
#Poisson distribution
lambda <- 5
sample1.mean <- c()
sample2.mean <- c()
for (i in 1:10000) {
sample1.mean <- c(sample1.mean,mean(rpois(2,lambda)))
sample2.mean <- c(sample2.mean,mean(rpois(100,lambda)))
}
hist(sample1.mean,breaks = 100,main = paste("Mean of Poisson sample n=2"),xlab = "Mean")
hist(sample2.mean,breaks = 100,main = paste("Mean of Poisson sample n=100"),xlab = "Mean")
Refer to slide and lecture notes to point estimator derivation of each distribution (Method of moment, maximum likelihood estimation)
# Bernoulli distribution
theta0 <- 0.6
N <- 1000
x <- rbinom(N,1,theta0)
N1 <- sum(x)
N0 <- N - N1
theta <- N1/N #Estimated theta
theta
## [1] 0.592
# Normal distribution
mu0 <- 1
sigma0 <- 2
x <- rnorm(1000, mu0, sigma0)
(xbar <- mean(x)) #Estimated mean
## [1] 1.003554
(s2 <- sd(x)) #Estimated standard deviation
## [1] 1.981359
# using R package
# install.packages("fitdistrplus")
library(fitdistrplus)
## Loading required package: MASS
## Loading required package: survival
## Loading required package: npsurv
## Loading required package: lsei
test <- rnorm(n = 1000, mean = 0, sd = 1) #Samples from normal distribution
mmedist(test, "norm")$estimate #Method of moments estimation: normal distribution
## mean sd
## 0.00585879 1.00641990
mledist(test, "norm", optim.method = "Nelder-Mead")$estimate #Maximum likelihood estimation: normal distribution
## mean sd
## 0.00585879 1.00641990
####Check the below results####
test2 <- rpois(n = 5, lambda = 0.5) #Samples from poisson distribution
mmedist(test2, "pois")$estimate
## lambda
## 0.8
mledist(test2, "pois")$estimate
## lambda
## 0.8
test3 <- rexp(n = 100, rate = 1) #Samples from exponential distribution
mmedist(test3, "exp")$estimate
## rate
## 1.050387
mledist(test3, "exp")$estimate
## rate
## 1.050387
mledist() provides multiple optimization methods: Nelder-Mead(default), BFGS, CG, SANN etc. We can estimate any distributions existing in R using fitdistrplus package See the results from estimation of various distributions (Binomial, chi-square, beta, gamma …)
The \(CO_2\) uptake of six plants from Quebec and six plants from Mississippi was measured at several levels of ambient CO2 concentration. Half the plants of each type were chilled overnight before the experiment was conducted. This data is available at “\(CO_2\)” dataset in R.
library(fitdistrplus)
# a
data("CO2")
obj <- CO2[which(CO2$Treatment == "nonchilled"),]
mmedist(obj$uptake,"norm")$estimate
## mean sd
## 30.642857 9.588762
# b
mledist(obj$uptake,"norm")$estimate
## mean sd
## 30.642857 9.588762
Interval estimation of mean with known variance
After we found a point estimate of the population mean, we would need a way to quantify its accuracy. Here, we discuss the case where the population variance \(\sigma^2\) is assumed known.
Let us denote the \(100(1 - \alpha/2)\) percentile of the standard normal distribution as \(z_{\alpha/2}\). For random sample of sufficiently large size, the end points of the interval estimate at \(1-\alpha\) confidence level is given as follows:
\(\bar{x} \pm z_{\frac{\alpha}{2}}(\frac{\sigma}{\sqrt n})\)
Interval estimation of mean with unknown variance
After we found a point estimate of the population mean, we would need a way to quantify its accuracy. Here, we discuss the case where the population variance is not assumed.
Let us denote the \(100(1 - \alpha/2)\) percentile of the Student t distribution with \(n-1\) degrees of freedom as \(t_{\alpha/2}\). For random samples of sufficiently large size, and with standard deviation \(s\), the end points of the interval estimate at \((1-\alpha)\) confidence level is given as follows:
\(\bar{x} \pm t_{\frac{\alpha}{2}}(\frac{\sigma}{\sqrt n})\)
n.m <- 6
n.w <- 4
mu.m <- 117.5 # mean sys.bp of man
mu.w <- 126.8 # mean sys.bp of woman
s.m <- 9.7 # sample sd of man
s.w <- 12.0 # sample sd of woman
alpha <- 0.975
error.m <- qt(alpha, df = n.m - 1)*(s.m/sqrt(n.m))
ci.m <- mu.m + c(-error.m,error.m)
ci.m
## [1] 107.3205 127.6795
error.w <- qt(alpha, df = n.w - 1)*(s.w/sqrt(n.w))
ci.w <- mu.w + c(-error.w,error.w)
ci.w
## [1] 107.7053 145.8947
mu <- mean(mu.m, mu.w)
range <- seq(-30 + mu, 30 + mu, 0.1)
plot(range, dnorm(range, mu.m, s.m), type = 'l', xlab = 'x', ylab = 'probability density',
main = 'Interval estimation for systolic blood pressure between man and woman', cex = 1)
lines(range, dnorm(range, mu.w, s.w), lty = 2, col = "blue", type = 'l')
abline(v = ci.m[1], col = "red")
abline(v = ci.m[2], col = "red")
abline(v = ci.w[1], col = "blue")
abline(v = ci.w[2], col = "blue")
legend(192, 0.6, legend = c("sys.bp.m", "sysbp.w", "CI for man", "CI for woman"),
lty = c(1:2,1:2), col = c("blue", "red", "blue", "red"))
# pooled estimate of the common sd
sd.p <- sqrt((((n.m - 1)*s.m^2) + ((n.w - 1)*s.w^2)) / (n.m + n.w - 2))
t <- qt(0.975, df = n.m + n.w - 2)
error <- t*sd.p*sqrt(1/n.m + 1/n.w)
ci <- (mu.m - mu.w) + c(-error, error)
ci
## [1] -25.109606 6.509606
range <- seq(-30 + mu, 30 + mu, 0.1)
plot(range, dnorm(range, mu.m, s.m), type = 'l', xlab = 'x', ylab = 'probability density',
main = 'Interval estimation for systolic blood pressure between man and woman', cex = 1)
lines(range, dnorm(range, mu.w, s.w), lty = 2, col = "blue", type = 'l')
abline(v = mu.m + ci[1], col = "red")
abline(v = mu.m + ci[2], col = "red")
abline(v = mu.w + ci[1], col = "blue")
abline(v = mu.w + ci[2], col = "blue")
legend(192, 0.6, legend = c("sys.bp.m", "sysbp.w", "CI for man", "CI for woman"),
lty = c(1:2,1:2), col = c("blue", "red", "blue", "red"))
The R Program provides datasets, including ‘iris’ and ‘iris3’. You can easily get the dataset with data(iris). - ‘iris’ is a data frame with 150 cases (rows) and 5 variables (columns) named Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, and Species.
There are 3 species(setosa, versicolor, virginica) in dataset. You can check the names of species in the iris dataset.
Assume that Sepal.Length, Sepal.Width, Petal.Length and Petal.Width of each species are following normal distribution.
# Direct calculation
# a - mean and sd estimation
data(iris)
setosa.Avg.Sepal.lenght <- mean(iris$Sepal.Length[which(iris$Species == "setosa")])
setosa.Sd.Sepal.lenght <- sd(iris$Sepal.Length[which(iris$Species == "setosa")])
setosa.Avg.Sepal.width <- mean(iris$Sepal.Width[which(iris$Species == "setosa")])
setosa.Sd.Sepal.width <- sd(iris$Sepal.Width[which(iris$Species == "setosa")])
setosa.Avg.Petal.lenght <- mean(iris$Petal.Length[which(iris$Species == "setosa")])
setosa.Sd.Petal.lenght <- sd(iris$Petal.Length[which(iris$Species == "setosa")])
setosa.Avg.Petal.width <- mean(iris$Petal.Width[which(iris$Species == "setosa")])
setosa.Sd.Petal.width <- sd(iris$Petal.Width[which(iris$Species == "setosa")])
#a-versicolor
versicolor.Avg.Sepal.lenght <- mean(iris$Sepal.Length[which(iris$Species == "versicolor")])
versicolor.Sd.Sepal.lenght <- sd(iris$Sepal.Length[which(iris$Species == "versicolor")])
versicolor.Avg.Sepal.width <- mean(iris$Sepal.Width[which(iris$Species == "versicolor")])
versicolor.Sd.Sepal.width <- sd(iris$Sepal.Width[which(iris$Species == "versicolor")])
versicolor.Avg.Petal.lenght <- mean(iris$Petal.Length[which(iris$Species == "versicolor")])
versicolor.Sd.Petal.lenght <- sd(iris$Petal.Length[which(iris$Species == "versicolor")])
versicolor.Avg.Petal.width <- mean(iris$Petal.Width[which(iris$Species == "versicolor")])
versicolor.Sd.Petal.width <- sd(iris$Petal.Width[which(iris$Species == "versicolor")])
#a-virginica
virginica.Avg.Sepal.lenght <- mean(iris$Sepal.Length[which(iris$Species == "virginica")])
virginica.Sd.Sepal.lenght <- sd(iris$Sepal.Length[which(iris$Species == "virginica")])
virginica.Avg.Sepal.width <- mean(iris$Sepal.Width[which(iris$Species == "virginica")])
virginica.Sd.Sepal.width <- sd(iris$Sepal.Width[which(iris$Species == "virginica")])
virginica.Avg.Petal.lenght <- mean(iris$Petal.Length[which(iris$Species == "virginica")])
virginica.Sd.Petal.lenght <- sd(iris$Petal.Length[which(iris$Species == "virginica")])
virginica.Avg.Petal.width <- mean(iris$Petal.Width[which(iris$Species == "virginica")])
virginica.Sd.Petal.width <- sd(iris$Petal.Width[which(iris$Species == "virginica")])
# b - ci estimation (2 sided 0.95 --> 0.975)
#setosa
setosa.length <- length(iris$Species[which(iris$Species == "setosa")])
setosa.error1 <- qt(0.975,setosa.length - 1)*setosa.Sd.Sepal.lenght/sqrt(setosa.length)
setosa.Sepal.lenght <- setosa.Avg.Sepal.lenght + c(-setosa.error1,setosa.error1)
setosa.error2 <- qt(0.975,setosa.length - 1)*setosa.Sd.Sepal.width/sqrt(setosa.length)
setosa.Sepal.width <- setosa.Avg.Sepal.width + c(-setosa.error2,setosa.error2)
setosa.error3 <- qt(0.975,setosa.length - 1)*setosa.Sd.Petal.lenght/sqrt(setosa.length)
setosa.Petal.lenght <- setosa.Avg.Petal.lenght + c(-setosa.error3,setosa.error3)
setosa.error4 <- qt(0.975,setosa.length - 1)*setosa.Sd.Petal.width/sqrt(setosa.length)
setosa.Petal.width <- setosa.Avg.Petal.width + c(-setosa.error4,setosa.error4)
#b-versicolor
versicolor.length <- length(iris$Species[which(iris$Species == "versicolor")])
versicolor.error1 <- qt(0.975, setosa.length - 1)*versicolor.Sd.Sepal.lenght/sqrt(versicolor.length)
versicolor.Sepal.lenght <- versicolor.Avg.Sepal.lenght + c(-versicolor.error1,versicolor.error1)
versicolor.error2 <- qt(0.975,setosa.length - 1)*versicolor.Sd.Sepal.width/sqrt(versicolor.length)
versicolor.Sepal.width <- versicolor.Avg.Sepal.width + c(-versicolor.error2,versicolor.error2)
versicolor.error3 <- qt(0.975,setosa.length - 1)*versicolor.Sd.Petal.lenght/sqrt(versicolor.length)
versicolor.Petal.lenght <- versicolor.Avg.Petal.lenght + c(-versicolor.error3,versicolor.error3)
versicolor.error4 <- qt(0.975,setosa.length - 1)*versicolor.Sd.Petal.width/sqrt(versicolor.length)
versicolor.Petal.width <- versicolor.Avg.Petal.width + c(-versicolor.error4,versicolor.error4)
#b-virginica
virginica.length <- length(iris$Species[which(iris$Species == "virginica")])
virginica.error1 <- qt(0.975,setosa.length - 1)*virginica.Sd.Sepal.lenght/sqrt(virginica.length)
virginica.Sepal.lenght <- virginica.Avg.Sepal.lenght + c(-virginica.error1,virginica.error1)
virginica.error2 <- qt(0.975,setosa.length - 1)*virginica.Sd.Sepal.width/sqrt(virginica.length)
virginica.Sepal.width <- virginica.Avg.Sepal.width + c(-virginica.error2,virginica.error2)
virginica.error3 <- qt(0.975,setosa.length - 1)*virginica.Sd.Petal.lenght/sqrt(virginica.length)
virginica.Petal.lenght <- virginica.Avg.Petal.lenght + c(-virginica.error3,virginica.error3)
virginica.error4 <- qt(0.975,setosa.length - 1)*virginica.Sd.Petal.width/sqrt(virginica.length)
virginica.Petal.width <- virginica.Avg.Petal.width + c(-virginica.error4,virginica.error4)
You can simplify this job with tidyr and dplyr package.
# load dataset
data(iris)
# inspect data set
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# estimate mean, sd, and ci
# install.packages("tidyr")
suppressMessages(library(tidyr)) # for gather or spread data frame to tidy data set
suppressMessages(library(dplyr)) # for grouping, selecting, summarizing tidy data set
library(knitr) # to get knit output
# Length
tidy_length <- iris %>%
gather(Length, measure, c(1,3)) %>%
select(-c(1,2))
tidy_length %>%
group_by(Species, Length) %>%
summarise(mean = mean(measure), sd = sd(measure),
df = n() - 1, err = qt(0.975, df)*sd/sqrt(df + 1),
lower = mean - err, upper = mean + err) %>%
kable()
Species | Length | mean | sd | df | err | lower | upper |
---|---|---|---|---|---|---|---|
setosa | Petal.Length | 1.462 | 0.1736640 | 49 | 0.0493548 | 1.412645 | 1.511355 |
setosa | Sepal.Length | 5.006 | 0.3524897 | 49 | 0.1001765 | 4.905824 | 5.106177 |
versicolor | Petal.Length | 4.260 | 0.4699110 | 49 | 0.1335472 | 4.126453 | 4.393547 |
versicolor | Sepal.Length | 5.936 | 0.5161711 | 49 | 0.1466942 | 5.789306 | 6.082694 |
virginica | Petal.Length | 5.552 | 0.5518947 | 49 | 0.1568467 | 5.395153 | 5.708847 |
virginica | Sepal.Length | 6.588 | 0.6358796 | 49 | 0.1807150 | 6.407285 | 6.768715 |
# Width
tidy_width <- iris %>%
gather(Width, measure, c(2,4)) %>%
select(-c(1,2))
tidy_width %>%
group_by(Species, Width) %>%
summarise(mean = mean(measure), sd = sd(measure),
df = n() - 1, err = qt(0.975, df)*sd/sqrt(df + 1),
lower = mean - err, upper = mean + err) %>%
kable()
Species | Width | mean | sd | df | err | lower | upper |
---|---|---|---|---|---|---|---|
setosa | Petal.Width | 0.246 | 0.1053856 | 49 | 0.0299503 | 0.2160497 | 0.2759503 |
setosa | Sepal.Width | 3.428 | 0.3790644 | 49 | 0.1077289 | 3.3202711 | 3.5357289 |
versicolor | Petal.Width | 1.326 | 0.1977527 | 49 | 0.0562007 | 1.2697993 | 1.3822007 |
versicolor | Sepal.Width | 2.770 | 0.3137983 | 49 | 0.0891805 | 2.6808195 | 2.8591805 |
virginica | Petal.Width | 2.026 | 0.2746501 | 49 | 0.0780547 | 1.9479453 | 2.1040547 |
virginica | Sepal.Width | 2.974 | 0.3224966 | 49 | 0.0916525 | 2.8823475 | 3.0656525 |
for more information about these packages and tidy data principle, plz refer to the following paper
Tidy Data, Hadley Wickham, Journal of statistical software, 2014