values <- rnorm(n = 30,mean = 9900,sd = 120) #actual expected mean: 9900 (<10000)
xbar <- mean(values) #sample mean
mu0 <- 10000 #hypothesized value
sigma <- 120 #known standard deviation
n <- 30 #sample size
z <- (xbar - mu0)/(sigma/sqrt(n))
z
## [1] -4.511959
alpha <- 0.05 #significance level
z.alpha <- qnorm(1 - alpha) #critical value
- z.alpha #critical value for lower tail test
## [1] -1.644854
xbar + (z.alpha*(sigma/sqrt(n))) #confidence interval of the sample mean
## [1] 9937.185
Because \(z < z_{\alpha}\), we can reject the null hypothesis
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
values <- rnorm(n = 30,mean = 9900,sd = 120) #actual expected mean: 9900 (<10000)
mu0 <- 10000 #hypothesized value
sigma <- 120 #known standard deviation
n <- 30 #sample size
z.test(values, alternative = "less", mu = mu0, sigma.x = sigma, conf.level = 0.95)
##
## One-sample z-Test
##
## data: values
## z = -4.4591, p-value = 4.116e-06
## alternative hypothesis: true mean is less than 10000
## 95 percent confidence interval:
## NA 9938.344
## sample estimates:
## mean of x
## 9902.307
The followings are daily energy intakes in kJ for 11 women.
x |
---|
5260 |
5470 |
5640 |
6180 |
6390 |
6515 |
6805 |
7515 |
7515 |
8230 |
8770 |
We wish to investigate whether the women’s energy intake deviates systematically from a recommended value of 7725 kJ.
Hypothesis: \(H0: \mu=7725 kJ, H_{a}: \mu \neq 7725\)
daily.intake <- c(5260, 5470, 5640, 6180, 6390, 6515, 6805, 7515, 7515, 8230, 8770)
test <- t.test(daily.intake, mu = 7725, alternative = "two.sided")
test
##
## One Sample t-test
##
## data: daily.intake
## t = -2.8208, df = 10, p-value = 0.01814
## alternative hypothesis: true mean is not equal to 7725
## 95 percent confidence interval:
## 5986.348 7520.925
## sample estimates:
## mean of x
## 6753.636
sx <- sd(daily.intake) #standard deviation
n <- length(daily.intake)
se <- sx/sqrt(n) #standard error
a <- 0.05
df <- length(daily.intake) - 1
t.crit <- qt(a/2,df) # left a/2 tail
We would like to compare daily energy expenditures between lean and obese women.
Data: “energy” data from package “ISwR”
Hypothesis: \(H0: \mu_{1} = \mu_{2}, H_{a}: \mu_{1} \neq \mu_{2}\) (1: lean, 2: obese)
#Daily energy expenditure data
library(ISwR)
attach(energy)
#Data split
sample.lean <- energy$expend[which(energy$stature == "lean")]
sample.obese <- energy$expend[which(energy$stature == "obese")]
t.test(sample.lean,sample.obese) #t-test
##
## Welch Two Sample t-test
##
## data: sample.lean and sample.obese
## t = -3.8555, df = 15.919, p-value = 0.001411
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.459167 -1.004081
## sample estimates:
## mean of x mean of y
## 8.066154 10.297778
detach(energy)
Since p < 0.05, We can reject null hypothesis with significant level \(\alpha = 0.05\)
Graphical display of data (boxplot) Function: boxplot()
#boxplot about data from each 2 groups
library(ISwR)
attach(energy)
boxplot(energy$expend~energy$stature,col = c("cyan","magenta"), ylab = "Energy Expenditure")
mean.lean <- mean(sample.lean)
mean.obese <- mean(sample.obese)
abline(h = mean.lean,col = "cyan")
abline(h = mean.obese,col = "magenta")
detach(energy)
Two sample t-test example Cystic fibrosis lung function data Data: “cystfibr” data from package “ISwR” We will check that is there a significant difference in lung function between male and female patients
#cyctic fibrosis dataset
library(ISwR)
attach(cystfibr)
## The following object is masked from package:ISwR:
##
## tlc
#switch integer variable to factor
sex <- factor(cystfibr$sex,levels = 0:1,labels = c("male","female"))
sample.male <- cystfibr$pemax[which(sex == "male")]
sample.female <- cystfibr$pemax[which(sex == "female")]
t.test(sample.male,sample.female)
##
## Welch Two Sample t-test
##
## data: sample.male and sample.female
## t = 1.5353, df = 21.584, p-value = 0.1393
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.710638 44.801547
## sample estimates:
## mean of x mean of y
## 117.50000 98.45455
detach(cystfibr)
Suppose a sample of 20 students were given a diagnostic test before studying a particular module and then again after completing the module. We want to find out if our teaching leads to improvements in students’ knowledge. Hypothesis \(H0: \mu_{1} \geq \mu_{2}, H_{a}: \mu_{1} < \mu_{2}\), (1: before, 2:after)
#Data preparation
pre <- c(18,21,16,22,19,24,17,21,23,18,14,16,16,19,18,20,12,22,15,17)
post <- c(22,25,17,24,16,29,20,23,19,20,15,15,18,26,18,24,18,25,19,16)
#One-tailed test
pt <- t.test(pre,post,paired = TRUE,alternative = "less")
pt
##
## Paired t-test
##
## data: pre and post
## t = -3.2313, df = 19, p-value = 0.002197
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.9529882
## sample estimates:
## mean of the differences
## -2.05
#Graphical interpretation with distribution
t <- pt$statistic # calculated t-statistic
df <- pt$parameter #degree of freedom
tc <- qt(0.05,df) #critical t-value (p=0.05)
x <- seq(-4,4,0.05)
plot(x,dt(x,df),type = "l") #t-distribution curve with degree of freedom
xt <- seq(-4,tc,0.05);
yt <- dt(xt,df)
#make polygon which of vertices are (-4,0),(xt,yt),(0,tc)
polygon(c(-4,xt,tc),c(0,yt,0),col = "skyblue") #Polygon represents rejection region
abline(v = t);
text(t,0.15,paste("t=",round(t,3)))
Hypothesis \(H0: \sigma_{1}^2 = \sigma_{2}^2, H_{a}: \sigma_{1}^2 \neq \sigma_{2}^2\), (1: before, 2:after)
#Simulation data tutorial from R manual
x <- rnorm(50, mean = 0, sd = 2)
y <- rnorm(30, mean = 1, sd = 1)
var.test(x, y) # Do x and y have the same variance?
##
## F test to compare two variances
##
## data: x and y
## F = 3.0165, num df = 49, denom df = 29, p-value = 0.00213
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.515579 5.675365
## sample estimates:
## ratio of variances
## 3.016539
Dataset “InsectSprays” represents the counts of insects in agricultural experimental units treated with 6 different insecticides (A to F). We want to test each insect sprays have equal effectiveness Hypothesis: \(H_{0}:\mu_{A}=\mu_{B}=..=\mu_{F}, H_{a}:\) one of mean counts are not equal to the others
#ANOVA
#1. oneway.test
attach(InsectSprays)
oneway.test(count ~ spray)
##
## One-way analysis of means (not assuming equal variances)
##
## data: count and spray
## F = 36.065, num df = 5.000, denom df = 30.043, p-value = 7.999e-12
#2 ANOVA using aov
aov.out <- aov(count ~ spray)
summary(aov.out)
## Df Sum Sq Mean Sq F value Pr(>F)
## spray 5 2669 533.8 34.7 <2e-16 ***
## Residuals 66 1015 15.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Mean plot
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
plotmeans(count~spray, main = "Means Plot with 95% CI")
#Mean display by factors
plot.design(count~spray)
print(model.tables(aov.out,"means"),digits = 3)
## Tables of means
## Grand mean
##
## 9.5
##
## spray
## spray
## A B C D E F
## 14.50 15.33 2.08 4.92 3.50 16.67
detach(InsectSprays)
Consider the study in which standing and supine systolic blood pressures were compared. This study was performed on 12 subjects. Their blood pressures were measured in both positions.
subject | standing | supine |
---|---|---|
1 | 132 | 136 |
2 | 146 | 145 |
3 | 135 | 140 |
4 | 141 | 147 |
5 | 139 | 142 |
6 | 162 | 160 |
7 | 128 | 137 |
8 | 137 | 136 |
9 | 145 | 149 |
10 | 151 | 158 |
11 | 131 | 120 |
12 | 143 | 150 |
Suggested null and alternative hypotheses could be:
\(H_{0}\): There is no difference between the mean blood pressures in two samples.
\(H_{1}\): There is a difference between the mean blood pressures in two samples.
Perform a paired t-test on the standing and supine blood pressure data.
Plot t-distribution using given condition in problem 1, and perform graphical checks for this paired t-test (including rejection region and t-statistic).
#1-a
Standing <- c(132,146,135,141,139,162,128,137,145,151,131,143)
Supine <- c(136,145,140,147,142,160,137,136,149,158,120,150)
pt <- t.test(Standing,Supine,paired = TRUE,alternative = "two.sided")
#1-b
#t statistic value
t <- pt$statistic
#degree of freedom
dof <- pt$parameter
#critical value
tc <- qt(0.025,dof)
x <- seq(-4,4,0.025)
xtn <- seq(-4,-abs(tc),0.025)
xtp <- seq(abs(tc),4,0.025)
ytn <- dt(xtn,dof)
ytp <- dt(xtp,dof)
#t graph between -4 : 4
plot(x,dt(x,dof),type = "l")
#reject region
polygon(c(-4,xtn,tc),c(0,ytn,0),col = 'skyblue')
polygon(c(xtp,4,tc),c(0,ytp,0),col = 'skyblue')
#current location of statistic value
abline(v = t)
text(t,0.15,paste("t=",round(t,3)))
Using ‘iris’ dataset, answer the following questions.
#2-a
#load the iris data
data("iris")
#result of analysis of variance
summary(aov(iris$Sepal.Length~iris$Species))
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 63.21 31.606 119.3 <2e-16 ***
## Residuals 147 38.96 0.265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(iris$Sepal.Width~iris$Species))
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 11.35 5.672 49.16 <2e-16 ***
## Residuals 147 16.96 0.115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(iris$Petal.Length~iris$Species))
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 437.1 218.55 1180 <2e-16 ***
## Residuals 147 27.2 0.19
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(iris$Petal.Width~iris$Species))
## Df Sum Sq Mean Sq F value Pr(>F)
## iris$Species 2 80.41 40.21 960 <2e-16 ***
## Residuals 147 6.16 0.04
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#2-b
# load gplot packages
library(gplots)
# plotmean functinon
suppressWarnings(plotmeans(iris$Petal.Length~iris$Species))
#2-c
#Petal length data of each species
setosa <- iris$Petal.Length[which(iris$Species == "setosa")]
versicolor <- iris$Petal.Length[which(iris$Species == "versicolor")]
virginica <- iris$Petal.Length[which(iris$Species == "virginica")]
#two sided t test between setosa and versicolor
pt1 <- t.test(setosa,versicolor,alternative = "two.sided")
pt1
##
## Welch Two Sample t-test
##
## data: setosa and versicolor
## t = -39.493, df = 62.14, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.939618 -2.656382
## sample estimates:
## mean of x mean of y
## 1.462 4.260
#two sided t test between virginica and versicolor
pt2 <- t.test(virginica,versicolor,alternative = "two.sided")
pt2
##
## Welch Two Sample t-test
##
## data: virginica and versicolor
## t = 12.604, df = 95.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.08851 1.49549
## sample estimates:
## mean of x mean of y
## 5.552 4.260
#two sided t test between setosa and virginica
pt3 <- t.test(setosa,virginica,alternative = "two.sided")
pt3
##
## Welch Two Sample t-test
##
## data: setosa and virginica
## t = -49.986, df = 58.609, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.253749 -3.926251
## sample estimates:
## mean of x mean of y
## 1.462 5.552