values <- rnorm(n = 30,mean = 9900,sd = 120) #actual expected mean: 9900 (<10000)
xbar <- mean(values) #sample mean
mu0 <- 10000 #hypothesized value
sigma <- 120 #known standard deviation
n <- 30 #sample size
z <- (xbar - mu0)/(sigma/sqrt(n))
z## [1] -4.511959alpha <- 0.05 #significance level
z.alpha <- qnorm(1 - alpha) #critical value
- z.alpha #critical value for lower tail test## [1] -1.644854xbar + (z.alpha*(sigma/sqrt(n))) #confidence interval of the sample mean## [1] 9937.185Because \(z < z_{\alpha}\), we can reject the null hypothesis
library(BSDA)## Loading required package: lattice## 
## Attaching package: 'BSDA'## The following object is masked from 'package:datasets':
## 
##     Orangevalues <- rnorm(n = 30,mean = 9900,sd = 120) #actual expected mean: 9900 (<10000)
mu0 <- 10000 #hypothesized value
sigma <- 120 #known standard deviation
n <- 30 #sample size
z.test(values, alternative = "less", mu = mu0, sigma.x = sigma, conf.level = 0.95)## 
##  One-sample z-Test
## 
## data:  values
## z = -4.4591, p-value = 4.116e-06
## alternative hypothesis: true mean is less than 10000
## 95 percent confidence interval:
##        NA 9938.344
## sample estimates:
## mean of x 
##  9902.307The followings are daily energy intakes in kJ for 11 women.
| x | 
|---|
| 5260 | 
| 5470 | 
| 5640 | 
| 6180 | 
| 6390 | 
| 6515 | 
| 6805 | 
| 7515 | 
| 7515 | 
| 8230 | 
| 8770 | 
We wish to investigate whether the women’s energy intake deviates systematically from a recommended value of 7725 kJ.
Hypothesis: \(H0: \mu=7725 kJ, H_{a}: \mu \neq 7725\)
daily.intake <- c(5260, 5470, 5640, 6180, 6390, 6515, 6805, 7515, 7515, 8230, 8770)
test <- t.test(daily.intake, mu = 7725, alternative = "two.sided")
test## 
##  One Sample t-test
## 
## data:  daily.intake
## t = -2.8208, df = 10, p-value = 0.01814
## alternative hypothesis: true mean is not equal to 7725
## 95 percent confidence interval:
##  5986.348 7520.925
## sample estimates:
## mean of x 
##  6753.636sx <- sd(daily.intake) #standard deviation
n <- length(daily.intake)
se <- sx/sqrt(n) #standard error
a <- 0.05
df <- length(daily.intake) - 1
t.crit <- qt(a/2,df) # left a/2 tailWe would like to compare daily energy expenditures between lean and obese women.
Data: “energy” data from package “ISwR”
Hypothesis: \(H0: \mu_{1} = \mu_{2}, H_{a}: \mu_{1} \neq \mu_{2}\) (1: lean, 2: obese)
#Daily energy expenditure data
library(ISwR)
attach(energy)
#Data split
sample.lean <- energy$expend[which(energy$stature == "lean")]
sample.obese <- energy$expend[which(energy$stature == "obese")]
t.test(sample.lean,sample.obese) #t-test## 
##  Welch Two Sample t-test
## 
## data:  sample.lean and sample.obese
## t = -3.8555, df = 15.919, p-value = 0.001411
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.459167 -1.004081
## sample estimates:
## mean of x mean of y 
##  8.066154 10.297778detach(energy)Since p < 0.05, We can reject null hypothesis with significant level \(\alpha = 0.05\)
Graphical display of data (boxplot) Function: boxplot()
#boxplot about data from each 2 groups
library(ISwR)
attach(energy)
boxplot(energy$expend~energy$stature,col = c("cyan","magenta"), ylab = "Energy Expenditure")
mean.lean <- mean(sample.lean)
mean.obese <- mean(sample.obese)
abline(h = mean.lean,col = "cyan")
abline(h = mean.obese,col = "magenta")detach(energy)Two sample t-test example Cystic fibrosis lung function data Data: “cystfibr” data from package “ISwR” We will check that is there a significant difference in lung function between male and female patients
#cyctic fibrosis dataset
library(ISwR)
attach(cystfibr)## The following object is masked from package:ISwR:
## 
##     tlc#switch integer variable to factor
sex <- factor(cystfibr$sex,levels = 0:1,labels = c("male","female"))
sample.male <- cystfibr$pemax[which(sex == "male")]
sample.female <- cystfibr$pemax[which(sex == "female")]
t.test(sample.male,sample.female)## 
##  Welch Two Sample t-test
## 
## data:  sample.male and sample.female
## t = 1.5353, df = 21.584, p-value = 0.1393
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.710638 44.801547
## sample estimates:
## mean of x mean of y 
## 117.50000  98.45455detach(cystfibr)Suppose a sample of 20 students were given a diagnostic test before studying a particular module and then again after completing the module. We want to find out if our teaching leads to improvements in students’ knowledge. Hypothesis \(H0: \mu_{1} \geq \mu_{2}, H_{a}: \mu_{1} < \mu_{2}\), (1: before, 2:after)
#Data preparation
pre <- c(18,21,16,22,19,24,17,21,23,18,14,16,16,19,18,20,12,22,15,17)
post <- c(22,25,17,24,16,29,20,23,19,20,15,15,18,26,18,24,18,25,19,16)
#One-tailed test
pt <- t.test(pre,post,paired = TRUE,alternative = "less")
pt## 
##  Paired t-test
## 
## data:  pre and post
## t = -3.2313, df = 19, p-value = 0.002197
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##        -Inf -0.9529882
## sample estimates:
## mean of the differences 
##                   -2.05#Graphical interpretation with distribution
t <- pt$statistic # calculated t-statistic
df <- pt$parameter #degree of freedom
tc <- qt(0.05,df) #critical t-value (p=0.05)
x <- seq(-4,4,0.05)
plot(x,dt(x,df),type = "l") #t-distribution curve with degree of freedom
xt <- seq(-4,tc,0.05);
yt <- dt(xt,df)
#make polygon which of vertices are (-4,0),(xt,yt),(0,tc)
polygon(c(-4,xt,tc),c(0,yt,0),col = "skyblue") #Polygon represents rejection region
abline(v = t); 
text(t,0.15,paste("t=",round(t,3)))Hypothesis \(H0: \sigma_{1}^2 = \sigma_{2}^2, H_{a}: \sigma_{1}^2 \neq \sigma_{2}^2\), (1: before, 2:after)
#Simulation data tutorial from R manual
x <- rnorm(50, mean = 0, sd = 2) 
y <- rnorm(30, mean = 1, sd = 1) 
var.test(x, y) # Do x and y have the same variance?## 
##  F test to compare two variances
## 
## data:  x and y
## F = 3.0165, num df = 49, denom df = 29, p-value = 0.00213
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.515579 5.675365
## sample estimates:
## ratio of variances 
##           3.016539Dataset “InsectSprays” represents the counts of insects in agricultural experimental units treated with 6 different insecticides (A to F). We want to test each insect sprays have equal effectiveness Hypothesis: \(H_{0}:\mu_{A}=\mu_{B}=..=\mu_{F}, H_{a}:\) one of mean counts are not equal to the others
#ANOVA
#1. oneway.test
attach(InsectSprays)
oneway.test(count ~ spray)## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  count and spray
## F = 36.065, num df = 5.000, denom df = 30.043, p-value = 7.999e-12#2 ANOVA using aov
aov.out <- aov(count ~ spray)
summary(aov.out)##             Df Sum Sq Mean Sq F value Pr(>F)    
## spray        5   2669   533.8    34.7 <2e-16 ***
## Residuals   66   1015    15.4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#Mean plot
library(gplots)## 
## Attaching package: 'gplots'## The following object is masked from 'package:stats':
## 
##     lowessplotmeans(count~spray, main = "Means Plot with 95% CI")#Mean display by factors
plot.design(count~spray)print(model.tables(aov.out,"means"),digits = 3)## Tables of means
## Grand mean
##     
## 9.5 
## 
##  spray 
## spray
##     A     B     C     D     E     F 
## 14.50 15.33  2.08  4.92  3.50 16.67detach(InsectSprays)Consider the study in which standing and supine systolic blood pressures were compared. This study was performed on 12 subjects. Their blood pressures were measured in both positions.
| subject | standing | supine | 
|---|---|---|
| 1 | 132 | 136 | 
| 2 | 146 | 145 | 
| 3 | 135 | 140 | 
| 4 | 141 | 147 | 
| 5 | 139 | 142 | 
| 6 | 162 | 160 | 
| 7 | 128 | 137 | 
| 8 | 137 | 136 | 
| 9 | 145 | 149 | 
| 10 | 151 | 158 | 
| 11 | 131 | 120 | 
| 12 | 143 | 150 | 
Suggested null and alternative hypotheses could be:
\(H_{0}\): There is no difference between the mean blood pressures in two samples.
\(H_{1}\): There is a difference between the mean blood pressures in two samples.
Perform a paired t-test on the standing and supine blood pressure data.
Plot t-distribution using given condition in problem 1, and perform graphical checks for this paired t-test (including rejection region and t-statistic).
#1-a
Standing <- c(132,146,135,141,139,162,128,137,145,151,131,143)
Supine <- c(136,145,140,147,142,160,137,136,149,158,120,150)
pt <- t.test(Standing,Supine,paired = TRUE,alternative = "two.sided")
#1-b
#t statistic value
t <- pt$statistic
#degree of freedom
dof <- pt$parameter
#critical value
tc <- qt(0.025,dof)
x <- seq(-4,4,0.025)
xtn <- seq(-4,-abs(tc),0.025)
xtp <- seq(abs(tc),4,0.025)
ytn <- dt(xtn,dof)
ytp <- dt(xtp,dof)
#t graph between -4 : 4
plot(x,dt(x,dof),type = "l")
#reject region
polygon(c(-4,xtn,tc),c(0,ytn,0),col = 'skyblue')
polygon(c(xtp,4,tc),c(0,ytp,0),col = 'skyblue')
#current location of statistic value
abline(v = t)
text(t,0.15,paste("t=",round(t,3)))Using ‘iris’ dataset, answer the following questions.
#2-a
#load the iris data
data("iris")
#result of analysis of variance 
summary(aov(iris$Sepal.Length~iris$Species))##               Df Sum Sq Mean Sq F value Pr(>F)    
## iris$Species   2  63.21  31.606   119.3 <2e-16 ***
## Residuals    147  38.96   0.265                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1summary(aov(iris$Sepal.Width~iris$Species))##               Df Sum Sq Mean Sq F value Pr(>F)    
## iris$Species   2  11.35   5.672   49.16 <2e-16 ***
## Residuals    147  16.96   0.115                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1summary(aov(iris$Petal.Length~iris$Species))##               Df Sum Sq Mean Sq F value Pr(>F)    
## iris$Species   2  437.1  218.55    1180 <2e-16 ***
## Residuals    147   27.2    0.19                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1summary(aov(iris$Petal.Width~iris$Species))##               Df Sum Sq Mean Sq F value Pr(>F)    
## iris$Species   2  80.41   40.21     960 <2e-16 ***
## Residuals    147   6.16    0.04                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#2-b
# load gplot packages
library(gplots)
# plotmean functinon
suppressWarnings(plotmeans(iris$Petal.Length~iris$Species))#2-c
#Petal length data of each species
setosa <- iris$Petal.Length[which(iris$Species == "setosa")]
versicolor <- iris$Petal.Length[which(iris$Species == "versicolor")]
virginica <- iris$Petal.Length[which(iris$Species == "virginica")]
#two sided t test between setosa and versicolor
pt1 <- t.test(setosa,versicolor,alternative = "two.sided")
pt1## 
##  Welch Two Sample t-test
## 
## data:  setosa and versicolor
## t = -39.493, df = 62.14, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.939618 -2.656382
## sample estimates:
## mean of x mean of y 
##     1.462     4.260#two sided t test between virginica and versicolor
pt2 <- t.test(virginica,versicolor,alternative = "two.sided")
pt2## 
##  Welch Two Sample t-test
## 
## data:  virginica and versicolor
## t = 12.604, df = 95.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.08851 1.49549
## sample estimates:
## mean of x mean of y 
##     5.552     4.260#two sided t test between setosa and virginica
pt3 <- t.test(setosa,virginica,alternative = "two.sided")
pt3## 
##  Welch Two Sample t-test
## 
## data:  setosa and virginica
## t = -49.986, df = 58.609, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.253749 -3.926251
## sample estimates:
## mean of x mean of y 
##     1.462     5.552