Test the hypothesis whether the students smoking habit is independent of their exercise
# Tutorial for Chi-Square Goodness of Fit Test
library(MASS) #Load the MASS package
tbl <- table(survey$Smoke,survey$Exer) # Make a contingency table
tbl # See the contingency table
##
## Freq None Some
## Heavy 7 1 3
## Never 87 18 84
## Occas 12 3 4
## Regul 9 1 7
chisq.test(tbl) # Pearson's Chi-squared test of goodness of fit
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 5.4885, df = 6, p-value = 0.4828
We cannot reject the null hypothesis that the smoking habit is independent of the exercise level of the students.
# Tutorial for Chi-Square Goodness of Fit Test example
fo <- c(35,10,15)
k <- 3; n <- sum(fo); fe <- rep(n/k,k)
sum((fo - fe)^2/fe) #calculated chi-square statistic
## [1] 17.5
df <- k - 1; alpha <- 0.05 # Degree of freedom and confidence level
qchisq(1 - alpha,df) #critical chi-square
## [1] 5.991465
tc <- c("Observed Frequency");
tr <- c("More important","Less important","About the Same")
mo <- matrix(fo,dimnames = list(tr,tc))
mo
## Observed Frequency
## More important 35
## Less important 10
## About the Same 15
chisq.test(mo)
##
## Chi-squared test for given probabilities
##
## data: mo
## X-squared = 17.5, df = 2, p-value = 0.0001585
#Tutorial for Chi-Square Independence Test
data <- matrix(c(11,30,36,23,8,31,25,36,13,28,28,31),nrow = 3,byrow = TRUE)
chisq.test(data)
##
## Pearson's Chi-squared test
##
## data: data
## X-squared = 6.3912, df = 6, p-value = 0.3808
#SIGN.test() function of "BSDA" package
# install.packages("BSDA")
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
#data input
data <- c(51.8,54.5,54.5,55.8,56.7,57.3, 59.1,59.5,60.4,61.2,61.8,64,64.9,65.5,70.2)
SIGN.test(data, md = 65) #one sample sign test
##
## One-sample Sign-Test
##
## data: data
## s = 2, p-value = 0.007385
## alternative hypothesis: true median is not equal to 65
## 95 percent confidence interval:
## 55.96035 63.60803
## sample estimates:
## median of x
## 59.5
##
## Achieved and Interpolated Confidence Intervals:
##
## Conf.Level L.E.pt U.E.pt
## Lower Achieved CI 0.8815 56.7000 61.800
## Interpolated CI 0.9500 55.9604 63.608
## Upper Achieved CI 0.9648 55.8000 64.000
#data input
x <- c(172,165,206,184,174,142,190,159,161,200)
y <- c(201,179,159,192,177,170,182,179,169,210)
SIGN.test(x - y,md = 0) #paired one-sample sign test
##
## One-sample Sign-Test
##
## data: x - y
## s = 2, p-value = 0.1094
## alternative hypothesis: true median is not equal to 0
## 95 percent confidence interval:
## -25.404444 4.431111
## sample estimates:
## median of x
## -9
##
## Achieved and Interpolated Confidence Intervals:
##
## Conf.Level L.E.pt U.E.pt
## Lower Achieved CI 0.8906 -20.0000 -3.0000
## Interpolated CI 0.9500 -25.4044 4.4311
## Upper Achieved CI 0.9785 -28.0000 8.0000
wilcox.test(x,y,paired = TRUE) #Wilcoxon signed rank test
## Warning in wilcox.test.default(x, y, paired = TRUE): cannot compute exact
## p-value with ties
##
## Wilcoxon signed rank test with continuity correction
##
## data: x and y
## V = 13, p-value = 0.1525
## alternative hypothesis: true location shift is not equal to 0
#cyctic fibrosis dataset
library(ISwR)
attach(cystfibr)
## The following object is masked from package:ISwR:
##
## tlc
#switch integer variable to factor
sex <- factor(cystfibr$sex,levels = 0:1,labels = c("male","female"))
sample.male <- cystfibr$pemax[which(sex == "male")]
sample.female <- cystfibr$pemax[which(sex == "female")]
wilcox.test(sample.male,sample.female)
## Warning in wilcox.test.default(sample.male, sample.female): cannot compute
## exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: sample.male and sample.female
## W = 100, p-value = 0.2158
## alternative hypothesis: true location shift is not equal to 0
detach(cystfibr)
since p = 0.2158, We cannot reject null hypothesis with significance level 0.05
Now we will analyze microarray dataset using R. Before we start, we will learn ExpressionSet class of R, which is common data structure of microarray experiments. When the microarray data is uploaded to NCBI GEO database, researchers should provide the data in unified format and common information (MIAME: Minimum Information about a Microarray Experiment) about their experiment.
Object “ExpressionSet” contains the expression data and some common information - experimentData: Brief information about researcher, research title, and published information - phenoData: Information about sample, like disease class, tissue, and gender etc. - For other features, you can refer google search or brief introduction of ExpressionSet class in Bioconductor consortium (link below). - Reference link: https://www.bioconductor.org/packages/release/bioc/vignettes/Biobase/inst/doc/ExpressionSetIntroduction.pdf
You can see the structure of the “ExpressionSet” class by using str() function.
suppressMessages(library(golubEsets))
data(Golub_Train)
str(Golub_Train@assayData$exprs)
## int [1:7129, 1:38] -214 -153 -58 88 -295 -558 199 -176 252 206 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:7129] "AFFX-BioB-5_at" "AFFX-BioB-M_at" "AFFX-BioB-3_at" "AFFX-BioC-5_at" ...
## ..$ : chr [1:38] "1" "2" "3" "4" ...
str(Golub_Train@phenoData@data)
## 'data.frame': 38 obs. of 11 variables:
## $ Samples : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ALL.AML : Factor w/ 2 levels "ALL","AML": 1 1 1 1 1 1 1 1 1 1 ...
## $ BM.PB : Factor w/ 2 levels "BM","PB": 1 1 1 1 1 1 1 1 1 1 ...
## $ T.B.cell : Factor w/ 2 levels "B-cell","T-cell": 1 2 2 1 1 2 1 1 2 2 ...
## $ FAB : Factor w/ 4 levels "M1","M2","M4",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Date : Factor w/ 27 levels "","1/24/1984",..: 26 NA NA NA NA NA 10 NA NA 19 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 2 2 NA NA 2 1 1 2 2 ...
## $ pctBlasts: int NA NA NA NA NA NA NA NA NA NA ...
## $ Treatment: Factor w/ 2 levels "Failure","Success": NA NA NA NA NA NA NA NA NA NA ...
## $ PS : num 1 0.41 0.87 0.91 0.89 0.76 0.78 0.77 0.89 0.56 ...
## $ Source : Factor w/ 4 levels "CALGB","CCG",..: 3 3 3 3 3 3 3 3 3 3 ...
suppressMessages(library(golubEsets, warn.conflicts = FALSE, quietly = TRUE))
data("Golub_Merge") #Data preparation
expression.data <- exprs(Golub_Merge) #Extract expression data
# Extract phenotype data, pData() means data parsing
phenotype.data <- pData(phenoData(Golub_Merge))
#intersection of index with class ALL/AML and BM.PB
index.ALL.PB <- intersect(which(phenotype.data$ALL.AML == "ALL"),
which(phenotype.data$BM.PB == "PB"))
index.AML.PB <- intersect(which(phenotype.data$ALL.AML == "AML"),
which(phenotype.data$BM.PB == "PB"))
#p-value of wilcoxon rank sum test
p.value <- c()
for (i in 1:dim(expression.data)[1]) { #calculate p-values for each genes
ALL.PB <- expression.data[i,index.ALL.PB]
AML.PB <- expression.data[i,index.AML.PB]
p.value <- c(p.value,wilcox.test(ALL.PB,AML.PB)$p.value)
}
#significant microarray probes
deg.index <- which(p.value < 0.05)
length(deg.index)
## [1] 1061
#index of increasingly ordered p-value
ordered.index <- order(p.value)
#top 10 differentially expressed probes
top10 <- head(row.names(expression.data)[ordered.index],10)
top10
## [1] "AFFX-TrpnX-M_at" "AB000895_at" "AD000684_cds1_at"
## [4] "AF000562_at" "AF005887_at" "AF006084_at"
## [7] "AF015950_at" "D14678_at" "D26308_at"
## [10] "D28423_at"
# mapping gene symbol
# source("http://bioconductor.org/biocLite.R")
# biocLite("hu6800.db")
suppressPackageStartupMessages(library(hu6800.db))
maptbl <- mget(row.names(expression.data), hu6800SYMBOL)
tbl <- unlist(maptbl, use.names = TRUE)
print(tbl[names(tbl) %in% top10])
## AFFX-TrpnX-M_at AB000895_at AD000684_cds1_at AF000562_at
## NA "DCHS1" "LSR" "UPK2"
## AF005887_at AF006084_at AF015950_at D14678_at
## "ATF6" "ARPC1B" "TERT" "KIFC1"
## D26308_at D28423_at
## "BLVRB" "SRSF3"
#Kruskal-Wallis test
data(InsectSprays)
kruskal.test(InsectSprays$count ~ InsectSprays$spray)
##
## Kruskal-Wallis rank sum test
##
## data: InsectSprays$count by InsectSprays$spray
## Kruskal-Wallis chi-squared = 54.691, df = 5, p-value = 1.511e-10
#Kruskal-Wallis test
kruskal.test(iris$Petal.Length ~ iris$Species)
##
## Kruskal-Wallis rank sum test
##
## data: iris$Petal.Length by iris$Species
## Kruskal-Wallis chi-squared = 130.41, df = 2, p-value < 2.2e-16
kruskal.test(iris$Petal.Width ~ iris$Species)
##
## Kruskal-Wallis rank sum test
##
## data: iris$Petal.Width by iris$Species
## Kruskal-Wallis chi-squared = 131.19, df = 2, p-value < 2.2e-16
We can compare the results with results from ANOVA
Acute myeloid leukemia (AML) is one of the most common and deadly forms of hematopoietic malignancies. Stirewalt et al. made transcriptome expression data of AML from multiple tissues (hematopoietic cells, bone marrow, peripheral blood). This dataset is stored at NCBI GEO as “GDS3057”. Find the differentially expressed genes between leukemia and normal cells.
Using non-parametric method, make tables of top 20 differentially expressed genes in ‘bone marrow’ and ‘peripheral blood’ respectively. The table should contain microarray probe ID, corresponding gene symbol, and p-value of each testing. (Hint: You can get gene symbol by extracting featureData from exprs class data.)
Find top 20 differentially expressed genes in ‘bone marrow’ using parametric testing method.
Compare the result in (a) and (b). Are the ‘significant’ DEGs are conserved in non-parametric testing? State about the difference of parametric and non-parametric testing methods, their proper usage conditions, and possible problems of each methods.
Hint: You can use GEOquery package to download data as gds file format (refer R tutorial at 2nd week). Change the gds file format to exprs class data by using “GDS2eSet()” function from “Biobase” package like “GDS2Set(yourdata)”.
# source("http://bioconductor.org/biocLite.R")
# biocLite()
# biocLite("GEOquery")
library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
# load the data
gds <- getGEO("GDS3057")
## File stored at:
## C:\Users\sypark\AppData\Local\Temp\RtmpcxN6fT/GDS3057.soft.gz
## Parsed with column specification:
## cols(
## .default = col_double(),
## ID_REF = col_character(),
## IDENTIFIER = col_character()
## )
## See spec(...) for full column specifications.
datas <- GDS2eSet(gds)
## File stored at:
## C:\Users\sypark\AppData\Local\Temp\RtmpcxN6fT/GPL96.annot.gz
expression.data <- exprs(datas)
phenotype.data <- pData(datas)
# index of leukemia and normal data of bone marrow and peripheral_blood
index_Bone_marrow_leukemia <- intersect(which(phenotype.data$cell.type == "bone marrow"),
which(phenotype.data$disease.state == "leukemia"))
index_Bone_marrow_normal <- intersect(which(phenotype.data$cell.type == "bone marrow"),
which(phenotype.data$disease.state == "normal"))
index_peripheral_blood_leukemia <- intersect(which(phenotype.data$cell.type == "peripheral blood"),
which(phenotype.data$disease.state == "leukemia"))
index_peripheral_blood_normal <- intersect(which(phenotype.data$cell.type == "peripheral blood"),
which(phenotype.data$disease.state == "normal"))
# p-value of wilcox test for bone marrow
Bone_marrow_p_value <- c()
for (i in 1:dim(expression.data)[1]) {
Bone_marrow_leukemia <- expression.data[i,index_Bone_marrow_leukemia]
Bone_marrow_normal <- expression.data[i,index_Bone_marrow_normal]
Bone_marrow_p_value <- c(Bone_marrow_p_value,
wilcox.test(Bone_marrow_leukemia,Bone_marrow_normal)$p.value)
}
# sort the data
ordered_index <- order(Bone_marrow_p_value)
Bone_marrow_index <- head(ordered_index,20)
Bone_marrow_p.value <- Bone_marrow_p_value[Bone_marrow_index]
Bone_marrow_microarray.probe.ID <- rownames(expression.data)[Bone_marrow_index]
Bone_marrow_gene.symbol <- datas@featureData@data$`Gene symbol`[Bone_marrow_index]
Bone_marrow_result <- data.frame(Bone_marrow_microarray.probe.ID,
Bone_marrow_gene.symbol,Bone_marrow_p.value)
# p-value of wilcox test for peripheral_blood
peripheral_blood_p_value <- c()
for (i in 1:dim(expression.data)[1]) {
peripheral_blood_leukemia <- expression.data[i,index_peripheral_blood_leukemia]
peripheral_blood_normal <- expression.data[i,index_peripheral_blood_normal]
peripheral_blood_p_value <- c(peripheral_blood_p_value,
wilcox.test(peripheral_blood_leukemia,
peripheral_blood_normal)$p.value)
}
#sort the data
ordered_index <- order(peripheral_blood_p_value)
peripheral_blood_index <- head(ordered_index,20)
peripheral_blood_p.value <- peripheral_blood_p_value[peripheral_blood_index]
peripheral_blood_microarray.probe.ID <- rownames(expression.data)[peripheral_blood_index]
peripheral_blood_gene.symbol <- datas@featureData@data$`Gene symbol`[peripheral_blood_index]
peripheral_blood_result <- data.frame(peripheral_blood_microarray.probe.ID,
peripheral_blood_gene.symbol,peripheral_blood_p.value)
#1-b
Bone_marrow_p_value <- c()
for (i in 1:dim(expression.data)[1]) {
Bone_marrow_leukemia <- expression.data[i,index_Bone_marrow_leukemia]
Bone_marrow_normal <- expression.data[i,index_Bone_marrow_normal]
Bone_marrow_p_value <- c(Bone_marrow_p_value,t.test(Bone_marrow_leukemia,
Bone_marrow_normal,
alternative = "two.sided")$p.value)
}
ordered_index <- order(Bone_marrow_p_value)
Bone_marrow_index <- head(ordered_index,20)
Bone_marrow_p.value <- Bone_marrow_p_value[Bone_marrow_index]
Bone_marrow_microarray.probe.ID <- rownames(expression.data)[Bone_marrow_index]
Bone_marrow_gene.symbol <- datas@featureData@data$`Gene symbol`[Bone_marrow_index]
t_Bone_marrow_result <- data.frame(Bone_marrow_microarray.probe.ID,
Bone_marrow_gene.symbol,Bone_marrow_p.value)
peripheral_blood_p_value <- c()
for (i in 1:dim(expression.data)[1]) {
peripheral_blood_leukemia <- expression.data[i,index_peripheral_blood_leukemia]
peripheral_blood_normal <- expression.data[i,index_peripheral_blood_normal]
peripheral_blood_p_value <- c(peripheral_blood_p_value,
t.test(peripheral_blood_leukemia,peripheral_blood_normal,
alternative = "two.sided")$p.value)
}
ordered_index <- order(peripheral_blood_p_value)
peripheral_blood_index <- head(ordered_index,20)
peripheral_blood_p.value <- peripheral_blood_p_value[peripheral_blood_index]
peripheral_blood_microarray.probe.ID <- rownames(expression.data)[peripheral_blood_index]
peripheral_blood_gene.symbol <- datas@featureData@data$`Gene symbol`[peripheral_blood_index]
t_peripheral_blood_result <- data.frame(peripheral_blood_microarray.probe.ID,
peripheral_blood_gene.symbol,peripheral_blood_p.value)
#1-c
intersect(Bone_marrow_result$Bone_marrow_microarray.probe.ID,
t_Bone_marrow_result$Bone_marrow_microarray.probe.ID)
## character(0)
intersect(t_peripheral_blood_result$peripheral_blood_microarray.probe.ID,
peripheral_blood_result$peripheral_blood_microarray.probe.ID)
## character(0)
intersect(Bone_marrow_result$Bone_marrow_microarray.probe.ID,
peripheral_blood_result$peripheral_blood_microarray.probe.ID)
## [1] "200736_s_at"
intersect(t_Bone_marrow_result$Bone_marrow_microarray.probe.ID,
t_peripheral_blood_result$peripheral_blood_microarray.probe.ID)
## character(0)
There are two ways of testing for independence in two-way contingency table. One is using the Pearson chi-square statistics and the other one is using the likelihood ratio chi-square statistics. The first one is implemented on chisq.test() but the other one is not implemented. Following below rules, write your own function for chi-square testing for independence, that can calculate likelihood ratio chi-square statistics.
Input of the function is a contingency table.
The function should provide calculation by using the Pearson chi-square statistic and the likelihood ratio chi-square statistics.
Parameters of function should contain Boolean variable for choosing one of each calculation methods
Function should return both chi-square statistic and p-value.
Operation example: chisq.test.user(data=mydata, likelihood=TRUE) ==> return the likelihood ratio chi-square statistic and p-value of the test.
Test your function with the contingency table by multi-drug allergy categorization (data from slide 11, tutorial slide 5). Make your conclusion for independency of the data.
chisq.test.user <- function(x,y) {
Row_margianl <- rep(0,dim(x)[1])
for (i in 1:dim(x)[1]) {
Row_margianl[i] <- sum(x[i,])
}
#Row margianl frequency
Col_margianl <- rep(0,dim(x)[2])
for (i in 1:dim(x)[2]) {
Col_margianl[i] <- sum(x[,i])
}
#Column margianl frequency
exp_value <- matrix(rep(0,dim(x)[1]*dim(x)[2]),nrow = dim(x)[1],byrow = TRUE)
for (i in 1:dim(x)[1]) {
for (j in 1:dim(x)[2]) {exp_value[i,j] <- Row_margianl[i] * Col_margianl[j]/sum(x)}
}
#Expected value matrix
if (y) {
G2 = 0
# likelihood ratio statistic
for (i in 1:dim(x)[1]) {
for (j in 1:dim(x)[2]) {
G2 <- G2 + 2*x[i,j]*log(x[i,j]/exp_value[i,j])
}
}
Value = c(statistic = G2,
p_value = pchisq(G2,(dim(x)[1] - 1)*(dim(x)[2] - 1),lower.tail = FALSE))
return(Value)
} else {
Chisq = 0
#Pearson Chi-square statistic
for (i in 1:dim(x)[1]) {
for (j in 1:dim(x)[2]) {
Chisq <- Chisq + (x[i,j] - exp_value[i,j])^2/exp_value[i,j]
}
}
Value = c(statistic = Chisq,
p_value = pchisq(Chisq,(dim(x)[1] - 1)*(dim(x)[2] - 1),lower.tail = FALSE))
return(Value)
}
}
mat <- matrix(c(11,30,36,23,8,31,25,36,13,28,28,31),nrow = 3,byrow = TRUE)
chisq.test.user(mat,TRUE)
## statistic p_value
## 6.4498448 0.3747267
chisq.test.user(mat,FALSE)
## statistic p_value
## 6.3912453 0.3808181