You installed Bioconductor Package in week 2, R Tutorial Class. We will practice the R code, with probability concept. Before we start tutorial, you may install a probability package. In R, each project have its workspace. Before you use functions in packages, you may call functions from the library.

# install.packages("prob")
library(prob)

Contents

Introduction

  • Probability and Statistics, Frequentist and Bayesian Views
  • Data Types, Data Display, Sample space and Event, Event Algebra

Probability

  • Probability, Conditional Probability, Independency, Bayes’ theorem

Random variable

  • Discrete and continuous random variables

Probabilistic properties of random variables

  • Expected value, various types of averages, variance,
  • Joint/marginal/conditional probability distribution of random variables
  • Correlation and Covariance

Tutorial

I. Sample Space & Event (s7)

# Discrete sample space & Events
library(prob)

# Sample space of two coin toss
two.coin <- tosscoin(2)
two.coin
##   toss1 toss2
## 1     H     H
## 2     T     H
## 3     H     T
## 4     T     T
# event of obtaining exactly one head
subset(two.coin, (toss1 == 'H' | toss2 == 'H') & !(toss1 == 'H' & toss2 == 'H'))
##   toss1 toss2
## 2     T     H
## 3     H     T
# Sample of rolling two dice
two.dice <- rolldie(2)
two.dice
##    X1 X2
## 1   1  1
## 2   2  1
## 3   3  1
## 4   4  1
## 5   5  1
## 6   6  1
## 7   1  2
## 8   2  2
## 9   3  2
## 10  4  2
## 11  5  2
## 12  6  2
## 13  1  3
## 14  2  3
## 15  3  3
## 16  4  3
## 17  5  3
## 18  6  3
## 19  1  4
## 20  2  4
## 21  3  4
## 22  4  4
## 23  5  4
## 24  6  4
## 25  1  5
## 26  2  5
## 27  3  5
## 28  4  5
## 29  5  5
## 30  6  5
## 31  1  6
## 32  2  6
## 33  3  6
## 34  4  6
## 35  5  6
## 36  6  6
# Event that the sum of the number on the two dice equals 3  
subset(two.dice, X1 + X2 == 3)
##   X1 X2
## 2  2  1
## 7  1  2
# Continuous sample space & Events
# Sample space of measurement the relative humidity of air (assume uniform distribution)
f <- function(x){x/x}
S <- integrate(f, 0, 100)
S
## 100 with absolute error < 1.1e-12
# Event that the relative humidity is at least 90% 
Event <- integrate(f, 0, 90)
Event
## 90 with absolute error < 1e-12

Probability (s10)

# Two coin toss event
two.coin <- tosscoin(2, makespace = TRUE)

# P(one head)
p1h <- Prob(two.coin, event = (toss1 == 'H' | toss2 == 'H') & !(toss1 == 'H' & toss2 == 'H'))
p1h
## [1] 0.5
# P(one tail)
p1t <- Prob(two.coin, event = (toss1 == 'T' | toss2 == 'T') & !(toss1 == 'T' & toss2 == 'T'))
p1t
## [1] 0.5
# P(one head U one tail)
p1h1t <- Prob(two.coin, event = (toss1 == 'H' | toss2 == 'T') | (toss1 == 'T' & toss2 == 'H'))
p1h1t
## [1] 1
# P(head)
ph <- Prob(two.coin, event = (toss1 == 'H' | toss2 == 'H'))
ph
## [1] 0.75
# P(tail)
pt <- Prob(two.coin, event = (toss1 == 'T' | toss2 == 'T'))
pt
## [1] 0.75
# P(head n tail)
phnt <- Prob(two.coin, event = !(toss1 == 'H' | toss2 == 'H') | !(toss1 == 'T' | toss2 == 'T'))
phnt
## [1] 0.5
# P(head U tail)
phut <- ph + pt - phnt
phut
## [1] 1

Conditional Probability (s11)

# Tossing two dice
two.dice <- rolldie(2, makespace = TRUE)

# A := {difference is 3}
A <- subset(two.dice, abs(X1 - X2) == 3)
A
##    X1 X2      probs
## 4   4  1 0.02777778
## 11  5  2 0.02777778
## 18  6  3 0.02777778
## 19  1  4 0.02777778
## 26  2  5 0.02777778
## 33  3  6 0.02777778
pA <- Prob(A)
pA
## [1] 0.1666667
# B := {sum is a multiple of 3}
B <- subset(two.dice, abs(X1 + X2) %in% c(3,6,9,12))
B
##    X1 X2      probs
## 2   2  1 0.02777778
## 5   5  1 0.02777778
## 7   1  2 0.02777778
## 10  4  2 0.02777778
## 15  3  3 0.02777778
## 18  6  3 0.02777778
## 20  2  4 0.02777778
## 23  5  4 0.02777778
## 25  1  5 0.02777778
## 28  4  5 0.02777778
## 33  3  6 0.02777778
## 36  6  6 0.02777778
pB <- Prob(B)
pB
## [1] 0.3333333
# P(A|B)
Prob(two.dice, event = (abs(X1 - X2) == 3), (given = abs(X1 + X2) %in% c(3,6,9,12)))
## [1] 0.1666667
# P(AnB)/P(B)
Prob(intersect(A, B))/Prob(B)
## [1] 0.1666667

Independency (s12)

# Dice example
one.die <- rolldie(1, makespace = TRUE)
one.die
##   X1     probs
## 1  1 0.1666667
## 2  2 0.1666667
## 3  3 0.1666667
## 4  4 0.1666667
## 5  5 0.1666667
## 6  6 0.1666667
# A := the event that the number is even = {2,4,6}
A <- subset(one.die, (X1 %% 2 == 0))
A
##   X1     probs
## 2  2 0.1666667
## 4  4 0.1666667
## 6  6 0.1666667
# B := the event that the number is greater than or equal to 3 = {3,4,5,6}
B <- subset(one.die, X1 >= 3)
B
##   X1     probs
## 3  3 0.1666667
## 4  4 0.1666667
## 5  5 0.1666667
## 6  6 0.1666667
# P(A)
Prob(A)
## [1] 0.5
# P(B)
Prob(B)
## [1] 0.6666667
# P(A n B)
Prob(intersect(A,B))
## [1] 0.3333333
# P(B|A)
Prob(B, given = A)
## [1] 0.6666667
# P(B) = P(B|A) ?
identical(Prob(B), Prob(B, given = A))
## [1] TRUE
# P(A n B) = P(A)P(B) ?
identical(Prob(intersect(A,B)), Prob(A)*Prob(B))
## [1] TRUE

Example of Bayes’ Rule (s14)

# Probability of test positive when RA is given, P(+|RA) = 0.95
# Probability of test positive when RA is not given, P(+|!RA) = 0.01
# background probability of RA and Normal, P(RA) = 0.005, P(Normal) = 0.995 
test <- matrix(c(0.95, 0.01, 0.05, 0.99), nrow = 2, ncol = 2, byrow = TRUE)
rownames(test) <- c("TP", "TN")
colnames(test) <- c("RA", "Normal")
prevalance <- c(0.005, 0.995)

# considering prevalence
test_corrected <- matrix(c(test[,1] * prevalance[1], test[,2] * prevalance[2]), 
                         nrow = 2, ncol = 2, byrow = FALSE)
rownames(test_corrected) <- c("TP", "TN")
colnames(test_corrected) <- c("RA", "Normal")
test_corrected <- as.data.frame(test_corrected)
test_corrected$Sum <- rowSums(test_corrected)
test_corrected$P_correct_Diagnosis[1] <- test_corrected$RA[1]/test_corrected$Sum[1]
test_corrected$P_correct_Diagnosis[2] <- test_corrected$Normal[2]/test_corrected$Sum[2]

# percent
test_corrected * 100
##       RA Normal   Sum P_correct_Diagnosis
## TP 0.475  0.995  1.47            32.31293
## TN 0.025 98.505 98.53            99.97463

Random Variable of two coin toss (s 17)

# toss 2 coins
library(prob)
two.coin <- tosscoin(2, makespace = TRUE)
two.coin <- addrv(two.coin, X = ifelse(toss1 == "H", 1, 0) + ifelse(toss2 == "H", 1, 0))
pmf <- marginal(two.coin, vars = "X")
pmf
##   X probs
## 1 0  0.25
## 2 1  0.50
## 3 2  0.25
barplot(pmf$probs, names.arg = pmf$X, main = "Histogram of X", 
        xlab = "X", ylab = "f(X)", ylim = c(0, 0.5))

Discrete Random Variable (s18)

# roll 2 dices
two.dice <- rolldie(2, makespace = TRUE)
two.dice <- addrv(two.dice, X = X1 + X2)
pmf <- marginal(two.dice, vars = "X")
pmf
##     X      probs
## 1   2 0.02777778
## 2   3 0.05555556
## 3   4 0.08333333
## 4   5 0.11111111
## 5   6 0.13888889
## 6   7 0.16666667
## 7   8 0.13888889
## 8   9 0.11111111
## 9  10 0.08333333
## 10 11 0.05555556
## 11 12 0.02777778
barplot(pmf$probs, names.arg = pmf$X, main = "Histogram of X", 
        xlab = "X", ylab = "f(X)", ylim = c(0, 0.2))

Continuous Random Variable (s20)

f <- function(x) {1.5 - 6 * (x - 50.0)^2}
integrate(f, 49.8, 50.1)
## 0.432 with absolute error < 4.8e-15

Expectation of random variable (s22)

two.dice <- rolldie(2, makespace = TRUE)
two.dice <- addrv(two.dice, X = X1 + X2)
pmf <- marginal(two.dice, vars = "X")
Ex <- sum(pmf$X*pmf$probs)

Various types of Averages (s24)

# install.packages("psych")
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:prob':
## 
##     sim
## The following object is masked from 'package:fBasics':
## 
##     tr
## The following object is masked from 'package:timeSeries':
## 
##     outlier
# Arithmatic mean
vec <- c(10,2,19,24,6,23,47,24,54,77)
sum(vec)/length(vec)
## [1] 28.6
mean(vec)
## [1] 28.6
# Harmonic mean
x <- c(40, 20) # x1, x2, km/hr
length(x)/sum(1/x)
## [1] 26.66667
harmonic.mean(x)
## [1] 26.66667
mean(x) # cf
## [1] 30
geometric.mean(x) # cf
## [1] 28.28427
# Geometric mean
pop <- c(10000, 10500, 11550, 13860, 18156)
x <- c(1.05, 1.10, 1.20, 1.31)

# calculation of geometric mean
prod(x)^(1 / length(x))
## [1] 1.160803
geometric.mean(x)
## [1] 1.160803
# Population change over 4 decade
pop[1]*mean(x)^4
## [1] 18420.6
pop[1]*harmonic.mean(x)^4
## [1] 17900.19
pop[1]*geometric.mean(x)^4
## [1] 18156.6

Variance and Standard Deviation (s25)

two.dice <- rolldie(2, makespace = TRUE)
two.dice <- addrv(two.dice, X = X1 + X2)
pmf <- marginal(two.dice, vars = "X")
Ex2 <- sum(pmf$X^2 * pmf$probs)
Ex <- sum(pmf$X * pmf$probs)
Var <- Ex2 - Ex^2
SD <- sqrt(Var)

Marginal distribution of RV X, Y (s27)

tbl <- matrix(c(rep(0,3),rep(1/32,2),rep(3/32,2),rep(1/16,3),1/8,2/8,0,1/8,rep(1/64,4),rep(0,3)), 
              nrow = 3, byrow = TRUE)
rownames(tbl) <- c(seq(0,2))
colnames(tbl) <- c(seq(0,6))
tbl <- as.data.frame(tbl)
tbl$py <- rowSums(tbl)
px <- colSums(tbl)
tbl <- rbind(tbl, px)
rownames(tbl) <- c(seq(0,2), "px")

barplot(tbl$py[1:3], names.arg = rownames(tbl)[1:3], main = "probability mass function of y", 
        xlab = "y", ylab = "p(y)", ylim = c(0, 1))

barplot(t(tbl)[1:7,4], names.arg = colnames(tbl)[1:7], main = "probability mass function of x", 
        xlab = "x", ylab = "p(x)", ylim = c(0, 1))

Marginal expectation of RV X, Y & correlation (s29 ~ s30)

Ex <- sum(seq(0,6)*tbl[4,1:7])
Ex2 <- sum(seq(0,6)^2*tbl[4,1:7])
Var_x <- Ex2 - Ex^2
SD_x <- sqrt(Var_x)

Ey <- sum(seq(0,2)*tbl[1:3,8])
Ey2 <- sum(seq(0,2)^2*tbl[1:3,8])
Var_y <- Ey2 - Ey^2
SD_y <- sqrt(Var_y)

# Cov = E(XY) - E(X)E(Y)
Exy <- sum(0*seq(0,6)*tbl[1,1:7], 1*seq(0,6)*tbl[2,1:7], 2*seq(0,6)*tbl[3,1:7])
ExEy <- Ex*Ey
Cov <- Exy - ExEy

# Correlation Coefficient (Pearson Correlation Coefficient)
PCC <- Cov / sqrt(Var_x*Var_y)
PCC
## [1] -0.4929085