Probability

You installed Bioconductor Package in week 2, R Tutorial Class. We will practice the R code, with probability concept. Before we start tutorial, you may install a probability package. In R, each project have its workspace. Before you use functions in packages, you may call functions from the library.

# install.packages("prob")
library(prob)

Introduction

Probability and Statistics, Frequentist and Bayesian Views
Data Types, Data Display, Sample space and Event, Event Algebra

Probability

Probability, Conditional Probability, Independency, Bayes’ theorem

Random variable

Discrete and continuous random variables

Probabilistic properties of random variables

Expected value, various types of averages, variance,
Joint/marginal/conditional probability distribution of random variables
Correlation and Covariance

Tutorial

I. Sample Space & Event (s7)

# Discrete sample space & Events
library(prob)

# Sample space of two coin toss
two.coin <- tosscoin(2)
two.coin

##   toss1 toss2
## 1     H     H
## 2     T     H
## 3     H     T
## 4     T     T

# event of obtaining exactly one head
subset(two.coin, (toss1 == 'H' | toss2 == 'H') & !(toss1 == 'H' & toss2 == 'H'))

##   toss1 toss2
## 2     T     H
## 3     H     T

# Sample of rolling two dice
two.dice <- rolldie(2)
two.dice

##    X1 X2
## 1   1  1
## 2   2  1
## 3   3  1
## 4   4  1
## 5   5  1
## 6   6  1
## 7   1  2
## 8   2  2
## 9   3  2
## 10  4  2
## 11  5  2
## 12  6  2
## 13  1  3
## 14  2  3
## 15  3  3
## 16  4  3
## 17  5  3
## 18  6  3
## 19  1  4
## 20  2  4
## 21  3  4
## 22  4  4
## 23  5  4
## 24  6  4
## 25  1  5
## 26  2  5
## 27  3  5
## 28  4  5
## 29  5  5
## 30  6  5
## 31  1  6
## 32  2  6
## 33  3  6
## 34  4  6
## 35  5  6
## 36  6  6

# Event that the sum of the number on the two dice equals 3  
subset(two.dice, X1 + X2 == 3)

##   X1 X2
## 2  2  1
## 7  1  2

# Continuous sample space & Events
# Sample space of measurement the relative humidity of air (assume uniform distribution)
f <- function(x){x/x}
S <- integrate(f, 0, 100)
S

## 100 with absolute error < 1.1e-12

# Event that the relative humidity is at least 90% 
Event <- integrate(f, 0, 90)
Event

## 90 with absolute error < 1e-12

Probability (s10)

# Two coin toss event
two.coin <- tosscoin(2, makespace = TRUE)

# P(one head)
p1h <- Prob(two.coin, event = (toss1 == 'H' | toss2 == 'H') & !(toss1 == 'H' & toss2 == 'H'))
p1h

## [1] 0.5

# P(one tail)
p1t <- Prob(two.coin, event = (toss1 == 'T' | toss2 == 'T') & !(toss1 == 'T' & toss2 == 'T'))
p1t

## [1] 0.5

# P(one head U one tail)
p1h1t <- Prob(two.coin, event = (toss1 == 'H' | toss2 == 'T') | (toss1 == 'T' & toss2 == 'H'))
p1h1t

## [1] 1

# P(head)
ph <- Prob(two.coin, event = (toss1 == 'H' | toss2 == 'H'))
ph

## [1] 0.75

# P(tail)
pt <- Prob(two.coin, event = (toss1 == 'T' | toss2 == 'T'))
pt

## [1] 0.75

# P(head n tail)
phnt <- Prob(two.coin, event = !(toss1 == 'H' | toss2 == 'H') | !(toss1 == 'T' | toss2 == 'T'))
phnt

## [1] 0.5

# P(head U tail)
phut <- ph + pt - phnt
phut

## [1] 1

Conditional Probability (s11)

# Tossing two dice
two.dice <- rolldie(2, makespace = TRUE)

# A := {difference is 3}
A <- subset(two.dice, abs(X1 - X2) == 3)
A

##    X1 X2      probs
## 4   4  1 0.02777778
## 11  5  2 0.02777778
## 18  6  3 0.02777778
## 19  1  4 0.02777778
## 26  2  5 0.02777778
## 33  3  6 0.02777778

pA <- Prob(A)
pA

## [1] 0.1666667

# B := {sum is a multiple of 3}
B <- subset(two.dice, abs(X1 + X2) %in% c(3,6,9,12))
B

##    X1 X2      probs
## 2   2  1 0.02777778
## 5   5  1 0.02777778
## 7   1  2 0.02777778
## 10  4  2 0.02777778
## 15  3  3 0.02777778
## 18  6  3 0.02777778
## 20  2  4 0.02777778
## 23  5  4 0.02777778
## 25  1  5 0.02777778
## 28  4  5 0.02777778
## 33  3  6 0.02777778
## 36  6  6 0.02777778

pB <- Prob(B)
pB

## [1] 0.3333333

# P(A|B)
Prob(two.dice, event = (abs(X1 - X2) == 3), (given = abs(X1 + X2) %in% c(3,6,9,12)))

## [1] 0.1666667

# P(AnB)/P(B)
Prob(intersect(A, B))/Prob(B)

## [1] 0.1666667

Independency (s12)

# Dice example
one.die <- rolldie(1, makespace = TRUE)
one.die

##   X1     probs
## 1  1 0.1666667
## 2  2 0.1666667
## 3  3 0.1666667
## 4  4 0.1666667
## 5  5 0.1666667
## 6  6 0.1666667

# A := the event that the number is even = {2,4,6}
A <- subset(one.die, (X1 %% 2 == 0))
A

##   X1     probs
## 2  2 0.1666667
## 4  4 0.1666667
## 6  6 0.1666667

# B := the event that the number is greater than or equal to 3 = {3,4,5,6}
B <- subset(one.die, X1 >= 3)
B

##   X1     probs
## 3  3 0.1666667
## 4  4 0.1666667
## 5  5 0.1666667
## 6  6 0.1666667

# P(A)
Prob(A)

## [1] 0.5

# P(B)
Prob(B)

## [1] 0.6666667

# P(A n B)
Prob(intersect(A,B))

## [1] 0.3333333

# P(B|A)
Prob(B, given = A)

## [1] 0.6666667

# P(B) = P(B|A) ?
identical(Prob(B), Prob(B, given = A))

## [1] TRUE

# P(A n B) = P(A)P(B) ?
identical(Prob(intersect(A,B)), Prob(A)*Prob(B))

## [1] TRUE

Example of Bayes’ Rule (s14)

# Probability of test positive when RA is given, P(+|RA) = 0.95
# Probability of test positive when RA is not given, P(+|!RA) = 0.01
# background probability of RA and Normal, P(RA) = 0.005, P(Normal) = 0.995 
test <- matrix(c(0.95, 0.01, 0.05, 0.99), nrow = 2, ncol = 2, byrow = TRUE)
rownames(test) <- c("TP", "TN")
colnames(test) <- c("RA", "Normal")
prevalance <- c(0.005, 0.995)

# considering prevalence
test_corrected <- matrix(c(test[,1] * prevalance[1], test[,2] * prevalance[2]), 
                         nrow = 2, ncol = 2, byrow = FALSE)
rownames(test_corrected) <- c("TP", "TN")
colnames(test_corrected) <- c("RA", "Normal")
test_corrected <- as.data.frame(test_corrected)
test_corrected$Sum <- rowSums(test_corrected)
test_corrected$P_correct_Diagnosis[1] <- test_corrected$RA[1]/test_corrected$Sum[1]
test_corrected$P_correct_Diagnosis[2] <- test_corrected$Normal[2]/test_corrected$Sum[2]

# percent
test_corrected * 100

##       RA Normal   Sum P_correct_Diagnosis
## TP 0.475  0.995  1.47            32.31293
## TN 0.025 98.505 98.53            99.97463

Random Variable of two coin toss (s 17)

# toss 2 coins
library(prob)
two.coin <- tosscoin(2, makespace = TRUE)
two.coin <- addrv(two.coin, X = ifelse(toss1 == "H", 1, 0) + ifelse(toss2 == "H", 1, 0))
pmf <- marginal(two.coin, vars = "X")
pmf

##   X probs
## 1 0  0.25
## 2 1  0.50
## 3 2  0.25

barplot(pmf$probs, names.arg = pmf$X, main = "Histogram of X", 
        xlab = "X", ylab = "f(X)", ylim = c(0, 0.5))

Discrete Random Variable (s18)

# roll 2 dices
two.dice <- rolldie(2, makespace = TRUE)
two.dice <- addrv(two.dice, X = X1 + X2)
pmf <- marginal(two.dice, vars = "X")
pmf

##     X      probs
## 1   2 0.02777778
## 2   3 0.05555556
## 3   4 0.08333333
## 4   5 0.11111111
## 5   6 0.13888889
## 6   7 0.16666667
## 7   8 0.13888889
## 8   9 0.11111111
## 9  10 0.08333333
## 10 11 0.05555556
## 11 12 0.02777778

barplot(pmf$probs, names.arg = pmf$X, main = "Histogram of X", 
        xlab = "X", ylab = "f(X)", ylim = c(0, 0.2))

Continuous Random Variable (s20)

f <- function(x) {1.5 - 6 * (x - 50.0)^2}
integrate(f, 49.8, 50.1)

## 0.432 with absolute error < 4.8e-15

Expectation of random variable (s22)

two.dice <- rolldie(2, makespace = TRUE)
two.dice <- addrv(two.dice, X = X1 + X2)
pmf <- marginal(two.dice, vars = "X")
Ex <- sum(pmf$X*pmf$probs)

Various types of Averages (s24)

# install.packages("psych")
library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:prob':
## 
##     sim

## The following object is masked from 'package:fBasics':
## 
##     tr

## The following object is masked from 'package:timeSeries':
## 
##     outlier

# Arithmatic mean
vec <- c(10,2,19,24,6,23,47,24,54,77)
sum(vec)/length(vec)

## [1] 28.6

mean(vec)

## [1] 28.6

# Harmonic mean
x <- c(40, 20) # x1, x2, km/hr
length(x)/sum(1/x)

## [1] 26.66667

harmonic.mean(x)

## [1] 26.66667

mean(x) # cf

## [1] 30

geometric.mean(x) # cf

## [1] 28.28427

# Geometric mean
pop <- c(10000, 10500, 11550, 13860, 18156)
x <- c(1.05, 1.10, 1.20, 1.31)

# calculation of geometric mean
prod(x)^(1 / length(x))

## [1] 1.160803

geometric.mean(x)

## [1] 1.160803

# Population change over 4 decade
pop[1]*mean(x)^4

## [1] 18420.6

pop[1]*harmonic.mean(x)^4

## [1] 17900.19

pop[1]*geometric.mean(x)^4

## [1] 18156.6

Variance and Standard Deviation (s25)

two.dice <- rolldie(2, makespace = TRUE)
two.dice <- addrv(two.dice, X = X1 + X2)
pmf <- marginal(two.dice, vars = "X")
Ex2 <- sum(pmf$X^2 * pmf$probs)
Ex <- sum(pmf$X * pmf$probs)
Var <- Ex2 - Ex^2
SD <- sqrt(Var)

Marginal distribution of RV X, Y (s27)

tbl <- matrix(c(rep(0,3),rep(1/32,2),rep(3/32,2),rep(1/16,3),1/8,2/8,0,1/8,rep(1/64,4),rep(0,3)), 
              nrow = 3, byrow = TRUE)
rownames(tbl) <- c(seq(0,2))
colnames(tbl) <- c(seq(0,6))
tbl <- as.data.frame(tbl)
tbl$py <- rowSums(tbl)
px <- colSums(tbl)
tbl <- rbind(tbl, px)
rownames(tbl) <- c(seq(0,2), "px")

barplot(tbl$py[1:3], names.arg = rownames(tbl)[1:3], main = "probability mass function of y", 
        xlab = "y", ylab = "p(y)", ylim = c(0, 1))

barplot(t(tbl)[1:7,4], names.arg = colnames(tbl)[1:7], main = "probability mass function of x", 
        xlab = "x", ylab = "p(x)", ylim = c(0, 1))

Marginal expectation of RV X, Y & correlation (s29 ~ s30)

Ex <- sum(seq(0,6)*tbl[4,1:7])
Ex2 <- sum(seq(0,6)^2*tbl[4,1:7])
Var_x <- Ex2 - Ex^2
SD_x <- sqrt(Var_x)

Ey <- sum(seq(0,2)*tbl[1:3,8])
Ey2 <- sum(seq(0,2)^2*tbl[1:3,8])
Var_y <- Ey2 - Ey^2
SD_y <- sqrt(Var_y)

# Cov = E(XY) - E(X)E(Y)
Exy <- sum(0*seq(0,6)*tbl[1,1:7], 1*seq(0,6)*tbl[2,1:7], 2*seq(0,6)*tbl[3,1:7])
ExEy <- Ex*Ey
Cov <- Exy - ExEy

# Correlation Coefficient (Pearson Correlation Coefficient)
PCC <- Cov / sqrt(Var_x*Var_y)
PCC

## [1] -0.4929085

Probability

Bis335

Contents

Introduction

Probability

Random variable

Probabilistic properties of random variables

Tutorial

I. Sample Space & Event (s7)

Probability (s10)

Conditional Probability (s11)

Independency (s12)

Example of Bayes’ Rule (s14)

Random Variable of two coin toss (s 17)

Discrete Random Variable (s18)

Continuous Random Variable (s20)

Expectation of random variable (s22)

Various types of Averages (s24)

Variance and Standard Deviation (s25)

Marginal distribution of RV X, Y (s27)

Marginal expectation of RV X, Y & correlation (s29 ~ s30)