Contents

Part I: Nonlinear expansion of classification

Part II: Decision Tree Approaches

Part III: Support Vector Machine


Tutorial

Classification Adv 1 - Nonlinear expansion of classification

Polynomial Regression and Step Function

  • You can get following descriptions and more details with ‘help(name of dataset)’

  • help(Wage) - comparison for nonlinear regressions

  • Wage and other data for a group of 3000 male workers in the Mid-Atlantic region

  • help(Carseats) - random forest

  • A simulated data set containing sales of child car seats at 400 different stores

  • Library(MASS); help(Boston) - regression tree

  • A simple simulated data set containing 100 returns for each of two assets

#install.packages("ISLR")
library(ISLR)
attach(Wage)
head(Carseats)
fit = lm(wage ~ poly(age, 4),data = Wage)
coef(summary(fit))
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02
#range function returns a vector containing the minimum and maximum of all the given arguments
agelims = range(age)
age.grid = seq(from = agelims[1], to = agelims[2]) #age from 18 to 80
preds = predict(fit, newdata = list(age = age.grid),se = TRUE)
se.bands = cbind(preds$fit + 2*preds$se.fit, preds$fit - 2*preds$se.fit)

#Left - polynomial regression
#Right - logistic regression
{par(mfrow = c(1,2), mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 4, 0))

plot(age,wage,xlim = agelims, cex=.5, col="darkgrey")
title("Degree-4 polynomial", outer=T)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)

fit = glm(I(wage>250)~poly(age,4),data = Wage, family = binomial)
preds = predict(fit, newdata = list(age = age.grid), se = T)
pfit = exp(preds$fit)/(1 + exp(preds$fit))
se.bands.logit = cbind(preds$fit + 2*preds$se.fit, preds$fit - 2*preds$se.fit)
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))

plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2))
points(jitter(age), I((wage > 250)/5), cex = .5, pch = "|", col = "darkgrey")
lines(age.grid, pfit, lwd = 1.5, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue",lty = 3)}

  • Here cut() automatically picked the cutpoints at 33.5, 49, and 64.5 years of age.
  • The function cut() returns an ordered categorical variable
### linear fit
table(cut(age,4)) # quadrisection of age's range
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72
fit = lm(wage ~ cut(age, 4), data = Wage)
coef(summary(fit))
##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01

Splines

  • Load’splines’ package
  • The bs() function generates the entire matrix of basis functions for splines with the specified set of knots
  • By default, cubic splines are produced
## [1] 3000    6
## [1] 3000    6
##   25%   50%   75% 
## 33.75 42.00 51.00

## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful

Generalized Additive Models (GAMs)

# method 1

#install.packages("gam")
library(gam)
## Loading required package: foreach
## Loaded gam 1.16
gam.m3 = gam(wage ~ s(year,4) + s(age,5) + education, data = Wage)
{par(mfrow = c(1,3))
plot(gam.m3, se = TRUE, col = "blue")

gam1 = lm(wage ~ ns(year,4) + ns(age,5) + education, data = Wage)
plot.Gam(gam1, se = TRUE, col = "red")}

  • Comparing among various GAMs
  • We find that there is compelling evidence that a GAM with a linear function
  • of year is better than a GAM that does not include year at all (p-value=0.00014).
  • However, there is no evidence that a non-linear function of year is needed (p-value=0.349).
# method2
gam.m1 = gam(wage ~ s(age,5) + education, data = Wage)
gam.m2 = gam(wage ~ year + s(age,5) + education, data = Wage)
anova(gam.m1, gam.m2, gam.m3, test = "F")

Classification Adv 2 - Decision Tree Approaches

Fitting Classification tree

  • To make decision tree , function tree() in tree package.
  • To train binary classifier first we have to make new response variable of binary class
  • Grow tree with all predictors except ‘Sales’
## Fitting Classification Trees (1)
#install.packages("tree")
library(tree)
Sales <- Carseats$Sales
#takes on a value of Yes if the Sales variable exceeds 8, and takes on a value of No otherwise.
High <- ifelse(Sales <= 8,"No","Yes") 
my_Carseats = data.frame(Carseats, High)
tree.carseats = tree(High ~.-Sales,my_Carseats)
summary(tree.carseats)
## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = my_Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
## [6] "Advertising" "Age"         "US"         
## Number of terminal nodes:  27 
## Residual mean deviance:  0.4575 = 170.7 / 373 
## Misclassification error rate: 0.09 = 36 / 400
{par(mfrow = c(1,1))
plot(tree.carseats)
text(tree.carseats,pretty = 0)
tree.carseats}

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 400 541.500 No ( 0.59000 0.41000 )  
##     2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )  
##       4) Price < 92.5 46  56.530 Yes ( 0.30435 0.69565 )  
##         8) Income < 57 10  12.220 No ( 0.70000 0.30000 )  
##          16) CompPrice < 110.5 5   0.000 No ( 1.00000 0.00000 ) *
##          17) CompPrice > 110.5 5   6.730 Yes ( 0.40000 0.60000 ) *
##         9) Income > 57 36  35.470 Yes ( 0.19444 0.80556 )  
##          18) Population < 207.5 16  21.170 Yes ( 0.37500 0.62500 ) *
##          19) Population > 207.5 20   7.941 Yes ( 0.05000 0.95000 ) *
##       5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )  
##        10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )  
##          20) CompPrice < 124.5 96  44.890 No ( 0.93750 0.06250 )  
##            40) Price < 106.5 38  33.150 No ( 0.84211 0.15789 )  
##              80) Population < 177 12  16.300 No ( 0.58333 0.41667 )  
##               160) Income < 60.5 6   0.000 No ( 1.00000 0.00000 ) *
##               161) Income > 60.5 6   5.407 Yes ( 0.16667 0.83333 ) *
##              81) Population > 177 26   8.477 No ( 0.96154 0.03846 ) *
##            41) Price > 106.5 58   0.000 No ( 1.00000 0.00000 ) *
##          21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )  
##            42) Price < 122.5 51  70.680 Yes ( 0.49020 0.50980 )  
##              84) ShelveLoc: Bad 11   6.702 No ( 0.90909 0.09091 ) *
##              85) ShelveLoc: Medium 40  52.930 Yes ( 0.37500 0.62500 )  
##               170) Price < 109.5 16   7.481 Yes ( 0.06250 0.93750 ) *
##               171) Price > 109.5 24  32.600 No ( 0.58333 0.41667 )  
##                 342) Age < 49.5 13  16.050 Yes ( 0.30769 0.69231 ) *
##                 343) Age > 49.5 11   6.702 No ( 0.90909 0.09091 ) *
##            43) Price > 122.5 77  55.540 No ( 0.88312 0.11688 )  
##              86) CompPrice < 147.5 58  17.400 No ( 0.96552 0.03448 ) *
##              87) CompPrice > 147.5 19  25.010 No ( 0.63158 0.36842 )  
##               174) Price < 147 12  16.300 Yes ( 0.41667 0.58333 )  
##                 348) CompPrice < 152.5 7   5.742 Yes ( 0.14286 0.85714 ) *
##                 349) CompPrice > 152.5 5   5.004 No ( 0.80000 0.20000 ) *
##               175) Price > 147 7   0.000 No ( 1.00000 0.00000 ) *
##        11) Advertising > 13.5 45  61.830 Yes ( 0.44444 0.55556 )  
##          22) Age < 54.5 25  25.020 Yes ( 0.20000 0.80000 )  
##            44) CompPrice < 130.5 14  18.250 Yes ( 0.35714 0.64286 )  
##              88) Income < 100 9  12.370 No ( 0.55556 0.44444 ) *
##              89) Income > 100 5   0.000 Yes ( 0.00000 1.00000 ) *
##            45) CompPrice > 130.5 11   0.000 Yes ( 0.00000 1.00000 ) *
##          23) Age > 54.5 20  22.490 No ( 0.75000 0.25000 )  
##            46) CompPrice < 122.5 10   0.000 No ( 1.00000 0.00000 ) *
##            47) CompPrice > 122.5 10  13.860 No ( 0.50000 0.50000 )  
##              94) Price < 125 5   0.000 Yes ( 0.00000 1.00000 ) *
##              95) Price > 125 5   0.000 No ( 1.00000 0.00000 ) *
##     3) ShelveLoc: Good 85  90.330 Yes ( 0.22353 0.77647 )  
##       6) Price < 135 68  49.260 Yes ( 0.11765 0.88235 )  
##        12) US: No 17  22.070 Yes ( 0.35294 0.64706 )  
##          24) Price < 109 8   0.000 Yes ( 0.00000 1.00000 ) *
##          25) Price > 109 9  11.460 No ( 0.66667 0.33333 ) *
##        13) US: Yes 51  16.880 Yes ( 0.03922 0.96078 ) *
##       7) Price > 135 17  22.070 No ( 0.64706 0.35294 )  
##        14) Income < 46 6   0.000 No ( 1.00000 0.00000 ) *
##        15) Income > 46 11  15.160 Yes ( 0.45455 0.54545 ) *
  • Predict classes for the test set, randomly sampled from whole set. You can calculate the performance of a decision tree with test set.
  • First randomly select samples from whole set
  • Then train tree with function tree() with formula ‘High~. -Sales‘, meaning predicting High variable with all predictors except ‘Sales’
  • Predict class with function ‘predict’. For more details type help(‘predict.tree’)
## Fitting Classification Trees (2)
set.seed(2)
train <- sample(1:nrow(my_Carseats),200)
my_Carseats.test <- my_Carseats[-train,]
High.test <- High[-train]
tree.carseats <- tree(High~.-Sales,my_Carseats,subset = train)
tree.pred <- predict(tree.carseats,my_Carseats.test,type = "class")
table(tree.pred,High.test)
##          High.test
## tree.pred No Yes
##       No  86  27
##       Yes 30  57
  • cv.tree() : performs cross-validation in order to determine the optimal level of tree complexity
  • FUN=prune.misclass in order to indicate that we want the classification error rate to guide the cross-validation and pruning process, rather than the default for the cv.tree() function, which is deviance.
  • function reports the number of terminal nodes of each tree considered (size) as well as the corresponding error rate and the value of the cost-complexity parameter used, k. Despite the name, dev corresponds to the cross-validation error rate in this instance
  • The tree with 9 terminal nodes results in the lowest cross-validation error rate, with 50 cross-validation errors
##Fitting Classification Trees (3)
set.seed(3)
cv.carseats = cv.tree(tree.carseats,FUN = prune.misclass)
cv.carseats
## $size
## [1] 19 17 14 13  9  7  3  2  1
## 
## $dev
## [1] 55 55 53 52 50 56 69 65 80
## 
## $k
## [1]       -Inf  0.0000000  0.6666667  1.0000000  1.7500000  2.0000000
## [7]  4.2500000  5.0000000 23.0000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
  • plot the error rate as a function of both size and k
##Fitting Classification Trees (4)
{par(mfrow = c(1,2))
plot(cv.carseats$size,cv.carseats$dev,type = "b")
plot(cv.carseats$k,cv.carseats$dev,type = "b")}

  • Comparing effects of pruning, with performance and tree structures.
##Fitting Classification Trees (5)
prune.carseats = prune.misclass(tree.carseats,best = 9)
{plot(prune.carseats)
text(prune.carseats,pretty = 0)}

tree.pred <- predict(prune.carseats,my_Carseats.test,type = "class")
table(tree.pred,High.test)
##          High.test
## tree.pred No Yes
##       No  94  24
##       Yes 22  60
(94 + 60)/200
## [1] 0.77
prune.carseats <- prune.misclass(tree.carseats,best = 15)
{plot(prune.carseats)
text(prune.carseats,pretty = 0)}

tree.pred <- predict(prune.carseats,my_Carseats.test,type = "class")
table(tree.pred, High.test)
##          High.test
## tree.pred No Yes
##       No  86  22
##       Yes 30  62
(86 + 62)/200
## [1] 0.74

Bagging and Random Forest

  • randomForest() function can be used to perform both random forests and bagging
  • Set the train and test set.
#Bagging and Random Forest (3)
library(MASS)
#install.packages("randomForest")
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston)/2)
boston.test <- Boston[-train,"medv"]
  • The argument mtry=13 indicates that all 13 predictors should be considered for each split of the tree. By default, mtry uses p/3 variables when building a random forest of regression trees, and √p variables when building a random forest of classification tree. We could change the number of trees grown using the ntree argument (default =500).
bag.boston <- randomForest(medv~.,data = Boston,subset = train,mtry = 13,importance = TRUE)
bag.boston
## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE,      subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 13
## 
##           Mean of squared residuals: 10.80817
##                     % Var explained: 86.91
yhat.bag <- predict(bag.boston,newdata = Boston[-train,])
{plot(yhat.bag,boston.test)
abline(0,1)}

mean((yhat.bag - boston.test)^2)
## [1] 13.444
bag.boston <- randomForest(medv~.,data = Boston,subset = train,mtry = 13,ntree = 25)
yhat.bag <- predict(bag.boston,newdata = Boston[-train,])
mean((yhat.bag - boston.test)^2)
## [1] 13.67774
  • Predict and calculate MSE
  • Explore the variable importance
  • Two measures of variable importance are reported. The former is based upon the mean decrease of accuracy in predictions on the out of bag samples when a given variable is excluded from the model. The latter is a measure of the total decrease in node impurity that results from splits over that variable, averaged over all trees
##Bagging and Random Forest (2) var importance
set.seed(1)
rf.boston <- randomForest(medv~.,data = Boston,subset = train,mtry = 6,importance = TRUE)
yhat.rf <- predict(rf.boston,newdata = Boston[-train,])
mean((yhat.rf - boston.test)^2)
## [1] 11.66454
importance(rf.boston)
##           %IncMSE IncNodePurity
## crim    12.132320     986.50338
## zn       1.955579      57.96945
## indus    9.069302     882.78261
## chas     2.210835      45.22941
## nox     11.104823    1044.33776
## rm      31.784033    6359.31971
## age     10.962684     516.82969
## dis     15.015236    1224.11605
## rad      4.118011      95.94586
## tax      8.587932     502.96719
## ptratio 12.503896     830.77523
## black    6.702609     341.30361
## lstat   30.695224    7505.73936
varImpPlot(rf.boston)

Boosting

  • Here we use the gbm package, and within it the gbm() function, to fit boosted regression trees
  • We run gbm() with the option distribution=“gaussian” since this is a regression problem; if it were a binary classification problem, we would use distribution=“bernoulli”.
##Boosting (1)
#install.packages("gbm")
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1.3
set.seed(1)
boost.boston <- gbm(medv~., data = Boston[train,], distribution = "gaussian", n.trees = 5000, interaction.depth = 4)
summary(boost.boston)
  • lstat and rm are by far the most important variables. We can also produce partial dependence plots for these two variables. These plots illustrate the marginal effect of the selected variables on the response after integrating out the other variables.
##Boosting (2)

{par(mfrow = c(1,2))
plot(boost.boston, i = "rm")
plot(boost.boston, i = "lstat")}

  • We now use the boosted model to predict medv on the test set:
##Boosting (3)

yhat.boost <- predict(boost.boston,newdata = Boston[-train,],n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 11.84434
boost.boston <- gbm(medv~., data = Boston[train,], distribution = "gaussian", n.trees = 5000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
yhat.boost <- predict(boost.boston,newdata = Boston[-train,],n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 11.51109
  • If we want to, we can perform boosting with a different value of the shrinkage parameter λ in (8.10). The default value is 0.001, but this is easily modified

Biological example for Decision tree (in the lecture matrial)

  • Decision tree for classifying cancer types
library("tree")
#1:15 ALL, 16:26 AML
cancer <- read.csv("http://bisyn.kaist.ac.kr/bis335/13-Classification_Adv-ALLnAML3.csv")
cancer$class <- c(rep("ALL",15), rep("AML", 11))

#make B bootstrap samples
bootstrap <- list()
B <- 3
for (i in 1:B) {
  bootstrap[[i]] <- cancer[sample(c(1:26), 26,replace = T), ] #random process, thus it can be different from the problem b)
}

#make decision tree of original sample
cancer[, 4] <- as.factor(cancer[, 4])
treemod <- tree(class ~ TCF15 + IFI44 + SDCBP, cancer, split = "gini")
{plot(treemod)
text(treemod)}

#Pruning
cv.trees <- cv.tree(treemod, FUN = prune.misclass)
plot(cv.trees)

##result : size > 2 have the lowest variance
prune.trees <- prune.misclass(treemod, best = 2) #thus select size 2
{plot(prune.trees)
text(prune.trees)}

#make decision tree of bootstrap samples
boots.tree <- list()
for (i in 1:B) {
  bootstrap[[i]][, 4] <- as.factor(bootstrap[[i]][, 4])
  boots.tree[[i]] <- tree(class ~ TCF15 + IFI44 + SDCBP, bootstrap[[i]], split = "gini")
}

{par(mfrow = c(3,1))
plot(boots.tree[[1]])
text(boots.tree[[1]])
plot(boots.tree[[2]])
text(boots.tree[[2]])
plot(boots.tree[[3]])
text(boots.tree[[3]])}

##a) gini index of each terminal node of original tree
gini <- function(x,k){
  G <- 0
  for (i in 1:k) {
    G <- G + x[i]*(1 - x[i])
  }
  return(G)
}
treemod  ##focus row of "star" index
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 26 35.430 ALL ( 0.5769 0.4231 )  
##   2) TCF15 < -0.582425 13 11.160 ALL ( 0.8462 0.1538 )  
##     4) IFI44 < 0.061325 8  0.000 ALL ( 1.0000 0.0000 ) *
##     5) IFI44 > 0.061325 5  6.730 ALL ( 0.6000 0.4000 ) *
##   3) TCF15 > -0.582425 13 16.050 AML ( 0.3077 0.6923 )  
##     6) TCF15 < -0.33372 6  8.318 ALL ( 0.5000 0.5000 ) *
##     7) TCF15 > -0.33372 7  5.742 AML ( 0.1429 0.8571 ) *
gini(c(1,0),2)   #0
## [1] 0
gini(c(0.6,0.4),2) #0.48
## [1] 0.48
gini(c(0.5,0.5),2) #0.5
## [1] 0.5
gini(c(0.1429,0.8571),2)  #0.2449
## [1] 0.2449592
##b) predict cancer types of three patients using bootstrap trees
test <- cancer[c(9,15,26), ]
treepred1 <- predict(boots.tree[[1]], test, type = "class")
treepred2 <- predict(boots.tree[[2]], test, type = "class")
treepred3 <- predict(boots.tree[[3]], test, type = "class")

Classification Adv 3 - Support Vector Machine

Support Vector Classifier

  • Khan Dataset - ‘pairs’ shows the scatter plots for pair of predictors
  • Make ‘dat’ data frame of part of Khan data
  • 3th line make dat more linearly separatable.
## Method 1
##Support Vector Classifier (1)
set.seed(1)
x = Khan$xtrain
dim(x);
## [1]   63 2308
y = Khan$ytrain
length(y);
## [1] 63
pairs(x[,c(1,2,3)], col = y, labels = c('gene1','gene2','gene3'))

X <- x[y %in% c(1,2), c(1,2,3)]
Y <- factor(y[as.numeric(y) %in% c(1,2)])
X[Y == 1,] <- X[Y == 1,] + 3
pairs(X[,c(1,2,3)],col = Y, labels = c('gene1','gene2','gene3'))

dat <- data.frame(gene = X, type = Y)
  • We now use svm() to fit the support vector classifier for a given value of the cost parameter. A cost argument allows us to specify the cost of a violation to the margin.
  • When the cost argument is large, then the margins will be narrow and there will be few support vectors on the margin or violating the margin
#install.packages("e1071")
library(e1071)
svmfit.cost10 <- svm(type~., data = dat, kernel = "linear", cost = 10, scale = FALSE)
svmfit.cost.1 <- svm(type~., data = dat, kernel = "linear", cost = 0.1, scale = FALSE)

##Support Vector Classifier (2)
plot(svmfit.cost10, dat, gene.2 ~ gene.1)

plot(svmfit.cost.1, dat, gene.2 ~ gene.1)

svmfit.cost10$index #the indeces of support vectors
## [1]  5 20 30
svmfit.cost.1$index
## [1]  5  7  8 24 27 30
summary(svmfit.cost10) # you can check the number of support vectors for each class
## 
## Call:
## svm(formula = type ~ ., data = dat, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.3333333 
## 
## Number of Support Vectors:  3
## 
##  ( 2 1 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2
  • You can use tune() function to perform cross-validation. By default, tune() performs ten-fold cross-validation on a set of models of interest.
  • You can also get best performed model from cross-validation
##Support Vector Classifier (3) 
set.seed(1)
tune.out <- tune(svm,type~.,data = dat,kernel = "linear",
                ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.2583333  0.2619078
## 2 1e-02 0.2583333  0.2619078
## 3 1e-01 0.0000000  0.0000000
## 4 1e+00 0.0000000  0.0000000
## 5 5e+00 0.0000000  0.0000000
## 6 1e+01 0.0000000  0.0000000
## 7 1e+02 0.0000000  0.0000000
bestmod <- tune.out$best.model
summary(bestmod)
## 
## Call:
## best.tune(method = svm, train.x = type ~ ., data = dat, ranges = list(cost = c(0.001, 
##     0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
##       gamma:  0.3333333 
## 
## Number of Support Vectors:  8
## 
##  ( 4 4 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2
  • Let’s predict y for test dataset. Generate test data with part test samples of Kahn
##Support Vector Classifier (4)
xtest <- Khan$xtest
ytest <- Khan$ytest
Xtest <- xtest[ytest %in% c(1,2),]
Ytest <- factor(ytest[ytest %in% c(1,2)])
testdat <- data.frame(gene = Xtest, type = Ytest)
  • First, predict y with best model from cross-validation previous
  • Then, predict y with regenerated model using full train set and cost parameter of best model (cost = 1.0).
  • Because of the difference of samples used in training models, the performance could be vary.
ypred <- predict(bestmod, testdat)
table(predict = ypred, truth = testdat$type)
##        truth
## predict 1 2
##       1 0 0
##       2 3 6
svmfit <- svm(type~., data = dat, kernel = "linear", cost = 1, scale = FALSE)
ypred <- predict(svmfit,testdat)
table(predict = ypred, truth = testdat$type)
##        truth
## predict 1 2
##       1 0 0
##       2 3 6

Support Vector Machine

  • To show more significant change in decision boundary, we will use artificial datasets.
## Method 2
##Support Vector Machine (1)
set.seed(1)
x <- matrix(rnorm(200*2),ncol = 2)
x[1:100,] <- x[1:100,] + 2
x[101:150,] <- x[101:150,] - 2
y <- c(rep(1, 150), rep(2, 50))
dat <- data.frame(gene = x,type = as.factor(y))
  • To fit an SVM with a radial kernel, we use kernel=“radial”.
  • we use gamma to specify a value of γ for the radial basis kernel
svmfit <- svm(type~., data = dat, kernel = "radial", gamma = 1, cost = 0.6)
plot(svmfit, dat, gene.2 ~ gene.1)

svmfit <- svm(type~., data = dat, kernel = "radial", gamma = 1, cost = 1e5)
plot(svmfit, dat, gene.2 ~ gene.1)

  • We can find best parameters for SVM model with radial kernel method and test performance of the best model
  • We first determine the train set by sampling 150 samples from 200 samples and test set with remaining.
  • Using tune() function, you can perform cross validation to the various parameters.
  • You can calculate accuracy with contingency table
  • Acc = (35+11)/50 = 0.92
##Support Vector Machine (2)
set.seed(1)
train <- sample(200,150)
tune.out <- tune(svm, type~., data = dat[train,], kernel = "radial", ranges = list(cost = c(0.1,1,10,100), gamma = c(0.5,1,2,4)))
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     1
## 
## - best performance: 0.1066667 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1    0.1   0.5 0.2400000 0.09532271
## 2    1.0   0.5 0.1200000 0.06885304
## 3   10.0   0.5 0.1133333 0.07062333
## 4  100.0   0.5 0.1466667 0.08195151
## 5    0.1   1.0 0.2000000 0.09428090
## 6    1.0   1.0 0.1066667 0.05621827
## 7   10.0   1.0 0.1400000 0.07336700
## 8  100.0   1.0 0.1533333 0.07730012
## 9    0.1   2.0 0.1866667 0.08195151
## 10   1.0   2.0 0.1200000 0.07568616
## 11  10.0   2.0 0.1333333 0.07027284
## 12 100.0   2.0 0.1466667 0.09322745
## 13   0.1   4.0 0.2400000 0.09532271
## 14   1.0   4.0 0.1400000 0.08577893
## 15  10.0   4.0 0.1466667 0.09838197
## 16 100.0   4.0 0.1600000 0.10517475
table(tune = dat[-train,"type"],
      pred = predict(tune.out$best.model, newdata = dat[-train,]))
##     pred
## tune  1  2
##    1 35  1
##    2  3 11

Performance Testing - ROC Curves

  • With ROC curve, you can measure performance of binary classifier.
  • First, define a new function to plot ROC curves.
## ROC Curves
#install.packages("ROCR")
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
rocplot = function(pred, truth, ...){
  predob = prediction(pred, truth, label.ordering = c('2','1'))
  perf = performance(predob, "tpr", "fpr")
  plot(perf, ...)}

#More flexible model(red line) is more accurate model.
{par(mfrow = c(1,2))
svmfit.opt <- svm(type~., data = dat, kernel = "radial", gamma = 1, cost = 1, decision.values = T)
svmfit.flex <- svm(type~., data = dat, kernel = "radial", gamma = 50, cost = 1, decision.values = T)

fitted <- attributes(predict(svmfit.opt, dat, decision.values = TRUE))$decision.values
rocplot(fitted, dat[,"type"], main = "Training Data")
fitted <- attributes(predict(svmfit.flex, dat, decision.value = T))$decision.values
rocplot(fitted, dat[,"type"], add = T, col = "red")

fitted <- attributes(predict(svmfit.opt, dat[-train,], decision.values = T))$decision.values
rocplot(fitted, dat[-train, "type"], main = "Test Data")
fitted = attributes(predict(svmfit.flex, dat[-train,], decision.values = T))$decision.values
rocplot(fitted, dat[-train, "type"], add = T, col = "red")}

Applying to gene expression data

  • Look for the attributes and dim, and the number of samples in each class of train and test set.
##Application to Expression Data [Khan]
library(ISLR)
names(Khan)
## [1] "xtrain" "xtest"  "ytrain" "ytest"
dim(Khan$xtrain)
## [1]   63 2308
dim(Khan$xtest)
## [1]   20 2308
length(Khan$ytrain)
## [1] 63
length(Khan$ytest)
## [1] 20
table(Khan$ytrain)
## 
##  1  2  3  4 
##  8 23 12 20
table(Khan$ytest)
## 
## 1 2 3 4 
## 3 6 6 5
##SVM with Multiple Classes
dat <- data.frame(x = Khan$xtrain, y = as.factor(Khan$ytrain))
out <- svm(y~., data = dat, kernel = "linear", cost = 10)
summary(out)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.0004332756 
## 
## Number of Support Vectors:  58
## 
##  ( 20 20 11 7 )
## 
## 
## Number of Classes:  4 
## 
## Levels: 
##  1 2 3 4
table(out$fitted, dat$y)
##    
##      1  2  3  4
##   1  8  0  0  0
##   2  0 23  0  0
##   3  0  0 12  0
##   4  0  0  0 20
dat.te <- data.frame(x = Khan$xtest, y = as.factor(Khan$ytest))
pred.te <- predict(out, newdata = dat.te)
table(pred.te, dat.te$y)
##        
## pred.te 1 2 3 4
##       1 3 0 0 0
##       2 0 6 2 0
##       3 0 0 4 0
##       4 0 0 0 5
##SVM with Mutiple Classes
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following object is masked from 'package:survival':
## 
##     aml
boot.fn = function(data, index)
  coefficients(lm(mpg ~ horsepower + I(horsepower^2), data = data, subset = index))
set.seed(1)
boot(Auto, boot.fn, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##         original        bias     std. error
## t1* 56.900099702  6.098115e-03 2.0944855842
## t2* -0.466189630 -1.777108e-04 0.0334123802
## t3*  0.001230536  1.324315e-06 0.0001208339
summary(lm(mpg ~ horsepower + I(horsepower^2),data = Auto))$coef
##                     Estimate   Std. Error   t value      Pr(>|t|)
## (Intercept)     56.900099702 1.8004268063  31.60367 1.740911e-109
## horsepower      -0.466189630 0.0311246171 -14.97816  2.289429e-40
## I(horsepower^2)  0.001230536 0.0001220759  10.08009  2.196340e-21