You can get following descriptions and more details with ‘help(name of dataset)’
help(Wage) - comparison for nonlinear regressions
Wage and other data for a group of 3000 male workers in the Mid-Atlantic region
help(Carseats) - random forest
A simulated data set containing sales of child car seats at 400 different stores
Library(MASS); help(Boston) - regression tree
A simple simulated data set containing 100 returns for each of two assets
#install.packages("ISLR")
library(ISLR)
attach(Wage)
head(Carseats)
fit = lm(wage ~ poly(age, 4),data = Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287409 153.283015 0.000000e+00
## poly(age, 4)1 447.06785 39.9147851 11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3 125.52169 39.9147851 3.144742 1.678622e-03
## poly(age, 4)4 -77.91118 39.9147851 -1.951938 5.103865e-02
#range function returns a vector containing the minimum and maximum of all the given arguments
agelims = range(age)
age.grid = seq(from = agelims[1], to = agelims[2]) #age from 18 to 80
preds = predict(fit, newdata = list(age = age.grid),se = TRUE)
se.bands = cbind(preds$fit + 2*preds$se.fit, preds$fit - 2*preds$se.fit)
#Left - polynomial regression
#Right - logistic regression
{par(mfrow = c(1,2), mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 4, 0))
plot(age,wage,xlim = agelims, cex=.5, col="darkgrey")
title("Degree-4 polynomial", outer=T)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)
fit = glm(I(wage>250)~poly(age,4),data = Wage, family = binomial)
preds = predict(fit, newdata = list(age = age.grid), se = T)
pfit = exp(preds$fit)/(1 + exp(preds$fit))
se.bands.logit = cbind(preds$fit + 2*preds$se.fit, preds$fit - 2*preds$se.fit)
se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))
plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2))
points(jitter(age), I((wage > 250)/5), cex = .5, pch = "|", col = "darkgrey")
lines(age.grid, pfit, lwd = 1.5, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue",lty = 3)}
### linear fit
table(cut(age,4)) # quadrisection of age's range
##
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
fit = lm(wage ~ cut(age, 4), data = Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.158392 1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49] 24.053491 1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5] 23.664559 2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1] 7.640592 4.987424 1.531972 1.256350e-01
## [1] 3000 6
## [1] 3000 6
## 25% 50% 75%
## 33.75 42.00 51.00
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
The s() function, which is part of the gam library, is used to indicate that we would like to use a smoothing spline.
Reference (gam) : https://www.rdocumentation.org/packages/mgcv/versions/1.8-24/topics/gam
# method 1
#install.packages("gam")
library(gam)
## Loading required package: foreach
## Loaded gam 1.16
gam.m3 = gam(wage ~ s(year,4) + s(age,5) + education, data = Wage)
{par(mfrow = c(1,3))
plot(gam.m3, se = TRUE, col = "blue")
gam1 = lm(wage ~ ns(year,4) + ns(age,5) + education, data = Wage)
plot.Gam(gam1, se = TRUE, col = "red")}
# method2
gam.m1 = gam(wage ~ s(age,5) + education, data = Wage)
gam.m2 = gam(wage ~ year + s(age,5) + education, data = Wage)
anova(gam.m1, gam.m2, gam.m3, test = "F")
tree
package.## Fitting Classification Trees (1)
#install.packages("tree")
library(tree)
Sales <- Carseats$Sales
#takes on a value of Yes if the Sales variable exceeds 8, and takes on a value of No otherwise.
High <- ifelse(Sales <= 8,"No","Yes")
my_Carseats = data.frame(Carseats, High)
tree.carseats = tree(High ~.-Sales,my_Carseats)
summary(tree.carseats)
##
## Classification tree:
## tree(formula = High ~ . - Sales, data = my_Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "CompPrice" "Population"
## [6] "Advertising" "Age" "US"
## Number of terminal nodes: 27
## Residual mean deviance: 0.4575 = 170.7 / 373
## Misclassification error rate: 0.09 = 36 / 400
{par(mfrow = c(1,1))
plot(tree.carseats)
text(tree.carseats,pretty = 0)
tree.carseats}
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 400 541.500 No ( 0.59000 0.41000 )
## 2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )
## 4) Price < 92.5 46 56.530 Yes ( 0.30435 0.69565 )
## 8) Income < 57 10 12.220 No ( 0.70000 0.30000 )
## 16) CompPrice < 110.5 5 0.000 No ( 1.00000 0.00000 ) *
## 17) CompPrice > 110.5 5 6.730 Yes ( 0.40000 0.60000 ) *
## 9) Income > 57 36 35.470 Yes ( 0.19444 0.80556 )
## 18) Population < 207.5 16 21.170 Yes ( 0.37500 0.62500 ) *
## 19) Population > 207.5 20 7.941 Yes ( 0.05000 0.95000 ) *
## 5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )
## 10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )
## 20) CompPrice < 124.5 96 44.890 No ( 0.93750 0.06250 )
## 40) Price < 106.5 38 33.150 No ( 0.84211 0.15789 )
## 80) Population < 177 12 16.300 No ( 0.58333 0.41667 )
## 160) Income < 60.5 6 0.000 No ( 1.00000 0.00000 ) *
## 161) Income > 60.5 6 5.407 Yes ( 0.16667 0.83333 ) *
## 81) Population > 177 26 8.477 No ( 0.96154 0.03846 ) *
## 41) Price > 106.5 58 0.000 No ( 1.00000 0.00000 ) *
## 21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )
## 42) Price < 122.5 51 70.680 Yes ( 0.49020 0.50980 )
## 84) ShelveLoc: Bad 11 6.702 No ( 0.90909 0.09091 ) *
## 85) ShelveLoc: Medium 40 52.930 Yes ( 0.37500 0.62500 )
## 170) Price < 109.5 16 7.481 Yes ( 0.06250 0.93750 ) *
## 171) Price > 109.5 24 32.600 No ( 0.58333 0.41667 )
## 342) Age < 49.5 13 16.050 Yes ( 0.30769 0.69231 ) *
## 343) Age > 49.5 11 6.702 No ( 0.90909 0.09091 ) *
## 43) Price > 122.5 77 55.540 No ( 0.88312 0.11688 )
## 86) CompPrice < 147.5 58 17.400 No ( 0.96552 0.03448 ) *
## 87) CompPrice > 147.5 19 25.010 No ( 0.63158 0.36842 )
## 174) Price < 147 12 16.300 Yes ( 0.41667 0.58333 )
## 348) CompPrice < 152.5 7 5.742 Yes ( 0.14286 0.85714 ) *
## 349) CompPrice > 152.5 5 5.004 No ( 0.80000 0.20000 ) *
## 175) Price > 147 7 0.000 No ( 1.00000 0.00000 ) *
## 11) Advertising > 13.5 45 61.830 Yes ( 0.44444 0.55556 )
## 22) Age < 54.5 25 25.020 Yes ( 0.20000 0.80000 )
## 44) CompPrice < 130.5 14 18.250 Yes ( 0.35714 0.64286 )
## 88) Income < 100 9 12.370 No ( 0.55556 0.44444 ) *
## 89) Income > 100 5 0.000 Yes ( 0.00000 1.00000 ) *
## 45) CompPrice > 130.5 11 0.000 Yes ( 0.00000 1.00000 ) *
## 23) Age > 54.5 20 22.490 No ( 0.75000 0.25000 )
## 46) CompPrice < 122.5 10 0.000 No ( 1.00000 0.00000 ) *
## 47) CompPrice > 122.5 10 13.860 No ( 0.50000 0.50000 )
## 94) Price < 125 5 0.000 Yes ( 0.00000 1.00000 ) *
## 95) Price > 125 5 0.000 No ( 1.00000 0.00000 ) *
## 3) ShelveLoc: Good 85 90.330 Yes ( 0.22353 0.77647 )
## 6) Price < 135 68 49.260 Yes ( 0.11765 0.88235 )
## 12) US: No 17 22.070 Yes ( 0.35294 0.64706 )
## 24) Price < 109 8 0.000 Yes ( 0.00000 1.00000 ) *
## 25) Price > 109 9 11.460 No ( 0.66667 0.33333 ) *
## 13) US: Yes 51 16.880 Yes ( 0.03922 0.96078 ) *
## 7) Price > 135 17 22.070 No ( 0.64706 0.35294 )
## 14) Income < 46 6 0.000 No ( 1.00000 0.00000 ) *
## 15) Income > 46 11 15.160 Yes ( 0.45455 0.54545 ) *
## Fitting Classification Trees (2)
set.seed(2)
train <- sample(1:nrow(my_Carseats),200)
my_Carseats.test <- my_Carseats[-train,]
High.test <- High[-train]
tree.carseats <- tree(High~.-Sales,my_Carseats,subset = train)
tree.pred <- predict(tree.carseats,my_Carseats.test,type = "class")
table(tree.pred,High.test)
## High.test
## tree.pred No Yes
## No 86 27
## Yes 30 57
FUN=prune.misclass
in order to indicate that we want the classification error rate to guide the cross-validation and pruning process, rather than the default for the cv.tree() function, which is deviance.dev
corresponds to the cross-validation error rate in this instance##Fitting Classification Trees (3)
set.seed(3)
cv.carseats = cv.tree(tree.carseats,FUN = prune.misclass)
cv.carseats
## $size
## [1] 19 17 14 13 9 7 3 2 1
##
## $dev
## [1] 55 55 53 52 50 56 69 65 80
##
## $k
## [1] -Inf 0.0000000 0.6666667 1.0000000 1.7500000 2.0000000
## [7] 4.2500000 5.0000000 23.0000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
##Fitting Classification Trees (4)
{par(mfrow = c(1,2))
plot(cv.carseats$size,cv.carseats$dev,type = "b")
plot(cv.carseats$k,cv.carseats$dev,type = "b")}
##Fitting Classification Trees (5)
prune.carseats = prune.misclass(tree.carseats,best = 9)
{plot(prune.carseats)
text(prune.carseats,pretty = 0)}
tree.pred <- predict(prune.carseats,my_Carseats.test,type = "class")
table(tree.pred,High.test)
## High.test
## tree.pred No Yes
## No 94 24
## Yes 22 60
(94 + 60)/200
## [1] 0.77
prune.carseats <- prune.misclass(tree.carseats,best = 15)
{plot(prune.carseats)
text(prune.carseats,pretty = 0)}
tree.pred <- predict(prune.carseats,my_Carseats.test,type = "class")
table(tree.pred, High.test)
## High.test
## tree.pred No Yes
## No 86 22
## Yes 30 62
(86 + 62)/200
## [1] 0.74
#Bagging and Random Forest (3)
library(MASS)
#install.packages("randomForest")
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston)/2)
boston.test <- Boston[-train,"medv"]
bag.boston <- randomForest(medv~.,data = Boston,subset = train,mtry = 13,importance = TRUE)
bag.boston
##
## Call:
## randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 10.80817
## % Var explained: 86.91
yhat.bag <- predict(bag.boston,newdata = Boston[-train,])
{plot(yhat.bag,boston.test)
abline(0,1)}
mean((yhat.bag - boston.test)^2)
## [1] 13.444
bag.boston <- randomForest(medv~.,data = Boston,subset = train,mtry = 13,ntree = 25)
yhat.bag <- predict(bag.boston,newdata = Boston[-train,])
mean((yhat.bag - boston.test)^2)
## [1] 13.67774
##Bagging and Random Forest (2) var importance
set.seed(1)
rf.boston <- randomForest(medv~.,data = Boston,subset = train,mtry = 6,importance = TRUE)
yhat.rf <- predict(rf.boston,newdata = Boston[-train,])
mean((yhat.rf - boston.test)^2)
## [1] 11.66454
importance(rf.boston)
## %IncMSE IncNodePurity
## crim 12.132320 986.50338
## zn 1.955579 57.96945
## indus 9.069302 882.78261
## chas 2.210835 45.22941
## nox 11.104823 1044.33776
## rm 31.784033 6359.31971
## age 10.962684 516.82969
## dis 15.015236 1224.11605
## rad 4.118011 95.94586
## tax 8.587932 502.96719
## ptratio 12.503896 830.77523
## black 6.702609 341.30361
## lstat 30.695224 7505.73936
varImpPlot(rf.boston)
gbm
package, and within it the gbm() function, to fit boosted regression trees##Boosting (1)
#install.packages("gbm")
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1.3
set.seed(1)
boost.boston <- gbm(medv~., data = Boston[train,], distribution = "gaussian", n.trees = 5000, interaction.depth = 4)
summary(boost.boston)
##Boosting (2)
{par(mfrow = c(1,2))
plot(boost.boston, i = "rm")
plot(boost.boston, i = "lstat")}
##Boosting (3)
yhat.boost <- predict(boost.boston,newdata = Boston[-train,],n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 11.84434
boost.boston <- gbm(medv~., data = Boston[train,], distribution = "gaussian", n.trees = 5000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
yhat.boost <- predict(boost.boston,newdata = Boston[-train,],n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 11.51109
library("tree")
#1:15 ALL, 16:26 AML
cancer <- read.csv("http://bisyn.kaist.ac.kr/bis335/13-Classification_Adv-ALLnAML3.csv")
cancer$class <- c(rep("ALL",15), rep("AML", 11))
#make B bootstrap samples
bootstrap <- list()
B <- 3
for (i in 1:B) {
bootstrap[[i]] <- cancer[sample(c(1:26), 26,replace = T), ] #random process, thus it can be different from the problem b)
}
#make decision tree of original sample
cancer[, 4] <- as.factor(cancer[, 4])
treemod <- tree(class ~ TCF15 + IFI44 + SDCBP, cancer, split = "gini")
{plot(treemod)
text(treemod)}
#Pruning
cv.trees <- cv.tree(treemod, FUN = prune.misclass)
plot(cv.trees)
##result : size > 2 have the lowest variance
prune.trees <- prune.misclass(treemod, best = 2) #thus select size 2
{plot(prune.trees)
text(prune.trees)}
#make decision tree of bootstrap samples
boots.tree <- list()
for (i in 1:B) {
bootstrap[[i]][, 4] <- as.factor(bootstrap[[i]][, 4])
boots.tree[[i]] <- tree(class ~ TCF15 + IFI44 + SDCBP, bootstrap[[i]], split = "gini")
}
{par(mfrow = c(3,1))
plot(boots.tree[[1]])
text(boots.tree[[1]])
plot(boots.tree[[2]])
text(boots.tree[[2]])
plot(boots.tree[[3]])
text(boots.tree[[3]])}
##a) gini index of each terminal node of original tree
gini <- function(x,k){
G <- 0
for (i in 1:k) {
G <- G + x[i]*(1 - x[i])
}
return(G)
}
treemod ##focus row of "star" index
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 26 35.430 ALL ( 0.5769 0.4231 )
## 2) TCF15 < -0.582425 13 11.160 ALL ( 0.8462 0.1538 )
## 4) IFI44 < 0.061325 8 0.000 ALL ( 1.0000 0.0000 ) *
## 5) IFI44 > 0.061325 5 6.730 ALL ( 0.6000 0.4000 ) *
## 3) TCF15 > -0.582425 13 16.050 AML ( 0.3077 0.6923 )
## 6) TCF15 < -0.33372 6 8.318 ALL ( 0.5000 0.5000 ) *
## 7) TCF15 > -0.33372 7 5.742 AML ( 0.1429 0.8571 ) *
gini(c(1,0),2) #0
## [1] 0
gini(c(0.6,0.4),2) #0.48
## [1] 0.48
gini(c(0.5,0.5),2) #0.5
## [1] 0.5
gini(c(0.1429,0.8571),2) #0.2449
## [1] 0.2449592
##b) predict cancer types of three patients using bootstrap trees
test <- cancer[c(9,15,26), ]
treepred1 <- predict(boots.tree[[1]], test, type = "class")
treepred2 <- predict(boots.tree[[2]], test, type = "class")
treepred3 <- predict(boots.tree[[3]], test, type = "class")
## Method 1
##Support Vector Classifier (1)
set.seed(1)
x = Khan$xtrain
dim(x);
## [1] 63 2308
y = Khan$ytrain
length(y);
## [1] 63
pairs(x[,c(1,2,3)], col = y, labels = c('gene1','gene2','gene3'))
X <- x[y %in% c(1,2), c(1,2,3)]
Y <- factor(y[as.numeric(y) %in% c(1,2)])
X[Y == 1,] <- X[Y == 1,] + 3
pairs(X[,c(1,2,3)],col = Y, labels = c('gene1','gene2','gene3'))
dat <- data.frame(gene = X, type = Y)
#install.packages("e1071")
library(e1071)
svmfit.cost10 <- svm(type~., data = dat, kernel = "linear", cost = 10, scale = FALSE)
svmfit.cost.1 <- svm(type~., data = dat, kernel = "linear", cost = 0.1, scale = FALSE)
##Support Vector Classifier (2)
plot(svmfit.cost10, dat, gene.2 ~ gene.1)
plot(svmfit.cost.1, dat, gene.2 ~ gene.1)
svmfit.cost10$index #the indeces of support vectors
## [1] 5 20 30
svmfit.cost.1$index
## [1] 5 7 8 24 27 30
summary(svmfit.cost10) # you can check the number of support vectors for each class
##
## Call:
## svm(formula = type ~ ., data = dat, kernel = "linear", cost = 10,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
## gamma: 0.3333333
##
## Number of Support Vectors: 3
##
## ( 2 1 )
##
##
## Number of Classes: 2
##
## Levels:
## 1 2
##Support Vector Classifier (3)
set.seed(1)
tune.out <- tune(svm,type~.,data = dat,kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.2583333 0.2619078
## 2 1e-02 0.2583333 0.2619078
## 3 1e-01 0.0000000 0.0000000
## 4 1e+00 0.0000000 0.0000000
## 5 5e+00 0.0000000 0.0000000
## 6 1e+01 0.0000000 0.0000000
## 7 1e+02 0.0000000 0.0000000
bestmod <- tune.out$best.model
summary(bestmod)
##
## Call:
## best.tune(method = svm, train.x = type ~ ., data = dat, ranges = list(cost = c(0.001,
## 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
## gamma: 0.3333333
##
## Number of Support Vectors: 8
##
## ( 4 4 )
##
##
## Number of Classes: 2
##
## Levels:
## 1 2
##Support Vector Classifier (4)
xtest <- Khan$xtest
ytest <- Khan$ytest
Xtest <- xtest[ytest %in% c(1,2),]
Ytest <- factor(ytest[ytest %in% c(1,2)])
testdat <- data.frame(gene = Xtest, type = Ytest)
cost
parameter of best model (cost = 1.0).ypred <- predict(bestmod, testdat)
table(predict = ypred, truth = testdat$type)
## truth
## predict 1 2
## 1 0 0
## 2 3 6
svmfit <- svm(type~., data = dat, kernel = "linear", cost = 1, scale = FALSE)
ypred <- predict(svmfit,testdat)
table(predict = ypred, truth = testdat$type)
## truth
## predict 1 2
## 1 0 0
## 2 3 6
## Method 2
##Support Vector Machine (1)
set.seed(1)
x <- matrix(rnorm(200*2),ncol = 2)
x[1:100,] <- x[1:100,] + 2
x[101:150,] <- x[101:150,] - 2
y <- c(rep(1, 150), rep(2, 50))
dat <- data.frame(gene = x,type = as.factor(y))
svmfit <- svm(type~., data = dat, kernel = "radial", gamma = 1, cost = 0.6)
plot(svmfit, dat, gene.2 ~ gene.1)
svmfit <- svm(type~., data = dat, kernel = "radial", gamma = 1, cost = 1e5)
plot(svmfit, dat, gene.2 ~ gene.1)
##Support Vector Machine (2)
set.seed(1)
train <- sample(200,150)
tune.out <- tune(svm, type~., data = dat[train,], kernel = "radial", ranges = list(cost = c(0.1,1,10,100), gamma = c(0.5,1,2,4)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 1
##
## - best performance: 0.1066667
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.5 0.2400000 0.09532271
## 2 1.0 0.5 0.1200000 0.06885304
## 3 10.0 0.5 0.1133333 0.07062333
## 4 100.0 0.5 0.1466667 0.08195151
## 5 0.1 1.0 0.2000000 0.09428090
## 6 1.0 1.0 0.1066667 0.05621827
## 7 10.0 1.0 0.1400000 0.07336700
## 8 100.0 1.0 0.1533333 0.07730012
## 9 0.1 2.0 0.1866667 0.08195151
## 10 1.0 2.0 0.1200000 0.07568616
## 11 10.0 2.0 0.1333333 0.07027284
## 12 100.0 2.0 0.1466667 0.09322745
## 13 0.1 4.0 0.2400000 0.09532271
## 14 1.0 4.0 0.1400000 0.08577893
## 15 10.0 4.0 0.1466667 0.09838197
## 16 100.0 4.0 0.1600000 0.10517475
table(tune = dat[-train,"type"],
pred = predict(tune.out$best.model, newdata = dat[-train,]))
## pred
## tune 1 2
## 1 35 1
## 2 3 11
## ROC Curves
#install.packages("ROCR")
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
rocplot = function(pred, truth, ...){
predob = prediction(pred, truth, label.ordering = c('2','1'))
perf = performance(predob, "tpr", "fpr")
plot(perf, ...)}
#More flexible model(red line) is more accurate model.
{par(mfrow = c(1,2))
svmfit.opt <- svm(type~., data = dat, kernel = "radial", gamma = 1, cost = 1, decision.values = T)
svmfit.flex <- svm(type~., data = dat, kernel = "radial", gamma = 50, cost = 1, decision.values = T)
fitted <- attributes(predict(svmfit.opt, dat, decision.values = TRUE))$decision.values
rocplot(fitted, dat[,"type"], main = "Training Data")
fitted <- attributes(predict(svmfit.flex, dat, decision.value = T))$decision.values
rocplot(fitted, dat[,"type"], add = T, col = "red")
fitted <- attributes(predict(svmfit.opt, dat[-train,], decision.values = T))$decision.values
rocplot(fitted, dat[-train, "type"], main = "Test Data")
fitted = attributes(predict(svmfit.flex, dat[-train,], decision.values = T))$decision.values
rocplot(fitted, dat[-train, "type"], add = T, col = "red")}
##Application to Expression Data [Khan]
library(ISLR)
names(Khan)
## [1] "xtrain" "xtest" "ytrain" "ytest"
dim(Khan$xtrain)
## [1] 63 2308
dim(Khan$xtest)
## [1] 20 2308
length(Khan$ytrain)
## [1] 63
length(Khan$ytest)
## [1] 20
table(Khan$ytrain)
##
## 1 2 3 4
## 8 23 12 20
table(Khan$ytest)
##
## 1 2 3 4
## 3 6 6 5
##SVM with Multiple Classes
dat <- data.frame(x = Khan$xtrain, y = as.factor(Khan$ytrain))
out <- svm(y~., data = dat, kernel = "linear", cost = 10)
summary(out)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
## gamma: 0.0004332756
##
## Number of Support Vectors: 58
##
## ( 20 20 11 7 )
##
##
## Number of Classes: 4
##
## Levels:
## 1 2 3 4
table(out$fitted, dat$y)
##
## 1 2 3 4
## 1 8 0 0 0
## 2 0 23 0 0
## 3 0 0 12 0
## 4 0 0 0 20
dat.te <- data.frame(x = Khan$xtest, y = as.factor(Khan$ytest))
pred.te <- predict(out, newdata = dat.te)
table(pred.te, dat.te$y)
##
## pred.te 1 2 3 4
## 1 3 0 0 0
## 2 0 6 2 0
## 3 0 0 4 0
## 4 0 0 0 5
##SVM with Mutiple Classes
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:survival':
##
## aml
boot.fn = function(data, index)
coefficients(lm(mpg ~ horsepower + I(horsepower^2), data = data, subset = index))
set.seed(1)
boot(Auto, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 56.900099702 6.098115e-03 2.0944855842
## t2* -0.466189630 -1.777108e-04 0.0334123802
## t3* 0.001230536 1.324315e-06 0.0001208339
summary(lm(mpg ~ horsepower + I(horsepower^2),data = Auto))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.900099702 1.8004268063 31.60367 1.740911e-109
## horsepower -0.466189630 0.0311246171 -14.97816 2.289429e-40
## I(horsepower^2) 0.001230536 0.0001220759 10.08009 2.196340e-21