参考文档
使用包party
- 学习使用包
party
里的函数ctree()
为数据集iris建立一个决策树;
- 四个属性:
(1) 属性Sepal.Length
:萼片长度
(2) 属性Sepal.Width
:萼片宽度
(3) 属性Petal.Length
:花瓣长度
(4) 属性Petal.Width
:花瓣宽度
- 上述四个属性被用来预测鸢(yuan1)尾花的
Species
(种类);
- 在这个包里,函数
ctree()
建立了一个决策树,函数predict()
预测了另外一个数据集;
- 在建立模型之前,iris(鸢尾花)数据集被分成两个子集:训练集(70%)和测试集(30%)。使用随机种子设置固定的随机数,可以使得随机选取的数据是可重复利用的;
> str(iris)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
> set.seed(1234)
> ind <- sample(2,nrow(iris),replace=TRUE,prob = c(0.7,0.3))
> trainData <- iris[ind == 1,]
> testData <- iris[ind == 2,]
> install.packages("party")
> library(grid)
> library(mvtnorm)
> library(stats4)
> library(modeltools)
> library(zoo)
> library(strucchange)
> library(sandwich)
> library(party)
> myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
> iris_ctree <- ctree(myFormula, data=trainData)
> table(predict(iris_ctree),trainData$Species)
setosa versicolor virginica
setosa 40 0 0
versicolor 0 37 3
virginica 0 1 31
> print(iris_ctree)
Conditional inference tree with 4 terminal nodes
Response: Species
Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
Number of observations: 112
1) Petal.Length <= 1.9; criterion = 1, statistic = 104.643
2)* weights = 40
1) Petal.Length > 1.9
3) Petal.Width <= 1.7; criterion = 1, statistic = 48.939
4) Petal.Length <= 4.4; criterion = 0.974, statistic = 7.397
5)* weights = 21
4) Petal.Length > 4.4
6)* weights = 19
3) Petal.Width > 1.7
7)* weights = 32
> plot(iris_ctree)
> plot(iris_ctree,type="simple")


> testPred <- predict(iris_ctree, newdata = testData)
> table(testPred, testData$Species)
testPred setosa versicolor virginica
setosa 10 0 0
versicolor 0 12 2
virginica 0 0 14
str(iris)
set.seed(1234)
ind <- sample(2,nrow(iris),replace=TRUE,prob = c(0.7,0.3))
trainData <- iris[ind == 1,]
testData <- iris[ind == 2,]
library(grid)
library(mvtnorm)
library(stats4)
library(modeltools)
library(zoo)
library(strucchange)
library(sandwich)
library(party)
myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
iris_ctree <- ctree(myFormula, data=trainData)
table(predict(iris_ctree),trainData$Species)
print(iris_ctree)
plot(iris_ctree)
plot(iris_ctree,type="simple")
testPred <- predict(iris_ctree, newdata = testData)
table(testPred, testData$Species)
使用包rpart
- 在
bodyfat
这个数据集基础上建立决策树;
- 函数
rpart()
可以建立一个决策树,并且可以选择最小误差的预测;
- 利用该决策树使用
predict()
函数预测另外一个数据集;
> data("bodyfat", package = "TH.data")
> dim(bodyfat)
[1] 71 10
> attributes(bodyfat)
$names
[1] "age" "DEXfat" "waistcirc"
[4] "hipcirc" "elbowbreadth" "kneebreadth"
[7] "anthro3a" "anthro3b" "anthro3c"
[10] "anthro4"
$row.names
[1] "47" "48" "49" "50" "51" "52" "53" "54" "55"
[10] "56" "57" "58" "59" "60" "61" "62" "63" "64"
[19] "65" "66" "67" "68" "69" "70" "71" "72" "73"
[28] "74" "75" "76" "77" "78" "79" "80" "81" "82"
[37] "83" "84" "85" "86" "87" "88" "89" "90" "91"
[46] "92" "93" "94" "95" "96" "97" "98" "99" "100"
[55] "101" "102" "103" "104" "105" "106" "107" "108" "109"
[64] "110" "111" "112" "113" "114" "115" "116" "117"
$class
[1] "data.frame"
> body[1:5,]
age DEXfat waistcirc hipcirc elbowbreadth kneebreadth anthro3a anthro3b anthro3c anthro4
47 57 41.68 100.0 112.0 7.1 9.4 4.42 4.95 4.50 6.13
48 65 43.29 99.5 116.5 6.5 8.9 4.63 5.01 4.48 6.37
49 59 35.41 96.0 108.5 6.2 8.9 4.12 4.74 4.60 5.82
50 58 22.79 72.0 96.5 6.1 9.2 4.03 4.48 3.91 5.66
51 60 36.42 89.5 100.5 7.1 10.0 4.24 4.68 4.15 5.91
> set.seed(1234)
> ind <- sample(2, nrow(bodyfat), replace = TRUE, prob = c(0.7,0.3))
> bodyfat.train <- bodyfat[ind == 1,]
> bodyfat.test <- bodyfat[ind == 2,]
> library(rpart)
> myFormula <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth
> bodyfat_rpart <- rpart(myFormula, data = bodyfat.train, control = rpart.control(minsplit = 10))
> library(rattle)
> library(rpart.plot)
> library(RColorBrewer)
> fancyRpartPlot(bodyfat_rpart)

> opt <- which.min(bodyfat_rpart$cptable[,"xerror"])
> cp <- bodyfat_rpart$cptable[opt,"CP"]
> bodyfat_prune <- prune(bodyfat_rpart, cp = cp)
> fancyRpartPlot(bodyfat_prune)

> DEXfat_pred <- predict(bodyfat_prune, newdata = bodyfat.test)
> xlim <- range(bodyfat$DEXfat)
> plot(DEXfat_pred ~ DEXfat, data = bodyfat.test, xlab = "Observed", ylab = "Predicted", ylim = xlim, xlim = xlim)
> abline(a = 0, b = 1)

data("bodyfat", package = "TH.data")
dim(bodyfat)
attributes(bodyfat)
body[1:5,]
set.seed(1234)
ind <- sample(2, nrow(bodyfat), replace = TRUE, prob = c(0.7,0.3))
bodyfat.train <- bodyfat[ind == 1,]
bodyfat.test <- bodyfat[ind == 2,]
library(rpart)
myFormula <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth
bodyfat_rpart <- rpart(myFormula, data = bodyfat.train, control = rpart.control(minsplit = 10))
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(bodyfat_rpart)
opt <- which.min(bodyfat_rpart$cptable[,"xerror"])
cp <- bodyfat_rpart$cptable[opt,"CP"]
bodyfat_prune <- prune(bodyfat_rpart, cp = cp)
fancyRpartPlot(bodyfat_prune)
DEXfat_pred <- predict(bodyfat_prune, newdata = bodyfat.test)
xlim <- range(bodyfat$DEXfat)
plot(DEXfat_pred ~ DEXfat, data = bodyfat.test, xlab = "Observed", ylab = "Predicted", ylim = xlim, xlim = xlim)
abline(a = 0, b = 1)
使用随机森林
- 这里使用包
randomForest
,并利用iris(鸢尾花)数据建立一个预测模型;
- 包里面的
randomForest()
函数有两点不足:第一,它不能处理缺失值,使得用户必须在使用该函数之前填补这些缺失值;第二,每个分类属性的最大数量不能超过32个,如果属性超过32个,那么在使用该函数之前那些属性必须被转化;
- 另外,也可以通过包
cforest
建立随机森林,并且该包里面的函数并不受属性的最大数量限制,尽管如此,高维的分类属性会使得它建立随机森林的时候消耗大量的内存和时间;
> ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.7, 0.3))
> trainData <- iris[ind == 1,]
> testData <- iris[ind == 2,]
> install.packages("randomForest")
> library(randomForest)
> myFormula <- Species ~ .
> rf <- randomForest(myFormula, data = trainData, ntree = 100, proximity = TRUE)
setosa versicolor virginica
setosa 36 0 0
versicolor 0 31 2
virginica 0 1 34
> print(rf)
Call:
randomForest(formula = myFormula, data = trainData, ntree = 100, proximity = TRUE)
Type of random forest: classification
Number of trees: 100
No. of variables tried at each split: 2
OOB estimate of error rate: 2.88%
Confusion matrix:
setosa versicolor virginica class.error
setosa 36 0 0 0.00000000
versicolor 0 31 1 0.03125000
virginica 0 2 34 0.05555556
> irisPred <- predict(rf, newdata = testData)
> table(irisPred, testData$Species)
irisPred setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 3
virginica 0 1 11
> plot(margin(rf, testData$Species))

ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.7, 0.3))
trainData <- iris[ind == 1,]
testData <- iris[ind == 2,]
install.packages("randomForest")
library(randomForest)
myFormula <- Species ~ .
rf <- randomForest(myFormula, data = trainData, ntree = 100, proximity = TRUE)
table(predict(rf), trainData$Species)
print(rf)
irisPred <- predict(rf, newdata = testData)
table(irisPred, testData$Species)
plot(margin(rf, testData$Species))
没有评论:
发表评论