参考文档

【R语言进行数据挖掘】决策树和随机森林

使用包`party`

学习使用包party里的函数ctree()为数据集iris建立一个决策树；
四个属性：
(1) 属性Sepal.Length：萼片长度
(2) 属性Sepal.Width：萼片宽度
(3) 属性Petal.Length：花瓣长度
(4) 属性Petal.Width：花瓣宽度
上述四个属性被用来预测鸢(yuan1)尾花的Species（种类）；
在这个包里，函数ctree()建立了一个决策树，函数predict()预测了另外一个数据集；
在建立模型之前，iris（鸢尾花）数据集被分成两个子集：训练集（70%）和测试集（30%）。使用随机种子设置固定的随机数，可以使得随机选取的数据是可重复利用的；

观察数据集的结构

> str(iris)

'data.frame':    150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

设置随机数起点

> set.seed(1234)

使用sample()函数抽取样本

> ind <- sample(2,nrow(iris),replace=TRUE,prob = c(0.7,0.3))

训练集 & 测试集

> trainData <- iris[ind == 1,]
> testData <- iris[ind == 2,]

（安装并）加载包

> install.packages("party")  # 可能需要执行两遍

> library(grid)
> library(mvtnorm)
> library(stats4)
> library(modeltools)
> library(zoo)
> library(strucchange)
> library(sandwich)
> library(party)

设置决策树的目标变量和独立变量

# 符号‘~’是连接方程或公式左右两边的符号
> myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width

建立决策树

> iris_ctree <- ctree(myFormula, data=trainData)

查看训练结果

> table(predict(iris_ctree),trainData$Species)

             setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         37         3
  virginica       0          1        31

打印决策树

> print(iris_ctree)

     Conditional inference tree with 4 terminal nodes

Response:  Species 
Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
Number of observations:  112 

1) Petal.Length <= 1.9; criterion = 1, statistic = 104.643
  2)*  weights = 40 
1) Petal.Length > 1.9
  3) Petal.Width <= 1.7; criterion = 1, statistic = 48.939
    4) Petal.Length <= 4.4; criterion = 0.974, statistic = 7.397
      5)*  weights = 21 
    4) Petal.Length > 4.4
      6)*  weights = 19 
  3) Petal.Width > 1.7
    7)*  weights = 32

绘图

> plot(iris_ctree)
> plot(iris_ctree,type="simple")

在测试集上测试决策树模型

> testPred <- predict(iris_ctree, newdata = testData)
> table(testPred, testData$Species)

testPred     setosa versicolor virginica
  setosa         10          0         0
  versicolor      0         12         2
  virginica       0          0        14

代码汇总

str(iris)
set.seed(1234)

ind <- sample(2,nrow(iris),replace=TRUE,prob = c(0.7,0.3))
trainData <- iris[ind == 1,]
testData <- iris[ind == 2,]

library(grid)
library(mvtnorm)
library(stats4)
library(modeltools)
library(zoo)
library(strucchange)
library(sandwich)
library(party)

myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
iris_ctree <- ctree(myFormula, data=trainData)

table(predict(iris_ctree),trainData$Species)

print(iris_ctree)
plot(iris_ctree)
plot(iris_ctree,type="simple")

testPred <- predict(iris_ctree, newdata = testData)
table(testPred, testData$Species)

使用包`rpart`

在bodyfat这个数据集基础上建立决策树；
函数rpart()可以建立一个决策树，并且可以选择最小误差的预测；
利用该决策树使用predict()函数预测另外一个数据集；

加载bodyfat数据集，并查看它的一些属性

> data("bodyfat", package = "TH.data")
> dim(bodyfat)  # 维度 a x b

[1] 71 10

> attributes(bodyfat)

$names
 [1] "age"          "DEXfat"       "waistcirc"   
 [4] "hipcirc"      "elbowbreadth" "kneebreadth" 
 [7] "anthro3a"     "anthro3b"     "anthro3c"    
[10] "anthro4"     

$row.names
 [1] "47"  "48"  "49"  "50"  "51"  "52"  "53"  "54"  "55" 
[10] "56"  "57"  "58"  "59"  "60"  "61"  "62"  "63"  "64" 
[19] "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72"  "73" 
[28] "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82" 
[37] "83"  "84"  "85"  "86"  "87"  "88"  "89"  "90"  "91" 
[46] "92"  "93"  "94"  "95"  "96"  "97"  "98"  "99"  "100"
[55] "101" "102" "103" "104" "105" "106" "107" "108" "109"
[64] "110" "111" "112" "113" "114" "115" "116" "117"

$class
[1] "data.frame"

> body[1:5,]

   age DEXfat waistcirc hipcirc elbowbreadth kneebreadth anthro3a anthro3b anthro3c anthro4
47  57  41.68     100.0   112.0          7.1         9.4     4.42     4.95     4.50    6.13
48  65  43.29      99.5   116.5          6.5         8.9     4.63     5.01     4.48    6.37
49  59  35.41      96.0   108.5          6.2         8.9     4.12     4.74     4.60    5.82
50  58  22.79      72.0    96.5          6.1         9.2     4.03     4.48     3.91    5.66
51  60  36.42      89.5   100.5          7.1        10.0     4.24     4.68     4.15    5.91

> set.seed(1234)
> ind <- sample(2, nrow(bodyfat), replace = TRUE, prob = c(0.7,0.3))
> bodyfat.train <- bodyfat[ind == 1,]
> bodyfat.test <- bodyfat[ind == 2,]
> library(rpart)

编写公式

> myFormula <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth

训练决策树

> bodyfat_rpart <- rpart(myFormula, data = bodyfat.train, control = rpart.control(minsplit = 10))

绘制决策树

> library(rattle)
> library(rpart.plot)
> library(RColorBrewer)
> fancyRpartPlot(bodyfat_rpart)

选择预测误差最小值的预测树，从而优化模型

> opt <- which.min(bodyfat_rpart$cptable[,"xerror"])
> cp <- bodyfat_rpart$cptable[opt,"CP"]
> bodyfat_prune <- prune(bodyfat_rpart, cp = cp)
> fancyRpartPlot(bodyfat_prune)

在测试集上测试，并与实际值进行对比

> DEXfat_pred <- predict(bodyfat_prune, newdata = bodyfat.test)
> xlim <- range(bodyfat$DEXfat)
> plot(DEXfat_pred ~ DEXfat, data = bodyfat.test, xlab = "Observed", ylab = "Predicted", ylim = xlim, xlim = xlim)
> abline(a = 0, b = 1)

代码汇总

data("bodyfat", package = "TH.data")
dim(bodyfat)

attributes(bodyfat)

body[1:5,]

set.seed(1234)
ind <- sample(2, nrow(bodyfat), replace = TRUE, prob = c(0.7,0.3))
bodyfat.train <- bodyfat[ind == 1,]
bodyfat.test <- bodyfat[ind == 2,]
library(rpart)

myFormula <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth

bodyfat_rpart <- rpart(myFormula, data = bodyfat.train, control = rpart.control(minsplit = 10))

library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(bodyfat_rpart)

opt <- which.min(bodyfat_rpart$cptable[,"xerror"])
cp <- bodyfat_rpart$cptable[opt,"CP"]
bodyfat_prune <- prune(bodyfat_rpart, cp = cp)
fancyRpartPlot(bodyfat_prune)

DEXfat_pred <- predict(bodyfat_prune, newdata = bodyfat.test)
xlim <- range(bodyfat$DEXfat)
plot(DEXfat_pred ~ DEXfat, data = bodyfat.test, xlab = "Observed", ylab = "Predicted", ylim = xlim, xlim = xlim)
abline(a = 0, b = 1)

使用随机森林

这里使用包randomForest，并利用iris(鸢尾花)数据建立一个预测模型；
包里面的randomForest()函数有两点不足：第一，它不能处理缺失值，使得用户必须在使用该函数之前填补这些缺失值；第二，每个分类属性的最大数量不能超过32个，如果属性超过32个，那么在使用该函数之前那些属性必须被转化；
另外，也可以通过包cforest建立随机森林，并且该包里面的函数并不受属性的最大数量限制，尽管如此，高维的分类属性会使得它建立随机森林的时候消耗大量的内存和时间；

> ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.7, 0.3))
> trainData <- iris[ind == 1,]
> testData <- iris[ind == 2,]
> install.packages("randomForest")
> library(randomForest)
> myFormula <- Species ~ .  # 'Species ~ .'表示'Species'与其他所有属性之间的等式
> rf <- randomForest(myFormula, data = trainData, ntree = 100, proximity = TRUE)

             setosa versicolor virginica
  setosa         36          0         0
  versicolor      0         31         2
  virginica       0          1        34

> print(rf)

Call:
 randomForest(formula = myFormula, data = trainData, ntree = 100,      proximity = TRUE) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 2

        OOB estimate of  error rate: 2.88%
Confusion matrix:
           setosa versicolor virginica class.error
setosa         36          0         0  0.00000000
versicolor      0         31         1  0.03125000
virginica       0          2        34  0.05555556

> irisPred <- predict(rf, newdata = testData)
> table(irisPred, testData$Species)

irisPred     setosa versicolor virginica
  setosa         14          0         0
  versicolor      0         17         3
  virginica       0          1        11

绘制每一个观测值被判断正确的概率图

> plot(margin(rf, testData$Species))

代码汇总

ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.7, 0.3))
trainData <- iris[ind == 1,]
testData <- iris[ind == 2,]

install.packages("randomForest")
library(randomForest)

myFormula <- Species ~ .
rf <- randomForest(myFormula, data = trainData, ntree = 100, proximity = TRUE)
table(predict(rf), trainData$Species)

print(rf)

irisPred <- predict(rf, newdata = testData)
table(irisPred, testData$Species)

plot(margin(rf, testData$Species))

閒譚藕記

2016/12/12

决策树与R语言

参考文档

使用包`party`

使用包`rpart`

使用随机森林

没有评论:

发表评论

博客归档

标签

2016/12/12

决策树与R语言

参考文档

使用包party

使用包rpart

使用随机森林

没有评论:

发表评论

使用包`party`

使用包`rpart`