クラスター分析手法の比較

昨日の続きをすこしだけ。
クラスター分析の手法による結果の違いを確認してみたいと思います。
ウォード法、K-means法と{cluster}パッケージに入っているPAM(Partitioning Around Medoids)という手法を比較してみます。

data(iris)
# 結果を２次元で観察したいので、「花弁の長さ」と「花弁の幅」だけを使う
# ウォード法
WA <- cutree(hclust(dist(iris[, 3:4]), "ward"), 3)
# K-means法
KM <- kmeans(iris[, 3:4], 3, nstart=100)$cluster
# PAM。距離にはマンハッタン距離を使用。
library(cluster)
PMM <- pam(iris[, 3:4], 3, metric="manhattan", cluster.only=TRUE)

# 散布図で結果を観察
par(mfrow=c(2,2))
plot(iris[,3:4],pch=c(15,16,17)[iris[,5]], col=iris[,5], main="正解")
plot(iris[,3:4],pch=c(15,16,17)[iris[,5]], col=WA, main="ウォード法")
plot(iris[,3:4],pch=c(15,16,17)[iris[,5]], col=KM, main="K-means法")
plot(iris[,3:4],pch=c(15,16,17)[iris[,5]], col=PMM, main="PAM（マンハッタン距離）")

# クロス表で結果を観察
(WAT <- table(iris[,5], WA)) # ウォード法
#           WA
#             1  2  3
# setosa     50  0  0
# versicolor  0 45  5
# virginica   0  1 49

(KMT <- table(iris[,5], KM)) # K-means法
#           KM
#             1  2  3
# setosa      0  0 50
# versicolor 48  2  0
# virginica   4 46  0

(PMMT <-table(iris[,5], PMM)) # PAM
#           PMM
#             1  2  3
# setosa     50  0  0
# versicolor  0 50  0
# virginica   0  7 43

# エントロピーを算出する関数の定義
calcEntropy <- function(ct){
  - sum((apply(ct, 1, sum) / sum(ct))
  * apply(ct, 1, calcEntropy0)) / log(ncol(ct))
  }

calcEntropy0 <- function(pv){
  p1 <- pv / sum(pv)
  p2 <- p1[p1 != 0]
  sum(p2 * log(p2))
  }

# 純度を算出する関数の定義
calcPurity <- function(ct){
  sum(apply(ct, 1, max) / sum(ct))
  }

# エントロピー。値が小さいほど結果が良好。
calcEntropy(WAT) # ウォード法
# [1] 0.1283808
calcEntropy(KMT) # K-means法
# [1] 0.1355387
calcEntropy(PMMT) # PAM
# [1] 0.1228712

# 純度。値が大きいほど結果が良好。
calcPurity(WAT) # ウォード法
# [1] 0.96
calcPurity(KMT) # K-means法
# [1] 0.96
calcPurity(PMMT) # PAM
# [1] 0.9533333

エントロピーで見るとPAMが良いですが、純度ではウォード法とK-menasが良いようですね。
でも大差ない感じ。
もっと“汚い”データだと明確に違いが出ると思います。