Hiyerarşik Kümeleme

arrests <- data("USArrests")
View(arrests)
?USArrests
## starting httpd help server ... done
arrests <- as.data.frame(USArrests)

Uzaklılkları hesaplama

res.dist <- dist(arrests, method = "euclidean")
res.hc <- hclust(res.dist, method = "ward.D2")
?hclust()
plot(res.hc)

res.hc2 <- hclust(res.dist, method = "ave")
plot(res.hc2)

res.hc3 <- hclust(res.dist, method = "complete")
plot(res.hc3)

res.hc4 <- hclust(res.dist, method = "single")
plot(res.hc4)

library("factoextra")
## Warning: package 'factoextra' was built under R version 3.6.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(res.hc, cex = 0.5)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

# cex: label size

res.coph <- cophenetic(res.hc)

# iki birim arasındaki cophenetic uzaklık, dendrogramdaki iki birimden oluşan iki dalın tek bir dalda birleştiği yüksekliktir.
# Ağacın yüksekliğine cophenetic uzaklık denir.
# Korelasyon katsayısının değeri 1'e ne kadar yakınsa, kümeleme çözümü veriyi o kadar doğru yansıtır. 
# 0.75'ten büyük değerler iyidir.
cor(res.dist, res.coph)
## [1] 0.7609613
# küme sayısı = 4 
group <- cutree(res.hc, k = 4)
group
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              1              2              1 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              1              1              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              4              3              1              3              4 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              3              1              4              1 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              2              1              4              1              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              1              4              2 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              1              1              1              4              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              2              2              3              2              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              4              2              2              3              4 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              2              2              4              4              2
# her kümedeki birim sayısı
table(group)
## group
##  1  2  3  4 
## 16 14 10 10
# ikinci kümedeki birimler
rownames(arrests)[group == 2]
##  [1] "Arkansas"      "Colorado"      "Georgia"       "Massachusetts"
##  [5] "Missouri"      "New Jersey"    "Oklahoma"      "Oregon"       
##  [9] "Rhode Island"  "Tennessee"     "Texas"         "Virginia"     
## [13] "Washington"    "Wyoming"
# tanımlı kümeler için farklı renkler seçmek
fviz_dend(res.hc, k = 4, # küme sayısı
          cex = 0.5, # label size
          k_colors = c("blue", "orange", "red", "black"),
          color_labels_by_k = TRUE, # kümelere göre renkler
          rect = TRUE # kümeleri belirginleştirmek için dikdörtgen eklemek
          )
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

res.hc2 <- hclust(res.dist, method = "average")
cor(res.dist, cophenetic(res.hc2))
## [1] 0.7658983
fviz_dend(res.hc3, k=3, cex = 0.5, k_colors = c("blue", "orange", "red", "black"))
## Warning in get_col(col, k): Length of color vector was longer than the number of
## clusters - first k elements are used
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

# Agglomerative = down--top
library("cluster")
## Warning: package 'cluster' was built under R version 3.6.2
# Agglomerative Nesting (Hierarchical Clustering)
res.agnes <- agnes(x = USArrests, # data matrix
                   stand = TRUE, # normalize
                   metric = "euclidean", # distance metric
                   method = "ward" # linkage method
)
fviz_dend(res.agnes, cex = 0.6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

fviz_dend(res.agnes, cex = 0.6, k=3)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

# DIvisive ANAlysis Clustering
res.diana <- diana(x = USArrests, 
                   stand = TRUE, # normalizasyon
                   metric = "euclidean" 
)
fviz_dend(res.diana, cex = 0.6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.