cancer <- read.csv("cancer.csv", stringsAsFactors = FALSE)
View(cancer)
str(cancer)
## 'data.frame': 569 obs. of 33 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ X : logi NA NA NA NA NA NA ...
# ilk değişken 'id' doğası gereği benzersizdir ve yararlı bilgiler sağlamadığından kaldırılabilir.
# sonuncusu da kaldırılabilir.
cancer1 <- cancer[,c(-1,-33)]
View(cancer1)
str(cancer1)
## 'data.frame': 569 obs. of 31 variables:
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
summary(cancer1)
## diagnosis radius_mean texture_mean perimeter_mean
## Length:569 Min. : 6.981 Min. : 9.71 Min. : 43.79
## Class :character 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Mode :character Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000
## 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956
## Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154
## Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880
## 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680
## concave.points_mean symmetry_mean fractal_dimension_mean radius_se
## Min. :0.00000 Min. :0.1060 Min. :0.04996 Min. :0.1115
## 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324
## Median :0.03350 Median :0.1792 Median :0.06154 Median :0.3242
## Mean :0.04892 Mean :0.1812 Mean :0.06280 Mean :0.4052
## 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789
## Max. :0.20120 Max. :0.3040 Max. :0.09744 Max. :2.8730
## texture_se perimeter_se area_se smoothness_se
## Min. :0.3602 Min. : 0.757 Min. : 6.802 Min. :0.001713
## 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169
## Median :1.1080 Median : 2.287 Median : 24.530 Median :0.006380
## Mean :1.2169 Mean : 2.866 Mean : 40.337 Mean :0.007041
## 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146
## Max. :4.8850 Max. :21.980 Max. :542.200 Max. :0.031130
## compactness_se concavity_se concave.points_se symmetry_se
## Min. :0.002252 Min. :0.00000 Min. :0.000000 Min. :0.007882
## 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160
## Median :0.020450 Median :0.02589 Median :0.010930 Median :0.018730
## Mean :0.025478 Mean :0.03189 Mean :0.011796 Mean :0.020542
## 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480
## Max. :0.135400 Max. :0.39600 Max. :0.052790 Max. :0.078950
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
## 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
## Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66
## Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26
## 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
## Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20
## area_worst smoothness_worst compactness_worst concavity_worst
## Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000
## 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145
## Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267
## Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722
## 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829
## Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520
## concave.points_worst symmetry_worst fractal_dimension_worst
## Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.29100 Max. :0.6638 Max. :0.20750
# diagnosis(tanı), hedef değişkenimiz (nitelik).
table(cancer1$diagnosis)
##
## B M
## 357 212
# diagnosise göre hasta sayısını görmek için:
# B'yi ”Benign” ve M'yi “Malign” olarak yeniden adlandırıyoruz, sonuçları yüzde biçiminde görmek için:
cancer1$diagnosis <- factor(cancer1$diagnosis, levels = c("B","M"), labels = c("Benign", "Malignant"))
# sonucu 1 ondalık basamağa yuvarlamak için:
round(prop.table(table(cancer1$diagnosis))*100, digits = 1)
##
## Benign Malignant
## 62.7 37.3
str(cancer1)
## 'data.frame': 569 obs. of 31 variables:
## $ diagnosis : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
# Kalan 30 özelliğin tamamı nicel
# tesadüfi üç niteliğin özelliklerine bakmak için:
summary(cancer1[c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. : 6.981 Min. : 143.5 Min. :0.05263
## 1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
## Median :13.370 Median : 551.1 Median :0.09587
## Mean :14.127 Mean : 654.9 Mean :0.09636
## 3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
## Max. :28.110 Max. :2501.0 Max. :0.16340
# smoothness_mean 0,05 ile 0.16, area_mean 143.5 ile 2501.0 arasında değişirken,
# area_mean etkisi mesafe hesaplamasında smoothness_meanden çok daha büyük olacaktır.
# Sınıflandırıcımız için sorun var, bu yüzden özellikleri yeniden ölçeklendirmek için normalleştirme uygulayalım
# min-max normalizasyonu
normalize <- function(x){
return((x-min(x))/(max(x)-min(x)))
}
# Bu kodu çalıştırdıktan sonra veri setindeki sayısal özellikleri normalleştirmemiz gerekiyor.
# Kullandığımız 29 ayrı değişkenin her birini normalleştirmek yerine,
# lapply() fonksiyonu bir liste alır ve her öğeye bunu uygular.
cancer_n <- as.data.frame(lapply(cancer1[2:31], normalize))
# _n soneki burada normalize değerlerin geçerli olduğunu hatırlatmak için kullanıldı.
# Veri kümemizdeki ilk değişken (ID çıkardıktan sonra), doğası gereği nicel olmayan 'diagnosis'tir.
# Yani 2. değişkenden başlıyoruz. lapply(), veri çerçevesindeki her özelliğe normalize() uygular.
# Nihai sonuç, as.data.frame() işlevi kullanılarak kanser_n veri çerçevesine kaydedilir.
View(cancer_n)
# Verilerin normalize edilip edilmediğini kontrol etmek için:
summary(cancer_n)
## radius_mean texture_mean perimeter_mean area_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.2185 1st Qu.:0.2168 1st Qu.:0.1174
## Median :0.3024 Median :0.3088 Median :0.2933 Median :0.1729
## Mean :0.3382 Mean :0.3240 Mean :0.3329 Mean :0.2169
## 3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4168 3rd Qu.:0.2711
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3046 1st Qu.:0.1397 1st Qu.:0.06926 1st Qu.:0.1009
## Median :0.3904 Median :0.2247 Median :0.14419 Median :0.1665
## Mean :0.3948 Mean :0.2606 Mean :0.20806 Mean :0.2431
## 3rd Qu.:0.4755 3rd Qu.:0.3405 3rd Qu.:0.30623 3rd Qu.:0.3678
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_mean fractal_dimension_mean radius_se texture_se
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.2823 1st Qu.:0.1630 1st Qu.:0.04378 1st Qu.:0.1047
## Median :0.3697 Median :0.2439 Median :0.07702 Median :0.1653
## Mean :0.3796 Mean :0.2704 Mean :0.10635 Mean :0.1893
## 3rd Qu.:0.4530 3rd Qu.:0.3404 3rd Qu.:0.13304 3rd Qu.:0.2462
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## perimeter_se area_se smoothness_se compactness_se
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.04000 1st Qu.:0.02064 1st Qu.:0.1175 1st Qu.:0.08132
## Median :0.07209 Median :0.03311 Median :0.1586 Median :0.13667
## Mean :0.09938 Mean :0.06264 Mean :0.1811 Mean :0.17444
## 3rd Qu.:0.12251 3rd Qu.:0.07170 3rd Qu.:0.2187 3rd Qu.:0.22680
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## concavity_se concave.points_se symmetry_se fractal_dimension_se
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.03811 1st Qu.:0.1447 1st Qu.:0.1024 1st Qu.:0.04675
## Median :0.06538 Median :0.2070 Median :0.1526 Median :0.07919
## Mean :0.08054 Mean :0.2235 Mean :0.1781 Mean :0.10019
## 3rd Qu.:0.10619 3rd Qu.:0.2787 3rd Qu.:0.2195 3rd Qu.:0.12656
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## radius_worst texture_worst perimeter_worst area_worst
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1807 1st Qu.:0.2415 1st Qu.:0.1678 1st Qu.:0.08113
## Median :0.2504 Median :0.3569 Median :0.2353 Median :0.12321
## Mean :0.2967 Mean :0.3640 Mean :0.2831 Mean :0.17091
## 3rd Qu.:0.3863 3rd Qu.:0.4717 3rd Qu.:0.3735 3rd Qu.:0.22090
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3000 1st Qu.:0.1163 1st Qu.:0.09145 1st Qu.:0.2231
## Median :0.3971 Median :0.1791 Median :0.18107 Median :0.3434
## Mean :0.4041 Mean :0.2202 Mean :0.21740 Mean :0.3938
## 3rd Qu.:0.4942 3rd Qu.:0.3025 3rd Qu.:0.30583 3rd Qu.:0.5546
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## symmetry_worst fractal_dimension_worst
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1851 1st Qu.:0.1077
## Median :0.2478 Median :0.1640
## Mean :0.2633 Mean :0.1896
## 3rd Qu.:0.3182 3rd Qu.:0.2429
## Max. :1.0000 Max. :1.0000
# Cancer_n veri çerçevesini,cancer_train ve cancer_test veri çerçevelerine ayıracağız
# Eğitim için ilk 469 birim, test için de son 100 birim alıyoruz
cancer_train <- cancer_n[1:469, ]
cancer_test <- cancer_n[470:569, ]
# Hedef değişkenimiz, eğitim ve test veri setlerimize dahil etmediğimiz 'diagnosis'.
# Eğitim ve test verilerimizi oluşturduğumuzda, hedef değişken olan diagnosis hariç tuttuk.
# kNN modelini eğitmek için bu sınıf etiketlerini içinde saklamamız gerekecek.
cancer_train_labels <- cancer1[1:469,1]
cancer_test_labels <- cancer1[470:569,1]
summary(cancer_test_labels)
## Benign Malignant
## 77 23
# install.packages("class")
library(class)
## Warning: package 'class' was built under R version 3.6.3
# k?
sqrt(469)
## [1] 21.65641
cancer_predict <- knn(train = cancer_train,
test = cancer_test,
cl = cancer_train_labels,
k = 21)
cancer_predict
## [1] Benign Benign Benign Benign Benign Benign Benign
## [8] Benign Benign Benign Malignant Benign Benign Benign
## [15] Benign Benign Benign Benign Malignant Benign Benign
## [22] Benign Benign Malignant Benign Benign Benign Benign
## [29] Benign Malignant Malignant Benign Malignant Benign Malignant
## [36] Benign Benign Benign Benign Benign Malignant Benign
## [43] Benign Malignant Benign Benign Benign Malignant Malignant
## [50] Benign Benign Benign Malignant Benign Benign Benign
## [57] Benign Benign Benign Benign Benign Benign Benign
## [64] Benign Malignant Benign Malignant Malignant Benign Benign
## [71] Benign Benign Benign Benign Benign Benign Benign
## [78] Benign Benign Benign Benign Benign Benign Benign
## [85] Benign Benign Benign Benign Benign Benign Benign
## [92] Benign Benign Malignant Malignant Malignant Malignant Malignant
## [99] Malignant Benign
## Levels: Benign Malignant
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(table(cancer_predict ,cancer_test_labels))
## Confusion Matrix and Statistics
##
## cancer_test_labels
## cancer_predict Benign Malignant
## Benign 77 2
## Malignant 0 21
##
## Accuracy : 0.98
## 95% CI : (0.9296, 0.9976)
## No Information Rate : 0.77
## P-Value [Acc > NIR] : 2.106e-09
##
## Kappa : 0.9418
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9130
## Pos Pred Value : 0.9747
## Neg Pred Value : 1.0000
## Prevalence : 0.7700
## Detection Rate : 0.7700
## Detection Prevalence : 0.7900
## Balanced Accuracy : 0.9565
##
## 'Positive' Class : Benign
##
# The test data consisted of 100 observations.
# Out of which 77 cases have been accurately predicted (TN->True Negatives) as Benign (B) in nature which constitutes 77%.
# Also, 21 out of 100 observations were accurately predicted (TP-> True Positives) as Malignant (M) in nature which constitutes 21%.
# Thus a total of 21 out of 100 predictions where TP i.e, True Positive in nature.
# There were 2 cases of False Negatives (FN) meaning 2 cases were recorded which actually are malignant in nature
# but got predicted as benign. The FN’s if any poses a potential threat for the same reason and the main focus to
# increase the accuracy of the model is to reduce FN’s.
# There were 0 cases of False Positives (FP) meaning 0 cases were actually benign in nature but
# got predicted as malignant.
# The total accuracy of the model is 98 %( (TN+TP)/100) which shows that there may be chances to
# improve the model performance
# Test verileri 100 gözlemden oluşuyordu.77 hasta (TN->Gerçek Negatifler) gerçekte
# Benign (B) olduğu doğru bir şekilde tahmin edilmiştir.
# Ayrıca, 100 gözlemden 21'i (TP-> Gerçek Pozitifler) gerçekte Malign (M) olarak
# doğru bir şekilde tahmin edilmiş, böylece 100 tahminden toplam 21'inde TP yani Doğası gereği Gerçek Pozitif.
# 2 Yanlış Negatif (FN) vakası var, yani aslında doğası gereği malign olan 2 hasta iyi huylu olduğu tahmin edildi.
# FN'ler varsa, aynı nedenle potansiyel bir tehdit oluşturmakta ve ana odak noktası
# modelin doğruluğunu arttırmak FN'leri azaltmaktır.
# 0 Yanlış Pozitif (FP) durumu var, yani 0 vaka gerçekte iyi huyluydu ama
# malign olarak tahmin edildi.
# Modelin toplam doğruluğu %98( (TN+TP)/100) olmakta bu da farklı olasılıkların olabileceğini göstermektedir.
# model performansını iyileştirmeli.
# Ayrıca çapraz tablo sonuçlarını görmek için "gmodels" paketini kullanabiliriz.
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.6.3
CrossTable(x = cancer_test_labels, y = cancer_predict, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | cancer_predict
## cancer_test_labels | Benign | Malignant | Row Total |
## -------------------|-----------|-----------|-----------|
## Benign | 77 | 0 | 77 |
## | 1.000 | 0.000 | 0.770 |
## | 0.975 | 0.000 | |
## | 0.770 | 0.000 | |
## -------------------|-----------|-----------|-----------|
## Malignant | 2 | 21 | 23 |
## | 0.087 | 0.913 | 0.230 |
## | 0.025 | 1.000 | |
## | 0.020 | 0.210 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 79 | 21 | 100 |
## | 0.790 | 0.210 | |
## -------------------|-----------|-----------|-----------|
##
##
# z score normalizasyonu
cancer_z <- as.data.frame(scale(cancer1[-1]))
summary(cancer_z)
## radius_mean texture_mean perimeter_mean area_mean
## Min. :-2.0279 Min. :-2.2273 Min. :-1.9828 Min. :-1.4532
## 1st Qu.:-0.6888 1st Qu.:-0.7253 1st Qu.:-0.6913 1st Qu.:-0.6666
## Median :-0.2149 Median :-0.1045 Median :-0.2358 Median :-0.2949
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4690 3rd Qu.: 0.5837 3rd Qu.: 0.4992 3rd Qu.: 0.3632
## Max. : 3.9678 Max. : 4.6478 Max. : 3.9726 Max. : 5.2459
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## Min. :-3.10935 Min. :-1.6087 Min. :-1.1139 Min. :-1.2607
## 1st Qu.:-0.71034 1st Qu.:-0.7464 1st Qu.:-0.7431 1st Qu.:-0.7373
## Median :-0.03486 Median :-0.2217 Median :-0.3419 Median :-0.3974
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.63564 3rd Qu.: 0.4934 3rd Qu.: 0.5256 3rd Qu.: 0.6464
## Max. : 4.76672 Max. : 4.5644 Max. : 4.2399 Max. : 3.9245
## symmetry_mean fractal_dimension_mean radius_se texture_se
## Min. :-2.74171 Min. :-1.8183 Min. :-1.0590 Min. :-1.5529
## 1st Qu.:-0.70262 1st Qu.:-0.7220 1st Qu.:-0.6230 1st Qu.:-0.6942
## Median :-0.07156 Median :-0.1781 Median :-0.2920 Median :-0.1973
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.53031 3rd Qu.: 0.4706 3rd Qu.: 0.2659 3rd Qu.: 0.4661
## Max. : 4.48081 Max. : 4.9066 Max. : 8.8991 Max. : 6.6494
## perimeter_se area_se smoothness_se compactness_se
## Min. :-1.0431 Min. :-0.7372 Min. :-1.7745 Min. :-1.2970
## 1st Qu.:-0.6232 1st Qu.:-0.4943 1st Qu.:-0.6235 1st Qu.:-0.6923
## Median :-0.2864 Median :-0.3475 Median :-0.2201 Median :-0.2808
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2428 3rd Qu.: 0.1067 3rd Qu.: 0.3680 3rd Qu.: 0.3893
## Max. : 9.4537 Max. :11.0321 Max. : 8.0229 Max. : 6.1381
## concavity_se concave.points_se symmetry_se fractal_dimension_se
## Min. :-1.0566 Min. :-1.9118 Min. :-1.5315 Min. :-1.0960
## 1st Qu.:-0.5567 1st Qu.:-0.6739 1st Qu.:-0.6511 1st Qu.:-0.5846
## Median :-0.1989 Median :-0.1404 Median :-0.2192 Median :-0.2297
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.3365 3rd Qu.: 0.4722 3rd Qu.: 0.3554 3rd Qu.: 0.2884
## Max. :12.0621 Max. : 6.6438 Max. : 7.0657 Max. : 9.8429
## radius_worst texture_worst perimeter_worst area_worst
## Min. :-1.7254 Min. :-2.22204 Min. :-1.6919 Min. :-1.2213
## 1st Qu.:-0.6743 1st Qu.:-0.74797 1st Qu.:-0.6890 1st Qu.:-0.6416
## Median :-0.2688 Median :-0.04348 Median :-0.2857 Median :-0.3409
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5216 3rd Qu.: 0.65776 3rd Qu.: 0.5398 3rd Qu.: 0.3573
## Max. : 4.0906 Max. : 3.88249 Max. : 4.2836 Max. : 5.9250
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## Min. :-2.6803 Min. :-1.4426 Min. :-1.3047 Min. :-1.7435
## 1st Qu.:-0.6906 1st Qu.:-0.6805 1st Qu.:-0.7558 1st Qu.:-0.7557
## Median :-0.0468 Median :-0.2693 Median :-0.2180 Median :-0.2233
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5970 3rd Qu.: 0.5392 3rd Qu.: 0.5307 3rd Qu.: 0.7119
## Max. : 3.9519 Max. : 5.1084 Max. : 4.6965 Max. : 2.6835
## symmetry_worst fractal_dimension_worst
## Min. :-2.1591 Min. :-1.6004
## 1st Qu.:-0.6413 1st Qu.:-0.6913
## Median :-0.1273 Median :-0.2163
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4497 3rd Qu.: 0.4504
## Max. : 6.0407 Max. : 6.8408
cancer_train2 <- cancer_z[1:469,]
cancer_test2 <- cancer_z[470:569,]
cancer_train2_labels <- cancer1[1:469,1]
cancer_test2_labels <- cancer1[470:569,1]
cancer_predict2 <- knn(train = cancer_train2,
test = cancer_test2,
cl = cancer_train2_labels,
k= 21)
CrossTable(x = cancer_test2_labels, y = cancer_predict2, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | cancer_predict2
## cancer_test2_labels | Benign | Malignant | Row Total |
## --------------------|-----------|-----------|-----------|
## Benign | 77 | 0 | 77 |
## | 1.000 | 0.000 | 0.770 |
## | 0.975 | 0.000 | |
## | 0.770 | 0.000 | |
## --------------------|-----------|-----------|-----------|
## Malignant | 2 | 21 | 23 |
## | 0.087 | 0.913 | 0.230 |
## | 0.025 | 1.000 | |
## | 0.020 | 0.210 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 79 | 21 | 100 |
## | 0.790 | 0.210 | |
## --------------------|-----------|-----------|-----------|
##
##
library(caret)
confusionMatrix(table(cancer_predict2 ,cancer_test2_labels))
## Confusion Matrix and Statistics
##
## cancer_test2_labels
## cancer_predict2 Benign Malignant
## Benign 77 2
## Malignant 0 21
##
## Accuracy : 0.98
## 95% CI : (0.9296, 0.9976)
## No Information Rate : 0.77
## P-Value [Acc > NIR] : 2.106e-09
##
## Kappa : 0.9418
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9130
## Pos Pred Value : 0.9747
## Neg Pred Value : 1.0000
## Prevalence : 0.7700
## Detection Rate : 0.7700
## Detection Prevalence : 0.7900
## Balanced Accuracy : 0.9565
##
## 'Positive' Class : Benign
##
# k değeri genellikle gözlem sayısının kareköküdür ve bu durumda
# 100'ün tam karekökü olan k=10'dur. Doğruluğu artırmak için farklı değerlerle denenebilir.
# Ayrıca, FN'leri de mümkün olduğunca düşük tutmalıyız.
set.seed(1234)
ind <- sample(2, nrow(cancer1), replace = T, prob = c(0.7, 0.3))
training <- cancer1[ind == 1,]
test <- cancer1[ind == 2,]
str(training)
## 'data.frame': 392 obs. of 31 variables:
## $ diagnosis : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 12.4 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 15.7 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 82.6 ...
## $ area_mean : num 1001 1326 1203 386 477 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1278 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.17 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.1578 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.0809 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.209 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0761 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.335 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.89 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 2.22 ...
## $ area_se : num 153.4 74.1 94 27.2 27.2 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.00751 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0335 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0367 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0114 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0216 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00508 ...
## $ radius_worst : num 25.4 25 23.6 14.9 15.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 23.8 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 103.4 ...
## $ area_worst : num 2019 1956 1709 568 742 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.179 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.525 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.535 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.174 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.399 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.1244 ...
trControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary)
set.seed(222)
fit <- train(diagnosis~.,
data = training,
method = 'knn',
tuneLength = 20,
trControl = trControl,
preProc = c("center", "scale"),
metric = "ROC",
tuneGrid = expand.grid(k = 1:60))
fit
## k-Nearest Neighbors
##
## 392 samples
## 30 predictor
## 2 classes: 'Benign', 'Malignant'
##
## Pre-processing: centered (30), scaled (30)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 353, 353, 354, 353, 353, 352, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.9449817 0.9776923 0.9122711
## 2 0.9747168 0.9686154 0.9172161
## 3 0.9870761 0.9804615 0.9340659
## 4 0.9882537 0.9895897 0.9267399
## 5 0.9880595 0.9883077 0.9340659
## 6 0.9875178 0.9830256 0.9340659
## 7 0.9884763 0.9843590 0.9243590
## 8 0.9893115 0.9882564 0.9316850
## 9 0.9902068 0.9895897 0.9340659
## 10 0.9913956 0.9895385 0.9342491
## 11 0.9921760 0.9908718 0.9267399
## 12 0.9916009 0.9908718 0.9243590
## 13 0.9914438 0.9935385 0.9194139
## 14 0.9913446 0.9961026 0.9098901
## 15 0.9907643 0.9961026 0.9148352
## 16 0.9901292 0.9961026 0.9075092
## 17 0.9901386 0.9948205 0.8976190
## 18 0.9896114 0.9947692 0.8979853
## 19 0.9893202 0.9947692 0.8956044
## 20 0.9897225 0.9948205 0.8904762
## 21 0.9895665 0.9961026 0.8906593
## 22 0.9895082 0.9947692 0.8957875
## 23 0.9892666 0.9947692 0.8908425
## 24 0.9895130 0.9934359 0.8857143
## 25 0.9890280 0.9947179 0.8831502
## 26 0.9890355 0.9934359 0.8807692
## 27 0.9892226 0.9947179 0.8782051
## 28 0.9899793 0.9960000 0.8829670
## 29 0.9912575 0.9960000 0.8805861
## 30 0.9921368 0.9960000 0.8758242
## 31 0.9919007 0.9960000 0.8756410
## 32 0.9920032 0.9960000 0.8756410
## 33 0.9920686 0.9973333 0.8708791
## 34 0.9923070 0.9973333 0.8780220
## 35 0.9925487 0.9986667 0.8708791
## 36 0.9919080 0.9973846 0.8661172
## 37 0.9918680 0.9986667 0.8684982
## 38 0.9920569 0.9986667 0.8684982
## 39 0.9918135 0.9986667 0.8637363
## 40 0.9919125 1.0000000 0.8661172
## 41 0.9919728 1.0000000 0.8661172
## 42 0.9916779 0.9986667 0.8661172
## 43 0.9916193 0.9986667 0.8661172
## 44 0.9917144 0.9986667 0.8661172
## 45 0.9916248 1.0000000 0.8637363
## 46 0.9918243 1.0000000 0.8684982
## 47 0.9916378 1.0000000 0.8637363
## 48 0.9912990 0.9986667 0.8637363
## 49 0.9910555 1.0000000 0.8637363
## 50 0.9911560 1.0000000 0.8589744
## 51 0.9912513 1.0000000 0.8613553
## 52 0.9911595 1.0000000 0.8589744
## 53 0.9909124 1.0000000 0.8589744
## 54 0.9906743 0.9973333 0.8518315
## 55 0.9905757 0.9986667 0.8542125
## 56 0.9903852 0.9986667 0.8470696
## 57 0.9901875 0.9973333 0.8518315
## 58 0.9900850 0.9960000 0.8423077
## 59 0.9898415 0.9960000 0.8423077
## 60 0.9897115 0.9960513 0.8327839
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 35.
plot(fit)

varImp(fit)
## ROC curve variable importance
##
## only 20 most important variables shown (out of 30)
##
## Importance
## perimeter_worst 100.00
## concave.points_worst 99.28
## radius_worst 98.67
## area_worst 98.29
## concave.points_mean 97.46
## perimeter_mean 92.13
## concavity_mean 91.69
## area_mean 90.14
## radius_mean 89.95
## concavity_worst 88.60
## area_se 88.47
## perimeter_se 80.68
## radius_se 77.78
## compactness_worst 76.95
## compactness_mean 76.03
## concave.points_se 60.90
## texture_worst 59.88
## concavity_se 59.79
## smoothness_worst 58.80
## texture_mean 56.66
pred <- predict(fit, newdata = test)
confusionMatrix(pred, test$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 103 9
## Malignant 0 65
##
## Accuracy : 0.9492
## 95% CI : (0.9057, 0.9765)
## No Information Rate : 0.5819
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8937
##
## Mcnemar's Test P-Value : 0.007661
##
## Sensitivity : 1.0000
## Specificity : 0.8784
## Pos Pred Value : 0.9196
## Neg Pred Value : 1.0000
## Prevalence : 0.5819
## Detection Rate : 0.5819
## Detection Prevalence : 0.6328
## Balanced Accuracy : 0.9392
##
## 'Positive' Class : Benign
##