cancer <- read.csv("cancer.csv", stringsAsFactors = FALSE)

View(cancer)
str(cancer)
## 'data.frame':    569 obs. of  33 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : chr  "M" "M" "M" "M" ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ X                      : logi  NA NA NA NA NA NA ...
# ilk değişken 'id' doğası gereği benzersizdir ve yararlı bilgiler sağlamadığından kaldırılabilir.
# sonuncusu da kaldırılabilir.

cancer1 <- cancer[,c(-1,-33)]
View(cancer1)
str(cancer1)
## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : chr  "M" "M" "M" "M" ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
summary(cancer1)
##   diagnosis          radius_mean      texture_mean   perimeter_mean  
##  Length:569         Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  Class :character   1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##  Mode  :character   Median :13.370   Median :18.84   Median : 86.24  
##                     Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##                     3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##                     Max.   :28.110   Max.   :39.28   Max.   :188.50  
##    area_mean      smoothness_mean   compactness_mean  concavity_mean   
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
##  1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956  
##  Median : 551.1   Median :0.09587   Median :0.09263   Median :0.06154  
##  Mean   : 654.9   Mean   :0.09636   Mean   :0.10434   Mean   :0.08880  
##  3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540   Max.   :0.42680  
##  concave.points_mean symmetry_mean    fractal_dimension_mean   radius_se     
##  Min.   :0.00000     Min.   :0.1060   Min.   :0.04996        Min.   :0.1115  
##  1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770        1st Qu.:0.2324  
##  Median :0.03350     Median :0.1792   Median :0.06154        Median :0.3242  
##  Mean   :0.04892     Mean   :0.1812   Mean   :0.06280        Mean   :0.4052  
##  3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612        3rd Qu.:0.4789  
##  Max.   :0.20120     Max.   :0.3040   Max.   :0.09744        Max.   :2.8730  
##    texture_se      perimeter_se       area_se        smoothness_se     
##  Min.   :0.3602   Min.   : 0.757   Min.   :  6.802   Min.   :0.001713  
##  1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850   1st Qu.:0.005169  
##  Median :1.1080   Median : 2.287   Median : 24.530   Median :0.006380  
##  Mean   :1.2169   Mean   : 2.866   Mean   : 40.337   Mean   :0.007041  
##  3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190   3rd Qu.:0.008146  
##  Max.   :4.8850   Max.   :21.980   Max.   :542.200   Max.   :0.031130  
##  compactness_se      concavity_se     concave.points_se   symmetry_se      
##  Min.   :0.002252   Min.   :0.00000   Min.   :0.000000   Min.   :0.007882  
##  1st Qu.:0.013080   1st Qu.:0.01509   1st Qu.:0.007638   1st Qu.:0.015160  
##  Median :0.020450   Median :0.02589   Median :0.010930   Median :0.018730  
##  Mean   :0.025478   Mean   :0.03189   Mean   :0.011796   Mean   :0.020542  
##  3rd Qu.:0.032450   3rd Qu.:0.04205   3rd Qu.:0.014710   3rd Qu.:0.023480  
##  Max.   :0.135400   Max.   :0.39600   Max.   :0.052790   Max.   :0.078950  
##  fractal_dimension_se  radius_worst   texture_worst   perimeter_worst 
##  Min.   :0.0008948    Min.   : 7.93   Min.   :12.02   Min.   : 50.41  
##  1st Qu.:0.0022480    1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11  
##  Median :0.0031870    Median :14.97   Median :25.41   Median : 97.66  
##  Mean   :0.0037949    Mean   :16.27   Mean   :25.68   Mean   :107.26  
##  3rd Qu.:0.0045580    3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40  
##  Max.   :0.0298400    Max.   :36.04   Max.   :49.54   Max.   :251.20  
##    area_worst     smoothness_worst  compactness_worst concavity_worst 
##  Min.   : 185.2   Min.   :0.07117   Min.   :0.02729   Min.   :0.0000  
##  1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145  
##  Median : 686.5   Median :0.13130   Median :0.21190   Median :0.2267  
##  Mean   : 880.6   Mean   :0.13237   Mean   :0.25427   Mean   :0.2722  
##  3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829  
##  Max.   :4254.0   Max.   :0.22260   Max.   :1.05800   Max.   :1.2520  
##  concave.points_worst symmetry_worst   fractal_dimension_worst
##  Min.   :0.00000      Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.06493      1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.09993      Median :0.2822   Median :0.08004        
##  Mean   :0.11461      Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.16140      3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.29100      Max.   :0.6638   Max.   :0.20750
# diagnosis(tanı), hedef değişkenimiz (nitelik).
table(cancer1$diagnosis)
## 
##   B   M 
## 357 212
# diagnosise göre hasta sayısını görmek için:
# B'yi ”Benign” ve M'yi “Malign” olarak yeniden adlandırıyoruz, sonuçları yüzde biçiminde görmek için:
cancer1$diagnosis <- factor(cancer1$diagnosis, levels = c("B","M"), labels = c("Benign", "Malignant"))
# sonucu 1 ondalık basamağa yuvarlamak için:
round(prop.table(table(cancer1$diagnosis))*100, digits = 1)
## 
##    Benign Malignant 
##      62.7      37.3
str(cancer1)
## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
# Kalan 30 özelliğin tamamı nicel
# tesadüfi üç niteliğin özelliklerine bakmak için:
summary(cancer1[c("radius_mean", "area_mean", "smoothness_mean")])
##   radius_mean       area_mean      smoothness_mean  
##  Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
##  1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
##  Median :13.370   Median : 551.1   Median :0.09587  
##  Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
##  3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
##  Max.   :28.110   Max.   :2501.0   Max.   :0.16340
# smoothness_mean 0,05 ile 0.16, area_mean  143.5 ile 2501.0 arasında değişirken, 
# area_mean etkisi mesafe hesaplamasında smoothness_meanden çok daha büyük olacaktır. 
# Sınıflandırıcımız için sorun var, bu yüzden özellikleri yeniden ölçeklendirmek için normalleştirme uygulayalım
# min-max normalizasyonu
normalize <- function(x){
  return((x-min(x))/(max(x)-min(x)))
}

# Bu kodu çalıştırdıktan sonra veri setindeki sayısal özellikleri normalleştirmemiz gerekiyor.
# Kullandığımız 29 ayrı değişkenin her birini normalleştirmek yerine, 
# lapply() fonksiyonu bir liste alır ve her öğeye bunu uygular.

cancer_n <- as.data.frame(lapply(cancer1[2:31], normalize))

# _n soneki burada normalize değerlerin geçerli olduğunu hatırlatmak için kullanıldı.

# Veri kümemizdeki ilk değişken (ID çıkardıktan sonra), doğası gereği nicel olmayan 'diagnosis'tir.
# Yani 2. değişkenden başlıyoruz. lapply(), veri çerçevesindeki her özelliğe normalize() uygular.
# Nihai sonuç, as.data.frame() işlevi kullanılarak kanser_n veri çerçevesine kaydedilir.
View(cancer_n)

# Verilerin normalize edilip edilmediğini kontrol etmek için:
summary(cancer_n)
##   radius_mean      texture_mean    perimeter_mean     area_mean     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
##  Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
##  Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
##  3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  smoothness_mean  compactness_mean concavity_mean    concave.points_mean
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000     
##  1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009     
##  Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665     
##  Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431     
##  3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678     
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000     
##  symmetry_mean    fractal_dimension_mean   radius_se         texture_se    
##  Min.   :0.0000   Min.   :0.0000         Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.2823   1st Qu.:0.1630         1st Qu.:0.04378   1st Qu.:0.1047  
##  Median :0.3697   Median :0.2439         Median :0.07702   Median :0.1653  
##  Mean   :0.3796   Mean   :0.2704         Mean   :0.10635   Mean   :0.1893  
##  3rd Qu.:0.4530   3rd Qu.:0.3404         3rd Qu.:0.13304   3rd Qu.:0.2462  
##  Max.   :1.0000   Max.   :1.0000         Max.   :1.00000   Max.   :1.0000  
##   perimeter_se        area_se        smoothness_se    compactness_se   
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
##  Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
##  Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
##  3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   concavity_se     concave.points_se  symmetry_se     fractal_dimension_se
##  Min.   :0.00000   Min.   :0.0000    Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.03811   1st Qu.:0.1447    1st Qu.:0.1024   1st Qu.:0.04675     
##  Median :0.06538   Median :0.2070    Median :0.1526   Median :0.07919     
##  Mean   :0.08054   Mean   :0.2235    Mean   :0.1781   Mean   :0.10019     
##  3rd Qu.:0.10619   3rd Qu.:0.2787    3rd Qu.:0.2195   3rd Qu.:0.12656     
##  Max.   :1.00000   Max.   :1.0000    Max.   :1.0000   Max.   :1.00000     
##   radius_worst    texture_worst    perimeter_worst    area_worst     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.1807   1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113  
##  Median :0.2504   Median :0.3569   Median :0.2353   Median :0.12321  
##  Mean   :0.2967   Mean   :0.3640   Mean   :0.2831   Mean   :0.17091  
##  3rd Qu.:0.3863   3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  smoothness_worst compactness_worst concavity_worst   concave.points_worst
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000      
##  1st Qu.:0.3000   1st Qu.:0.1163    1st Qu.:0.09145   1st Qu.:0.2231      
##  Median :0.3971   Median :0.1791    Median :0.18107   Median :0.3434      
##  Mean   :0.4041   Mean   :0.2202    Mean   :0.21740   Mean   :0.3938      
##  3rd Qu.:0.4942   3rd Qu.:0.3025    3rd Qu.:0.30583   3rd Qu.:0.5546      
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000      
##  symmetry_worst   fractal_dimension_worst
##  Min.   :0.0000   Min.   :0.0000         
##  1st Qu.:0.1851   1st Qu.:0.1077         
##  Median :0.2478   Median :0.1640         
##  Mean   :0.2633   Mean   :0.1896         
##  3rd Qu.:0.3182   3rd Qu.:0.2429         
##  Max.   :1.0000   Max.   :1.0000
# Cancer_n veri çerçevesini,cancer_train ve cancer_test veri çerçevelerine ayıracağız
# Eğitim için ilk 469 birim, test için de son 100 birim alıyoruz

cancer_train <- cancer_n[1:469, ]
cancer_test <- cancer_n[470:569, ]

# Hedef değişkenimiz, eğitim ve test veri setlerimize dahil etmediğimiz 'diagnosis'.
# Eğitim ve test verilerimizi oluşturduğumuzda, hedef değişken olan diagnosis hariç tuttuk.
# kNN modelini eğitmek için bu sınıf etiketlerini içinde saklamamız gerekecek.

cancer_train_labels <- cancer1[1:469,1]
cancer_test_labels <- cancer1[470:569,1]

summary(cancer_test_labels)
##    Benign Malignant 
##        77        23
# install.packages("class")
library(class)
## Warning: package 'class' was built under R version 3.6.3
# k?
sqrt(469)
## [1] 21.65641
cancer_predict <- knn(train = cancer_train,
                      test = cancer_test,
                      cl = cancer_train_labels,
                      k = 21)

cancer_predict
##   [1] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##   [8] Benign    Benign    Benign    Malignant Benign    Benign    Benign   
##  [15] Benign    Benign    Benign    Benign    Malignant Benign    Benign   
##  [22] Benign    Benign    Malignant Benign    Benign    Benign    Benign   
##  [29] Benign    Malignant Malignant Benign    Malignant Benign    Malignant
##  [36] Benign    Benign    Benign    Benign    Benign    Malignant Benign   
##  [43] Benign    Malignant Benign    Benign    Benign    Malignant Malignant
##  [50] Benign    Benign    Benign    Malignant Benign    Benign    Benign   
##  [57] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [64] Benign    Malignant Benign    Malignant Malignant Benign    Benign   
##  [71] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [78] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [85] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [92] Benign    Benign    Malignant Malignant Malignant Malignant Malignant
##  [99] Malignant Benign   
## Levels: Benign Malignant
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(table(cancer_predict ,cancer_test_labels))
## Confusion Matrix and Statistics
## 
##               cancer_test_labels
## cancer_predict Benign Malignant
##      Benign        77         2
##      Malignant      0        21
##                                           
##                Accuracy : 0.98            
##                  95% CI : (0.9296, 0.9976)
##     No Information Rate : 0.77            
##     P-Value [Acc > NIR] : 2.106e-09       
##                                           
##                   Kappa : 0.9418          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9130          
##          Pos Pred Value : 0.9747          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.7700          
##          Detection Rate : 0.7700          
##    Detection Prevalence : 0.7900          
##       Balanced Accuracy : 0.9565          
##                                           
##        'Positive' Class : Benign          
## 
# The test data consisted of 100 observations. 
# Out of which 77 cases have been accurately predicted (TN->True Negatives) as Benign (B) in nature which constitutes 77%. 
# Also, 21 out of 100 observations were accurately predicted (TP-> True Positives) as Malignant (M) in nature which constitutes 21%. 
# Thus a total of 21 out of 100 predictions where TP i.e, True Positive in nature.

# There were 2 cases of False Negatives (FN) meaning 2 cases were recorded which actually are malignant in nature 
# but got predicted as benign. The FN’s if any poses a potential threat for the same reason and the main focus to 
# increase the accuracy of the model is to reduce FN’s.

# There were 0 cases of False Positives (FP) meaning 0 cases were actually benign in nature but 
# got predicted as malignant.

# The total accuracy of the model is 98 %( (TN+TP)/100) which shows that there may be chances to 
# improve the model performance

# Test verileri 100 gözlemden oluşuyordu.77 hasta (TN->Gerçek Negatifler) gerçekte 
# Benign (B) olduğu doğru bir şekilde tahmin edilmiştir.
# Ayrıca, 100 gözlemden 21'i (TP-> Gerçek Pozitifler) gerçekte Malign (M) olarak 
# doğru bir şekilde tahmin edilmiş, böylece 100 tahminden toplam 21'inde TP yani Doğası gereği Gerçek Pozitif.

# 2 Yanlış Negatif (FN) vakası var, yani aslında doğası gereği malign olan 2 hasta iyi huylu olduğu tahmin edildi. 
# FN'ler varsa, aynı nedenle potansiyel bir tehdit oluşturmakta ve ana odak noktası
# modelin doğruluğunu arttırmak FN'leri azaltmaktır.

# 0 Yanlış Pozitif (FP) durumu var, yani 0 vaka gerçekte iyi huyluydu ama
# malign olarak tahmin edildi.

# Modelin toplam doğruluğu %98( (TN+TP)/100) olmakta bu da farklı olasılıkların olabileceğini göstermektedir.
# model performansını iyileştirmeli.
# Ayrıca çapraz tablo sonuçlarını görmek için "gmodels" paketini kullanabiliriz.
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.6.3
CrossTable(x = cancer_test_labels, y = cancer_predict, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                    | cancer_predict 
## cancer_test_labels |    Benign | Malignant | Row Total | 
## -------------------|-----------|-----------|-----------|
##             Benign |        77 |         0 |        77 | 
##                    |     1.000 |     0.000 |     0.770 | 
##                    |     0.975 |     0.000 |           | 
##                    |     0.770 |     0.000 |           | 
## -------------------|-----------|-----------|-----------|
##          Malignant |         2 |        21 |        23 | 
##                    |     0.087 |     0.913 |     0.230 | 
##                    |     0.025 |     1.000 |           | 
##                    |     0.020 |     0.210 |           | 
## -------------------|-----------|-----------|-----------|
##       Column Total |        79 |        21 |       100 | 
##                    |     0.790 |     0.210 |           | 
## -------------------|-----------|-----------|-----------|
## 
## 
# z score normalizasyonu
cancer_z <- as.data.frame(scale(cancer1[-1]))
summary(cancer_z)
##   radius_mean       texture_mean     perimeter_mean      area_mean      
##  Min.   :-2.0279   Min.   :-2.2273   Min.   :-1.9828   Min.   :-1.4532  
##  1st Qu.:-0.6888   1st Qu.:-0.7253   1st Qu.:-0.6913   1st Qu.:-0.6666  
##  Median :-0.2149   Median :-0.1045   Median :-0.2358   Median :-0.2949  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4690   3rd Qu.: 0.5837   3rd Qu.: 0.4992   3rd Qu.: 0.3632  
##  Max.   : 3.9678   Max.   : 4.6478   Max.   : 3.9726   Max.   : 5.2459  
##  smoothness_mean    compactness_mean  concavity_mean    concave.points_mean
##  Min.   :-3.10935   Min.   :-1.6087   Min.   :-1.1139   Min.   :-1.2607    
##  1st Qu.:-0.71034   1st Qu.:-0.7464   1st Qu.:-0.7431   1st Qu.:-0.7373    
##  Median :-0.03486   Median :-0.2217   Median :-0.3419   Median :-0.3974    
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000    
##  3rd Qu.: 0.63564   3rd Qu.: 0.4934   3rd Qu.: 0.5256   3rd Qu.: 0.6464    
##  Max.   : 4.76672   Max.   : 4.5644   Max.   : 4.2399   Max.   : 3.9245    
##  symmetry_mean      fractal_dimension_mean   radius_se         texture_se     
##  Min.   :-2.74171   Min.   :-1.8183        Min.   :-1.0590   Min.   :-1.5529  
##  1st Qu.:-0.70262   1st Qu.:-0.7220        1st Qu.:-0.6230   1st Qu.:-0.6942  
##  Median :-0.07156   Median :-0.1781        Median :-0.2920   Median :-0.1973  
##  Mean   : 0.00000   Mean   : 0.0000        Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.53031   3rd Qu.: 0.4706        3rd Qu.: 0.2659   3rd Qu.: 0.4661  
##  Max.   : 4.48081   Max.   : 4.9066        Max.   : 8.8991   Max.   : 6.6494  
##   perimeter_se        area_se        smoothness_se     compactness_se   
##  Min.   :-1.0431   Min.   :-0.7372   Min.   :-1.7745   Min.   :-1.2970  
##  1st Qu.:-0.6232   1st Qu.:-0.4943   1st Qu.:-0.6235   1st Qu.:-0.6923  
##  Median :-0.2864   Median :-0.3475   Median :-0.2201   Median :-0.2808  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.2428   3rd Qu.: 0.1067   3rd Qu.: 0.3680   3rd Qu.: 0.3893  
##  Max.   : 9.4537   Max.   :11.0321   Max.   : 8.0229   Max.   : 6.1381  
##   concavity_se     concave.points_se  symmetry_se      fractal_dimension_se
##  Min.   :-1.0566   Min.   :-1.9118   Min.   :-1.5315   Min.   :-1.0960     
##  1st Qu.:-0.5567   1st Qu.:-0.6739   1st Qu.:-0.6511   1st Qu.:-0.5846     
##  Median :-0.1989   Median :-0.1404   Median :-0.2192   Median :-0.2297     
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000     
##  3rd Qu.: 0.3365   3rd Qu.: 0.4722   3rd Qu.: 0.3554   3rd Qu.: 0.2884     
##  Max.   :12.0621   Max.   : 6.6438   Max.   : 7.0657   Max.   : 9.8429     
##   radius_worst     texture_worst      perimeter_worst     area_worst     
##  Min.   :-1.7254   Min.   :-2.22204   Min.   :-1.6919   Min.   :-1.2213  
##  1st Qu.:-0.6743   1st Qu.:-0.74797   1st Qu.:-0.6890   1st Qu.:-0.6416  
##  Median :-0.2688   Median :-0.04348   Median :-0.2857   Median :-0.3409  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5216   3rd Qu.: 0.65776   3rd Qu.: 0.5398   3rd Qu.: 0.3573  
##  Max.   : 4.0906   Max.   : 3.88249   Max.   : 4.2836   Max.   : 5.9250  
##  smoothness_worst  compactness_worst concavity_worst   concave.points_worst
##  Min.   :-2.6803   Min.   :-1.4426   Min.   :-1.3047   Min.   :-1.7435     
##  1st Qu.:-0.6906   1st Qu.:-0.6805   1st Qu.:-0.7558   1st Qu.:-0.7557     
##  Median :-0.0468   Median :-0.2693   Median :-0.2180   Median :-0.2233     
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000     
##  3rd Qu.: 0.5970   3rd Qu.: 0.5392   3rd Qu.: 0.5307   3rd Qu.: 0.7119     
##  Max.   : 3.9519   Max.   : 5.1084   Max.   : 4.6965   Max.   : 2.6835     
##  symmetry_worst    fractal_dimension_worst
##  Min.   :-2.1591   Min.   :-1.6004        
##  1st Qu.:-0.6413   1st Qu.:-0.6913        
##  Median :-0.1273   Median :-0.2163        
##  Mean   : 0.0000   Mean   : 0.0000        
##  3rd Qu.: 0.4497   3rd Qu.: 0.4504        
##  Max.   : 6.0407   Max.   : 6.8408
cancer_train2 <- cancer_z[1:469,]
cancer_test2 <- cancer_z[470:569,]

cancer_train2_labels <- cancer1[1:469,1]
cancer_test2_labels <- cancer1[470:569,1]

cancer_predict2 <- knn(train = cancer_train2,
                      test = cancer_test2,
                      cl = cancer_train2_labels,
                      k= 21)

CrossTable(x = cancer_test2_labels, y = cancer_predict2, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                     | cancer_predict2 
## cancer_test2_labels |    Benign | Malignant | Row Total | 
## --------------------|-----------|-----------|-----------|
##              Benign |        77 |         0 |        77 | 
##                     |     1.000 |     0.000 |     0.770 | 
##                     |     0.975 |     0.000 |           | 
##                     |     0.770 |     0.000 |           | 
## --------------------|-----------|-----------|-----------|
##           Malignant |         2 |        21 |        23 | 
##                     |     0.087 |     0.913 |     0.230 | 
##                     |     0.025 |     1.000 |           | 
##                     |     0.020 |     0.210 |           | 
## --------------------|-----------|-----------|-----------|
##        Column Total |        79 |        21 |       100 | 
##                     |     0.790 |     0.210 |           | 
## --------------------|-----------|-----------|-----------|
## 
## 
library(caret)
confusionMatrix(table(cancer_predict2 ,cancer_test2_labels))
## Confusion Matrix and Statistics
## 
##                cancer_test2_labels
## cancer_predict2 Benign Malignant
##       Benign        77         2
##       Malignant      0        21
##                                           
##                Accuracy : 0.98            
##                  95% CI : (0.9296, 0.9976)
##     No Information Rate : 0.77            
##     P-Value [Acc > NIR] : 2.106e-09       
##                                           
##                   Kappa : 0.9418          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9130          
##          Pos Pred Value : 0.9747          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.7700          
##          Detection Rate : 0.7700          
##    Detection Prevalence : 0.7900          
##       Balanced Accuracy : 0.9565          
##                                           
##        'Positive' Class : Benign          
## 
# k değeri genellikle gözlem sayısının kareköküdür ve bu durumda
# 100'ün tam karekökü olan k=10'dur. Doğruluğu artırmak için farklı değerlerle denenebilir.
# Ayrıca, FN'leri de mümkün olduğunca düşük tutmalıyız.
set.seed(1234)
ind <- sample(2, nrow(cancer1), replace = T, prob = c(0.7, 0.3))
training <- cancer1[ind == 1,]
test <- cancer1[ind == 2,]
str(training)
## 'data.frame':    392 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 12.4 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 15.7 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 82.6 ...
##  $ area_mean              : num  1001 1326 1203 386 477 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1278 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.17 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.1578 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.0809 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.209 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0761 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.335 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.89 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 2.22 ...
##  $ area_se                : num  153.4 74.1 94 27.2 27.2 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.00751 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0335 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0367 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0114 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0216 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00508 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 15.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 23.8 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 103.4 ...
##  $ area_worst             : num  2019 1956 1709 568 742 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.179 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.525 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.535 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.174 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.399 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.1244 ...
trControl <- trainControl(method = "repeatedcv",
                          number = 10,
                          repeats = 3,
                          classProbs = TRUE,
                          summaryFunction = twoClassSummary)

set.seed(222)
fit <- train(diagnosis~.,
             data = training,
             method = 'knn',
             tuneLength = 20,
             trControl = trControl,
             preProc = c("center", "scale"),
             metric = "ROC",
             tuneGrid = expand.grid(k = 1:60))


fit
## k-Nearest Neighbors 
## 
## 392 samples
##  30 predictor
##   2 classes: 'Benign', 'Malignant' 
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 353, 353, 354, 353, 353, 352, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec     
##    1  0.9449817  0.9776923  0.9122711
##    2  0.9747168  0.9686154  0.9172161
##    3  0.9870761  0.9804615  0.9340659
##    4  0.9882537  0.9895897  0.9267399
##    5  0.9880595  0.9883077  0.9340659
##    6  0.9875178  0.9830256  0.9340659
##    7  0.9884763  0.9843590  0.9243590
##    8  0.9893115  0.9882564  0.9316850
##    9  0.9902068  0.9895897  0.9340659
##   10  0.9913956  0.9895385  0.9342491
##   11  0.9921760  0.9908718  0.9267399
##   12  0.9916009  0.9908718  0.9243590
##   13  0.9914438  0.9935385  0.9194139
##   14  0.9913446  0.9961026  0.9098901
##   15  0.9907643  0.9961026  0.9148352
##   16  0.9901292  0.9961026  0.9075092
##   17  0.9901386  0.9948205  0.8976190
##   18  0.9896114  0.9947692  0.8979853
##   19  0.9893202  0.9947692  0.8956044
##   20  0.9897225  0.9948205  0.8904762
##   21  0.9895665  0.9961026  0.8906593
##   22  0.9895082  0.9947692  0.8957875
##   23  0.9892666  0.9947692  0.8908425
##   24  0.9895130  0.9934359  0.8857143
##   25  0.9890280  0.9947179  0.8831502
##   26  0.9890355  0.9934359  0.8807692
##   27  0.9892226  0.9947179  0.8782051
##   28  0.9899793  0.9960000  0.8829670
##   29  0.9912575  0.9960000  0.8805861
##   30  0.9921368  0.9960000  0.8758242
##   31  0.9919007  0.9960000  0.8756410
##   32  0.9920032  0.9960000  0.8756410
##   33  0.9920686  0.9973333  0.8708791
##   34  0.9923070  0.9973333  0.8780220
##   35  0.9925487  0.9986667  0.8708791
##   36  0.9919080  0.9973846  0.8661172
##   37  0.9918680  0.9986667  0.8684982
##   38  0.9920569  0.9986667  0.8684982
##   39  0.9918135  0.9986667  0.8637363
##   40  0.9919125  1.0000000  0.8661172
##   41  0.9919728  1.0000000  0.8661172
##   42  0.9916779  0.9986667  0.8661172
##   43  0.9916193  0.9986667  0.8661172
##   44  0.9917144  0.9986667  0.8661172
##   45  0.9916248  1.0000000  0.8637363
##   46  0.9918243  1.0000000  0.8684982
##   47  0.9916378  1.0000000  0.8637363
##   48  0.9912990  0.9986667  0.8637363
##   49  0.9910555  1.0000000  0.8637363
##   50  0.9911560  1.0000000  0.8589744
##   51  0.9912513  1.0000000  0.8613553
##   52  0.9911595  1.0000000  0.8589744
##   53  0.9909124  1.0000000  0.8589744
##   54  0.9906743  0.9973333  0.8518315
##   55  0.9905757  0.9986667  0.8542125
##   56  0.9903852  0.9986667  0.8470696
##   57  0.9901875  0.9973333  0.8518315
##   58  0.9900850  0.9960000  0.8423077
##   59  0.9898415  0.9960000  0.8423077
##   60  0.9897115  0.9960513  0.8327839
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 35.
plot(fit)

varImp(fit)
## ROC curve variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                      Importance
## perimeter_worst          100.00
## concave.points_worst      99.28
## radius_worst              98.67
## area_worst                98.29
## concave.points_mean       97.46
## perimeter_mean            92.13
## concavity_mean            91.69
## area_mean                 90.14
## radius_mean               89.95
## concavity_worst           88.60
## area_se                   88.47
## perimeter_se              80.68
## radius_se                 77.78
## compactness_worst         76.95
## compactness_mean          76.03
## concave.points_se         60.90
## texture_worst             59.88
## concavity_se              59.79
## smoothness_worst          58.80
## texture_mean              56.66
pred <- predict(fit, newdata = test)
confusionMatrix(pred, test$diagnosis)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       103         9
##   Malignant      0        65
##                                           
##                Accuracy : 0.9492          
##                  95% CI : (0.9057, 0.9765)
##     No Information Rate : 0.5819          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8937          
##                                           
##  Mcnemar's Test P-Value : 0.007661        
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8784          
##          Pos Pred Value : 0.9196          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5819          
##          Detection Rate : 0.5819          
##    Detection Prevalence : 0.6328          
##       Balanced Accuracy : 0.9392          
##                                           
##        'Positive' Class : Benign          
##