Notes

# Attrition hedef değişken
exit <- read.csv("attr.txt", header = TRUE, sep = ",",stringsAsFactors = TRUE)
#View(exit)
summary(exit)
##  Attrition    Yrs_Exp      Work_Challenging     Work_Envir    Compensation
##  No :24    Min.   :2.000   No :28           Excellent:28   Excellent:21   
##  Yes:28    1st Qu.:2.500   Yes:24           Low      :24   Low      :31   
##            Median :4.000                                                  
##            Mean   :3.519                                                  
##            3rd Qu.:4.500                                                  
##            Max.   :5.000                                                  
##      Tech_Exper
##  Excellent:44  
##  Low      : 8  
##                
##                
##                
## 
str(exit)
## 'data.frame':    52 obs. of  6 variables:
##  $ Attrition       : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 2 1 2 2 2 ...
##  $ Yrs_Exp         : num  2.5 2 2.5 2 2 2 2 2.5 2 3 ...
##  $ Work_Challenging: Factor w/ 2 levels "No","Yes": 1 2 2 1 2 1 1 1 1 2 ...
##  $ Work_Envir      : Factor w/ 2 levels "Excellent","Low": 2 1 1 1 2 2 1 2 1 2 ...
##  $ Compensation    : Factor w/ 2 levels "Excellent","Low": 2 1 2 2 2 2 1 1 2 1 ...
##  $ Tech_Exper      : Factor w/ 2 levels "Excellent","Low": 1 1 1 1 2 1 2 1 1 1 ...
head(exit,10)
##    Attrition Yrs_Exp Work_Challenging Work_Envir Compensation Tech_Exper
## 1        Yes     2.5               No        Low          Low  Excellent
## 2         No     2.0              Yes  Excellent    Excellent  Excellent
## 3         No     2.5              Yes  Excellent          Low  Excellent
## 4        Yes     2.0               No  Excellent          Low  Excellent
## 5         No     2.0              Yes        Low          Low        Low
## 6        Yes     2.0               No        Low          Low  Excellent
## 7         No     2.0               No  Excellent    Excellent        Low
## 8        Yes     2.5               No        Low    Excellent  Excellent
## 9        Yes     2.0               No  Excellent          Low  Excellent
## 10       Yes     3.0              Yes        Low    Excellent  Excellent
tail(exit)
##    Attrition Yrs_Exp Work_Challenging Work_Envir Compensation Tech_Exper
## 47        No     4.0              Yes  Excellent    Excellent  Excellent
## 48        No     4.5               No  Excellent          Low        Low
## 49       Yes     5.0               No  Excellent    Excellent  Excellent
## 50        No     5.0               No  Excellent    Excellent  Excellent
## 51       Yes     2.0              Yes  Excellent    Excellent  Excellent
## 52        No     4.0              Yes  Excellent    Excellent  Excellent
#model1
names(exit)
## [1] "Attrition"        "Yrs_Exp"          "Work_Challenging" "Work_Envir"      
## [5] "Compensation"     "Tech_Exper"
model1 <- glm(Attrition~ Yrs_Exp+Work_Challenging+Work_Envir+Compensation+Tech_Exper,
              data = exit,
              family = "binomial")
summary(model1)
## 
## Call:
## glm(formula = Attrition ~ Yrs_Exp + Work_Challenging + Work_Envir + 
##     Compensation + Tech_Exper, family = "binomial", data = exit)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.37759  -0.20326   0.04508   0.22389   2.95410  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)   
## (Intercept)          -1.1964     2.4077  -0.497   0.6193   
## Yrs_Exp               0.1320     0.5102   0.259   0.7959   
## Work_ChallengingYes  -3.4180     1.4091  -2.426   0.0153 * 
## Work_EnvirLow         4.6118     1.6783   2.748   0.0060 **
## CompensationLow       2.8160     1.3513   2.084   0.0372 * 
## Tech_ExperLow        -3.9598     1.7030  -2.325   0.0201 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71.779  on 51  degrees of freedom
## Residual deviance: 26.018  on 46  degrees of freedom
## AIC: 38.018
## 
## Number of Fisher Scoring iterations: 7
# Yrs_Exp dışında tüm değişkenler anlamlı
# Yrs_Exp değişkeninin çıkarıp tekrar model kuruyoruz

model2 <- glm(Attrition~ Work_Challenging+Work_Envir+Compensation+Tech_Exper,
              data = exit,
              family = "binomial")
summary(model2)
## 
## Call:
## glm(formula = Attrition ~ Work_Challenging + Work_Envir + Compensation + 
##     Tech_Exper, family = "binomial", data = exit)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.36759  -0.20050   0.05191   0.22273   2.86409  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)   
## (Intercept)          -0.6216     0.9101  -0.683   0.4946   
## Work_ChallengingYes  -3.4632     1.4025  -2.469   0.0135 * 
## Work_EnvirLow         4.5215     1.5995   2.827   0.0047 **
## CompensationLow       2.7090     1.2542   2.160   0.0308 * 
## Tech_ExperLow        -3.8547     1.6065  -2.400   0.0164 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71.779  on 51  degrees of freedom
## Residual deviance: 26.086  on 47  degrees of freedom
## AIC: 36.086
## 
## Number of Fisher Scoring iterations: 7
# İki model arasında farklılık var mı?
# değişken çıktıktan sonra yeni modelde farklılık var mı?
# anova ile test ediyoruz.

anova(model2, model1, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: Attrition ~ Work_Challenging + Work_Envir + Compensation + Tech_Exper
## Model 2: Attrition ~ Yrs_Exp + Work_Challenging + Work_Envir + Compensation + 
##     Tech_Exper
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        47     26.086                     
## 2        46     26.018  1 0.067459   0.7951
# Chi değeri 0.7951>0.01(0.05) 
# İki model arasında farklılık yok, değişkeni çıkarabiliriz.
# model katsayıları için:
anova(model2, "PhiSq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Attrition
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev
## NULL                                51     71.779
## Work_Challenging  1  15.6908        50     56.089
## Work_Envir        1  17.4619        49     38.627
## Compensation      1   3.3265        48     35.300
## Tech_Exper        1   9.2142        47     26.086
coef(model2)
##         (Intercept) Work_ChallengingYes       Work_EnvirLow     CompensationLow 
##          -0.6215815          -3.4632331           4.5215090           2.7089540 
##       Tech_ExperLow 
##          -3.8547410
# negatif olanlar Attrition=Yes olasılığını düşürmekte.
# pozitif olanlar artırmakta
# ne kadar artırıp azalttığını görmek için üstelini alıyoruz:
exp(coef(model2))
##         (Intercept) Work_ChallengingYes       Work_EnvirLow     CompensationLow 
##          0.53709434          0.03132831         91.97428006         15.01356337 
##       Tech_ExperLow 
##          0.02117909
# Work_Env değişkenini (diğer değişkenle sabit kaldığında)
# bir birim artırdığımızda Attrition=Yes olasılığı 91.97 kere artmakta.
# çoklu bağlantıyı kontrol etmek istediğimizde:
library(car)
## Zorunlu paket yükleniyor: carData
vif(model2)
## Work_Challenging       Work_Envir     Compensation       Tech_Exper 
##         1.984868         2.461992         1.611694         1.562850
# 5'ten büyük değerler çoklu bağlantı olduğuna işaret eder.
# Değişkenleri kendi isteğimizle sınırlayarak sonuca bakmak istersek:
prediction <- data.frame(Yrs_Exp = 3,
                     Work_Challenging = "Yes",
                     Work_Envir = "Excellent",
                     Compensation = "Excellent",
                     Tech_Exper = "Low")
presults <- predict(model2, newdata = prediction, type = "response")
presults
##            1 
## 0.0003562379
# Burada sonuçları 0.5 ile kıyaslıyoruz:
# 0.5'den büyük ise 1'e yakın demektir. sonuç: Attrition=Yes 
# 0.5'den küçük ise, sonuç: Attrition=No.
# confusion matrix görmek için:
prediction2 <- predict(model2, type = "response")
table(exit$Attrition, prediction2 >0.5)
##      
##       FALSE TRUE
##   No     22    2
##   Yes     3   25
# modeli daha da geliştirebilir miyiz? 
# verisetini eğitim ve test setine bölelim.

library(caret)
## Zorunlu paket yükleniyor: ggplot2
## Zorunlu paket yükleniyor: lattice
set.seed(123)
Index <- createDataPartition(exit$Attrition, p=0.8, list = FALSE)

egitim <- exit[Index,]
test <- exit[-Index,]

summary(egitim)
##  Attrition    Yrs_Exp      Work_Challenging     Work_Envir    Compensation
##  No :20    Min.   :2.000   No :25           Excellent:23   Excellent:18   
##  Yes:23    1st Qu.:2.250   Yes:18           Low      :20   Low      :25   
##            Median :4.000                                                  
##            Mean   :3.581                                                  
##            3rd Qu.:4.500                                                  
##            Max.   :5.000                                                  
##      Tech_Exper
##  Excellent:36  
##  Low      : 7  
##                
##                
##                
## 
summary(test)
##  Attrition    Yrs_Exp      Work_Challenging     Work_Envir    Compensation
##  No :4     Min.   :2.000   No :3            Excellent:5    Excellent:3    
##  Yes:5     1st Qu.:2.500   Yes:6            Low      :4    Low      :6    
##            Median :3.000                                                  
##            Mean   :3.222                                                  
##            3rd Qu.:4.000                                                  
##            Max.   :5.000                                                  
##      Tech_Exper
##  Excellent:8   
##  Low      :1   
##                
##                
##                
## 

Üçüncü modeli de siz sonlandırın:)

model3