# Attrition hedef değişken
exit <- read.csv("attr.txt", header = TRUE, sep = ",",stringsAsFactors = TRUE)
#View(exit)
summary(exit)
## Attrition Yrs_Exp Work_Challenging Work_Envir Compensation
## No :24 Min. :2.000 No :28 Excellent:28 Excellent:21
## Yes:28 1st Qu.:2.500 Yes:24 Low :24 Low :31
## Median :4.000
## Mean :3.519
## 3rd Qu.:4.500
## Max. :5.000
## Tech_Exper
## Excellent:44
## Low : 8
##
##
##
##
str(exit)
## 'data.frame': 52 obs. of 6 variables:
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 2 1 2 2 2 ...
## $ Yrs_Exp : num 2.5 2 2.5 2 2 2 2 2.5 2 3 ...
## $ Work_Challenging: Factor w/ 2 levels "No","Yes": 1 2 2 1 2 1 1 1 1 2 ...
## $ Work_Envir : Factor w/ 2 levels "Excellent","Low": 2 1 1 1 2 2 1 2 1 2 ...
## $ Compensation : Factor w/ 2 levels "Excellent","Low": 2 1 2 2 2 2 1 1 2 1 ...
## $ Tech_Exper : Factor w/ 2 levels "Excellent","Low": 1 1 1 1 2 1 2 1 1 1 ...
head(exit,10)
## Attrition Yrs_Exp Work_Challenging Work_Envir Compensation Tech_Exper
## 1 Yes 2.5 No Low Low Excellent
## 2 No 2.0 Yes Excellent Excellent Excellent
## 3 No 2.5 Yes Excellent Low Excellent
## 4 Yes 2.0 No Excellent Low Excellent
## 5 No 2.0 Yes Low Low Low
## 6 Yes 2.0 No Low Low Excellent
## 7 No 2.0 No Excellent Excellent Low
## 8 Yes 2.5 No Low Excellent Excellent
## 9 Yes 2.0 No Excellent Low Excellent
## 10 Yes 3.0 Yes Low Excellent Excellent
tail(exit)
## Attrition Yrs_Exp Work_Challenging Work_Envir Compensation Tech_Exper
## 47 No 4.0 Yes Excellent Excellent Excellent
## 48 No 4.5 No Excellent Low Low
## 49 Yes 5.0 No Excellent Excellent Excellent
## 50 No 5.0 No Excellent Excellent Excellent
## 51 Yes 2.0 Yes Excellent Excellent Excellent
## 52 No 4.0 Yes Excellent Excellent Excellent
#model1
names(exit)
## [1] "Attrition" "Yrs_Exp" "Work_Challenging" "Work_Envir"
## [5] "Compensation" "Tech_Exper"
model1 <- glm(Attrition~ Yrs_Exp+Work_Challenging+Work_Envir+Compensation+Tech_Exper,
data = exit,
family = "binomial")
summary(model1)
##
## Call:
## glm(formula = Attrition ~ Yrs_Exp + Work_Challenging + Work_Envir +
## Compensation + Tech_Exper, family = "binomial", data = exit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.37759 -0.20326 0.04508 0.22389 2.95410
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1964 2.4077 -0.497 0.6193
## Yrs_Exp 0.1320 0.5102 0.259 0.7959
## Work_ChallengingYes -3.4180 1.4091 -2.426 0.0153 *
## Work_EnvirLow 4.6118 1.6783 2.748 0.0060 **
## CompensationLow 2.8160 1.3513 2.084 0.0372 *
## Tech_ExperLow -3.9598 1.7030 -2.325 0.0201 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71.779 on 51 degrees of freedom
## Residual deviance: 26.018 on 46 degrees of freedom
## AIC: 38.018
##
## Number of Fisher Scoring iterations: 7
# Yrs_Exp dışında tüm değişkenler anlamlı
# Yrs_Exp değişkeninin çıkarıp tekrar model kuruyoruz
model2 <- glm(Attrition~ Work_Challenging+Work_Envir+Compensation+Tech_Exper,
data = exit,
family = "binomial")
summary(model2)
##
## Call:
## glm(formula = Attrition ~ Work_Challenging + Work_Envir + Compensation +
## Tech_Exper, family = "binomial", data = exit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.36759 -0.20050 0.05191 0.22273 2.86409
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6216 0.9101 -0.683 0.4946
## Work_ChallengingYes -3.4632 1.4025 -2.469 0.0135 *
## Work_EnvirLow 4.5215 1.5995 2.827 0.0047 **
## CompensationLow 2.7090 1.2542 2.160 0.0308 *
## Tech_ExperLow -3.8547 1.6065 -2.400 0.0164 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71.779 on 51 degrees of freedom
## Residual deviance: 26.086 on 47 degrees of freedom
## AIC: 36.086
##
## Number of Fisher Scoring iterations: 7
# İki model arasında farklılık var mı?
# değişken çıktıktan sonra yeni modelde farklılık var mı?
# anova ile test ediyoruz.
anova(model2, model1, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Attrition ~ Work_Challenging + Work_Envir + Compensation + Tech_Exper
## Model 2: Attrition ~ Yrs_Exp + Work_Challenging + Work_Envir + Compensation +
## Tech_Exper
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 47 26.086
## 2 46 26.018 1 0.067459 0.7951
# Chi değeri 0.7951>0.01(0.05)
# İki model arasında farklılık yok, değişkeni çıkarabiliriz.
# model katsayıları için:
anova(model2, "PhiSq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Attrition
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 51 71.779
## Work_Challenging 1 15.6908 50 56.089
## Work_Envir 1 17.4619 49 38.627
## Compensation 1 3.3265 48 35.300
## Tech_Exper 1 9.2142 47 26.086
coef(model2)
## (Intercept) Work_ChallengingYes Work_EnvirLow CompensationLow
## -0.6215815 -3.4632331 4.5215090 2.7089540
## Tech_ExperLow
## -3.8547410
# negatif olanlar Attrition=Yes olasılığını düşürmekte.
# pozitif olanlar artırmakta
# ne kadar artırıp azalttığını görmek için üstelini alıyoruz:
exp(coef(model2))
## (Intercept) Work_ChallengingYes Work_EnvirLow CompensationLow
## 0.53709434 0.03132831 91.97428006 15.01356337
## Tech_ExperLow
## 0.02117909
# Work_Env değişkenini (diğer değişkenle sabit kaldığında)
# bir birim artırdığımızda Attrition=Yes olasılığı 91.97 kere artmakta.
# çoklu bağlantıyı kontrol etmek istediğimizde:
library(car)
## Zorunlu paket yükleniyor: carData
vif(model2)
## Work_Challenging Work_Envir Compensation Tech_Exper
## 1.984868 2.461992 1.611694 1.562850
# 5'ten büyük değerler çoklu bağlantı olduğuna işaret eder.
# Değişkenleri kendi isteğimizle sınırlayarak sonuca bakmak istersek:
prediction <- data.frame(Yrs_Exp = 3,
Work_Challenging = "Yes",
Work_Envir = "Excellent",
Compensation = "Excellent",
Tech_Exper = "Low")
presults <- predict(model2, newdata = prediction, type = "response")
presults
## 1
## 0.0003562379
# Burada sonuçları 0.5 ile kıyaslıyoruz:
# 0.5'den büyük ise 1'e yakın demektir. sonuç: Attrition=Yes
# 0.5'den küçük ise, sonuç: Attrition=No.
# confusion matrix görmek için:
prediction2 <- predict(model2, type = "response")
table(exit$Attrition, prediction2 >0.5)
##
## FALSE TRUE
## No 22 2
## Yes 3 25
# modeli daha da geliştirebilir miyiz?
# verisetini eğitim ve test setine bölelim.
library(caret)
## Zorunlu paket yükleniyor: ggplot2
## Zorunlu paket yükleniyor: lattice
set.seed(123)
Index <- createDataPartition(exit$Attrition, p=0.8, list = FALSE)
egitim <- exit[Index,]
test <- exit[-Index,]
summary(egitim)
## Attrition Yrs_Exp Work_Challenging Work_Envir Compensation
## No :20 Min. :2.000 No :25 Excellent:23 Excellent:18
## Yes:23 1st Qu.:2.250 Yes:18 Low :20 Low :25
## Median :4.000
## Mean :3.581
## 3rd Qu.:4.500
## Max. :5.000
## Tech_Exper
## Excellent:36
## Low : 7
##
##
##
##
summary(test)
## Attrition Yrs_Exp Work_Challenging Work_Envir Compensation
## No :4 Min. :2.000 No :3 Excellent:5 Excellent:3
## Yes:5 1st Qu.:2.500 Yes:6 Low :4 Low :6
## Median :3.000
## Mean :3.222
## 3rd Qu.:4.000
## Max. :5.000
## Tech_Exper
## Excellent:8
## Low :1
##
##
##
##
model3