library(modeldata)
?attrition
## starting httpd help server ... done
names(attrition)
## [1] "Age" "Attrition"
## [3] "BusinessTravel" "DailyRate"
## [5] "Department" "DistanceFromHome"
## [7] "Education" "EducationField"
## [9] "EnvironmentSatisfaction" "Gender"
## [11] "HourlyRate" "JobInvolvement"
## [13] "JobLevel" "JobRole"
## [15] "JobSatisfaction" "MaritalStatus"
## [17] "MonthlyIncome" "MonthlyRate"
## [19] "NumCompaniesWorked" "OverTime"
## [21] "PercentSalaryHike" "PerformanceRating"
## [23] "RelationshipSatisfaction" "StockOptionLevel"
## [25] "TotalWorkingYears" "TrainingTimesLastYear"
## [27] "WorkLifeBalance" "YearsAtCompany"
## [29] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [31] "YearsWithCurrManager"
table(attrition$Attrition)
##
## No Yes
## 1233 237
View(attrition)
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
str(attrition)
## 'data.frame': 1470 obs. of 31 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : Factor w/ 3 levels "Human_Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : Ord.factor w/ 5 levels "Below_College"<..: 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human_Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EnvironmentSatisfaction : Ord.factor w/ 4 levels "Low"<"Medium"<..: 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : Ord.factor w/ 4 levels "Low"<"Medium"<..: 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare_Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : Ord.factor w/ 4 levels "Low"<"Medium"<..: 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : Ord.factor w/ 4 levels "Low"<"Good"<"Excellent"<..: 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: Ord.factor w/ 4 levels "Low"<"Medium"<..: 1 4 2 3 4 3 1 2 2 2 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : Ord.factor w/ 4 levels "Bad"<"Good"<"Better"<..: 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
summary(attrition)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 No :1233 Non-Travel : 150 Min. : 102.0
## 1st Qu.:30.00 Yes: 237 Travel_Frequently: 277 1st Qu.: 465.0
## Median :36.00 Travel_Rarely :1043 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human_Resources : 63 Min. : 1.000 Below_College:170
## Research_Development:961 1st Qu.: 2.000 College :282
## Sales :446 Median : 7.000 Bachelor :572
## Mean : 9.193 Master :398
## 3rd Qu.:14.000 Doctor : 48
## Max. :29.000
##
## EducationField EnvironmentSatisfaction Gender HourlyRate
## Human_Resources : 27 Low :284 Female:588 Min. : 30.00
## Life_Sciences :606 Medium :287 Male :882 1st Qu.: 48.00
## Marketing :159 High :453 Median : 66.00
## Medical :464 Very_High:446 Mean : 65.89
## Other : 82 3rd Qu.: 83.75
## Technical_Degree:132 Max. :100.00
##
## JobInvolvement JobLevel JobRole
## Low : 83 Min. :1.000 Sales_Executive :326
## Medium :375 1st Qu.:1.000 Research_Scientist :292
## High :868 Median :2.000 Laboratory_Technician :259
## Very_High:144 Mean :2.064 Manufacturing_Director :145
## 3rd Qu.:3.000 Healthcare_Representative:131
## Max. :5.000 Manager :102
## (Other) :215
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Low :289 Divorced:327 Min. : 1009 Min. : 2094
## Medium :280 Married :673 1st Qu.: 2911 1st Qu.: 8047
## High :442 Single :470 Median : 4919 Median :14236
## Very_High:459 Mean : 6503 Mean :14313
## 3rd Qu.: 8379 3rd Qu.:20462
## Max. :19999 Max. :26999
##
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## Min. :0.000 No :1054 Min. :11.00 Low : 0
## 1st Qu.:1.000 Yes: 416 1st Qu.:12.00 Good : 0
## Median :2.000 Median :14.00 Excellent :1244
## Mean :2.693 Mean :15.21 Outstanding: 226
## 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :9.000 Max. :25.00
##
## RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## Low :276 Min. :0.0000 Min. : 0.00
## Medium :303 1st Qu.:0.0000 1st Qu.: 6.00
## High :459 Median :1.0000 Median :10.00
## Very_High:432 Mean :0.7939 Mean :11.28
## 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Bad : 80 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 Good :344 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Better:893 Median : 5.000 Median : 3.000
## Mean :2.799 Best :153 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
Index <- createDataPartition(attrition$Attrition, p = 0.8, list = FALSE)
train <- attrition[Index,]
summary(train)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 No :987 Non-Travel :117 Min. : 102.0
## 1st Qu.:31.00 Yes:190 Travel_Frequently:223 1st Qu.: 464.0
## Median :36.00 Travel_Rarely :837 Median : 801.0
## Mean :37.14 Mean : 797.4
## 3rd Qu.:43.00 3rd Qu.:1147.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human_Resources : 53 Min. : 1.000 Below_College:125
## Research_Development:776 1st Qu.: 2.000 College :236
## Sales :348 Median : 7.000 Bachelor :456
## Mean : 9.105 Master :323
## 3rd Qu.:14.000 Doctor : 37
## Max. :29.000
##
## EducationField EnvironmentSatisfaction Gender HourlyRate
## Human_Resources : 24 Low :215 Female:470 Min. : 30.00
## Life_Sciences :475 Medium :231 Male :707 1st Qu.: 48.00
## Marketing :128 High :362 Median : 66.00
## Medical :371 Very_High:369 Mean : 65.77
## Other : 68 3rd Qu.: 83.00
## Technical_Degree:111 Max. :100.00
##
## JobInvolvement JobLevel JobRole
## Low : 75 Min. :1.000 Sales_Executive :259
## Medium :299 1st Qu.:1.000 Research_Scientist :230
## High :680 Median :2.000 Laboratory_Technician :214
## Very_High:123 Mean :2.072 Manufacturing_Director :122
## 3rd Qu.:3.000 Healthcare_Representative:103
## Max. :5.000 Manager : 81
## (Other) :168
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Low :230 Divorced:263 Min. : 1009 Min. : 2094
## Medium :234 Married :544 1st Qu.: 2973 1st Qu.: 8192
## High :356 Single :370 Median : 4968 Median :14284
## Very_High:357 Mean : 6544 Mean :14409
## 3rd Qu.: 8346 3rd Qu.:20652
## Max. :19999 Max. :26999
##
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## Min. :0.000 No :846 Min. :11.00 Low : 0
## 1st Qu.:1.000 Yes:331 1st Qu.:12.00 Good : 0
## Median :2.000 Median :14.00 Excellent :992
## Mean :2.754 Mean :15.25 Outstanding:185
## 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :9.000 Max. :25.00
##
## RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## Low :221 Min. :0.0000 Min. : 0.0
## Medium :241 1st Qu.:0.0000 1st Qu.: 6.0
## High :364 Median :1.0000 Median :10.0
## Very_High:351 Mean :0.8165 Mean :11.4
## 3rd Qu.:1.0000 3rd Qu.:15.0
## Max. :3.0000 Max. :40.0
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.0 Bad : 66 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.0 Good :265 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.0 Better:715 Median : 5.000 Median : 3.000
## Mean :2.8 Best :131 Mean : 7.067 Mean : 4.282
## 3rd Qu.:3.0 3rd Qu.:10.000 3rd Qu.: 7.000
## Max. :6.0 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.212 Mean : 4.141
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
test <- attrition[-Index,]
summary(test)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 No :246 Non-Travel : 33 Min. : 107.0
## 1st Qu.:29.00 Yes: 47 Travel_Frequently: 54 1st Qu.: 472.0
## Median :35.00 Travel_Rarely :206 Median : 805.0
## Mean :36.04 Mean : 822.9
## 3rd Qu.:42.00 3rd Qu.:1206.0
## Max. :60.00 Max. :1498.0
##
## Department DistanceFromHome Education
## Human_Resources : 10 Min. : 1.000 Below_College: 45
## Research_Development:185 1st Qu.: 2.000 College : 46
## Sales : 98 Median : 7.000 Bachelor :116
## Mean : 9.546 Master : 75
## 3rd Qu.:15.000 Doctor : 11
## Max. :29.000
##
## EducationField EnvironmentSatisfaction Gender HourlyRate
## Human_Resources : 3 Low :69 Female:118 Min. : 30.00
## Life_Sciences :131 Medium :56 Male :175 1st Qu.: 49.00
## Marketing : 31 High :91 Median : 67.00
## Medical : 93 Very_High:77 Mean : 66.38
## Other : 14 3rd Qu.: 85.00
## Technical_Degree: 21 Max. :100.00
##
## JobInvolvement JobLevel JobRole
## Low : 8 Min. :1.000 Sales_Executive :67
## Medium : 76 1st Qu.:1.000 Research_Scientist :62
## High :188 Median :2.000 Laboratory_Technician :45
## Very_High: 21 Mean :2.031 Healthcare_Representative:28
## 3rd Qu.:3.000 Sales_Representative :24
## Max. :5.000 Manufacturing_Director :23
## (Other) :44
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Low : 59 Divorced: 64 Min. : 1081 Min. : 2097
## Medium : 46 Married :129 1st Qu.: 2760 1st Qu.: 7909
## High : 86 Single :100 Median : 4695 Median :14120
## Very_High:102 Mean : 6338 Mean :13927
## 3rd Qu.: 8474 3rd Qu.:20165
## Max. :19847 Max. :26997
##
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## Min. :0.000 No :208 Min. :11.00 Low : 0
## 1st Qu.:1.000 Yes: 85 1st Qu.:12.00 Good : 0
## Median :1.000 Median :14.00 Excellent :252
## Mean :2.451 Mean :15.03 Outstanding: 41
## 3rd Qu.:4.000 3rd Qu.:17.00
## Max. :9.000 Max. :25.00
##
## RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## Low :55 Min. :0.0000 Min. : 0.0
## Medium :62 1st Qu.:0.0000 1st Qu.: 6.0
## High :95 Median :1.0000 Median : 9.0
## Very_High:81 Mean :0.7031 Mean :10.8
## 3rd Qu.:1.0000 3rd Qu.:15.0
## Max. :3.0000 Max. :37.0
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Bad : 14 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 Good : 79 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Better:178 Median : 5.000 Median : 3.000
## Mean :2.795 Best : 22 Mean : 6.771 Mean : 4.017
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :32.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.092 Mean : 4.051
## 3rd Qu.: 2.000 3rd Qu.: 7.000
## Max. :15.000 Max. :15.000
##
set.seed(123)
model1 <- naiveBayes(Attrition~., data = train)
model1
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.8385726 0.1614274
##
## Conditional probabilities:
## Age
## Y [,1] [,2]
## No 37.79129 8.894183
## Yes 33.77368 9.769098
##
## BusinessTravel
## Y Non-Travel Travel_Frequently Travel_Rarely
## No 0.10840932 0.16514691 0.72644377
## Yes 0.05263158 0.31578947 0.63157895
##
## DailyRate
## Y [,1] [,2]
## No 809.5441 401.1480
## Yes 734.3316 399.6501
##
## Department
## Y Human_Resources Research_Development Sales
## No 0.04356636 0.67882472 0.27760892
## Yes 0.05263158 0.55789474 0.38947368
##
## DistanceFromHome
## Y [,1] [,2]
## No 8.761905 7.931057
## Yes 10.884211 8.531750
##
## Education
## Y Below_College College Bachelor Master Doctor
## No 0.10435664 0.20263425 0.38500507 0.27558257 0.03242148
## Yes 0.11578947 0.18947368 0.40000000 0.26842105 0.02631579
##
## EducationField
## Y Human_Resources Life_Sciences Marketing Medical Other
## No 0.01823708 0.41033435 0.10334347 0.32421479 0.05775076
## Yes 0.03157895 0.36842105 0.13684211 0.26842105 0.05789474
## EducationField
## Y Technical_Degree
## No 0.08611955
## Yes 0.13684211
##
## EnvironmentSatisfaction
## Y Low Medium High Very_High
## No 0.1580547 0.2006079 0.3171226 0.3242148
## Yes 0.3105263 0.1736842 0.2578947 0.2578947
##
## Gender
## Y Female Male
## No 0.4042553 0.5957447
## Yes 0.3736842 0.6263158
##
## HourlyRate
## Y [,1] [,2]
## No 65.83992 20.37998
## Yes 65.40000 19.87256
##
## JobInvolvement
## Y Low Medium High Very_High
## No 0.05167173 0.24721378 0.58763931 0.11347518
## Yes 0.12631579 0.28947368 0.52631579 0.05789474
##
## JobLevel
## Y [,1] [,2]
## No 2.149949 1.1140844
## Yes 1.668421 0.9598019
##
## JobRole
## Y Healthcare_Representative Human_Resources Laboratory_Technician
## No 0.09523810 0.03444782 0.17122594
## Yes 0.04736842 0.05263158 0.23684211
## JobRole
## Y Manager Manufacturing_Director Research_Director Research_Scientist
## No 0.07700101 0.11550152 0.06382979 0.19351570
## Yes 0.02631579 0.04210526 0.01052632 0.20526316
## JobRole
## Y Sales_Executive Sales_Representative
## No 0.21479230 0.03444782
## Yes 0.24736842 0.13157895
##
## JobSatisfaction
## Y Low Medium High Very_High
## No 0.1833840 0.1985816 0.2968592 0.3211753
## Yes 0.2578947 0.2000000 0.3315789 0.2105263
##
## MaritalStatus
## Y Divorced Married Single
## No 0.2421479 0.4751773 0.2826748
## Yes 0.1263158 0.3947368 0.4789474
##
## MonthlyIncome
## Y [,1] [,2]
## No 6857.368 4811.979
## Yes 4916.189 3730.297
##
## MonthlyRate
## Y [,1] [,2]
## No 14383.27 7060.171
## Yes 14543.31 7416.437
##
## NumCompaniesWorked
## Y [,1] [,2]
## No 2.725431 2.497583
## Yes 2.900000 2.669740
##
## OverTime
## Y No Yes
## No 0.7679838 0.2320162
## Yes 0.4631579 0.5368421
##
## PercentSalaryHike
## Y [,1] [,2]
## No 15.27153 3.651562
## Yes 15.16316 3.803328
##
## PerformanceRating
## Y Low Good Excellent Outstanding
## No 0.0000000 0.0000000 0.8429585 0.1570415
## Yes 0.0000000 0.0000000 0.8421053 0.1578947
##
## RelationshipSatisfaction
## Y Low Medium High Very_High
## No 0.1762918 0.2056738 0.3120567 0.3059777
## Yes 0.2473684 0.2000000 0.2947368 0.2578947
##
## StockOptionLevel
## Y [,1] [,2]
## No 0.8672746 0.8684304
## Yes 0.5526316 0.8878179
##
## TotalWorkingYears
## Y [,1] [,2]
## No 11.998987 7.738828
## Yes 8.278947 7.061415
##
## TrainingTimesLastYear
## Y [,1] [,2]
## No 2.824721 1.280481
## Yes 2.673684 1.181355
##
## WorkLifeBalance
## Y Bad Good Better Best
## No 0.04660588 0.22289767 0.62107396 0.10942249
## Yes 0.10526316 0.23684211 0.53684211 0.12105263
##
## YearsAtCompany
## Y [,1] [,2]
## No 7.400203 6.053088
## Yes 5.336842 6.313856
##
## YearsInCurrentRole
## Y [,1] [,2]
## No 4.531915 3.637441
## Yes 2.984211 3.242372
##
## YearsSinceLastPromotion
## Y [,1] [,2]
## No 2.242148 3.198175
## Yes 2.052632 3.263939
##
## YearsWithCurrManager
## Y [,1] [,2]
## No 4.379939 3.614895
## Yes 2.900000 3.132945
?naiveBayes()
predict1 <- predict(model1, newdata = test)
ctable <- table(test$Attrition, predict1)
ctable
## predict1
## No Yes
## No 196 50
## Yes 15 32
confusionMatrix(ctable)
## Confusion Matrix and Statistics
##
## predict1
## No Yes
## No 196 50
## Yes 15 32
##
## Accuracy : 0.7782
## 95% CI : (0.7262, 0.8244)
## No Information Rate : 0.7201
## P-Value [Acc > NIR] : 0.01439
##
## Kappa : 0.367
##
## Mcnemar's Test P-Value : 2.474e-05
##
## Sensitivity : 0.9289
## Specificity : 0.3902
## Pos Pred Value : 0.7967
## Neg Pred Value : 0.6809
## Prevalence : 0.7201
## Detection Rate : 0.6689
## Detection Prevalence : 0.8396
## Balanced Accuracy : 0.6596
##
## 'Positive' Class : No
##
#project
# task 1
# eliminate all numeric variables then build model and test it. get confusion matrix results
# task 2
# Convert numeric variables to categorical ones.build model and test it. get confusion matrix results.
# task 3
# Compare the confusion matrix results of the three models. which one is better?