This data is from a SPSS Dataset and used for demonstration. This is intended to illstrate the use of a R Markdown Notebook for presentation.
The data was exported to csv and used with R and Rattle. For more information visit www.thinkqlik.com
bank_loan_train<-read.csv("c:/efrayer/R_Projects/BankLoan/data/bankloan_train.csv")
summary(bank_loan_train)
ï..record_id age ed employ
Min. : 1.0 Min. :20.00 Min. :1.000 Min. : 0.000
1st Qu.:175.8 1st Qu.:29.00 1st Qu.:1.000 1st Qu.: 3.000
Median :350.5 Median :34.00 Median :1.000 Median : 7.000
Mean :350.5 Mean :34.86 Mean :1.723 Mean : 8.389
3rd Qu.:525.2 3rd Qu.:40.00 3rd Qu.:2.000 3rd Qu.:12.000
Max. :700.0 Max. :56.00 Max. :5.000 Max. :31.000
address income debtinc creddebt
Min. : 0.000 Min. : 14.0 Min. : 0.40 Min. : 0.010
1st Qu.: 3.000 1st Qu.: 24.0 1st Qu.: 5.00 1st Qu.: 0.370
Median : 7.000 Median : 34.0 Median : 8.60 Median : 0.855
Mean : 8.279 Mean : 45.6 Mean :10.26 Mean : 1.553
3rd Qu.:12.000 3rd Qu.: 55.0 3rd Qu.:14.12 3rd Qu.: 1.905
Max. :34.000 Max. :446.0 Max. :41.30 Max. :20.560
othdebt default preddef1
Min. : 0.050 Min. :0.0000 Min. :0.00052
1st Qu.: 1.048 1st Qu.:0.0000 1st Qu.:0.04890
Median : 1.985 Median :0.0000 Median :0.17574
Mean : 3.058 Mean :0.2614 Mean :0.26143
3rd Qu.: 3.928 3rd Qu.:1.0000 3rd Qu.:0.41882
Max. :27.030 Max. :1.0000 Max. :0.99940
bank_loan_glm=glm(data=bank_loan_train,default~
address+
employ+
debtinc+
creddebt
,family=binomial())
summary(bank_loan_glm)
Call:
glm(formula = default ~ address + employ + debtinc + creddebt,
family = binomial(), data = bank_loan_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4483 -0.6396 -0.3108 0.2583 2.8496
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.79107 0.25154 -3.145 0.00166 **
address -0.08122 0.01960 -4.144 3.41e-05 ***
employ -0.24258 0.02806 -8.646 < 2e-16 ***
debtinc 0.08827 0.01854 4.760 1.93e-06 ***
creddebt 0.57290 0.08725 6.566 5.17e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 804.36 on 699 degrees of freedom
Residual deviance: 556.74 on 695 degrees of freedom
AIC: 566.74
Number of Fisher Scoring iterations: 6
anova(bank_loan_glm)
Analysis of Deviance Table
Model: binomial, link: logit
Response: default
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev
NULL 699 804.36
address 1 20.452 698 783.91
employ 1 49.798 697 734.11
debtinc 1 111.762 696 622.35
creddebt 1 65.613 695 556.74
coefficients(bank_loan_glm)
(Intercept) address employ debtinc creddebt
-0.79107079 -0.08122146 -0.24258492 0.08826530 0.57289682
Evaluate model performance on the training dataset.
ROC Curve: requires the ROCR package.
library(ROCR)
# ROC Curve: requires the ggplot2 package.
library(ggplot2, quietly=TRUE)
RStudio Community is a great place to get help:
https://community.rstudio.com/c/tidyverse.
crs$input <- c("employ", "address", "debtinc", "creddebt")
crs$numeric <- c("employ", "address", "debtinc", "creddebt")
crs$categoric <- NULL
crs$target <- "default"
crs$glm <- glm(default ~ .,
data=crs$dataset[, c(crs$input, crs$target)],
family=binomial(link="logit"))
crs$pr <- predict(crs$glm,
type = "response",
newdata = crs$dataset[,c(crs$input, crs$target)])
# Remove observations with missing target.
no.miss <- na.omit(crs$dataset[,c(crs$input, crs$target)]$default)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL
if (length(miss.list))
{
pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
pred <- prediction(crs$pr, no.miss)
}
pe <- performance(pred, "tpr", "fpr")
au <- performance(pred, "auc")@y.values[[1]]
pd <- data.frame(fpr=unlist(pe@x.values), tpr=unlist(pe@y.values))
p <- ggplot(pd, aes(x=fpr, y=tpr))
p <- p + geom_line(colour="red")
p <- p + xlab("False Positive Rate") + ylab("True Positive Rate")
p <- p + ggtitle("ROC Curve Linear bankloan_train.csv [**train**] default")
p <- p + theme(plot.title=element_text(size=10))
p <- p + geom_line(data=data.frame(), aes(x=c(0,1), y=c(0,1)), colour="grey")
p <- p + annotate("text", x=0.50, y=0.00, hjust=0, vjust=0, size=5,
label=paste("AUC =", round(au, 2)))
print(p)
# Calculate the area under the curve for the plot.
# Remove observations with missing target.
no.miss <- na.omit(crs$dataset[,c(crs$input, crs$target)]$default)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL
if (length(miss.list))
{
pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
pred <- prediction(crs$pr, no.miss)
}
performance(pred, "auc")
An object of class "performance"
Slot "x.name":
[1] "None"
Slot "y.name":
[1] "Area under the ROC curve"
Slot "alpha.name":
[1] "none"
Slot "x.values":
list()
Slot "y.values":
[[1]]
[1] 0.8556193
Slot "alpha.values":
list()