library(ggplot2)
library(boot)
library(randomForest)
#setwd("/media/guerzhoy/Windows/STA303/projects/p2")
#setwd("/media/guerzhoy/Windows/STA303/lectures/lec11")
setwd("c:/STA303/lectures/lec11")
Let’s read in the data and rename the columns and values to something more readable data (note: you didn’t have to rename the values.)
german <- read.csv("german.data", sep = " ", header=F)
attr_info <- "Attribute 1: (qualitative)
AccountStatus
A11 : ... < 0 DM
A12 : 0 <= ... < 200 DM
A13 : ... >= 200 DM / salary assignments for at least 1 year
A14 : no checking account
Attribute 2: (numerical)
Duration
Attribute 3: (qualitative)
CreditHistory
A30 : no credits taken/ all credits paid back duly
A31 : all credits at this bank paid back duly
A32 : existing credits paid back duly till now
A33 : delay in paying off in the past
A34 : critical account/ other credits existing (not at this bank)
Attribute 4: (qualitative)
Purpose
A40 : car (new)
A41 : car (used)
A42 : furniture/equipment
A43 : radio/television
A44 : domestic appliances
A45 : repairs
A46 : education
A48 : retraining
A49 : business
A410 : others
Attribute 5: (numerical)
CreditAmount
Attribute 6: (qualitative)
Account
A61 : ... < 100 DM
A62 : 100 <= ... < 500 DM
A63 : 500 <= ... < 1000 DM
A64 : .. >= 1000 DM
A65 : unknown/ no savings account
Attribute 7: (qualitative)
EmploymentSince
A71 : unemployed
A72 : ... < 1 year
A73 : 1 <= ... < 4 years
A74 : 4 <= ... < 7 years
A75 : .. >= 7 years
Attribute 8: (numerical)
PrecentOfIncome
Attribute 9: (qualitative)
PersonalStatus
A91 : male : divorced/separated
A92 : female : divorced/separated/married
A93 : male : single
A94 : male : married/widowed
A95 : female : single
Attribute 10: (qualitative)
OtherDebtors
A101 : none
A102 : co-applicant
A103 : guarantor
Attribute 11: (numerical)
ResidenceSince
Attribute 12: (qualitative)
Property
A121 : real estate
A122 : if not A121 : building society savings agreement/ life insurance
A123 : if not A121/A122 : car or other, not in attribute 6
A124 : unknown / no property
Attribute 13: (numerical)
Age
Attribute 14: (qualitative)
OtherInstallPlans
A141 : bank
A142 : stores
A143 : none
Attribute 15: (qualitative)
Housing
A151 : rent
A152 : own
A153 : for free
Attribute 16: (numerical)
NumExistingCredits
Attribute 17: (qualitative)
Job
A171 : unemployed/ unskilled - non-resident
A172 : unskilled - resident
A173 : skilled employee / official
A174 : management/ self-employed/highly qualified employee/ officer
Attribute 18: (numerical)
NumMaintenance
Attribute 19: (qualitative)
Telephone
A191 : none
A192 : yes, registered under the customers name
Attribute 20: (qualitative)
ForeignWorker
A201 : yes
A202 : no
Attribute 21: (quantitative)
GoodCredit
1: 1
2: 0
"
attr_list <- as.list(as.vector(strsplit(attr_info, "Attribute ")[[1]])[2:22])
attr_names <- sapply(attr_list, function(attr_desc) strsplit(attr_desc, "\n")[[1]][2])
colnames(german) <- attr_names
for(i in 1:21){
attr_desc <- attr_list[[i]]
if(length(grep("qualitative",attr_desc))>0){
lines_all <- as.vector(strsplit(attr_desc, "\n")[[1]])
lines <- lines_all[3:(length(lines_all)-1)]
str_vals <- sapply(lines, function(line) strsplit(line, " : ")[[1]][2])
german[,i] <-as.numeric(german[,i])
for (val_i in 1:length(lines)){
german[,i] <- ifelse(german[,i]==val_i, str_vals[val_i], german[,i])
}
}
}
Here are the first three lines of german
now:
german[c(1, 2, 3),]
## AccountStatus Duration
## 1 ... < 0 DM 6
## 2 0 <= ... < 200 DM 48
## 3 no checking account 12
## CreditHistory
## 1 critical account/ other credits existing (not at this bank)
## 2 existing credits paid back duly till now
## 3 critical account/ other credits existing (not at this bank)
## Purpose CreditAmount Account
## 1 domestic appliances 1169 unknown/ no savings account
## 2 domestic appliances 5951 ... < 100 DM
## 3 retraining 2096 ... < 100 DM
## EmploymentSince PrecentOfIncome PersonalStatus OtherDebtors
## 1 .. >= 7 years 4 male none
## 2 1 <= ... < 4 years 2 female none
## 3 4 <= ... < 7 years 2 male none
## ResidenceSince Property Age OtherInstallPlans Housing
## 1 4 real estate 67 none own
## 2 2 real estate 22 none own
## 3 3 real estate 49 none own
## NumExistingCredits Job NumMaintenance
## 1 2 skilled employee / official 1
## 2 1 skilled employee / official 1
## 3 1 unskilled - resident 2
## Telephone ForeignWorker GoodCredit
## 1 yes, registered under the customers name yes 1
## 2 none yes 2
## 3 none yes 1
Now, let’s set things up so that Good Credit is 1, and Bad Credit is 0.
german[, "GoodCredit"] <- ifelse(german[, "GoodCredit"]==1, 1, 0)
First, let’s fit the full model (but with no variable transformations/interactions to see which variables seem to matter a lot
summary(glm(GoodCredit~., family=binomial, data=german))
##
## Call:
## glm(formula = GoodCredit ~ ., family = binomial, data = german)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5564 -0.7282 0.3858 0.7001 2.3117
##
## Coefficients:
## Estimate
## (Intercept) 3.937e+00
## AccountStatus... >= 200 DM / salary assignments for at least 1 year 9.730e-01
## AccountStatus0 <= ... < 200 DM 3.700e-01
## AccountStatusno checking account 1.729e+00
## Duration -2.846e-02
## CreditHistorycritical account/ other credits existing (not at this bank) 1.558e+00
## CreditHistorydelay in paying off in the past 9.856e-01
## CreditHistoryexisting credits paid back duly till now 7.085e-01
## CreditHistoryno credits taken/ all credits paid back duly 1.648e-01
## Purposecar (new) -1.848e+00
## Purposecar (used) -2.113e-01
## Purposedomestic appliances -9.688e-01
## Purposeeducation -1.663e+00
## Purposefurniture/equipment -3.882e-01
## Purposeothers -1.169e+00
## Purposeradio/television -1.080e+00
## Purposerepairs -1.321e+00
## Purposeretraining -1.885e+00
## CreditAmount -1.176e-04
## Account... < 100 DM -1.324e+00
## Account100 <= ... < 500 DM -9.870e-01
## Account500 <= ... < 1000 DM -9.358e-01
## Accountunknown/ no savings account -3.689e-01
## EmploymentSince... < 1 year -2.606e-01
## EmploymentSince1 <= ... < 4 years -1.107e-01
## EmploymentSince4 <= ... < 7 years 5.871e-01
## EmploymentSinceunemployed -2.762e-01
## PrecentOfIncome -3.010e-01
## PersonalStatusmale 3.643e-01
## OtherDebtorsguarantor 1.364e+00
## OtherDebtorsnone 3.869e-01
## ResidenceSince 7.259e-03
## Propertyif not A121/A122 6.924e-02
## Propertyreal estate 2.454e-01
## Propertyunknown / no property -4.311e-01
## Age 1.362e-02
## OtherInstallPlansnone 6.247e-01
## OtherInstallPlansstores 1.674e-01
## Housingown -2.616e-01
## Housingrent -7.268e-01
## NumExistingCredits -2.687e-01
## Jobskilled employee / official -5.707e-02
## Jobunemployed/ unskilled - non-resident 5.370e-01
## Jobunskilled - resident -2.452e-02
## NumMaintenance -1.549e-01
## Telephoneyes, registered under the customers name 2.982e-01
## ForeignWorkeryes -1.365e+00
## Std. Error
## (Intercept) 1.756e+00
## AccountStatus... >= 200 DM / salary assignments for at least 1 year 3.669e-01
## AccountStatus0 <= ... < 200 DM 2.166e-01
## AccountStatusno checking account 2.313e-01
## Duration 9.247e-03
## CreditHistorycritical account/ other credits existing (not at this bank) 4.345e-01
## CreditHistorydelay in paying off in the past 4.671e-01
## CreditHistoryexisting credits paid back duly till now 3.812e-01
## CreditHistoryno credits taken/ all credits paid back duly 5.466e-01
## Purposecar (new) 1.179e+00
## Purposecar (used) 1.219e+00
## Purposedomestic appliances 1.179e+00
## Purposeeducation 1.279e+00
## Purposefurniture/equipment 1.391e+00
## Purposeothers 1.199e+00
## Purposeradio/television 1.182e+00
## Purposerepairs 1.381e+00
## Purposeretraining 1.225e+00
## CreditAmount 4.376e-05
## Account... < 100 DM 5.230e-01
## Account100 <= ... < 500 DM 5.721e-01
## Account500 <= ... < 1000 DM 6.414e-01
## Accountunknown/ no savings account 5.622e-01
## EmploymentSince... < 1 year 2.929e-01
## EmploymentSince1 <= ... < 4 years 2.504e-01
## EmploymentSince4 <= ... < 7 years 3.005e-01
## EmploymentSinceunemployed 4.111e-01
## PrecentOfIncome 8.706e-02
## PersonalStatusmale 1.926e-01
## OtherDebtorsguarantor 5.648e-01
## OtherDebtorsnone 4.089e-01
## ResidenceSince 8.595e-02
## Propertyif not A121/A122 2.307e-01
## Propertyreal estate 2.515e-01
## Propertyunknown / no property 4.094e-01
## Age 9.078e-03
## OtherInstallPlansnone 2.380e-01
## OtherInstallPlansstores 4.100e-01
## Housingown 4.469e-01
## Housingrent 4.742e-01
## NumExistingCredits 1.888e-01
## Jobskilled employee / official 2.843e-01
## Jobunemployed/ unskilled - non-resident 6.586e-01
## Jobunskilled - resident 3.487e-01
## NumMaintenance 2.437e-01
## Telephoneyes, registered under the customers name 2.002e-01
## ForeignWorkeryes 6.201e-01
## z value
## (Intercept) 2.242
## AccountStatus... >= 200 DM / salary assignments for at least 1 year 2.652
## AccountStatus0 <= ... < 200 DM 1.708
## AccountStatusno checking account 7.473
## Duration -3.078
## CreditHistorycritical account/ other credits existing (not at this bank) 3.586
## CreditHistorydelay in paying off in the past 2.110
## CreditHistoryexisting credits paid back duly till now 1.858
## CreditHistoryno credits taken/ all credits paid back duly 0.302
## Purposecar (new) -1.568
## Purposecar (used) -0.173
## Purposedomestic appliances -0.822
## Purposeeducation -1.299
## Purposefurniture/equipment -0.279
## Purposeothers -0.975
## Purposeradio/television -0.914
## Purposerepairs -0.957
## Purposeretraining -1.539
## CreditAmount -2.688
## Account... < 100 DM -2.531
## Account100 <= ... < 500 DM -1.725
## Account500 <= ... < 1000 DM -1.459
## Accountunknown/ no savings account -0.656
## EmploymentSince... < 1 year -0.890
## EmploymentSince1 <= ... < 4 years -0.442
## EmploymentSince4 <= ... < 7 years 1.954
## EmploymentSinceunemployed -0.672
## PrecentOfIncome -3.457
## PersonalStatusmale 1.892
## OtherDebtorsguarantor 2.414
## OtherDebtorsnone 0.946
## ResidenceSince 0.084
## Propertyif not A121/A122 0.300
## Propertyreal estate 0.976
## Propertyunknown / no property -1.053
## Age 1.500
## OtherInstallPlansnone 2.625
## OtherInstallPlansstores 0.408
## Housingown -0.585
## Housingrent -1.533
## NumExistingCredits -1.423
## Jobskilled employee / official -0.201
## Jobunemployed/ unskilled - non-resident 0.815
## Jobunskilled - resident -0.070
## NumMaintenance -0.636
## Telephoneyes, registered under the customers name 1.490
## ForeignWorkeryes -2.201
## Pr(>|z|)
## (Intercept) 0.024937
## AccountStatus... >= 200 DM / salary assignments for at least 1 year 0.008004
## AccountStatus0 <= ... < 200 DM 0.087574
## AccountStatusno checking account 7.83e-14
## Duration 0.002087
## CreditHistorycritical account/ other credits existing (not at this bank) 0.000336
## CreditHistorydelay in paying off in the past 0.034843
## CreditHistoryexisting credits paid back duly till now 0.063106
## CreditHistoryno credits taken/ all credits paid back duly 0.762997
## Purposecar (new) 0.116989
## Purposecar (used) 0.862369
## Purposedomestic appliances 0.411277
## Purposeeducation 0.193772
## Purposefurniture/equipment 0.780092
## Purposeothers 0.329529
## Purposeradio/television 0.360828
## Purposerepairs 0.338602
## Purposeretraining 0.123815
## CreditAmount 0.007187
## Account... < 100 DM 0.011378
## Account100 <= ... < 500 DM 0.084512
## Account500 <= ... < 1000 DM 0.144544
## Accountunknown/ no savings account 0.511731
## EmploymentSince... < 1 year 0.373626
## EmploymentSince1 <= ... < 4 years 0.658353
## EmploymentSince4 <= ... < 7 years 0.050749
## EmploymentSinceunemployed 0.501744
## PrecentOfIncome 0.000545
## PersonalStatusmale 0.058530
## OtherDebtorsguarantor 0.015769
## OtherDebtorsnone 0.344136
## ResidenceSince 0.932691
## Propertyif not A121/A122 0.764069
## Propertyreal estate 0.329188
## Propertyunknown / no property 0.292365
## Age 0.133506
## OtherInstallPlansnone 0.008664
## OtherInstallPlansstores 0.683033
## Housingown 0.558245
## Housingrent 0.125360
## NumExistingCredits 0.154801
## Jobskilled employee / official 0.840897
## Jobunemployed/ unskilled - non-resident 0.414858
## Jobunskilled - resident 0.943954
## NumMaintenance 0.524975
## Telephoneyes, registered under the customers name 0.136250
## ForeignWorkeryes 0.027738
##
## (Intercept) *
## AccountStatus... >= 200 DM / salary assignments for at least 1 year **
## AccountStatus0 <= ... < 200 DM .
## AccountStatusno checking account ***
## Duration **
## CreditHistorycritical account/ other credits existing (not at this bank) ***
## CreditHistorydelay in paying off in the past *
## CreditHistoryexisting credits paid back duly till now .
## CreditHistoryno credits taken/ all credits paid back duly
## Purposecar (new)
## Purposecar (used)
## Purposedomestic appliances
## Purposeeducation
## Purposefurniture/equipment
## Purposeothers
## Purposeradio/television
## Purposerepairs
## Purposeretraining
## CreditAmount **
## Account... < 100 DM *
## Account100 <= ... < 500 DM .
## Account500 <= ... < 1000 DM
## Accountunknown/ no savings account
## EmploymentSince... < 1 year
## EmploymentSince1 <= ... < 4 years
## EmploymentSince4 <= ... < 7 years .
## EmploymentSinceunemployed
## PrecentOfIncome ***
## PersonalStatusmale .
## OtherDebtorsguarantor *
## OtherDebtorsnone
## ResidenceSince
## Propertyif not A121/A122
## Propertyreal estate
## Propertyunknown / no property
## Age
## OtherInstallPlansnone **
## OtherInstallPlansstores
## Housingown
## Housingrent
## NumExistingCredits
## Jobskilled employee / official
## Jobunemployed/ unskilled - non-resident
## Jobunskilled - resident
## NumMaintenance
## Telephoneyes, registered under the customers name
## ForeignWorkeryes *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1221.73 on 999 degrees of freedom
## Residual deviance: 901.58 on 953 degrees of freedom
## AIC: 995.58
##
## Number of Fisher Scoring iterations: 5
A quick and dirty way to fit the model is to use the variables which are significant.
Let’s split the data into a training and a test set.
set.seed(0)
idx <- sample(1000, 800)
train_set <- german[idx,]
test_set <- german[-idx,]
cost_classification <- function(r, pi) mean(abs(r-pi) > 0.5)
model1 <- glm(GoodCredit~AccountStatus+Duration+CreditHistory+CreditAmount+Account+EmploymentSince+PrecentOfIncome+PersonalStatus+OtherDebtors+ForeignWorker, family=binomial, data=train_set)
cost_classification(train_set$GoodCredit, predict(model1, train_set, type="response"))
## [1] 0.225
cost_classification(test_set$GoodCredit, predict(model1, test_set, type="response"))
## [1] 0.205
It doesn’t look like there is overfitting (in fact, the classification cost is a little smaller on the test set, just due to random variation). Let’s see how much better we can do on the training set if we include all the variables.
cost_classification(train_set$GoodCredit, predict(glm(GoodCredit~., family=binomial, data=train_set), train_set, type="response"))
## [1] 0.215
So it seems like we can’t do that much better.
Now, let’s fit a Random Forest model.
library(randomForest)
credit.rf <- randomForest(as.factor(train_set$GoodCredit)~as.factor(train_set$AccountStatus)+train_set$Duration+as.factor(train_set$CreditHistory)+train_set$CreditAmount+as.factor(train_set$Account)+as.factor(train_set$EmploymentSince)+train_set$PrecentOfIncome+as.factor(train_set$PersonalStatus)+as.factor(train_set$OtherDebtors)+as.factor(train_set$ForeignWorker), ntrees=10000)
Let’s now evaluate the costs.
train.pred.rf <- ifelse(as.character(predict(credit.rf, type="response"))=="1", 1, 0)
cost_classification(train_set$GoodCredit, train.pred.rf)
## [1] 0.2475
test.pred.rf <- ifelse(as.character(predict(credit.rf, new_data=test_set, type="response"))=="1", 1, 0)
cost_classification(test_set$GoodCredit, test.pred.rf)
## [1] 0.37125
In this case, we are not doing as well as logistic regression on the training set, and random forest overfit pretty badly. Random Forests generally do perform pretty well though!
Finally, let’s try some transformations of the variables and see where that gets us.
model2 <- glm(GoodCredit~AccountStatus+Duration+I(Duration^2)+I(log(1+Duration))+CreditHistory+CreditAmount+I(CreditAmount^2)+I(log(CreditAmount))+Account+EmploymentSince+PrecentOfIncome+I(PrecentOfIncome^2)+I(log(PrecentOfIncome))+PersonalStatus+OtherDebtors+ForeignWorker, family=binomial, data=train_set)
cost_classification(train_set$GoodCredit, predict(model2, train_set, type="response"))
## [1] 0.21875
cost_classification(test_set$GoodCredit, predict(model2, test_set, type="response"))
## [1] 0.2
Looks like we’re doing even better!