library(ggplot2)
library(boot)
library(randomForest)
#setwd("/media/guerzhoy/Windows/STA303/projects/p2")
#setwd("/media/guerzhoy/Windows/STA303/lectures/lec11")
setwd("c:/STA303/lectures/lec11")

Let’s read in the data and rename the columns and values to something more readable data (note: you didn’t have to rename the values.)

german <- read.csv("german.data", sep = " ", header=F)

attr_info <- "Attribute 1: (qualitative) 
AccountStatus
A11 : ... < 0 DM 
A12 : 0 <= ... < 200 DM 
A13 : ... >= 200 DM / salary assignments for at least 1 year 
A14 : no checking account 

Attribute 2: (numerical) 
Duration

Attribute 3: (qualitative) 
CreditHistory
A30 : no credits taken/ all credits paid back duly 
A31 : all credits at this bank paid back duly 
A32 : existing credits paid back duly till now 
A33 : delay in paying off in the past 
A34 : critical account/ other credits existing (not at this bank) 

Attribute 4: (qualitative) 
Purpose
A40 : car (new) 
A41 : car (used) 
A42 : furniture/equipment 
A43 : radio/television 
A44 : domestic appliances 
A45 : repairs 
A46 : education 
A48 : retraining 
A49 : business 
A410 : others 

Attribute 5: (numerical) 
CreditAmount

Attribute 6: (qualitative) 
Account
A61 : ... < 100 DM 
A62 : 100 <= ... < 500 DM 
A63 : 500 <= ... < 1000 DM 
A64 : .. >= 1000 DM 
A65 : unknown/ no savings account 

Attribute 7: (qualitative) 
EmploymentSince
A71 : unemployed 
A72 : ... < 1 year 
A73 : 1 <= ... < 4 years 
A74 : 4 <= ... < 7 years 
A75 : .. >= 7 years 

Attribute 8: (numerical) 
PrecentOfIncome

Attribute 9: (qualitative) 
PersonalStatus
A91 : male : divorced/separated 
A92 : female : divorced/separated/married 
A93 : male : single 
A94 : male : married/widowed 
A95 : female : single 

Attribute 10: (qualitative) 
OtherDebtors
A101 : none 
A102 : co-applicant 
A103 : guarantor 

Attribute 11: (numerical) 
ResidenceSince

Attribute 12: (qualitative) 
Property
A121 : real estate 
A122 : if not A121 : building society savings agreement/ life insurance 
A123 : if not A121/A122 : car or other, not in attribute 6 
A124 : unknown / no property 

Attribute 13: (numerical) 
Age

Attribute 14: (qualitative) 
OtherInstallPlans
A141 : bank 
A142 : stores 
A143 : none 

Attribute 15: (qualitative) 
Housing
A151 : rent 
A152 : own 
A153 : for free 

Attribute 16: (numerical) 
NumExistingCredits

Attribute 17: (qualitative) 
Job
A171 : unemployed/ unskilled - non-resident
A172 : unskilled - resident
A173 : skilled employee / official
A174 : management/ self-employed/highly qualified employee/ officer

Attribute 18: (numerical) 
NumMaintenance

Attribute 19: (qualitative) 
Telephone
A191 : none 
A192 : yes, registered under the customers name 

Attribute 20: (qualitative) 
ForeignWorker
A201 : yes 
A202 : no 

Attribute 21: (quantitative)
GoodCredit
1: 1
2: 0
"

attr_list <- as.list(as.vector(strsplit(attr_info, "Attribute ")[[1]])[2:22])

attr_names <- sapply(attr_list, function(attr_desc) strsplit(attr_desc, "\n")[[1]][2])
colnames(german) <- attr_names

for(i in 1:21){
  attr_desc <- attr_list[[i]]
  if(length(grep("qualitative",attr_desc))>0){
    lines_all <- as.vector(strsplit(attr_desc, "\n")[[1]])
    lines <- lines_all[3:(length(lines_all)-1)]
    str_vals <- sapply(lines, function(line) strsplit(line, " : ")[[1]][2])
    german[,i] <-as.numeric(german[,i])
    
    for (val_i in 1:length(lines)){
      german[,i] <- ifelse(german[,i]==val_i, str_vals[val_i], german[,i])
    }
  }
}

Here are the first three lines of german now:

german[c(1, 2, 3),]
##          AccountStatus Duration
## 1          ... < 0 DM         6
## 2   0 <= ... < 200 DM        48
## 3 no checking account        12
##                                                  CreditHistory
## 1 critical account/ other credits existing (not at this bank) 
## 2                    existing credits paid back duly till now 
## 3 critical account/ other credits existing (not at this bank) 
##                Purpose CreditAmount                      Account
## 1 domestic appliances          1169 unknown/ no savings account 
## 2 domestic appliances          5951                ... < 100 DM 
## 3          retraining          2096                ... < 100 DM 
##       EmploymentSince PrecentOfIncome PersonalStatus OtherDebtors
## 1      .. >= 7 years                4           male        none 
## 2 1 <= ... < 4 years                2         female        none 
## 3 4 <= ... < 7 years                2           male        none 
##   ResidenceSince     Property Age OtherInstallPlans Housing
## 1              4 real estate   67             none     own 
## 2              2 real estate   22             none     own 
## 3              3 real estate   49             none     own 
##   NumExistingCredits                         Job NumMaintenance
## 1                  2 skilled employee / official              1
## 2                  1 skilled employee / official              1
## 3                  1        unskilled - resident              2
##                                   Telephone ForeignWorker GoodCredit
## 1 yes, registered under the customers name           yes           1
## 2                                     none           yes           2
## 3                                     none           yes           1

Now, let’s set things up so that Good Credit is 1, and Bad Credit is 0.

german[, "GoodCredit"] <- ifelse(german[, "GoodCredit"]==1, 1, 0)

Logistic Regression Model

First, let’s fit the full model (but with no variable transformations/interactions to see which variables seem to matter a lot

summary(glm(GoodCredit~., family=binomial, data=german))
## 
## Call:
## glm(formula = GoodCredit ~ ., family = binomial, data = german)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5564  -0.7282   0.3858   0.7001   2.3117  
## 
## Coefficients:
##                                                                             Estimate
## (Intercept)                                                                3.937e+00
## AccountStatus... >= 200 DM / salary assignments for at least 1 year        9.730e-01
## AccountStatus0 <= ... < 200 DM                                             3.700e-01
## AccountStatusno checking account                                           1.729e+00
## Duration                                                                  -2.846e-02
## CreditHistorycritical account/ other credits existing (not at this bank)   1.558e+00
## CreditHistorydelay in paying off in the past                               9.856e-01
## CreditHistoryexisting credits paid back duly till now                      7.085e-01
## CreditHistoryno credits taken/ all credits paid back duly                  1.648e-01
## Purposecar (new)                                                          -1.848e+00
## Purposecar (used)                                                         -2.113e-01
## Purposedomestic appliances                                                -9.688e-01
## Purposeeducation                                                          -1.663e+00
## Purposefurniture/equipment                                                -3.882e-01
## Purposeothers                                                             -1.169e+00
## Purposeradio/television                                                   -1.080e+00
## Purposerepairs                                                            -1.321e+00
## Purposeretraining                                                         -1.885e+00
## CreditAmount                                                              -1.176e-04
## Account... < 100 DM                                                       -1.324e+00
## Account100 <= ... < 500 DM                                                -9.870e-01
## Account500 <= ... < 1000 DM                                               -9.358e-01
## Accountunknown/ no savings account                                        -3.689e-01
## EmploymentSince... < 1 year                                               -2.606e-01
## EmploymentSince1 <= ... < 4 years                                         -1.107e-01
## EmploymentSince4 <= ... < 7 years                                          5.871e-01
## EmploymentSinceunemployed                                                 -2.762e-01
## PrecentOfIncome                                                           -3.010e-01
## PersonalStatusmale                                                         3.643e-01
## OtherDebtorsguarantor                                                      1.364e+00
## OtherDebtorsnone                                                           3.869e-01
## ResidenceSince                                                             7.259e-03
## Propertyif not A121/A122                                                   6.924e-02
## Propertyreal estate                                                        2.454e-01
## Propertyunknown / no property                                             -4.311e-01
## Age                                                                        1.362e-02
## OtherInstallPlansnone                                                      6.247e-01
## OtherInstallPlansstores                                                    1.674e-01
## Housingown                                                                -2.616e-01
## Housingrent                                                               -7.268e-01
## NumExistingCredits                                                        -2.687e-01
## Jobskilled employee / official                                            -5.707e-02
## Jobunemployed/ unskilled - non-resident                                    5.370e-01
## Jobunskilled - resident                                                   -2.452e-02
## NumMaintenance                                                            -1.549e-01
## Telephoneyes, registered under the customers name                          2.982e-01
## ForeignWorkeryes                                                          -1.365e+00
##                                                                           Std. Error
## (Intercept)                                                                1.756e+00
## AccountStatus... >= 200 DM / salary assignments for at least 1 year        3.669e-01
## AccountStatus0 <= ... < 200 DM                                             2.166e-01
## AccountStatusno checking account                                           2.313e-01
## Duration                                                                   9.247e-03
## CreditHistorycritical account/ other credits existing (not at this bank)   4.345e-01
## CreditHistorydelay in paying off in the past                               4.671e-01
## CreditHistoryexisting credits paid back duly till now                      3.812e-01
## CreditHistoryno credits taken/ all credits paid back duly                  5.466e-01
## Purposecar (new)                                                           1.179e+00
## Purposecar (used)                                                          1.219e+00
## Purposedomestic appliances                                                 1.179e+00
## Purposeeducation                                                           1.279e+00
## Purposefurniture/equipment                                                 1.391e+00
## Purposeothers                                                              1.199e+00
## Purposeradio/television                                                    1.182e+00
## Purposerepairs                                                             1.381e+00
## Purposeretraining                                                          1.225e+00
## CreditAmount                                                               4.376e-05
## Account... < 100 DM                                                        5.230e-01
## Account100 <= ... < 500 DM                                                 5.721e-01
## Account500 <= ... < 1000 DM                                                6.414e-01
## Accountunknown/ no savings account                                         5.622e-01
## EmploymentSince... < 1 year                                                2.929e-01
## EmploymentSince1 <= ... < 4 years                                          2.504e-01
## EmploymentSince4 <= ... < 7 years                                          3.005e-01
## EmploymentSinceunemployed                                                  4.111e-01
## PrecentOfIncome                                                            8.706e-02
## PersonalStatusmale                                                         1.926e-01
## OtherDebtorsguarantor                                                      5.648e-01
## OtherDebtorsnone                                                           4.089e-01
## ResidenceSince                                                             8.595e-02
## Propertyif not A121/A122                                                   2.307e-01
## Propertyreal estate                                                        2.515e-01
## Propertyunknown / no property                                              4.094e-01
## Age                                                                        9.078e-03
## OtherInstallPlansnone                                                      2.380e-01
## OtherInstallPlansstores                                                    4.100e-01
## Housingown                                                                 4.469e-01
## Housingrent                                                                4.742e-01
## NumExistingCredits                                                         1.888e-01
## Jobskilled employee / official                                             2.843e-01
## Jobunemployed/ unskilled - non-resident                                    6.586e-01
## Jobunskilled - resident                                                    3.487e-01
## NumMaintenance                                                             2.437e-01
## Telephoneyes, registered under the customers name                          2.002e-01
## ForeignWorkeryes                                                           6.201e-01
##                                                                           z value
## (Intercept)                                                                 2.242
## AccountStatus... >= 200 DM / salary assignments for at least 1 year         2.652
## AccountStatus0 <= ... < 200 DM                                              1.708
## AccountStatusno checking account                                            7.473
## Duration                                                                   -3.078
## CreditHistorycritical account/ other credits existing (not at this bank)    3.586
## CreditHistorydelay in paying off in the past                                2.110
## CreditHistoryexisting credits paid back duly till now                       1.858
## CreditHistoryno credits taken/ all credits paid back duly                   0.302
## Purposecar (new)                                                           -1.568
## Purposecar (used)                                                          -0.173
## Purposedomestic appliances                                                 -0.822
## Purposeeducation                                                           -1.299
## Purposefurniture/equipment                                                 -0.279
## Purposeothers                                                              -0.975
## Purposeradio/television                                                    -0.914
## Purposerepairs                                                             -0.957
## Purposeretraining                                                          -1.539
## CreditAmount                                                               -2.688
## Account... < 100 DM                                                        -2.531
## Account100 <= ... < 500 DM                                                 -1.725
## Account500 <= ... < 1000 DM                                                -1.459
## Accountunknown/ no savings account                                         -0.656
## EmploymentSince... < 1 year                                                -0.890
## EmploymentSince1 <= ... < 4 years                                          -0.442
## EmploymentSince4 <= ... < 7 years                                           1.954
## EmploymentSinceunemployed                                                  -0.672
## PrecentOfIncome                                                            -3.457
## PersonalStatusmale                                                          1.892
## OtherDebtorsguarantor                                                       2.414
## OtherDebtorsnone                                                            0.946
## ResidenceSince                                                              0.084
## Propertyif not A121/A122                                                    0.300
## Propertyreal estate                                                         0.976
## Propertyunknown / no property                                              -1.053
## Age                                                                         1.500
## OtherInstallPlansnone                                                       2.625
## OtherInstallPlansstores                                                     0.408
## Housingown                                                                 -0.585
## Housingrent                                                                -1.533
## NumExistingCredits                                                         -1.423
## Jobskilled employee / official                                             -0.201
## Jobunemployed/ unskilled - non-resident                                     0.815
## Jobunskilled - resident                                                    -0.070
## NumMaintenance                                                             -0.636
## Telephoneyes, registered under the customers name                           1.490
## ForeignWorkeryes                                                           -2.201
##                                                                           Pr(>|z|)
## (Intercept)                                                               0.024937
## AccountStatus... >= 200 DM / salary assignments for at least 1 year       0.008004
## AccountStatus0 <= ... < 200 DM                                            0.087574
## AccountStatusno checking account                                          7.83e-14
## Duration                                                                  0.002087
## CreditHistorycritical account/ other credits existing (not at this bank)  0.000336
## CreditHistorydelay in paying off in the past                              0.034843
## CreditHistoryexisting credits paid back duly till now                     0.063106
## CreditHistoryno credits taken/ all credits paid back duly                 0.762997
## Purposecar (new)                                                          0.116989
## Purposecar (used)                                                         0.862369
## Purposedomestic appliances                                                0.411277
## Purposeeducation                                                          0.193772
## Purposefurniture/equipment                                                0.780092
## Purposeothers                                                             0.329529
## Purposeradio/television                                                   0.360828
## Purposerepairs                                                            0.338602
## Purposeretraining                                                         0.123815
## CreditAmount                                                              0.007187
## Account... < 100 DM                                                       0.011378
## Account100 <= ... < 500 DM                                                0.084512
## Account500 <= ... < 1000 DM                                               0.144544
## Accountunknown/ no savings account                                        0.511731
## EmploymentSince... < 1 year                                               0.373626
## EmploymentSince1 <= ... < 4 years                                         0.658353
## EmploymentSince4 <= ... < 7 years                                         0.050749
## EmploymentSinceunemployed                                                 0.501744
## PrecentOfIncome                                                           0.000545
## PersonalStatusmale                                                        0.058530
## OtherDebtorsguarantor                                                     0.015769
## OtherDebtorsnone                                                          0.344136
## ResidenceSince                                                            0.932691
## Propertyif not A121/A122                                                  0.764069
## Propertyreal estate                                                       0.329188
## Propertyunknown / no property                                             0.292365
## Age                                                                       0.133506
## OtherInstallPlansnone                                                     0.008664
## OtherInstallPlansstores                                                   0.683033
## Housingown                                                                0.558245
## Housingrent                                                               0.125360
## NumExistingCredits                                                        0.154801
## Jobskilled employee / official                                            0.840897
## Jobunemployed/ unskilled - non-resident                                   0.414858
## Jobunskilled - resident                                                   0.943954
## NumMaintenance                                                            0.524975
## Telephoneyes, registered under the customers name                         0.136250
## ForeignWorkeryes                                                          0.027738
##                                                                              
## (Intercept)                                                               *  
## AccountStatus... >= 200 DM / salary assignments for at least 1 year       ** 
## AccountStatus0 <= ... < 200 DM                                            .  
## AccountStatusno checking account                                          ***
## Duration                                                                  ** 
## CreditHistorycritical account/ other credits existing (not at this bank)  ***
## CreditHistorydelay in paying off in the past                              *  
## CreditHistoryexisting credits paid back duly till now                     .  
## CreditHistoryno credits taken/ all credits paid back duly                    
## Purposecar (new)                                                             
## Purposecar (used)                                                            
## Purposedomestic appliances                                                   
## Purposeeducation                                                             
## Purposefurniture/equipment                                                   
## Purposeothers                                                                
## Purposeradio/television                                                      
## Purposerepairs                                                               
## Purposeretraining                                                            
## CreditAmount                                                              ** 
## Account... < 100 DM                                                       *  
## Account100 <= ... < 500 DM                                                .  
## Account500 <= ... < 1000 DM                                                  
## Accountunknown/ no savings account                                           
## EmploymentSince... < 1 year                                                  
## EmploymentSince1 <= ... < 4 years                                            
## EmploymentSince4 <= ... < 7 years                                         .  
## EmploymentSinceunemployed                                                    
## PrecentOfIncome                                                           ***
## PersonalStatusmale                                                        .  
## OtherDebtorsguarantor                                                     *  
## OtherDebtorsnone                                                             
## ResidenceSince                                                               
## Propertyif not A121/A122                                                     
## Propertyreal estate                                                          
## Propertyunknown / no property                                                
## Age                                                                          
## OtherInstallPlansnone                                                     ** 
## OtherInstallPlansstores                                                      
## Housingown                                                                   
## Housingrent                                                                  
## NumExistingCredits                                                           
## Jobskilled employee / official                                               
## Jobunemployed/ unskilled - non-resident                                      
## Jobunskilled - resident                                                      
## NumMaintenance                                                               
## Telephoneyes, registered under the customers name                            
## ForeignWorkeryes                                                          *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1221.73  on 999  degrees of freedom
## Residual deviance:  901.58  on 953  degrees of freedom
## AIC: 995.58
## 
## Number of Fisher Scoring iterations: 5

A quick and dirty way to fit the model is to use the variables which are significant.

Let’s split the data into a training and a test set.

set.seed(0)
idx <- sample(1000, 800)
train_set <- german[idx,]
test_set <- german[-idx,]
cost_classification <- function(r, pi) mean(abs(r-pi) > 0.5)

model1 <- glm(GoodCredit~AccountStatus+Duration+CreditHistory+CreditAmount+Account+EmploymentSince+PrecentOfIncome+PersonalStatus+OtherDebtors+ForeignWorker, family=binomial, data=train_set)
cost_classification(train_set$GoodCredit, predict(model1, train_set, type="response"))
## [1] 0.225
cost_classification(test_set$GoodCredit, predict(model1, test_set, type="response"))
## [1] 0.205

It doesn’t look like there is overfitting (in fact, the classification cost is a little smaller on the test set, just due to random variation). Let’s see how much better we can do on the training set if we include all the variables.

cost_classification(train_set$GoodCredit, predict(glm(GoodCredit~., family=binomial, data=train_set), train_set, type="response"))
## [1] 0.215

So it seems like we can’t do that much better.

Now, let’s fit a Random Forest model.

library(randomForest)
credit.rf <- randomForest(as.factor(train_set$GoodCredit)~as.factor(train_set$AccountStatus)+train_set$Duration+as.factor(train_set$CreditHistory)+train_set$CreditAmount+as.factor(train_set$Account)+as.factor(train_set$EmploymentSince)+train_set$PrecentOfIncome+as.factor(train_set$PersonalStatus)+as.factor(train_set$OtherDebtors)+as.factor(train_set$ForeignWorker), ntrees=10000)

Let’s now evaluate the costs.

train.pred.rf <- ifelse(as.character(predict(credit.rf, type="response"))=="1", 1, 0)
cost_classification(train_set$GoodCredit, train.pred.rf)
## [1] 0.2475
test.pred.rf <- ifelse(as.character(predict(credit.rf, new_data=test_set, type="response"))=="1", 1, 0)
cost_classification(test_set$GoodCredit, test.pred.rf)
## [1] 0.37125

In this case, we are not doing as well as logistic regression on the training set, and random forest overfit pretty badly. Random Forests generally do perform pretty well though!

Finally, let’s try some transformations of the variables and see where that gets us.

model2 <- glm(GoodCredit~AccountStatus+Duration+I(Duration^2)+I(log(1+Duration))+CreditHistory+CreditAmount+I(CreditAmount^2)+I(log(CreditAmount))+Account+EmploymentSince+PrecentOfIncome+I(PrecentOfIncome^2)+I(log(PrecentOfIncome))+PersonalStatus+OtherDebtors+ForeignWorker, family=binomial, data=train_set)
cost_classification(train_set$GoodCredit, predict(model2, train_set, type="response"))
## [1] 0.21875
cost_classification(test_set$GoodCredit, predict(model2, test_set, type="response"))
## [1] 0.2

Looks like we’re doing even better!