library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)
A retrospective sample of males in a heart-disease high-risk region of the Western Cape, South Africa. There are roughly two controls per case of CHD. Many of the CHD positive men have undergone blood pressure reduction treatment and other programs to reduce their risk factors after their CHD event. In some cases the measurements were made after these treatments. These data are taken from a larger dataset, described in Rousseauw et al, 1983, South African Medical Journal.
Load the data. There are 10 variables.
sbp
: systolic blood pressure
tobacco
: cumulative tobacco (kg)
ldl
: low densiity lipoprotein cholesterol
adiposity
: adiposity
famhist
: family history of heart disease (Present,
Absent)
typea
: type-A behavior
obesity
: obesity
alcohol
: current alcohol consumption
age
: age at onset
chd
response, coronary heart disease
library(GGally)
data = read.csv("https://hastie.su.domains/ElemStatLearn/datasets/SAheart.data", header = T, sep = ",") %>%
select(-row.names)
head(data)
sbp | tobacco | ldl | adiposity | famhist | typea | obesity | alcohol | age | chd |
---|---|---|---|---|---|---|---|---|---|
160 | 12.00 | 5.73 | 23.11 | Present | 49 | 25.30 | 97.20 | 52 | 1 |
144 | 0.01 | 4.41 | 28.61 | Absent | 55 | 28.87 | 2.06 | 63 | 1 |
118 | 0.08 | 3.48 | 32.28 | Present | 52 | 29.14 | 3.81 | 46 | 0 |
170 | 7.50 | 6.41 | 38.03 | Present | 51 | 31.99 | 24.26 | 58 | 1 |
134 | 13.60 | 3.50 | 27.78 | Present | 60 | 25.99 | 57.34 | 49 | 1 |
132 | 6.20 | 6.47 | 36.21 | Present | 62 | 30.77 | 14.14 | 45 | 0 |
dim(data)
## [1] 462 10
table(data$chd)
##
## 0 1
## 302 160
X = data %>% select(-chd)
Y = data %>% select(chd) %>% pull()
n = nrow(X)
p = ncol(X)
Pairwise plots.
ggpairs(data, columns = 1:(ncol(data) - 1), aes(colour = as.factor(chd)))
Age vs chd plot. Why not just using linear regression?
ggplot(data, aes(age, chd))+
geom_point()+
geom_smooth(method = "lm")
ggplot(data, aes(age, chd))+
geom_point()+
geom_smooth(method = "glm", method.args = list(family = "binomial"))
Split the data into train and test sets.
train = sample(1:n, n*0.8)
Xtrain = X[train,]
Ytrain = Y[train]
Xtest = X[-train,]
Ytest = Y[-train]
Fit logistic regression.
GLM = glm(chd ~ ., data = data[train,], family = "binomial")
summary(GLM)
##
## Call:
## glm(formula = chd ~ ., family = "binomial", data = data[train,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.269136 1.529839 -4.752 2.02e-06 ***
## sbp 0.004088 0.006609 0.619 0.536210
## tobacco 0.080000 0.029607 2.702 0.006892 **
## ldl 0.156760 0.067582 2.320 0.020366 *
## adiposity 0.006861 0.034321 0.200 0.841547
## famhistPresent 0.787333 0.257567 3.057 0.002237 **
## typea 0.049869 0.014619 3.411 0.000647 ***
## obesity -0.029012 0.050846 -0.571 0.568277
## alcohol 0.001844 0.004782 0.386 0.699776
## age 0.054363 0.013982 3.888 0.000101 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 473.8 on 368 degrees of freedom
## Residual deviance: 371.9 on 359 degrees of freedom
## AIC: 391.9
##
## Number of Fisher Scoring iterations: 5
GLM$coefficients
## (Intercept) sbp tobacco ldl adiposity
## -7.269135828 0.004088292 0.080000161 0.156759570 0.006861223
## famhistPresent typea obesity alcohol age
## 0.787332760 0.049868697 -0.029012118 0.001843950 0.054362751
exp(GLM$coefficients)
## (Intercept) sbp tobacco ldl adiposity
## 0.0006967138 1.0040966600 1.0832872425 1.1697143451 1.0068848155
## famhistPresent typea obesity alcohol age
## 2.1975272692 1.0511330701 0.9714046926 1.0018456513 1.0558675494
Predict values.
pred = predict(GLM, newdata = Xtest, type = "response")
Assess the performance.
Ytestpred = (pred > 0.5)*1
tab = table(Ytestpred, Ytest)
tab
## Ytest
## Ytestpred 0 1
## 0 50 16
## 1 9 18
Accuracy, sensitivity and specificity.
(tab[1,1] + tab[2,2])/sum(tab)
## [1] 0.7311828
tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1])
## [1] 0.5294118
tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])
## [1] 0.8474576
Change the threshold.
Ytestpred = (pred > 0.8)*1
tab = table(Ytestpred, Ytest)
tab
## Ytest
## Ytestpred 0 1
## 0 58 32
## 1 1 2
Accuracy, sensitivity and specificity.
(tab[1,1] + tab[2,2])/sum(tab)
## [1] 0.6451613
tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1])
## [1] 0.05882353
tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])
## [1] 0.9830508
Plot ROC and compute AUCROC.
library(pROC)
ROC = roc(response = Ytest, predictor = pred)
ggroc(ROC)+
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color = "grey", linetype = "dashed")
auc(ROC)
## Area under the curve: 0.7692
Use caret
package for cross-validation and
glmnet
package for sparsity.
Do we need to standardize the data?
summary(X)
## sbp tobacco ldl adiposity
## Min. :101.0 Min. : 0.0000 Min. : 0.980 Min. : 6.74
## 1st Qu.:124.0 1st Qu.: 0.0525 1st Qu.: 3.283 1st Qu.:19.77
## Median :134.0 Median : 2.0000 Median : 4.340 Median :26.11
## Mean :138.3 Mean : 3.6356 Mean : 4.740 Mean :25.41
## 3rd Qu.:148.0 3rd Qu.: 5.5000 3rd Qu.: 5.790 3rd Qu.:31.23
## Max. :218.0 Max. :31.2000 Max. :15.330 Max. :42.49
## famhist typea obesity alcohol
## Length:462 Min. :13.0 Min. :14.70 Min. : 0.00
## Class :character 1st Qu.:47.0 1st Qu.:22.98 1st Qu.: 0.51
## Mode :character Median :53.0 Median :25.80 Median : 7.51
## Mean :53.1 Mean :26.04 Mean : 17.04
## 3rd Qu.:60.0 3rd Qu.:28.50 3rd Qu.: 23.89
## Max. :78.0 Max. :46.58 Max. :147.19
## age
## Min. :15.00
## 1st Qu.:31.00
## Median :45.00
## Mean :42.82
## 3rd Qu.:55.00
## Max. :64.00
X = X %>% mutate(famhist = as.numeric(ifelse(famhist == "Present", 1, 0)))
Xn = scale(X, scale = T, center = F)
PCA embedding.
PCA = prcomp(as.matrix(X))
autoplot(PCA, data = data %>% mutate(chd = as.factor(chd)), colour = 'chd')
Split into train and test again.
Xtrain = Xn[train,]
Xtest = Xn[-train,]
Fit the model.
library(class)
KNN = knn(train = Xtrain, test = Xtest, cl = Ytrain, k = 20, prob = TRUE)
Ptestpred = attributes(KNN)$prob
Ytestpred = (Ptestpred > 0.5)*1
Measure the performance.
tab = table(Ytestpred, Ytest)
tab
## Ytest
## Ytestpred 0 1
## 0 1 0
## 1 58 34
Accuracy, sensitivity and specificity.
(tab[1,1] + tab[2,2])/sum(tab)
tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1])
tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])
Compute AUCROC.
ROC = roc(response = Ytest, predictor = Ptestpred)
ggroc(ROC)+
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color = "grey", linetype = "dashed")
auc(ROC)
## Area under the curve: 0.6548
Vary the number of neighbors.
ks = 1:20
accs = c()
for(k in ks){
Ytestpred = knn(train = Xtrain, test = Xtest, cl = Ytrain, k = k)
tab = table(Ytestpred, Ytest)
accs = c(accs, (tab[1,1] + tab[2,2])/sum(tab))
}
df = data.frame(accuracy = accs, k = ks)
ggplot(df, aes(k, accuracy))+
geom_point()+
geom_line()