library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)

South African Heart Disease data

A retrospective sample of males in a heart-disease high-risk region of the Western Cape, South Africa. There are roughly two controls per case of CHD. Many of the CHD positive men have undergone blood pressure reduction treatment and other programs to reduce their risk factors after their CHD event. In some cases the measurements were made after these treatments. These data are taken from a larger dataset, described in Rousseauw et al, 1983, South African Medical Journal.

Load the data. There are 10 variables.

sbp: systolic blood pressure

tobacco: cumulative tobacco (kg)

ldl: low densiity lipoprotein cholesterol

adiposity: adiposity

famhist: family history of heart disease (Present, Absent)

typea: type-A behavior

obesity: obesity

alcohol: current alcohol consumption

age: age at onset

chd response, coronary heart disease

library(GGally)
data = read.csv("https://hastie.su.domains/ElemStatLearn/datasets/SAheart.data", header = T, sep = ",") %>% 
  select(-row.names)
head(data)
sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
160 12.00 5.73 23.11 Present 49 25.30 97.20 52 1
144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 1
118 0.08 3.48 32.28 Present 52 29.14 3.81 46 0
170 7.50 6.41 38.03 Present 51 31.99 24.26 58 1
134 13.60 3.50 27.78 Present 60 25.99 57.34 49 1
132 6.20 6.47 36.21 Present 62 30.77 14.14 45 0
dim(data)
## [1] 462  10
table(data$chd)
## 
##   0   1 
## 302 160
X = data %>% select(-chd) 
Y = data %>% select(chd) %>% pull()
n = nrow(X)
p = ncol(X)

Pairwise plots.

ggpairs(data, columns = 1:(ncol(data) - 1), aes(colour = as.factor(chd)))

Age vs chd plot. Why not just using linear regression?

ggplot(data, aes(age, chd))+
  geom_point()+
  geom_smooth(method = "lm")

ggplot(data, aes(age, chd))+
  geom_point()+
  geom_smooth(method = "glm", method.args = list(family = "binomial"))

Split the data into train and test sets.

train = sample(1:n, n*0.8)
Xtrain = X[train,]
Ytrain = Y[train]
Xtest = X[-train,]
Ytest = Y[-train]

Fit logistic regression.

GLM = glm(chd ~ ., data = data[train,], family = "binomial")
summary(GLM)
## 
## Call:
## glm(formula = chd ~ ., family = "binomial", data = data[train, 
##     ])
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -7.269136   1.529839  -4.752 2.02e-06 ***
## sbp             0.004088   0.006609   0.619 0.536210    
## tobacco         0.080000   0.029607   2.702 0.006892 ** 
## ldl             0.156760   0.067582   2.320 0.020366 *  
## adiposity       0.006861   0.034321   0.200 0.841547    
## famhistPresent  0.787333   0.257567   3.057 0.002237 ** 
## typea           0.049869   0.014619   3.411 0.000647 ***
## obesity        -0.029012   0.050846  -0.571 0.568277    
## alcohol         0.001844   0.004782   0.386 0.699776    
## age             0.054363   0.013982   3.888 0.000101 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 473.8  on 368  degrees of freedom
## Residual deviance: 371.9  on 359  degrees of freedom
## AIC: 391.9
## 
## Number of Fisher Scoring iterations: 5
GLM$coefficients
##    (Intercept)            sbp        tobacco            ldl      adiposity 
##   -7.269135828    0.004088292    0.080000161    0.156759570    0.006861223 
## famhistPresent          typea        obesity        alcohol            age 
##    0.787332760    0.049868697   -0.029012118    0.001843950    0.054362751
exp(GLM$coefficients)
##    (Intercept)            sbp        tobacco            ldl      adiposity 
##   0.0006967138   1.0040966600   1.0832872425   1.1697143451   1.0068848155 
## famhistPresent          typea        obesity        alcohol            age 
##   2.1975272692   1.0511330701   0.9714046926   1.0018456513   1.0558675494

Predict values.

pred = predict(GLM, newdata = Xtest, type = "response")

Assess the performance.

Ytestpred = (pred > 0.5)*1
tab = table(Ytestpred, Ytest)
tab
##          Ytest
## Ytestpred  0  1
##         0 50 16
##         1  9 18

Accuracy, sensitivity and specificity.

(tab[1,1] + tab[2,2])/sum(tab)
## [1] 0.7311828
tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1]) 
## [1] 0.5294118
tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])
## [1] 0.8474576

Change the threshold.

Ytestpred = (pred > 0.8)*1
tab = table(Ytestpred, Ytest)
tab
##          Ytest
## Ytestpred  0  1
##         0 58 32
##         1  1  2

Accuracy, sensitivity and specificity.

(tab[1,1] + tab[2,2])/sum(tab)
## [1] 0.6451613
tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1]) 
## [1] 0.05882353
tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])
## [1] 0.9830508

Plot ROC and compute AUCROC.

library(pROC)
ROC = roc(response = Ytest, predictor = pred)
ggroc(ROC)+
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color = "grey", linetype = "dashed")

auc(ROC)
## Area under the curve: 0.7692

Use caret package for cross-validation and glmnet package for sparsity.

K-nearest neighbors

Do we need to standardize the data?

summary(X)
##       sbp           tobacco             ldl           adiposity    
##  Min.   :101.0   Min.   : 0.0000   Min.   : 0.980   Min.   : 6.74  
##  1st Qu.:124.0   1st Qu.: 0.0525   1st Qu.: 3.283   1st Qu.:19.77  
##  Median :134.0   Median : 2.0000   Median : 4.340   Median :26.11  
##  Mean   :138.3   Mean   : 3.6356   Mean   : 4.740   Mean   :25.41  
##  3rd Qu.:148.0   3rd Qu.: 5.5000   3rd Qu.: 5.790   3rd Qu.:31.23  
##  Max.   :218.0   Max.   :31.2000   Max.   :15.330   Max.   :42.49  
##    famhist              typea         obesity         alcohol      
##  Length:462         Min.   :13.0   Min.   :14.70   Min.   :  0.00  
##  Class :character   1st Qu.:47.0   1st Qu.:22.98   1st Qu.:  0.51  
##  Mode  :character   Median :53.0   Median :25.80   Median :  7.51  
##                     Mean   :53.1   Mean   :26.04   Mean   : 17.04  
##                     3rd Qu.:60.0   3rd Qu.:28.50   3rd Qu.: 23.89  
##                     Max.   :78.0   Max.   :46.58   Max.   :147.19  
##       age       
##  Min.   :15.00  
##  1st Qu.:31.00  
##  Median :45.00  
##  Mean   :42.82  
##  3rd Qu.:55.00  
##  Max.   :64.00
X = X %>% mutate(famhist = as.numeric(ifelse(famhist == "Present", 1, 0)))
Xn = scale(X, scale = T, center = F)

PCA embedding.

PCA = prcomp(as.matrix(X))
autoplot(PCA, data = data %>% mutate(chd = as.factor(chd)), colour = 'chd')

Split into train and test again.

Xtrain = Xn[train,]
Xtest = Xn[-train,]

Fit the model.

library(class)
KNN = knn(train = Xtrain, test = Xtest, cl = Ytrain, k = 20, prob = TRUE)
Ptestpred = attributes(KNN)$prob
Ytestpred = (Ptestpred > 0.5)*1

Measure the performance.

tab = table(Ytestpred, Ytest)
tab
##          Ytest
## Ytestpred  0  1
##         0  1  0
##         1 58 34

Accuracy, sensitivity and specificity.

(tab[1,1] + tab[2,2])/sum(tab)
tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1]) 
tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])

Compute AUCROC.

ROC = roc(response = Ytest, predictor = Ptestpred)
ggroc(ROC)+
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color = "grey", linetype = "dashed")

auc(ROC)
## Area under the curve: 0.6548

Vary the number of neighbors.

ks = 1:20
accs = c()
for(k in ks){
  Ytestpred = knn(train = Xtrain, test = Xtest, cl = Ytrain, k = k)
  tab = table(Ytestpred, Ytest)
  accs = c(accs, (tab[1,1] + tab[2,2])/sum(tab))
}
df = data.frame(accuracy = accs, k = ks)
ggplot(df, aes(k, accuracy))+
  geom_point()+
  geom_line()