library(ggplot2)
library(dplyr)
library(viridis)
library(ggfortify)
library(factoextra)

South African Heart Disease data

A retrospective sample of males in a heart-disease high-risk region of the Western Cape, South Africa. There are roughly two controls per case of CHD. Many of the CHD positive men have undergone blood pressure reduction treatment and other programs to reduce their risk factors after their CHD event. In some cases the measurements were made after these treatments. These data are taken from a larger dataset, described in Rousseauw et al, 1983, South African Medical Journal.

Load the data. There are 10 variables.

sbp: systolic blood pressure

tobacco: cumulative tobacco (kg)

ldl: low densiity lipoprotein cholesterol

adiposity: adiposity

famhist: family history of heart disease (Present, Absent)

typea: type-A behavior

obesity: obesity

alcohol: current alcohol consumption

age: age at onset

chd response, coronary heart disease

library(GGally)
data = read.csv("https://hastie.su.domains/ElemStatLearn/datasets/SAheart.data", header = T, sep = ",") %>% 
  select(-row.names)
head(data)

sbp	tobacco	ldl	adiposity	famhist	typea	obesity	alcohol	age	chd
160	12.00	5.73	23.11	Present	49	25.30	97.20	52	1
144	0.01	4.41	28.61	Absent	55	28.87	2.06	63	1
118	0.08	3.48	32.28	Present	52	29.14	3.81	46	0
170	7.50	6.41	38.03	Present	51	31.99	24.26	58	1
134	13.60	3.50	27.78	Present	60	25.99	57.34	49	1
132	6.20	6.47	36.21	Present	62	30.77	14.14	45	0

dim(data)

## [1] 462  10

table(data$chd)

## 
##   0   1 
## 302 160

X = data %>% select(-chd) 
Y = data %>% select(chd) %>% pull()
n = nrow(X)
p = ncol(X)

Pairwise plots.

ggpairs(data, columns = 1:(ncol(data) - 1), aes(colour = as.factor(chd)))

Age vs chd plot. Why not just using linear regression?

ggplot(data, aes(age, chd))+
  geom_point()+
  geom_smooth(method = "lm")

ggplot(data, aes(age, chd))+
  geom_point()+
  geom_smooth(method = "glm", method.args = list(family = "binomial"))

Split the data into train and test sets.

train = sample(1:n, n*0.8)
Xtrain = X[train,]
Ytrain = Y[train]
Xtest = X[-train,]
Ytest = Y[-train]

Fit logistic regression.

GLM = glm(chd ~ ., data = data[train,], family = "binomial")
summary(GLM)

## 
## Call:
## glm(formula = chd ~ ., family = "binomial", data = data[train, 
##     ])
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -7.269136   1.529839  -4.752 2.02e-06 ***
## sbp             0.004088   0.006609   0.619 0.536210    
## tobacco         0.080000   0.029607   2.702 0.006892 ** 
## ldl             0.156760   0.067582   2.320 0.020366 *  
## adiposity       0.006861   0.034321   0.200 0.841547    
## famhistPresent  0.787333   0.257567   3.057 0.002237 ** 
## typea           0.049869   0.014619   3.411 0.000647 ***
## obesity        -0.029012   0.050846  -0.571 0.568277    
## alcohol         0.001844   0.004782   0.386 0.699776    
## age             0.054363   0.013982   3.888 0.000101 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 473.8  on 368  degrees of freedom
## Residual deviance: 371.9  on 359  degrees of freedom
## AIC: 391.9
## 
## Number of Fisher Scoring iterations: 5

GLM$coefficients

##    (Intercept)            sbp        tobacco            ldl      adiposity 
##   -7.269135828    0.004088292    0.080000161    0.156759570    0.006861223 
## famhistPresent          typea        obesity        alcohol            age 
##    0.787332760    0.049868697   -0.029012118    0.001843950    0.054362751

exp(GLM$coefficients)

##    (Intercept)            sbp        tobacco            ldl      adiposity 
##   0.0006967138   1.0040966600   1.0832872425   1.1697143451   1.0068848155 
## famhistPresent          typea        obesity        alcohol            age 
##   2.1975272692   1.0511330701   0.9714046926   1.0018456513   1.0558675494

Predict values.

pred = predict(GLM, newdata = Xtest, type = "response")

Assess the performance.

Ytestpred = (pred > 0.5)*1
tab = table(Ytestpred, Ytest)
tab

##          Ytest
## Ytestpred  0  1
##         0 50 16
##         1  9 18

Accuracy, sensitivity and specificity.

(tab[1,1] + tab[2,2])/sum(tab)

## [1] 0.7311828

tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1])

## [1] 0.5294118

tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])

## [1] 0.8474576

Change the threshold.

Ytestpred = (pred > 0.8)*1
tab = table(Ytestpred, Ytest)
tab

##          Ytest
## Ytestpred  0  1
##         0 58 32
##         1  1  2

Accuracy, sensitivity and specificity.

(tab[1,1] + tab[2,2])/sum(tab)

## [1] 0.6451613

tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1])

## [1] 0.05882353

tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])

## [1] 0.9830508

Plot ROC and compute AUCROC.

library(pROC)
ROC = roc(response = Ytest, predictor = pred)
ggroc(ROC)+
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color = "grey", linetype = "dashed")

auc(ROC)

## Area under the curve: 0.7692

Use caret package for cross-validation and glmnet package for sparsity.

K-nearest neighbors

Do we need to standardize the data?

summary(X)

##       sbp           tobacco             ldl           adiposity    
##  Min.   :101.0   Min.   : 0.0000   Min.   : 0.980   Min.   : 6.74  
##  1st Qu.:124.0   1st Qu.: 0.0525   1st Qu.: 3.283   1st Qu.:19.77  
##  Median :134.0   Median : 2.0000   Median : 4.340   Median :26.11  
##  Mean   :138.3   Mean   : 3.6356   Mean   : 4.740   Mean   :25.41  
##  3rd Qu.:148.0   3rd Qu.: 5.5000   3rd Qu.: 5.790   3rd Qu.:31.23  
##  Max.   :218.0   Max.   :31.2000   Max.   :15.330   Max.   :42.49  
##    famhist              typea         obesity         alcohol      
##  Length:462         Min.   :13.0   Min.   :14.70   Min.   :  0.00  
##  Class :character   1st Qu.:47.0   1st Qu.:22.98   1st Qu.:  0.51  
##  Mode  :character   Median :53.0   Median :25.80   Median :  7.51  
##                     Mean   :53.1   Mean   :26.04   Mean   : 17.04  
##                     3rd Qu.:60.0   3rd Qu.:28.50   3rd Qu.: 23.89  
##                     Max.   :78.0   Max.   :46.58   Max.   :147.19  
##       age       
##  Min.   :15.00  
##  1st Qu.:31.00  
##  Median :45.00  
##  Mean   :42.82  
##  3rd Qu.:55.00  
##  Max.   :64.00

X = X %>% mutate(famhist = as.numeric(ifelse(famhist == "Present", 1, 0)))
Xn = scale(X, scale = T, center = F)

PCA embedding.

PCA = prcomp(as.matrix(X))
autoplot(PCA, data = data %>% mutate(chd = as.factor(chd)), colour = 'chd')

Split into train and test again.

Xtrain = Xn[train,]
Xtest = Xn[-train,]

Fit the model.

library(class)
KNN = knn(train = Xtrain, test = Xtest, cl = Ytrain, k = 20, prob = TRUE)
Ptestpred = attributes(KNN)$prob
Ytestpred = (Ptestpred > 0.5)*1

Measure the performance.

tab = table(Ytestpred, Ytest)
tab

##          Ytest
## Ytestpred  0  1
##         0  1  0
##         1 58 34

Accuracy, sensitivity and specificity.

(tab[1,1] + tab[2,2])/sum(tab)
tab[2,2]/(tab[1,2] + tab[2,2]) # caret::sensitivity(tab[2:1,2:1]) 
tab[1,1]/(tab[1,1] + tab[2,1]) # caret::specificity(tab[2:1,2:1])

Compute AUCROC.

ROC = roc(response = Ytest, predictor = Ptestpred)
ggroc(ROC)+
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color = "grey", linetype = "dashed")

auc(ROC)

## Area under the curve: 0.6548

Vary the number of neighbors.

ks = 1:20
accs = c()
for(k in ks){
  Ytestpred = knn(train = Xtrain, test = Xtest, cl = Ytrain, k = k)
  tab = table(Ytestpred, Ytest)
  accs = c(accs, (tab[1,1] + tab[2,2])/sum(tab))
}
df = data.frame(accuracy = accs, k = ks)
ggplot(df, aes(k, accuracy))+
  geom_point()+
  geom_line()

Classification: logistic regression and K-nearest neighbors

South African Heart Disease data

K-nearest neighbors