################################################################################# ################################################################################# ### Principal Component Analysis ### Auto data set ### Barbora Hladka, Martin Holub ### http://ufal.mff.cuni.cz/course/npfl054 ################################################################################# ################################################################################# library(ISLR) ################ ## EXERCISE #1: Read and explore data ################ a <- Auto[c("mpg", "cylinders", "horsepower", "weight")] apply(a, 2, var) apply(a, 2, mean) plot(a) # scatter plot matrix cov(a) # covariation matrix cor(a) # correlation matrix ########## ## EXERCISE #2: Principal Component Analysis ########## ## scaling pca <- prcomp(a, scale = TRUE) names(pca) # [1] "sdev" "rotation" "center" "scale" "x" summary(pca) # coefficients of the components pca$rotation # loadings pca$center # mean values pca$scale pca$x # scores boxplot(pca$x) pca$sdev # standard deviation of each PC pca.var <- pca$sdev^2 # variance explained by each PC pca.var sum(pca.var) # total variance explained by all PCs # Proportional Variance Explained by each PC (PVE) pve <- pca.var/sum(pca.var) plot(pve, xlab = "Principal Component", main = "Scree plot: Auto data set", ylab = "Proportion of Variance Explained", ylim = c(0,1), type = 'b') plot(cumsum(pve), xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", main = "Scree plot: Auto data set", ylim = c(0,1), type = 'b') # biplot # loadings: how strongly each feature influences a principal component # the angles between the vectors show how features correlate with one another # when two vectors are close (small angle), the two features are positively correlated # if the form a large angle, they are negative correlated # bottom axis: PC1 score # left axis: PC2 score # top axis: loadings on PC1 # right exis: loadings on PC2 biplot(pca, scale = T, # the arrows are scaled to represent the loadings main = "Biplot: scaled Auto data set", xlab = "First Principal Component", ylab = "Second Principal Component") # biplot with the car names biplot(pca, scale = T, xlabs = Auto$name, main = "Biplot: scaled Auto data set", xlab = "First Principal Component", ylab = "Second Principal Component") # unscaling features pca.un <- prcomp(a, scale = FALSE) biplot(pca.un, scale = T, xlabs = rep("O", nrow(a)), main = "Biplot: unscaled Auto data set", xlab = "First Principal Component", ylab = "Second Principal Component")