################################################################################# ################################################################################# ### SVM, PCA ### Protein Ligandability Recognition ### Classification ### Barbora Hladka, Martin Holub ### MFF UK, 2018/19 ### http://ufal.mff.cuni.cz/course/npfl054 ################################################################################# ################################################################################# library(e1071) # SVM ############### library(crossval) # evaluation # basic performance binary classification measures perf <- function(true, pred, neg) { # true -- true prediction # pred -- predicted output # neg -- negative class cm <- confusionMatrix(true, pred, neg) acc <- diagnosticErrors(cm)[[1]] p <- diagnosticErrors(cm)[[4]] r <- diagnosticErrors(cm)[[2]] f1 <- 2*p*r/(p +r) # accuracy -- acc = (TP+TN)/(FP+TN+TP+FN) # sensitivity/recall -- sens = TP/(TP+FN) [true positive rate] # specificity -- spec = TN/(FP+TN) [true negative rate] # precision -- ppv = TP/(FP+TP) [positive predicted value] # npv = TN/(TN+FN) [negative predicted value] # lor = log(TP*TN/(FN*FP)) [diagnostic odds ratio (TP/FP)/(FN/TN)] # output Accuracy, Precision, Recall, F1-measure return ( round(c(acc, p, r, f1),2) ) } ################ ## EXERCISE #1: Read the PLR data for classificatin ################ ## classification # Download https://ufal.mff.cuni.cz/courses/npfl054/materials -> PLR --> set.C.zip data <- read.csv('plr.dataset.C.development.csv', sep="\t", header = T) # target attribute 'class' # unique proteins p <- unique(sort(data$protein_id)) set.seed(123) # set number of test proteins te.p <- 3 # get number of train proteins tr.p <- length(p) - te.p # get train protein indeces tr.i <- sort(sample(p, tr.p)) # get developmnet test protein indeces te.i <- sort(setdiff(p, tr.i)) # get train and developmnet test proteins train <- data[data$protein_id %in% tr.i,] test <- data[data$protein_id %in% te.i,] # feature scaling train[,2:36] <- scale(train[,2:36]) test[,2:36] <- scale(test[,2:36]) train$class <- as.factor(train$class) test$class <- as.factor(test$class) train <- train[,-1] # exclude protein_id test <- test[,-1] # exclude protein_id y.train <- train$class y.test <- test$class ################ ## EXERCISE #2: SVM ################ # training m1.svm <-svm(formula = class ~ ., data = train, type = 'C-classification', kernel = 'linear') # evaluation test.pred.1 <- predict(m1.svm, newdata = test[-36]) cm.1 <- table(y.test, test.pred.1) perf.1 <- perf(y.test, test.pred.1, "0") ################ ## EXERCISE #3: Principal Component Analysis ################ pca <- prcomp(train[,-36]) summary(pca) ################ ## EXERCISE #4: SVM with PCA ################ # data representation using PCs train <- predict(pca, train) test <- predict(pca, test) # select PCs to work with # e.g. 18 PCs that explain 98,8% of total variance pcaComp = 18 train <- as.data.frame(train[,1:pcaComp]) train$class <- y.train test <- as.data.frame(test[,1:pcaComp]) test$class <- y.test # training SVM m2.svm <-svm(formula = class ~ ., data = train, type = 'C-classification', kernel = 'linear') # evaluation test.pred.2 <- predict(m2.svm, newdata = test[1:pcaComp]) cm.2 <- table(y.test, test.pred.2) perf.2 <- perf(y.test, test.pred.2, "0") # compare perf.1 and perf.2 # be careful when interpreting the results # since we have not analyzed the original features yet