#################################################################################
#################################################################################

### SVM, PCA

### Protein Ligandability Recognition

### Classification

### Barbora Hladka, Martin Holub

### MFF UK, 2018/19

### http://ufal.mff.cuni.cz/course/npfl054

#################################################################################
#################################################################################

library(e1071)	    # SVM

###############

library(crossval) # evaluation

# basic performance binary classification measures
perf <- function(true, pred, neg) {
# true -- true prediction
# pred -- predicted output
# neg -- negative class
	cm <- confusionMatrix(true, pred, neg)
	acc <- diagnosticErrors(cm)[[1]]
	p <- diagnosticErrors(cm)[[4]]
	r <- diagnosticErrors(cm)[[2]]
	f1 <- 2*p*r/(p +r)
# accuracy -- acc = (TP+TN)/(FP+TN+TP+FN)
# sensitivity/recall -- sens = TP/(TP+FN) [true positive rate]
# specificity --  spec = TN/(FP+TN) [true negative rate]
# precision -- ppv = TP/(FP+TP) [positive predicted value]
# npv = TN/(TN+FN) [negative predicted value]
# lor = log(TP*TN/(FN*FP)) [diagnostic odds ratio (TP/FP)/(FN/TN)]
	
# output Accuracy, Precision, Recall, F1-measure
	return ( round(c(acc, p, r, f1),2) )
}

################
## EXERCISE #1: Read the PLR data for classificatin
################

## classification
# Download https://ufal.mff.cuni.cz/courses/npfl054/materials -> PLR --> set.C.zip
data <- read.csv('plr.dataset.C.development.csv', sep="\t", header = T)
# target attribute 'class'

# unique proteins
p <- unique(sort(data$protein_id))

set.seed(123)
# set number of test proteins
te.p <- 3
# get number of train proteins
tr.p <- length(p) - te.p
# get train protein indeces
tr.i <- sort(sample(p, tr.p))
# get developmnet test protein indeces
te.i <- sort(setdiff(p, tr.i))

# get train and developmnet test proteins
train <- data[data$protein_id %in% tr.i,]
test <- data[data$protein_id %in% te.i,]

# feature scaling
train[,2:36] <- scale(train[,2:36])
test[,2:36] <- scale(test[,2:36])
train$class <- as.factor(train$class)
test$class <- as.factor(test$class)
train <- train[,-1] # exclude protein_id
test <- test[,-1] # exclude protein_id

y.train <- train$class
y.test <- test$class

################
## EXERCISE #2: SVM
################
# training
m1.svm <-svm(formula = class ~ .,
                 data = train,
                 type = 'C-classification',
                 kernel = 'linear')

# evaluation
test.pred.1 <- predict(m1.svm, newdata = test[-36])
cm.1 <- table(y.test, test.pred.1)
perf.1 <- perf(y.test, test.pred.1, "0")

################
## EXERCISE #3: Principal Component Analysis
################
pca <- prcomp(train[,-36])
summary(pca)

################
## EXERCISE #4: SVM with PCA
################
# data representation using PCs
train <- predict(pca, train)
test <- predict(pca, test)

# select PCs to work with 
# e.g. 18 PCs that explain 98,8% of total variance
pcaComp = 18
train <- as.data.frame(train[,1:pcaComp])
train$class <- y.train
test <- as.data.frame(test[,1:pcaComp])
test$class <- y.test

# training SVM
m2.svm <-svm(formula = class ~ .,
                 data = train,
                 type = 'C-classification',
                 kernel = 'linear')

# evaluation
test.pred.2 <- predict(m2.svm, newdata = test[1:pcaComp])
cm.2 <- table(y.test, test.pred.2)
perf.2 <- perf(y.test, test.pred.2, "0")

# compare perf.1 and perf.2
# be careful when interpreting the results
# since we have not analyzed the original features yet