## SVM modelling # https://archive.ics.uci.edu/ml/datasets/letter+recognition ## input data d <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data") str(d) d$T <- as.factor(d$T) table(d$T) ## split data into training and devel end eval test subsets ## use train and devel for development ## use eval for final evaluation train <- d[1:16000,] dtest <- d[16001:18000,] etest <- d[18001:19999,] library(e1071) ############### ## Train SVM with Linear and RBF Kernels and do prediction on the devel test data ############### ## linear kernel lin <- tune.svm(train[,-1], train$T, kernel = "linear", cost = 10^(1:2), # cost = 10^(-1:2) type='C-classification' ) summary(lin) # best model m1 <- lin$best.model lin.cost <- lin$best.model$cost # do prediction using the best model p1 <- predict(m1, dtest[,-1]) mean(p1 == dtest$T) ## RBF # gamma = c(0.1, 1, 10 ) # cost = 10^(-1:2) rbf <- tune.svm(train[,-1], train$T, kernel = "radial", gamma = 1, cost = 10^(1:2) ) summary(rbf) # best model m2 <- rbf$best.model rbf.gamma <- rbf$best.model$gamma rbf.cost <- rbf$best.model$cost # do prediction using the best model p2 <- predict(m2, dtest[,-1]) mean(p2 == dtest$T) ############### ## Apply k-fold cross validation ############### # create 10 folds cv.sample <- sample(1:nrow(train)) message("The number of all train data = ", length(cv.sample)) add.zeros <- 10 - nrow(train) %% 10 if(add.zeros < 10) cv.sample <- c(cv.sample, rep(0, add.zeros)) message("\nAdjusted cv.sample: ", length(cv.sample)) message("Tail of cv.sample:") print(tail(cv.sample, 10)) ## making 10 folds cv.index <- matrix(data = cv.sample, nrow = 10) message("\nNumbers of data in folds:") for(i in 1:10) print(length(cv.index[i,][cv.index[i,] > 0])) ######################################################################## message("\n***** Doing 10-fold cross validation *****\n") ## vectors with accuracies -- for two models lin.acc <- numeric(0) rbf.acc <- numeric(0) for(i in 1:10){ message(i, ". fold:") cv.train <- train[ - cv.index[i,][cv.index[i,] > 0], ] cv.test <- train[ cv.index[i,][cv.index[i,] > 0], ] message("\t", "size(cv.train) = ", nrow(cv.train), "\tsize(cv.test) = ", nrow(cv.test)) ## training lin.class = svm(cv.train[,-1], cv.train$T, kernel = 'linear', type = 'C-classification', cost = lin.cost ) rbf.class = svm(cv.train[,-1], cv.train$T, kernel = 'radial', type = 'C-classification', cost = rbf.cost, gamma = rbf.gamma ) ### prediction -- linear kernel p.lin <- predict(lin.class, cv.test[,-1]) acc <- mean(p.lin == cv.test$T) lin.acc <- c(lin.acc, acc) message("\t", "lin accuracy = ", round(100*acc, 2), "%") ### prediction -- rbf kernel p.rbf <- predict(rbf.class, cv.test[,-1]) acc <- mean(p.rbf == cv.test$T) rbf.acc <- c(rbf.acc, acc) message("\t", "rbf accuracy = ", round(100*acc, 2), "%") } message("\n***** Results of cross-validation process *****\n") message("lin -- cross-validation accuracies:") print( round(lin.acc, 3) ) message("rbf -- cross-validation accuracies:") print( round(rbf.acc, 3) ) ######## Which model is better min(lin.acc) min(rbf.acc) ############### ### train model with the best parameters on train+dtest and test on eval # train+dtest new.train <- rbind(train, dtest) final.rbf <- svm(new.train[,-1], new.train$T, kernel = "radial", cost = rbf.cost, gamma = rbf.gamma ) # do prediction final.p <- predict(final.rbf, etest[,-1]) mean(final.p == etest$T)