################################################################################# ################################################################################# ### R code for the H-4.1 ### Support vector machine classifier ### Barbora Hladka, Martin Holub ### ESSLLI 2013 ### http://ufal.mff.cuni.cz/mlnlpr13 ################################################################################# ################################################################################# ### Task: Decide whether the given word pair forms a semantic collocation "line" ("lines", "lined") ### Objects: Word pairs identified by two lemmas L1 and L2 ### Target class: Class = {0,1} ### Features ## feature type definition ##--------------------------------------------------------------------------------------------------------------- ## feature type definition ## A1 continuous Pointwise mutual information ## A2 First Kulczynski coefficient ## A3 Unigram subtuples measure ## A4 Ngram word coocurrence ## A5 Reverse confusion probability ## A6 Reverse cross entropy ## A7 Right context phrasal entropy ## A8 Log frequency biased mutual dependency ## A9 Cosine context similarity in boolean vector space ## A10 Dice context similarity in tf.idf vector space ## A11 categorical POS1:POS2 ## Legend ## POS1 part of speech of L1 ## POS2 part of speech of L2 ################################################################################# ################################################################################# ################################################################################# ################################ ### Getting example data ### ################################ ## Read a file with examples examples <- read.table("../data/col.development.csv", header=T) ## Get the number of input examples num.examples <- nrow(examples) ## Set the number of training examples = 90% of all examples num.train <- round(0.9 * num.examples) ## Set the number of test examples = 10% of all examples num.test <- num.examples - num.train ## Check the numbers num.examples num.train num.test ## Randomly split examples into training and test data using sample() ## Use set.seed() to be able to reconstruct the experiment with the SAME training and test sets set.seed(123) s <- sample(num.examples) ### Get the training set ## First, generate indices of training examples indices.train <- s[1:num.train] ## Second, get the training examples train <- examples[indices.train,] ### Get the test set indices.test <- s[(num.train+1):num.examples] test <- examples[indices.test,] ## Check the results str(train) str(test) ## Second, get the training examples with target class Class and features A1, A9 train <- examples[indices.train,] ### Get the test set indices.test <- s[(num.train+1):num.examples] test <- examples[indices.test,] test.classes <- examples[indices.test,] ################################### ### Learning from training data ### ################################### ## Use the 'e1071' package ## ! Run install.packages("e1071"), if not installed. ## Load the package "rpart" library(e1071) message("\n\n#####Homework 4.1 #####") ################################################################################### ################################################################################### message("\n\n All features") M9 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train[1:200,], kernel='linear', cost=10) P9.test <- predict(M9, test, type="class") P9.train <- predict(M9, train, type="class") P9.train.200 <- predict(M9, train[1:200,], type="class") message("\n\n##### SVM: kernel='linear', cost = 10, data = train[1:200,] #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P9.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P9.train == train$Class)/num.train, 2), "%") message("\nPercentage of the correctly predicted senses on the smaller train set = ", round(100*sum(P9.train.200 == train[1:200,]$Class)/200, 2), "%") ################################################################################### M10 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train[1:200,], kernel='linear', cost=100) P10.test <- predict(M10, test, type="class") P10.train <- predict(M10, train, type="class") P10.train.200 <- predict(M10, train[1:200,], type="class") message("\n\n##### SVM: kernel='linear', cost = 100, data = train[1:200,] #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P10.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P10.train == train$Class)/num.train, 2), "%") message("\nPercentage of the correctly predicted senses on the smaller train set = ", round(100*sum(P10.train.200 == train[1:200,]$Class)/200, 2), "%") ################################################################################### grid <- tune.svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, data=train[1:200,], gamma=10^seq(-2,0,by=0.5), cost=10^seq(0,2,by=0.5)) summary(grid) best.gamma <- grid$best.parameters[[1]] best.cost <- grid$best.parameters[[2]] M11 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train[1:200,], kernel='linear', cost=best.cost, gamma=best.gamma) P11.test <- predict(M11, test, type="class") P11.train <- predict(M11, train, type="class") P11.train.200 <- predict(M11, train[1:200,], type="class") message("\n\n##### SVM: kernel='linear', cost = best.cost, gamma = best.gamma, data = train[1:200,] #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P11.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P11.train == train$Class)/num.train, 2), "%") message("\nPercentage of the correctly predicted senses on the smaller train set = ", round(100*sum(P11.train.200 == train[1:200,]$Class)/200, 2), "%") ################################################################################### message("\n\n All numerical features") M12 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train[1:200,], kernel='linear', cost=10) P12.test <- predict(M12, test, type="class") P12.train <- predict(M12, train, type="class") P12.train.200 <- predict(M12, train[1:200,], type="class") message("\n\n##### SVM: kernel='linear', cost = 10, data = train[1:200,]") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P12.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P12.train == train$Class)/num.train, 2), "%") message("\nPercentage of the correctly predicted senses on the smaller train set = ", round(100*sum(P12.train.200 == train[1:200,]$Class)/200, 2), "%") ################################################################################### M13 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A10, type='C', data=train[1:200,], kernel='linear', cost=100) P13.test <- predict(M13, test, type="class") P13.train <- predict(M13, train, type="class") P13.train.200 <- predict(M13, train[1:200,], type="class") message("\n\n##### SVM: kernel='linear', cost = 100, data = train[1:200,] #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P13.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P13.train == train$Class)/num.train, 2), "%") message("\nPercentage of the correctly predicted senses on the smaller train set = ", round(100*sum(P13.train.200 == train[1:200,]$Class)/200, 2), "%") ################################################################################### grid <- tune.svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, data=train[1:200,], gamma=10^seq(-2,0,by=0.5), cost=10^seq(0,2,by=0.5)) summary(grid) best.gamma <- grid$best.parameters[[1]] best.cost <- grid$best.parameters[[2]] M14 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train[1:200,], kernel='linear', cost=best.cost, gamma=best.gamma) P14.test <- predict(M14, test, type="class") P14.train <- predict(M14, train, type="class") P14.train.200 <- predict(M14, train[1:200,], type="class") message("\n\n##### SVM: kernel='linear', cost = best.cost, gamma = best.gamma, data = train[1:200,] #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P14.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P14.train == train$Class)/num.train, 2), "%") message("\nPercentage of the correctly predicted senses on the smaller train set = ", round(100*sum(P14.train.200 == train[1:200,]$Class)/200, 2), "%") ############################################################## message("\n\nAll training data") M9.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train, kernel='linear', cost=10) P9.t.test <- predict(M9.t, test, type="class") P9.t.train <- predict(M9.t, train, type="class") message("\n\n##### SVM: kernel='linear', cost = 10, data = train #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P9.t.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P9.t.train == train$Class)/num.train, 2), "%") M12.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train, kernel='linear', cost=10) P12.t.test <- predict(M12.t, test, type="class") P12.t.train <- predict(M12.t, train, type="class") message("\n\n##### SVM: kernel='linear', cost = 10, data = train #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P12.t.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P12.t.train == train$Class)/num.train, 2), "%") ############################################################## M10.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train, kernel='linear', cost=100) P10.t.test <- predict(M10.t, test, type="class") P10.t.train <- predict(M10.t, train, type="class") message("\n\n##### SVM: kernel='linear', cost = 100, data = train #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P10.t.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P10.t.train == train$Class)/num.train, 2), "%") M13.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train, kernel='linear', cost=100) P13.t.test <- predict(M13.t, test, type="class") P13.t.train <- predict(M13.t, train, type="class") message("\n\n##### SVM: kernel='linear', cost = 100, data = train #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P13.t.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P13.t.train == train$Class)/num.train, 2), "%") ################################################################################### grid <- tune.svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, data=train, gamma=10^seq(-2,0,by=0.5), cost=10^seq(0,2,by=0.5)) summary(grid) best.gamma <- grid$best.parameters[[1]] best.cost <- grid$best.parameters[[2]] M14.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train[1:200,], kernel='linear', cost=best.cost, gamma=best.gamma) P14.t.test <- predict(M14.t, test, type="class") P14.t.train <- predict(M14.t, train, type="class") message("\n\n##### SVM: kernel='linear', cost = best.cost, gamma = best.gamma, data = train #####") message("\nPercentage of the correctly predicted senses on the test set = ", round(100*sum(P14.t.test == test$Class)/num.test, 2), "%") message("\nPercentage of the correctly predicted senses on the train set = ", round(100*sum(P14.t.train == train$Class)/num.train, 2), "%")