#################################################################################
#################################################################################

### R code for the H-4.1

### Support vector machine classifier

### Barbora Hladka, Martin Holub

### ESSLLI 2013

### http://ufal.mff.cuni.cz/mlnlpr13

#################################################################################
#################################################################################

### Task: Decide whether the given word pair forms a semantic collocation "line" ("lines", "lined")

### Objects: Word pairs identified by two lemmas L1 and L2

### Target class: Class = {0,1}

### Features
	## feature	type	definition
##---------------------------------------------------------------------------------------------------------------				
			
##	feature	type		definition
			
##	A1	continuous	Pointwise mutual information
##	A2			First Kulczynski coefficient
##	A3			Unigram subtuples measure
##	A4			Ngram word coocurrence
##	A5			Reverse confusion probability
##	A6			Reverse cross entropy
##	A7			Right context phrasal entropy
##	A8			Log frequency biased mutual dependency
##	A9			Cosine context similarity in boolean vector space
##	A10			Dice context similarity in tf.idf vector space
##	A11			categorical	POS1:POS2
			
## Legend			
##	POS1		part of speech of L1
##	POS2		part of speech of L2


#################################################################################
#################################################################################
#################################################################################

################################
### Getting example data    ###
################################

## Read a file with examples
examples <- read.table("../data/col.development.csv", header=T)

## Get the number of input examples
num.examples <- nrow(examples)

## Set the number of training examples = 90% of all examples
num.train <- round(0.9 * num.examples)

## Set the number of test examples = 10% of all examples
num.test <- num.examples - num.train

## Check the numbers
num.examples
num.train
num.test

## Randomly split examples into training and test data using sample()
## Use set.seed() to be able to reconstruct the experiment with the SAME training and test sets

set.seed(123)
s <- sample(num.examples) 

### Get the training set
## First, generate indices of training examples
indices.train <- s[1:num.train]

## Second, get the training examples
train <- examples[indices.train,]

### Get the test set 
indices.test <- s[(num.train+1):num.examples]
test <- examples[indices.test,]

## Check the results
str(train)
str(test)

## Second, get the training examples with target class Class and features A1, A9
train <- examples[indices.train,]

### Get the test set
indices.test <- s[(num.train+1):num.examples]
test <- examples[indices.test,]
test.classes <- examples[indices.test,]

###################################
### Learning from training data ###
###################################

## Use the 'e1071' package
## ! Run install.packages("e1071"), if not installed.

## Load the package "rpart"
library(e1071)

message("\n\n#####Homework 4.1  #####")

###################################################################################
###################################################################################

message("\n\n All features")

M9 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train[1:200,], kernel='linear', cost=10)
P9.test <- predict(M9, test, type="class")
P9.train <- predict(M9, train, type="class")
P9.train.200 <- predict(M9, train[1:200,], type="class")

message("\n\n##### SVM: kernel='linear', cost = 10, data = train[1:200,]  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P9.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P9.train == train$Class)/num.train, 2), "%")

message("\nPercentage of the correctly predicted senses on the smaller train set = ", 
	round(100*sum(P9.train.200 == train[1:200,]$Class)/200, 2), "%")

###################################################################################

M10 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train[1:200,], kernel='linear', cost=100)
P10.test <- predict(M10, test, type="class")
P10.train <- predict(M10, train, type="class")
P10.train.200 <- predict(M10, train[1:200,], type="class")

message("\n\n##### SVM: kernel='linear', cost = 100, data = train[1:200,]  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P10.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P10.train == train$Class)/num.train, 2), "%")

message("\nPercentage of the correctly predicted senses on the smaller train set = ", 
	round(100*sum(P10.train.200 == train[1:200,]$Class)/200, 2), "%")

###################################################################################

grid <- tune.svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, data=train[1:200,], gamma=10^seq(-2,0,by=0.5), cost=10^seq(0,2,by=0.5))

summary(grid)

best.gamma <- grid$best.parameters[[1]]
best.cost <- grid$best.parameters[[2]]

M11 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train[1:200,], kernel='linear', cost=best.cost, gamma=best.gamma)

P11.test <- predict(M11, test, type="class")
P11.train <- predict(M11, train, type="class")
P11.train.200 <- predict(M11, train[1:200,], type="class")

message("\n\n##### SVM: kernel='linear', cost = best.cost, gamma = best.gamma, data = train[1:200,]  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P11.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P11.train == train$Class)/num.train, 2), "%")

message("\nPercentage of the correctly predicted senses on the smaller train set = ", 
	round(100*sum(P11.train.200 == train[1:200,]$Class)/200, 2), "%")

###################################################################################

message("\n\n All numerical features")

M12 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train[1:200,], kernel='linear', cost=10)

P12.test <- predict(M12, test, type="class")
P12.train <- predict(M12, train, type="class")
P12.train.200 <- predict(M12, train[1:200,], type="class")

message("\n\n##### SVM: kernel='linear', cost = 10, data = train[1:200,]")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P12.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P12.train == train$Class)/num.train, 2), "%")

message("\nPercentage of the correctly predicted senses on the smaller train set = ", 
	round(100*sum(P12.train.200 == train[1:200,]$Class)/200, 2), "%")

###################################################################################

M13 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A10, type='C', data=train[1:200,], kernel='linear', cost=100)

P13.test <- predict(M13, test, type="class")
P13.train <- predict(M13, train, type="class")
P13.train.200 <- predict(M13, train[1:200,], type="class")

message("\n\n##### SVM: kernel='linear', cost = 100, data = train[1:200,]  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P13.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P13.train == train$Class)/num.train, 2), "%")

message("\nPercentage of the correctly predicted senses on the smaller train set = ", 
	round(100*sum(P13.train.200 == train[1:200,]$Class)/200, 2), "%")

###################################################################################

grid <- tune.svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, data=train[1:200,], gamma=10^seq(-2,0,by=0.5), cost=10^seq(0,2,by=0.5))

summary(grid)

best.gamma <- grid$best.parameters[[1]]
best.cost <- grid$best.parameters[[2]]

M14 <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train[1:200,], kernel='linear', cost=best.cost, gamma=best.gamma)
P14.test <- predict(M14, test, type="class")
P14.train <- predict(M14, train, type="class")
P14.train.200 <- predict(M14, train[1:200,], type="class")

message("\n\n##### SVM: kernel='linear', cost = best.cost, gamma = best.gamma, data = train[1:200,]  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P14.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P14.train == train$Class)/num.train, 2), "%")

message("\nPercentage of the correctly predicted senses on the smaller train set = ", 
	round(100*sum(P14.train.200 == train[1:200,]$Class)/200, 2), "%")

##############################################################

message("\n\nAll training data")

M9.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train, kernel='linear', cost=10)
P9.t.test <- predict(M9.t, test, type="class")
P9.t.train <- predict(M9.t, train, type="class")

message("\n\n##### SVM: kernel='linear', cost = 10, data = train  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P9.t.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P9.t.train == train$Class)/num.train, 2), "%")


M12.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train, kernel='linear', cost=10)
P12.t.test <- predict(M12.t, test, type="class")
P12.t.train <- predict(M12.t, train, type="class")

message("\n\n##### SVM: kernel='linear', cost = 10, data = train  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P12.t.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P12.t.train == train$Class)/num.train, 2), "%")

##############################################################

M10.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11, type='C', data=train, kernel='linear', cost=100)
P10.t.test <- predict(M10.t, test, type="class")
P10.t.train <- predict(M10.t, train, type="class")

message("\n\n##### SVM: kernel='linear', cost = 100, data = train  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P10.t.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P10.t.train == train$Class)/num.train, 2), "%")

M13.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train, kernel='linear', cost=100)
P13.t.test <- predict(M13.t, test, type="class")
P13.t.train <- predict(M13.t, train, type="class")

message("\n\n##### SVM: kernel='linear', cost = 100, data = train  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P13.t.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P13.t.train == train$Class)/num.train, 2), "%")

###################################################################################

grid <- tune.svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, data=train, gamma=10^seq(-2,0,by=0.5), cost=10^seq(0,2,by=0.5))

summary(grid)

best.gamma <- grid$best.parameters[[1]]
best.cost <- grid$best.parameters[[2]]

M14.t <- svm(Class ~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A10, type='C', data=train[1:200,], kernel='linear', cost=best.cost, gamma=best.gamma)
P14.t.test <- predict(M14.t, test, type="class")
P14.t.train <- predict(M14.t, train, type="class")

message("\n\n##### SVM: kernel='linear', cost = best.cost, gamma = best.gamma, data = train  #####")

message("\nPercentage of the correctly predicted senses on the test set = ", 
	round(100*sum(P14.t.test == test$Class)/num.test, 2), "%")

message("\nPercentage of the correctly predicted senses on the train set = ", 
	round(100*sum(P14.t.train == train$Class)/num.train, 2), "%")