#################################################################################
#################################################################################

### Regularization glmnet()
### http://cran.revolutionanalytics.com/web/packages/glmnet/glmnet.pdf

### College data set
### https://cran.r-project.org/web/packages/ISLR/ISLR.pdf

### Barbora Hladka, Martin Holub

### http://ufal.mff.cuni.cz/course/npfl054

#################################################################################
#################################################################################

library(e1071)
library(glmnet)
library(ISLR)

#######################
## function definitions
#######################

# add feature names into regularization plot

add.names <- function(fit) {
        L <- length(fit$lambda)
        x <- log(fit$lambda[L])
        y <- fit$beta[, L]
        labs <- names(y)
        text(x, y, 
		labels = labs,
		pos = 4,
		offset = 0.7)
}

################
## EXERCISE #1: Read data, get training and test sets
################

# training and test data
set.seed(123)

# do not work with Private, Accept, Enroll
college <- College[, -c(1,3,4)]

# split data into training and test data 50:50
train.ratio <- 0.5
train <- college[sample(row.names(college),
			size = round(nrow(college)*train.ratio)), ]
test <- college[!(row.names(college) %in% row.names(train)), ]

nrow(train)
nrow(test)

###############
## EXERCISE #2: Linear regression
##		        target value: Apps
###############

fit.lm <- lm(Apps ~ ., data = train)

summary(fit.lm)
# Coefficients	- features used for prediction
# Estimate		- values of hypothesis parameters theta_i
# Std. Error	- the average amount that the estimate differs from the actual value
# t value		- t-statistics used in testing whether a given parameter
#				is significantly different from zero
# Pr(>|t|)		- 2-tailed p-values used in testing the null hypothesis
#         		that the parameter is 0
# statistical codes
# annotation          p-value          significance level alpha
#    ***            [0, 0.001]                0.001
#     **         (0.001, 0.01]                0.01
#      *          (0.01, 0.05]                0.05
#      .           (0.05, 0.1]                0.1
#                     (0.1, 1]                1
# alpha = 0.001 (***)
# 	Parameter for F.Undergrad is significantly different from zero
# alpha = 0.01 (**)
#	Expend, Grad.Rate 
# alpha = 0.05 (*)
#	perc.alumni, Room.Board, 

# prediction
pred <- predict(fit.lm, test)

# mean squared error (MSE)
fit.lm.mse <- mean((test$Apps - pred)^2)

###############
## EXERCISE #2: Linear regression with regularization
##		        target value: Apps
###############

# right data format

x <- model.matrix(Apps ~ ., data = train)
y <- data.matrix(train$Apps)

x.test <- model.matrix(Apps ~ ., data = test)
y.test <- data.matrix(test$Apps)

# lambda values
grid <- 10^seq(4, -2, length = 100)
summary(grid)

###############
## EXERCISE #2.1: Lasso regularization
###############

# training
fit.lasso <- glmnet(x, y,
	alpha = 1,
	lambda = grid)

# explore fit
grid[50]
grid[60]

fit.lasso$lambda[50]
coef(fit.lasso)[,50]
sum(abs(coef(fit.lasso)[-1,50])) # l1-norm 	

fit.lasso$lambda[60]
coef(fit.lasso)[,60]
sum(abs(coef(fit.lasso)[-1,60]))

plot(fit.lasso, xvar = "lambda")
add.names(fit.lasso)
# each curve corresponds to a feature path
# number of non-zero parameters above at a given lambda 

print(fit.lasso)
# The Df column is the number of nonzero parameters
# %dev is the percent deviance explained

coef(fit.lasso, s = 0.1)	# s is lambda

# prediction with lambda = 50
pred.lasso <- predict(fit.lasso, newx = x.test, s = 50)
# mean squared error
fit.lasso.mse <- mean((test$Apps - pred.lasso)^2)

# prediction with lambda = 0, i.e., linear regression without regularization
pred.0 <- predict(fit.lasso, newx = x.test, s = 0, exact = T)
# mean squared error
fit.lasso.0.mse <- mean((test$Apps - pred.0)^2)
# In general, if we want to fit a unergularized LR, then
# we should use the lm() function -- see Exercise #2


## run 10-cross-validation lasso
set.seed(123) # choice of the cross-validation folds is random
fit.lasso.cv <- cv.glmnet(x, y, alpha = 1, lambda = grid)

# explore fit
plot(fit.lasso.cv)
# cross-validation curve and 
# upper and lower standard deviation curves
# along the lambda sequence

fit.lasso.cv$name	# loss fucntion used in cross-validation
fit.lasso.cv$lambda.min # value of lambda that gives minimum mean cross-validated error 
fit.lasso.cv$lambda.1se # largest value of lambda such that error is within 1 standard error of the minimum

coef(fit.lasso.cv, s = "lambda.min")

plot(fit.lasso.cv$glmnet.fit, "lambda", label = TRUE)
add.names(fit.lasso.cv$glmnet.fit)

# prediction
lambda.best <- fit.lasso.cv$lambda.min
pred.lasso.cv <- predict(fit.lasso.cv, newx = x.test, s = lambda.best)
fit.lasso.cv.mse <- mean((test$Apps-pred.lasso.cv)^2)


###############
## EXERCISE #2.2: Ridge regression
###############

## run 10-cross-validation ridge regression
set.seed(123) # choice of the cross-validation folds is random
fit.ridge.cv <- cv.glmnet(x, y,
	alpha = 0,
	lambda = grid)

# prediction
lambda.best <- fit.ridge.cv$lambda.min
pred.ridge.cv <- predict(fit.ridge.cv, newx = x.test, s = lambda.best)
fit.ridge.cv.mse <- mean((test$Apps - pred.ridge.cv)^2)

## fit.lm, fit.ridge.cv, fit.lasso.cv

fit.lm.mse
fit.lasso.cv.mse
fit.ridge.cv.mse

# parameter values of fit.lm
lm <- as.matrix(coef(fit.lm))y Lasso 

# parameter values for lambda.min by Ridge regression
ridge <- coef(fit.ridge.cv, s = fit.ridge.cv$lambda.min)

# parameter values for lambda.min by Lasso regression
lasso <- coef(fit.lasso.cv, s = fit.lasso.cv$lambda.min)

## compare parameters
cbind2(ridge, lasso)

###############
## EXERCISE #3: Logistic regression
##		target value: newY
###############

# College data set
# Target value: newY
# create a binary attribute, newY,
# that contains a 1 if Apps contains a value above its median,
# and a 0 if Apps contains a value below its median

med <- median(train$Apps)
newY <- rep("one", nrow(train))
newY[train$Apps < med] <- "zero"
train.college <- data.frame(train, newY)
train.college <- train.college[, -1]

med <- median(test$Apps)
newY <- rep("one", nrow(test))
newY[test$Apps < med] <- "zero"
test.college <- data.frame(test, newY)
test.college <- test.college[, -1]

###############
## EXERCISE #3.1: without regularization
##		target value: newY
###############

# training
glm.fit <- glm(formula = newY ~ .,
	family = binomial,
	data = train.college)

# prediction
glm.probs <- predict(glm.fit,
	newdata = test.college,
	type = "response")

glm.pred <- rep("one", nrow(test.college))
glm.pred[glm.probs > 0.5] = "zero"

# evaluation
table(glm.pred, test.college$newY)
glm.fit.acc <- mean(glm.pred == test.college$newY)

###############
## EXERCISE #3.1: Lasso and Ridge regression
##		target value: newY
###############

## 10-cross-validation Ridge regression
x <- model.matrix(newY ~ ., data = train.college)
y <- data.matrix(train.college$newY)

x.test <- model.matrix(newY~., data=test.college)
y.test <- data.matrix(test.college$newY)

fit.log.ridge <- cv.glmnet(x, y,
	family = "binomial",
	alpha = 0,
	type.measure = "class", 
	lambda = grid)

# prediction
lambda.best <- fit.log.ridge$lambda.min

pred.log.ridge <- predict(fit.log.ridge,
	type = "class",
	newx = x.test,
	s = lambda.best)

# evaluation
fit.log.ridge.acc <- mean(pred.log.ridge == y.test)

## 10-cross-validation Lasso
fit.log.lasso <- cv.glmnet(x, y,
	family = "binomial",
	alpha = 1,
	lambda = grid)

# prediction
lambda.best <- fit.log.lasso$lambda.min

pred.log.lasso <- predict(fit.log.lasso,
	type = "class",
	newx = x.test,
	s = lambda.best)

# evaluation
fit.log.lasso.acc <- mean(pred.log.lasso == y.test)

# compare accuracy
fit.log.lasso.acc
fit.log.ridge.acc
glm.fit.acc