#################################################################################
#################################################################################

### Linear regression -- analytical solution, Gradient Descent Algorithm, lm()

### Logistic regression -- glm()

### Barbora Hladka, Martin Holub

### http://ufal.mff.cuni.cz/course/npfl054

#################################################################################
############################################

###############
## EXERCISE #1: Gradient Descent Algorithm: a simple illustration
## 		Credit: https://www.r-bloggers.com/gradient-descent-in-r/
###############

# define the loss function
f <-  function(x) {
1.2 * (x-2)^2 + 3.2
}

# df/dx -- derivative of f along x 
grad <- function(x){
	1.2*2*(x-2)
}
 
# ! assume theta0 = 0
# assume non-zero theta1
theta1s <- seq(0, 4, len = 1000) 

# plot the function f
plot(theta1s , f(theta1s),
	type = "l",
	xlab = expression(theta[1]),
	ylab = "")

title(ylab = expression(1.2(theta[1]-2)^2 + 3.2),
	line = 1.7,
	cex.lab = 1)

###############
## EXERCISE # 1.1: Minimum of f -- closed-form solution
###############
lines (c(2,2), c(3.2,8),
	col = "red",
	lty = 2)

text (2, 7, "Closed-form solution",
	col = "red",
	pos = 4)

###############
## EXERCISE # 1.2: Minimum of f -- GDA
###############
theta1 <- 0.1	# initialize theta1
alpha <- 0.3	# set up learning step
iter <- 100     # set up number of iterations

theta1s <- theta1
f.theta1s <- f(theta1)

for (step in 1:iter) {
theta1 <- theta1 - alpha*grad(theta1)	# gradient descent update
theta1s <- c(theta1s,theta1)
f.theta1s <- c(f.theta1s, f(theta1))
}
 
lines (theta1s, f.theta1s,
	type = "b",
	col = "blue")

text (0.5, 6, "Gradient Descent",
	col = "blue",
	pos = 4)
 
###############
## EXERCISE #2: Linear regression
###############

###############
## EXERCISE #2.1: Simple linear regression with the Auto data set
###############

## data

library(ISLR)
summary(Auto)

attach(Auto)

message("\n\n###### Display the internal structure of the example data ####")
examples <- cbind(weight, mpg)
num.examples <- nrow(examples)
plot(examples, 
	xlab = "weight",
	ylab = "mpg",
	pch = 21,
	cex = 1.5,
	col = "black",
	bg = "blue")

message("\n\n#####   Correlation of weight and mpg #####")
cor(weight, mpg)

# target attribute: mpg
# feature: weight
# loss function: RSS

## solution analytically

y <- as.matrix(mpg)
# add column of 1s into X
weight.1 <- rep(1,num.examples)
X <- as.matrix(cbind(weight.1, weight))
theta <- solve(t(X) %*% X) %*% (t(X) %*% y)	# matrix inversion using 'solve'
detach(Auto)

plot(examples,
	xlab = "weight",
	ylab = "mpg",
	pch = 21,
	cex = 1.5,
	col = "black",
	bg = "blue")

abline(theta[1], theta[2],
	col="red",
	lwd = 5)

message("\n\n##### Verifying that sum of residuals equals 0  #####")
colSums(X %*% theta - y)

## solution using lm()

# mpg ~ weight

message("\n\n#####   Building a simple linear regression predictor #####")
m.1 <- lm( mpg ~ weight, as.data.frame(examples) )
#  ~ "is modeled as" or "is a function of"
# y ~ model
message("\n Model built.")

# get hypothesis parameters
m.1
summary(m.1)
coef(m.1)
attributes(m.1)

message("\n\n#####   Draw the fitting line   #####")
plot(examples,
	main = "ISLR: Auto data set",
	xlab = "Weight",
	ylab = "Miles Per Gallon",
	pch = 21,
	cex = 1.5,
	col = "black",
	bg  = "blue")

abline(a = m.1$coefficients[[1]], b = m.1$coefficients[[2]],
	col = 'red',
	lwd = 5)

# mpg ~ origin
# 1 -- American, 2 -- European, 3 -- Japanese

examples <- Auto[,c(1,8)]

# make origin categorical
examples$origin <- factor(examples$origin, levels = c(1, 2, 3))

# run lm()
m.2 <- lm( mpg ~ origin, examples )

# hypothesis parameters
m.2
summary(m.2)
coef(m.2)
attributes(m.2)

################
## EXERCISE #2.2: Build a polynomial regression predictor
################

# the plots require the data to be sorted
sorted <- Auto[order(Auto$weight), ] 
attach(sorted)

fit1 <- lm(mpg ~ poly(weight, 1), data=sorted) 
fit2 <- lm(mpg ~ poly(weight, 2), data=sorted) 
fit5 <- lm(mpg ~ poly(weight, 5), data=sorted) 


plot(weight, mpg, main="ISLR: Auto data set", 
	xlab = "Weight",
	ylab = "Miles Per Gallon",
	pch = 19,
	col = "red")
 
points(weight, predict(fit1), type="l", lwd=5, col="blue") 
points(weight, predict(fit2), type="l", lwd=5, col="orange") 
points(weight, predict(fit5), type="l", lwd=5, col="green") 

legend("topright",
	c("Linear", "Degree 2", "Degree 5"),
	col = c("blue", "orange", "green"),
	lty = c(1,1,1),
	lwd=c(5,5,5))

detach(sorted)

################
## EXERCISE #2.3:  Contour and three-dimensional plots: an illustration
################

# Residual Sum of Squares
RSS <- function(theta0, theta1) {
  th <- c(theta0, theta1)
  rss <- t(X %*% th - y) %*% (X %*% th - y)
  return(rss)
}

##########

trans3d <- function(x,y,z, pmat) { 
  tr <- cbind(x,y,z,1) %*% pmat 
  list(x = tr[,1]/tr[,4], y= tr[,2]/tr[,4]) 
} 

theta0 <- seq(26, 66, 2)
theta1 <- seq(-0.014, 0, 0.001)

VecFun <- Vectorize( RSS )
z    <- outer(theta0, theta1, FUN = "VecFun")

# sample points

th0.min <- 46.21652
th1.min <- -0.007647

th0.1 <- 38
th1.1 <- -0.004

th0.2 <- 60
th1.2 <- -0.009

# drawing

par(mfrow=c(1,2))
# 3D
res <- persp(theta0, theta1, z,
	theta = 40,
	phi = 35,
	border = "blue",
	ticktype='detailed')

points(trans3d(th0.min, th1.min, RSS(th0.min, th1.min), res),
	col = 2,
	cex = 3,
	pch = 19)

points(trans3d(th0.1, th1.1, RSS(th0.1, th1.1), res),
	col = "sandybrown",
	cex = 3,
	pch = 19)

points(trans3d(th0.2, th1.2, RSS(th0.2, th1.2), res),
	col = "orchid",
	cex = 3,
	pch = 19)

# contour plot
contour(theta0, theta1, z, nlevels = 40,
	col = "blue")

points(th0.min, th1.min,
	col = 2,
	cex = 3,
	pch = 19)

points(th0.1, th1.2,
	col = "sandybrown",
	cex = 3,
	pch = 19)
points(th0.2, th1.2,
	col = "orchid",
	cex = 3,
	pch = 19)

###############
## EXERCISE #3: Logistic regression with the data set on students
## Credit: https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/
###############

################
## EXERCISE #3.1: Explore the data and split it into training and test sets
################

examples <- read.csv("https://stats.idre.ucla.edu/wp-content/uploads/2016/02/sample.csv", header=T)

## explore the data
str(examples)
num.examples <- nrow(examples)

# encode the target feature as factor
examples$hon = factor(examples$hon, levels = c(0, 1))

## split data into training and test subsets

# set the number of training examples = 90% of all examples
num.train <- round(0.9 * num.examples)
# set the number of test examples = 10% of all examples
num.test <- num.examples - num.train

# randomly split examples into training and test data using sample()
# use set.seed() to be able to reconstruct the experiment 
# with the SAME training and test sets

set.seed(123)
s <- sample(num.examples) 

# get the training set
# first, generate indices of training examples
indices.train <- s[1:num.train]
# second, get the training examples
train <- examples[indices.train,]
# get the test set 
indices.test <- s[(num.train+1):num.examples]
test <- examples[indices.test,]

hon <- nrow(subset(examples, hon == 1))
not.hon <- nrow(subset(examples, hon == 0))

# count p(y=1|x) from the data
p <- hon/num.examples

## odds from the data
# odds of a female being in the honours class 32:77
p1 <- nrow(subset(examples, ((female==1) & (hon == 1)))) / nrow(subset(examples, female==1))
odds.f.hon <- p1/(1-p1)
# odds of a male being in the honours class 17:74
p2 <- nrow(subset(examples, ((female==0) & (hon == 1)))) / nrow(subset(examples, female==0))
odds.m.hon <- p2/(1-p2)
# ratio of the odds for female to the odds for male
odds.f.hon/odds.m.hon
#[1] 1.809015
# the odds for female are about 81% higher than the odds for males

################
## EXERCISE #3.2: Build a logistic regression classifier with no feature
##		  target value: hon	
################

m.0 <- glm(hon ~ -., 
	data = examples, 
	family = binomial(link = 'logit'))

message("\nModel built.")
m.0
# Coefficients:
# (Intercept)
#    -1.125
theta0 <- m.0$coefficients[1]
# (Intercept) 
#   -1.12546 

# i.e. log(h(x))/(1-h(x)) = -1.125
# h(x) = p(y=1/x), i.e.
exp(theta0)/(1+exp(theta0))
# 0.245	--> see p above
# i.e. theta0 is the estimated log odds of being in honors class for the whole population


################
## EXERCISE #3.3: Build a logistic regression classifier with a single categorical feature
##		  target value: hon
################

# hon ~ female
m.1 <- glm(hon ~ female,
	data = examples,
	family = binomial(link = 'logit'))

message("\nModel built.")
m.1

# Coefficients:
# (Intercept)       female  
#    -1.4709       0.5928

# theta_0 = -1.471 is the log odds for males (male is the reference group)
log(odds.m.hon)
# [1] -1.470852

# theta_1 = 0.5928 is the log of odds ratio between the female group and male group
log(odds.f.hon/odds.m.hon)

################
## EXERCISE #3.5: Build a logistic regression classifier with both categorical and continuous features
##		  target value: hon
##		  Train on the train data and test on the test data		
################

# features read, math and target attribute hon
d <- examples[, c(2,4,5)]
train <- train[, c(2,4,5)]
test <- test[, c(2,4,5)]

# fitting logistic regression to the training set
m.2 <- glm(formula = hon ~ .,
                  family = binomial,
                  data = train)

# predicting the test set results
prob_pred <- predict(m.2, type = 'response', newdata = test)

# classfication rule
y_pred <- ifelse(prob_pred > 0.5, 1, 0)

# making the confusion Matrix
cm <- table(test[, 3], y_pred > 0.5)

# accuracy
sum(diag(cm))/sum(cm)

## visualising the test set results
library(ElemStatLearn)
set <- test
X1 <- seq(min(set[, 1]) - 2, max(set[, 1]) + 2, by = 0.07)
X2 <-  seq(min(set[, 2]) - 2, max(set[, 2]) + 2, by = 0.07)

grid_set <- expand.grid(X1, X2)
colnames(grid_set) <- c('read', 'math')

prob_set <- predict(m.2, type = 'response', newdata = grid_set)
threshold <- 0.5
y_grid <- ifelse(prob_set > threshold, 1, 0)
plot(set[, -3],
     main = 'Logistic Regression (Test set)',
     xlab = 'read', ylab = 'math',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set,
       pch = '.',
       col = ifelse(y_grid == 1, 'lightpink', 'lightblue'))
points(set,
       pch = 21, 
       cex = 1.5,
       bg = ifelse(set[, 3] == 1, 'red', 'blue'))