################################################################################# ################################################################################# ### R code for lasso ### MOV task ### Help for glmnet ### http://web.stanford.edu/~hastie/Papers/Glmnet_Vignette.pdf ### Barbora Hladka, Martin Holub ### ESSLLI 2015 ### http://ufal.mff.cuni.cz/esslli2015 ################################################################################# ################################################################################# ################################################################################# ## load the package require(glmnet) ############# ## get the data source("load-mov-data.R") # replace missing imdb_rating with 0 examples$imdb_rating[which(is.na(examples$imdb_rating))] <- 0 ############# ## get folds for k-cross-validation # we already have the folds # thus generate foldid for cv.glmnet # foldid = a vector of values between 1 and nfold # identifying what fold each instance is in # number of input folds k <- 5 f <- c(0,0,0) for(i in 1:k){ cv.test <- read.csv(paste ("cv.test.", i, ".csv", sep=""), sep="\t") # movie, user, fold number f <- rbind(f, cbind(cv.test[,1:2], rep(i, nrow(cv.test)))) } ff <- data.frame(f[-1,]) names(ff) <- c("movie", "user", "fold") foldid <- (merge(examples, ff, by = intersect(names(ff), names(examples))))$fold ############# ## run 5-cross-validation lasso x <- model.matrix(rating~ age+occupation+genre_drama+imdb_rating, examples) y <- data.matrix(examples$rating) fit <- cv.glmnet(x, y, foldid=foldid, alpha=1) ############# ## explore fit # lambda values fit$lambda # cross-validation curve pdf("lin-reg-lasso-mov-cv-curve.pdf", width=7, height=5) # plot(fit, main = "lin-reg-mov-ridge-cv.glmnet.R") plot(fit) dev.off() # lambda that gives minimum cv mse fit$lambda.min i <- which(fit$lambda == fit$lambda.min) # minimum cv mse min(fit$cvm) # mean cross-validation error fit$cvm # larger value of lambda whose cv mse is 1 SE larger fit$lambda.1se fit$glmnet.fit # mean cross-validation error fit$cvm # minimum cve min(fit$cvm) # lambda that gives minimum cve fit$lambda.min i <- which(fit$lambda == fit$lambda.min) # what is cve for lambda.1se i <- which(fit$lambda == fit$lambda.1se) fit$cvm[i] # parameter values for lambda.min coef(fit, s = "lambda.min") # larger value of lambda whose misclassification error is 1 SE larger fit$lambda.1se coef(fit, s=fit$lambda.1se) # plot regularization path # each curve corresponds to a feature pdf("lin-reg-lasso-mov-path-norm.pdf", width=7, height=5) plot(fit$glmnet.fit, "norm", label=TRUE) dev.off() pdf("lin-reg-lasso-mov-path-lambda.pdf", width=7, height=5) plot(fit$glmnet.fit, "lambda", label=TRUE) dev.off() plot(fit$glmnet.fit, "dev", label=TRUE) # each curve corresponds to a feature # paths of them against the l1-norm # number of non-zero parameters above at a given lambda # parameter values for lambda.min lasso.lambda.min <- coef(fit, s=fit$lambda.min) lasso.lambda.1se <- coef(fit, s=fit$lambda.1se) # parameter values for lambda = 0 # i.e. unregularized zero <- coef(fit, s = 0, exact=TRUE) cbind2(cbind2(zero, lasso.lambda.min), lasso.lambda.1se)