################################################################################# ################################################################################# ### R code for Ridge regression ### VPR task: cry ### Help for glmnet ### http://web.stanford.edu/~hastie/Papers/Glmnet_Vignette.pdf ### Barbora Hladka, Martin Holub ### ESSLLI 2015 ### http://ufal.mff.cuni.cz/esslli2015 ################################################################################# ################################################################################# ################################################################################# ## load the package require(glmnet) ############# ## get the data examples <- read.csv("cry.development.csv", sep="\t") n <- nrow(examples) ############# ## filter out uneffective features # remove features with 0s only c.1 <- examples[,colSums(as.matrix(sapply(examples, as.numeric)) )!=0] # remove features with 1s only c.2 <- c.1[,colSums(as.matrix(sapply(c.1, as.numeric)) )!=n] # remove features with frequency less or equal than m m <- 4 c.3 <- c.2[,colSums(as.matrix(sapply(c.2, as.numeric)) ) > m] # remove column duplicates and get names of effective features c.4 <- data.frame(t(unique(t(as.matrix(c.3))))) c <- c.4[,-c(1,ncol(c.4))] names <- names(c) ############# ## get folds # we already have the folds # thus generate foldid for cv.glmnet # foldid = a vector of values between 1 and nfold # identifying what fold each instance is in # number of input folds k <- 9 f <- c(0,0) for(i in 1:k){ cv.test <- read.csv(paste ("cry.", i, ".test.csv", sep=""), sep="\t") # id and fold no. f <- rbind(f, cbind(cv.test[,1], rep(i, nrow(cv.test)))) } ff <- data.frame(f[-1,]) names(ff) <- c("id", "fold") foldid <- (merge(examples, ff, by = "id"))$fold ############# ## binary classification "1" vs. all out <- "1" ## output classes zero <- "0" one <- "1" examples$tp = as.character(examples$tp) examples[!(examples$tp %in% out),]$tp <- zero examples[examples$tp %in% out,]$tp <- one examples$tp <- as.factor(examples$tp) ## run 9-cross-validation logistic regression with all effective features x <- data.matrix(c) y <- data.matrix(examples$tp) fit <- cv.glmnet(x, y, family = "binomial", foldid=foldid, type.measure = "class", alpha=0) ## explore fit # lambda values fit$lambda # cross-validation curve pdf("log-reg-ridge-vpr-bin-curve.pdf", width=7, height=5) plot(fit) dev.off() # lambda that gives minimum error rate fit$lambda.min i <- which(fit$lambda == fit$lambda.min) # minimum cv mse min(fit$cvm) # mean cross-validation error rate fit$cvm # larger value of lambda whose cv mse is 1 SE larger fit$lambda.1se fit$glmnet.fit print(fit) # plot regularization path pdf("log-reg-ridge-vpr-bin-path-lambda.pdf", width=7, height=5) plot(fit$glmnet.fit, "lambda", label=TRUE) dev.off() # each curve corresponds to a feature # paths of them against the l1-norm # number of non-zero parameters above at a given lambda # parameter values for lambda.min ridge <- coef(fit, s=fit$lambda.min) # parameter values for lambda = 0 # i.e. unregularized zero <- coef(fit, s = 0, exact=FALSE) cbind2(zero, ridge) ############# ## multiclass classification ## get the data examples <- read.csv("cry.development.csv", sep="\t") n <- nrow(examples) ############# ## filter out uneffective features # remove features with 0s only c.1 <- examples[,colSums(as.matrix(sapply(examples, as.numeric)) )!=0] # remove features with 1s only c.2 <- c.1[,colSums(as.matrix(sapply(c.1, as.numeric)) )!=n] # remove features with frequency less or equal than m m <- 4 c.3 <- c.2[,colSums(as.matrix(sapply(c.2, as.numeric)) ) > m] # remove column duplicates and get names of effective features c.4 <- data.frame(t(unique(t(as.matrix(c.3))))) c <- c.4[,-c(1,ncol(c.4))] names <- names(c) ## run 9-cross-validation logistic regression with all effective features x <- data.matrix(c) y <- data.matrix(examples$tp) fit <- cv.glmnet(x, y, family = "multinomial", foldid=foldid, type.measure = "class", alpha=0) ## explore fit fit$glmnet.fit fit$name fit$lambda # mean cross-validation error fit$cvm # minimum min(fit$cvm) length(fit$lambda); length(fit$cvm) # lambda that gives minimum cv error rate fit$lambda.min i <- which(fit$lambda == fit$lambda.min) # parameter values for lambda.min coef(fit, s = "lambda.min") # larger value of lambda whose misclassification error is 1 SE larger fit$lambda.1se # classes u,x,1,4,7 coef(fit, s=fit$lambda.1se)[1] coef(fit, s=fit$lambda.1se)[2] coef(fit, s=fit$lambda.1se)[3] coef(fit, s=fit$lambda.1se)[4] coef(fit, s=fit$lambda.1se)[5] ## plot regularization path pdf("log-reg-ridge-vpr-multi-path.pdf", width=8.5, height=5) plot(fit) title("Multinomial Family",line=2.5) dev.off() # plot cross-validation curve pdf("log-reg-ridge-vpr-multi-curve-norm.pdf", width=8.5, height=5) op <- par(mfrow=c(1, 5)) plot(fit$glmnet.fit, "norm", label=TRUE) par(op) dev.off() # plot cross-validation curve pdf("log-reg-ridge-vpr-multi-curve-lambda.pdf", width=8.5, height=5) op <- par(mfrow=c(1, 5)) plot(fit$glmnet.fit, "lambda", label=TRUE) par(op) dev.off()