################################################################################ ### NPFL 054 -- Introduction to Machine Learning ### MFF UK, 2020-21 ### Example code by ### (c) M. Holub ################################################################################ # Build and evaluate RF models to predict binary profits using R package randomForest. # There are 5 features that you can use for prediction: # category, sales, assets, marketvalue, country. # # • Learn 20 random forests with diferent number of trees using ntree in seq(100, 2000, 100). # • For each random forest compute error rate estimate using 6-fold cross validation. In each # cross validation run you will have 1000 examples for training and 200 examples for test. # Compute mean and standard deviation of the error rate. # • Then compare the cross validation results with the OOB error estimates, and also with test # error rates measured using forbes.test. # • Arrange all your results in a nice table and plot a chart. prepare_data = function(){ library(HSAUR) # load the library with Forbes data set F = Forbes2000 # just to make a copy F = F[!is.na(F$profits), ] # rows with NA values are removed # now we select only countries with at least 25 companies in the data selected.countries = names(table(F$country)[table(F$country) >= 25]) F = F[F$country %in% selected.countries, ] F$country = droplevels(F$country) cat(nrow(F), "observations selected from Forbes2000 data set.\n") cat("Selected countries: ", paste(selected.countries, collapse=", "), ".\n", sep="") # to randomly split the data into two disjoint subsets set.seed(123); s = sample(1710) forbes.train <<- F[s[1:1200], ] # training examples forbes.test <<- F[s[1201:1710], ] # test examples } # the numerical values of 'profits' will be reduced to a binary attribute # forbes.train$profits = factor(forbes.train$profits > 0.2) # forbes.test$profits = factor(forbes.test$profits > 0.2) # selected.countries # [1] "Australia" "Canada" "China" "France" # [5] "Germany" "India" "Italy" "Japan" # [9] "Netherlands" "South Korea" "Spain" "Sweden" # [13] "Switzerland" "Taiwan" "United Kingdom" "United States" # str(F) # 'data.frame': 1710 obs. of 8 variables: # $ rank : int 1 2 3 4 5 6 7 8 9 10 ... # $ name : chr "Citigroup" "General Electric" "American Intl Group" "ExxonMobil" ... # $ country : Factor w/ 16 levels "Australia","Canada",..: 16 16 16 16 15 16 15 8 16 16 ... # $ category : Factor w/ 27 levels "Aerospace & defense",..: 2 6 16 19 19 2 2 8 9 20 ... # $ sales : num 94.7 134.2 76.7 222.9 232.6 ... # $ profits : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ... # $ assets : num 1264 627 648 167 178 ... # $ marketvalue: num 255 329 195 277 174 ... # table(F$profits) # FALSE TRUE # 906 804 # makes random division of n indexes into k groups; assumes that k divides n prepare_cv_folds = function(n,k){ fold.size = n %/% k set.seed(12); s = sample(n) f.idx = list() for(i in 1:k){ f.idx[[i]] = s[(1 + (i-1)*fold.size):(i*fold.size)] # cat("Fold_", i, "\t", (1 + (i-1)*fold.size), ":", (i*fold.size), " = ", length(f.idx[[i]]), "\n", sep="") } return(f.idx) } ################################################################################ ################################################################################ ##### TASK 2 ################################################################################ prepare_data() forbes.train$profits = factor(forbes.train$profits > 0.2) forbes.test$profits = factor(forbes.test$profits > 0.2) library(randomForest) ntree.options = seq(100, 2000, 100) # there will be 20 random forests with different ntree nfolds = 6 folds.test.idx = prepare_cv_folds(1200, nfolds) CVs = list() RFs = list() RF.eval = data.frame(ntree = ntree.options, cv.mean = 0, cv.std = 0, oob.err = 0, test.err = 0) ### Random Forests learning, including cross validation for(i in 1:length(ntree.options)){ cat("\n") cat("*** ntree =", ntree.options[i], "\n") ### Cross Validation process err.cv = numeric(nfolds) for(k in 1:nfolds){ rf.cv = randomForest(profits ~ category + sales + assets + marketvalue + country, forbes.train[ - folds.test.idx[[k]], ], ntree = ntree.options[i]) prediction.cv = predict(rf.cv, forbes.train[folds.test.idx[[k]], ], type="class") err.cv[k] = 1 - mean(prediction.cv == forbes.train[folds.test.idx[[k]], ]$profits) cat("\t\t err.cv", k, err.cv[k], "\n") } # save the cross validation results CVs[[i]] = err.cv cat(" CV mean =", mean(CVs[[i]]), "\n") cat(" CV std =", sd(CVs[[i]]), "\n") RF.eval$cv.mean[i] = mean(CVs[[i]]) RF.eval$cv.std[i] = sd(CVs[[i]]) ### Random Forest using all training data RFs[[i]] = randomForest(profits ~ category + sales + assets + marketvalue + country, forbes.train, ntree = ntree.options[i]) # get the OOB error estimate RF.eval$oob.err[i] = RFs[[i]]$err.rate[ntree.options[i]] cat(" RF OOB error rate =", RFs[[i]]$err.rate[ntree.options[i]], "\n") # print(RFs[[i]]$err.rate[ntree.options[i], ]) # evaluation of RF using test set prediction.test = predict(RFs[[i]], forbes.test, type="class") error.rate.test = 1 - mean(prediction.test == forbes.test$profits) RF.eval$test.err[i] = error.rate.test cat(" RF test error rate =", error.rate.test, "\n") } ### now all results are collected in RF.eval # > str(RF.eval) # 'data.frame': 20 obs. of 5 variables: # $ ntree : num 100 200 300 400 500 600 700 800 900 1000 ... # $ cv.mean : num 0.206 0.202 0.202 0.201 0.203 ... # $ cv.std : num 0.0153 0.0216 0.0175 0.0191 0.0163 ... # $ oob.err : num 0.203 0.202 0.188 0.189 0.191 ... # $ test.err: num 0.194 0.194 0.202 0.19 0.184 ... ### finally plot the results to compare different models ### and also different error estimates plot(RF.eval$ntree, RF.eval$cv.mean, pch=3, lwd=2, ylim=c(0.17,0.22), xlab="Number of trees", ylab="Estimated error rate") points(RF.eval$ntree, RF.eval$test.err, pch=3, lwd=2, col="red") points(RF.eval$ntree, RF.eval$oob.err, pch=3, lwd=2, col="blue") ### how to get OOB details -- probability of errors # rf.cv.1 = randomForest(profits ~ category + sales + assets + marketvalue + country, # forbes.train[folds.test.idx[[1]], ], ntree = 100) # rf.cv.1$err.rate[100,] # OOB FALSE TRUE # 0.2300000 0.2844037 0.1648352