################################################################################
###   NPFL 054  --  Introduction to Machine Learning
###   MFF UK, 2020-21
###   Example code by
###   (c) M. Holub
################################################################################
# Build and evaluate RF models to predict binary profits using R package randomForest.
# There are 5 features that you can use for prediction:
# category, sales, assets, marketvalue, country.
#
# • Learn 20 random forests with diferent number of trees using ntree in seq(100, 2000, 100).
# • For each random forest compute error rate estimate using 6-fold cross validation. In each
#   cross validation run you will have 1000 examples for training and 200 examples for test.
#   Compute mean and standard deviation of the error rate.
# • Then compare the cross validation results with the OOB error estimates, and also with test
#   error rates measured using forbes.test.
# • Arrange all your results in a nice table and plot a chart.

prepare_data = function(){
    library(HSAUR)                       # load the library with Forbes data set 
    F = Forbes2000	                     # just to make a copy 

    F = F[!is.na(F$profits), ]           # rows with NA values are removed
    
    # now we select only countries with at least 25 companies in the data
    selected.countries = names(table(F$country)[table(F$country) >= 25])
    F = F[F$country %in% selected.countries, ]
    F$country = droplevels(F$country)

    cat(nrow(F), "observations selected from Forbes2000 data set.\n")
    cat("Selected countries: ", paste(selected.countries, collapse=", "), ".\n", sep="")

    # to randomly split the data into two disjoint subsets 
    set.seed(123); s = sample(1710) 
    forbes.train <<- F[s[1:1200], ]            # training examples 
    forbes.test  <<- F[s[1201:1710], ]         # test examples 
}

# the numerical values of 'profits' will be reduced to a binary attribute 
#	 forbes.train$profits = factor(forbes.train$profits > 0.2)
#	 forbes.test$profits  = factor(forbes.test$profits  > 0.2)

# selected.countries
#  [1] "Australia"      "Canada"         "China"          "France"        
#  [5] "Germany"        "India"          "Italy"          "Japan"         
#  [9] "Netherlands"    "South Korea"    "Spain"          "Sweden"        
# [13] "Switzerland"    "Taiwan"         "United Kingdom" "United States" 

# str(F)
# 'data.frame':	1710 obs. of  8 variables:
#  $ rank       : int  1 2 3 4 5 6 7 8 9 10 ...
#  $ name       : chr  "Citigroup" "General Electric" "American Intl Group" "ExxonMobil" ...
#  $ country    : Factor w/ 16 levels "Australia","Canada",..: 16 16 16 16 15 16 15 8 16 16 ...
#  $ category   : Factor w/ 27 levels "Aerospace & defense",..: 2 6 16 19 19 2 2 8 9 20 ...
#  $ sales      : num  94.7 134.2 76.7 222.9 232.6 ...
#  $ profits    : Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
#  $ assets     : num  1264 627 648 167 178 ...
#  $ marketvalue: num  255 329 195 277 174 ...

# table(F$profits) 
# FALSE  TRUE 
#   906   804 


# makes random division of n indexes into k groups; assumes that k divides n
prepare_cv_folds = function(n,k){
    fold.size = n %/% k
    set.seed(12); s = sample(n)    
    f.idx = list()
    for(i in 1:k){
        f.idx[[i]] = s[(1 + (i-1)*fold.size):(i*fold.size)]
        # cat("Fold_", i, "\t", (1 + (i-1)*fold.size), ":", (i*fold.size), " = ", length(f.idx[[i]]), "\n", sep="")
    }
    return(f.idx)
}


################################################################################
################################################################################
#####   TASK 2
################################################################################

prepare_data()
forbes.train$profits = factor(forbes.train$profits > 0.2)
forbes.test$profits  = factor(forbes.test$profits  > 0.2)

library(randomForest)

ntree.options = seq(100, 2000, 100)      # there will be 20 random forests with different ntree
nfolds = 6
folds.test.idx = prepare_cv_folds(1200, nfolds)
CVs = list()
RFs = list()
RF.eval = data.frame(ntree = ntree.options, cv.mean = 0, cv.std = 0, oob.err = 0, test.err = 0)

### Random Forests learning, including cross validation
for(i in 1:length(ntree.options)){
  cat("\n")
  cat("*** ntree =", ntree.options[i], "\n")
  
  ### Cross Validation process
    err.cv = numeric(nfolds)
    for(k in 1:nfolds){
        rf.cv = randomForest(profits ~ category + sales + assets + marketvalue + country,
                             forbes.train[ - folds.test.idx[[k]], ], ntree = ntree.options[i])
        prediction.cv = predict(rf.cv, forbes.train[folds.test.idx[[k]], ], type="class") 
        err.cv[k] = 1 - mean(prediction.cv == forbes.train[folds.test.idx[[k]], ]$profits)
        cat("\t\t err.cv", k, err.cv[k], "\n")
    }    
  # save the cross validation results
  CVs[[i]] = err.cv
  cat("    CV mean =", mean(CVs[[i]]), "\n")
  cat("    CV std  =", sd(CVs[[i]]),   "\n")
  RF.eval$cv.mean[i] = mean(CVs[[i]])
  RF.eval$cv.std[i]  = sd(CVs[[i]])
  
  
  ### Random Forest using all training data
  RFs[[i]] = randomForest(profits ~ category + sales + assets + marketvalue + country,
                          forbes.train, ntree = ntree.options[i])
  # get the OOB error estimate  
  RF.eval$oob.err[i] = RFs[[i]]$err.rate[ntree.options[i]]
  cat("    RF OOB error rate  =", RFs[[i]]$err.rate[ntree.options[i]], "\n")
  # print(RFs[[i]]$err.rate[ntree.options[i], ])

  # evaluation of RF using test set
  prediction.test = predict(RFs[[i]], forbes.test, type="class")
  error.rate.test = 1 - mean(prediction.test == forbes.test$profits)
  RF.eval$test.err[i] = error.rate.test
  cat("    RF test error rate =", error.rate.test, "\n")
}

### now all results are collected in RF.eval
# > str(RF.eval)
# 'data.frame':	20 obs. of  5 variables:
#  $ ntree   : num  100 200 300 400 500 600 700 800 900 1000 ...
#  $ cv.mean : num  0.206 0.202 0.202 0.201 0.203 ...
#  $ cv.std  : num  0.0153 0.0216 0.0175 0.0191 0.0163 ...
#  $ oob.err : num  0.203 0.202 0.188 0.189 0.191 ...
#  $ test.err: num  0.194 0.194 0.202 0.19 0.184 ...


### finally plot the results to compare different models 
### and also different error estimates
plot(RF.eval$ntree,   RF.eval$cv.mean,  pch=3, lwd=2, 
     ylim=c(0.17,0.22), xlab="Number of trees", ylab="Estimated error rate")
points(RF.eval$ntree, RF.eval$test.err, pch=3, lwd=2, col="red")
points(RF.eval$ntree, RF.eval$oob.err,  pch=3, lwd=2, col="blue")
 
 
### how to get OOB details -- probability of errors
# rf.cv.1 = randomForest(profits ~ category + sales + assets + marketvalue + country,
#                        forbes.train[folds.test.idx[[1]], ], ntree = 100)
# rf.cv.1$err.rate[100,]
#       OOB     FALSE      TRUE 
# 0.2300000 0.2844037 0.1648352