booster

library("rbooster")
data(iris)
set.seed(1)

cv_sampler <- function(y, train_proportion) {
  unlist(lapply(unique(y), function(m) sample(which(y==m), round(sum(y==m))*train_proportion)))
}

x <- iris[,1:4]
y <- iris[,5]
### multiclass classification
unique(y)
#> [1] setosa     versicolor virginica 
#> Levels: setosa versicolor virginica

train_i <- cv_sampler(y, 0.8)

x_train <- x[train_i,]
y_train <- y[train_i]

x_test <- x[-train_i,]
y_test <- y[-train_i]
par(mfrow = c(1,2))
# boosting using decision tree
m <- booster(x_train = x_train, y_train = y_train,
             x_test = x_test, y_test = y_test,
             classifier = "rpart", bag_frac = 1, lambda = 1,
            print_detail = FALSE, print_plot = TRUE, max_iter = 50)
preds <- predict(object = m, newdata = x_test, print_detail = FALSE)
probs <- predict(object = m, newdata = x_test, print_detail = FALSE, type = "prob")

# boosting using decision tree example 2
m2 <- booster(x_train = x_train, y_train = y_train,
              x_test = x_test, y_test = y_test,
              classifier = "rpart", bag_frac = 0.5, lambda = 1,
              print_detail = FALSE, print_plot = TRUE, max_iter = 50)

preds2 <- predict(object = m, newdata = x_test, print_detail = FALSE)
probs2 <- predict(object = m, newdata = x_test, print_detail = FALSE, type = "prob")
head(probs)
#>      setosa versicolor virginica
#> [1,]      1          0         0
#> [2,]      1          0         0
#> [3,]      1          0         0
#> [4,]      1          0         0
#> [5,]      1          0         0
#> [6,]      1          0         0
head(probs2)
#>      setosa versicolor virginica
#> [1,]      1          0         0
#> [2,]      1          0         0
#> [3,]      1          0         0
#> [4,]      1          0         0
#> [5,]      1          0         0
#> [6,]      1          0         0

sum(preds != y_test)/length(y_test)
#> [1] 0.06666667
sum(preds2 != y_test)/length(y_test)
#> [1] 0.06666667
par(mfrow = c(1,2))
# boosting using naive bayes
m <- booster(x_train = x_train, y_train = y_train,
             x_test = x_test, y_test = y_test,
             classifier = "nb", bag_frac = 0.5, lambda = 1,
             print_detail = FALSE, print_plot = TRUE, max_iter = 175)

# boosting using naive bayes with bootstrap
m2 <- booster(x_train = x_train, y_train = y_train,
              x_test = x_test, y_test = y_test, weighted_bootstrap = TRUE,
              classifier = "nb", bag_frac = 0.5, lambda = 1,
              print_detail = FALSE, print_plot = TRUE, max_iter = 175)

preds2 <- predict(object = m, newdata = x_test, print_detail = FALSE)
probs2 <- predict(object = m, newdata = x_test, print_detail = FALSE, type = "prob")
head(probs)
#>      setosa versicolor virginica
#> [1,]      1          0         0
#> [2,]      1          0         0
#> [3,]      1          0         0
#> [4,]      1          0         0
#> [5,]      1          0         0
#> [6,]      1          0         0
head(probs2)
#>         setosa versicolor  virginica
#> [1,] 0.8399374  0.1499947 0.01006787
#> [2,] 0.8727797  0.1171524 0.01006787
#> [3,] 0.8775586  0.1123736 0.01006787
#> [4,] 0.8325694  0.1499947 0.01743590
#> [5,] 0.7961787  0.1723284 0.03149289
#> [6,] 0.8399887  0.1499435 0.01006787

sum(preds != y_test)/length(y_test)
#> [1] 0.06666667
sum(preds2 != y_test)/length(y_test)
#> [1] 0.03333333

# custom classifier
library(nnet)
classifier <- function(x_train, y_train, weights) {
  x_train <- as.data.frame(x_train)
  m <- nnet(y_train~., data = data.frame(x_train, y_train), size = 1,
            weights = weights, trace = FALSE)
  return(m)
}

predicter <- function(model, x_new) {
  x_new <- as.data.frame(x_new)
  preds <- predict(object = model, newdata = x_new, type = "class")
  return(preds)
}
par(mfrow = c(1,2))
# boosting using naive bayes
m <- booster(x_train = x_train, y_train = y_train,
             x_test = x_test, y_test = y_test,
             classifier = classifier,
             predicter = predicter,
             bag_frac = 0.5, lambda = 1,
             print_detail = FALSE, print_plot = TRUE, max_iter = 100)

# boosting using naive bayes with bootstrap
m2 <- booster(x_train = x_train, y_train = y_train,
              x_test = x_test, y_test = y_test,
              classifier = classifier,
              predicter = predicter,
              weighted_bootstrap = TRUE,
              bag_frac = 0.5, lambda = 1,
              print_detail = FALSE, print_plot = TRUE, max_iter = 100)

preds2 <- predict(object = m, newdata = x_test, print_detail = FALSE)
probs2 <- predict(object = m, newdata = x_test, print_detail = FALSE, type = "prob")
head(probs)
#>      setosa versicolor virginica
#> [1,]      1          0         0
#> [2,]      1          0         0
#> [3,]      1          0         0
#> [4,]      1          0         0
#> [5,]      1          0         0
#> [6,]      1          0         0
head(probs2)
#>         setosa versicolor virginica
#> [1,] 0.6146007  0.2025681 0.1828312
#> [2,] 0.6146007  0.2025681 0.1828312
#> [3,] 0.6146007  0.2025681 0.1828312
#> [4,] 0.5957865  0.2025681 0.2016454
#> [5,] 0.6146007  0.2025681 0.1828312
#> [6,] 0.6011939  0.2025681 0.1962380

sum(preds != y_test)/length(y_test)
#> [1] 0.06666667
sum(preds2 != y_test)/length(y_test)
#> [1] 0.03333333

# Dataset 2, binary class, a non-linear separated data example
library("imbalance")
data(banana)
x <- banana[,1:2]
y <- banana[,3]
plot(x, col = y)

train_i <- cv_sampler(y, 0.9)
x_train <- x[train_i,]
y_train <- y[train_i]

x_test <- x[-train_i,]
y_test <- y[-train_i]
# rpart
m_rpart <- booster(x_train = x_train, y_train = y_train,
                   classifier = "rpart", x_test = x_test, y_test = y_test,
                   max_iter = 100, weighted_bootstrap = FALSE,
                   lambda = 1, print_detail = FALSE, print_plot = TRUE,
                   bag_frac = 0.5)

preds_rpart <- predict(object = m_rpart, newdata = x_test, type = "pred")
sum(y_test != preds_rpart)/length(y_test) # error
#> [1] 0.03773585
# glm
m_glm <- booster(x_train = x_train, y_train = y_train,
                 classifier = "glm", x_test = x_test, y_test = y_test,
                 max_iter = 500, weighted_bootstrap = FALSE,
                 lambda = 2, print_detail = FALSE, print_plot = TRUE,
                 bag_frac = 0.5)

preds_glm <- predict(object = m_glm, newdata = x_test, type = "pred", print_detail = FALSE)
sum(y_test != preds_glm)/length(y_test) # error
#> [1] 0.03773585
# custom classifier
m_nnet <- booster(x_train = x_train, y_train = y_train,
                  classifier = classifier,
                  predicter = predicter,
                  x_test = x_test, y_test = y_test,
                  max_iter = 300, weighted_bootstrap = FALSE,
                  lambda = 2, print_detail = FALSE, print_plot = TRUE,
                  bag_frac = 0.5)

preds_nnet <- predict(object = m_nnet, newdata = x_test, type = "pred", print_detail = FALSE)
sum(y_test != preds_nnet)/length(y_test) # error
#> [1] 0.04528302

# for seeing decision boundaries
x1_grid <- seq(min(x[,1]), max(x[,1]), length = 150)
x2_grid <- seq(min(x[,2]), max(x[,2]), length = 150)
grid <- expand.grid(x1_grid, x2_grid)
colnames(grid) <- colnames(x)

prob_rpart <- predict(object = m_rpart, newdata = grid, type = "prob")[,1]
prob_glm <- predict(object = m_glm, newdata = grid, type = "prob", print_detail = FALSE)[,1]
prob_nnet <- predict(object = m_nnet, newdata = grid, type = "prob", print_detail = FALSE)[,1]

par(mfrow = c(1,3))

plot(x = x_train[,1], y = x_train[,2], col = y_train, cex = 0.5, pch = ".")
points(x = x_test[,1], y = x_test[,2], col = y_test, cex = 1, pch = 16)
contour(x = x1_grid, y = x2_grid, z = matrix(prob_rpart, nrow = 150),
        levels = c(0.5), add = TRUE,
        method = "edge")

plot(x = x_train[,1], y = x_train[,2], col = y_train, cex = 0.5, pch = ".")
points(x = x_test[,1], y = x_test[,2], col = y_test, cex = 1, pch = 16)
contour(x = x1_grid, y = x2_grid, z = matrix(prob_glm, nrow = 150),
        levels = c(0.5), add = TRUE,
        method = "edge")

plot(x = x_train[,1], y = x_train[,2], col = y_train, cex = 0.5, pch = ".")
points(x = x_test[,1], y = x_test[,2], col = y_test, cex = 1, pch = 16)
contour(x = x1_grid, y = x2_grid, z = matrix(prob_nnet, nrow = 150),
        levels = c(0.5), add = TRUE,
        method = "edge")