# Pipeline_function --------------------------------------------------------

#' @title AutoScore STEP(i): Rank variables with machine learning (AutoScore Module 1)
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data to be analyzed, for auc-based ranking.
#' @param method method for ranking. Options: 1. `rf` - random forest (default), 2. `auc` - auc-based (required validation set). For "auc", univariate models will be built based on the train set, and the variable ranking is constructed via the AUC performance of corresponding univariate models on the validation set (`validation_set`).
#' @param ntree Number of trees in the random forest (Default: 100).
#' @details The first step in the AutoScore framework is variable ranking. We use random forest (RF),
#' an ensemble machine learning algorithm, to identify the top-ranking predictors for subsequent score generation.
#' This step correspond to Module 1 in the AutoScore paper.
#' @return Returns a vector containing the list of variables and its ranking generated by machine learning (random forest)
#' @examples
#' # see AutoScore Guidebook for the whole 5-step workflow
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' ranking <- AutoScore_rank(sample_data, ntree = 50)
#' @references
#' \itemize{
#'  \item{Breiman, L. (2001), Random Forests, Machine Learning 45(1), 5-32}
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{AutoScore_testing}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @importFrom randomForest randomForest importance
#'
AutoScore_rank <- function(train_set, validation_set = NULL, method = "rf", ntree = 100) {
  # set.seed(4)
  if (method == "rf") {
    train_set$label <- as.factor(train_set$label)
    model <-
      randomForest::randomForest(label ~ .,
        data = train_set,
        ntree = ntree,
        preProcess = "scale"
      )

    # estimate variable importance
    importance <- randomForest::importance(model, scale = F)

    # summarize importance
    names(importance) <- rownames(importance)
    importance <- sort(importance, decreasing = T)
    cat("The ranking based on variable importance was shown below for each variable: \n")
    print(importance)
    return(importance)
  }


  if (method == "auc") {
    if (is.null(validation_set)) {
      stop("Error: Please specify the validation set","\n",call.=FALSE)
    }
    vars <- names(train_set)
    vars <- vars[vars != "label"]
    train_set$label <- as.factor(train_set$label)
    AUC <- rep(0, length(vars))

    for (i in 1:length(vars)) {
      # log <- sprintf("--------%s--------", vars[i])
      # print(log)
      if (length(unique(train_set[[vars[i]]])) > 1) {
        model <-
          glm(label ~ ., data = train_set[c("label", vars[i])], family = binomial(link = "logit"))
        pred <- predict(model, newdata = validation_set[c("label", vars[i])])
        # confusionMatrix(pred, as.factor(as.character(val_set$label)))
        roc_obj <- roc(response = validation_set$label, predictor = as.numeric(pred), quiet = TRUE)
        AUC[i] <- auc(roc_obj)[[1]]
      } else {
        # if model can't be built
        AUC[i] = 0
      }
    }

    # summarize importance (AUC)
    names(AUC) <- vars
    AUC <- sort(AUC, decreasing = T)
    cat("The auc-based ranking based on variable importance was shown below for each variable: \n")
    print(AUC)
    return(AUC)
  }
  else {
     warning("Please specify methods among available options: rf, auc\n")
  }
}


#' @title AutoScore STEP(ii): Select the best model with parsimony plot (AutoScore Modules 2+3+4)
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data for validation purpose.
#' @param rank the raking result generated from AutoScore STEP(i) \code{\link{AutoScore_rank}}
#' @param n_min Minimum number of selected variables (Default: 1).
#' @param n_max Maximum number of selected variables (Default: 20).
#' @param max_score Maximum total score (Default: 100).
#' @param cross_validation If set to \code{TRUE}, cross-validation would be used for generating parsimony plot, which is
#'   suitable for small-size data. Default to \code{FALSE}
#' @param fold The number of folds used in cross validation (Default: 10). Available if \code{cross_validation = TRUE}.
#' @param categorize  Methods for categorize continuous variables. Options include "quantile" or "kmeans" (Default: "quantile").
#' @param quantiles Predefined quantiles to convert continuous variables to categorical ones. (Default: c(0, 0.05, 0.2, 0.8, 0.95, 1)) Available if \code{categorize = "quantile"}.
#' @param max_cluster The max number of cluster (Default: 5). Available if \code{categorize = "kmeans"}.
#' @param do_trace If set to TRUE, all results based on each fold of cross-validation would be printed out and plotted (Default: FALSE). Available if \code{cross_validation = TRUE}.
#' @param auc_lim_min Min y_axis limit in the parsimony plot (Default: 0.5).
#' @param auc_lim_max Max y_axis limit in the parsimony plot (Default: "adaptive").
#' @details This is the second step of the general AutoScore workflow, to generate the parsimony plot to help select a parsimonious model.
#'  In this step, it goes through AutoScore Module 2,3 and 4 multiple times and to evaluate the performance under different variable list.
#'  The generated parsimony plot would give researcher an intuitive figure to choose the best models.
#'  If data size is small (ie, <5000), an independent validation set may not be a wise choice. Then, we suggest using cross-validation
#'  to maximize the utility of data. Set \code{cross_validation=TRUE}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @return List of AUC value for different number of variables
#' @examples
#' \donttest{
#' # see AutoScore Guidebook for the whole 5-step workflow
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' out_split <- split_data(data = sample_data, ratio = c(0.7, 0.1, 0.2))
#' train_set <- out_split$train_set
#' validation_set <- out_split$validation_set
#' ranking <- AutoScore_rank(train_set, ntree=100)
#' AUC <- AutoScore_parsimony(
#' train_set,
#' validation_set,
#' rank = ranking,
#' max_score = 100,
#' n_min = 1,
#' n_max = 20,
#' categorize = "quantile",
#' quantiles = c(0, 0.05, 0.2, 0.8, 0.95, 1)
#' )}
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N, AutoScore: A Machine Learning-Based Automatic Clinical
#'   Score Generator and Its Application to Mortality Prediction Using Electronic Health Records,
#'   JMIR Med Inform 2020;8(10):e21798, doi: 10.2196/21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{AutoScore_testing}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import  pROC
AutoScore_parsimony <-
  function(train_set,
           validation_set,
           rank,
           max_score = 100,
           n_min = 1,
           n_max = 20,
           cross_validation = FALSE,
           fold = 10,
           categorize = "quantile",
           quantiles = c(0, 0.05, 0.2, 0.8, 0.95, 1),
           max_cluster = 5,
           do_trace = FALSE,
           auc_lim_min = 0.5,
           auc_lim_max = "adaptive") {
    if (n_max > length(rank)) {
      warning(
        "WARNING: the n_max (",
        n_max,
        ") is larger the number of all variables (",
        length(rank),
        "). We Automatically revise the n_max to ",
        length(rank)
      )
      n_max <- length(rank)
    }
    # Cross Validation scenario
    if (cross_validation == TRUE) {
      # Divide the data equally into n fold, record its index number
      #set.seed(4)
      index <- list()
      all <- 1:length(train_set[, 1])
      for (i in 1:(fold - 1)) {
        a <- sample(all, trunc(length(train_set[, 1]) / fold))
        index <- append(index, list(a))
        all <- all[!(all %in% a)]
      }
      index <- c(index, list(all))

      # Create a new variable auc_set to store all AUC value during the cross-validation
      auc_set <- data.frame(rep(0, n_max - n_min + 1))

      # for each fold, generate train_set and validation_set
      for (j in 1:fold) {
        validation_set_temp <- train_set[index[[j]],]
        train_set_tmp <- train_set[-index[[j]],]

        #variable_list <- names(rank)
        AUC <- c()

        # Go through AUtoScore Module 2/3/4 in the loop
        for (i in n_min:n_max) {
          variable_list <- names(rank)[1:i]
          train_set_1 <- train_set_tmp[, c(variable_list, "label")]
          validation_set_1 <-
            validation_set_temp[, c(variable_list, "label")]

          model_roc <-
            compute_auc_val(
              train_set_1,
              validation_set_1,
              variable_list,
              categorize,
              quantiles,
              max_cluster,
              max_score
            )
          #print(auc(model_roc))
          AUC <- c(AUC, auc(model_roc))
        }

        # plot parsimony plot for each fold
        names(AUC) <- n_min:n_max

        # only print and plot when do_trace = TRUE
        if (do_trace) {
          print(paste("list of AUC values for fold", j))
          print(data.frame(AUC))
          plot(
            AUC,
            main = paste("Parsimony plot (cross validation) for fold", j),
            xlab = "Number of Variables",
            ylab = "Area Under the Curve",
            col = "#2b8cbe",
            lwd = 2,
            type = "o"
          )
        }

        # store AUC result from each fold into "auc_set"
        auc_set <- cbind(auc_set, data.frame(AUC))
      }

      # finish loop and then output final results averaged by all folds
      auc_set$rep.0..n_max...n_min...1. <- NULL
      auc_set$sum <- rowSums(auc_set) / fold
      cat("***list of final mean AUC values through cross-validation are shown below \n")
      print(data.frame(auc_set$sum))

      # output final results and plot parsimony plot

      if(auc_lim_max == "adaptive"){
        auc_lim_max <- max(auc_set$sum)

      }

      var_names <- factor(names(rank)[n_min:n_max], levels = names(rank)[n_min:n_max])
      dt <- data.frame(AUC = auc_set$sum, variables = var_names, num = n_min:n_max)
      #names(AUC) <- n_min:n_max
      #cat("list of AUC values are shown below")
      #print(data.frame(AUC))
      p <- ggplot(data = dt, mapping = aes_string(x = "variables", y = "AUC")) +
        geom_bar(stat = "identity", fill = "steelblue") +
        coord_cartesian(ylim = c(auc_lim_min, auc_lim_max))+
        theme_bw() +
        labs(x = "", y = "Area Under the Curve", title = paste("Final parsimony plot based on ", fold,
                                                             "-fold cross validation", sep = "")) +
        theme(legend.position = "none",
              axis.text = element_text(size = 12),
              axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

      paste("Final Parsimony Plot based on ", fold,
        "-fold Cross Validation", sep = "")

      # Add number of variables to bar:
      if (nrow(dt) >= 100) {
        print( p + geom_text(aes(label = dt$num), vjust = 1.5, colour = "white", angle = 90))
      } else {
        print( p + geom_text(aes(label = dt$num), vjust = 1.5, colour = "white"))
      }


      return(auc_set)
    }


    # if Cross validation is FALSE
    else{
      AUC <- c()

      # Go through AutoScore Module 2/3/4 in the loop
      for (i in n_min:n_max) {
        cat(paste("Select", i, "Variable(s):  "))

        variable_list <- names(rank)[1:i]
        train_set_1 <- train_set[, c(variable_list, "label")]
        validation_set_1 <-
          validation_set[, c(variable_list, "label")]
        model_roc <-
          compute_auc_val(
            train_set_1,
            validation_set_1,
            variable_list,
            categorize,
            quantiles,
            max_cluster,
            max_score
          )
        print(auc(model_roc))
        AUC <- c(AUC, auc(model_roc))
      }

      if(auc_lim_max == "adaptive"){
        auc_lim_max <- max(AUC)

      }


      # output final results and plot parsimony plot
      var_names <- factor(names(rank)[n_min:n_max], levels = names(rank)[n_min:n_max])
      dt <- data.frame(AUC = AUC, variables = var_names, num = n_min:n_max)
      names(AUC) <- n_min:n_max
      #cat("list of AUC values are shown below")
      #print(data.frame(AUC))
      p <- ggplot(data = dt, mapping = aes_string(x = "variables", y = "AUC")) +
        geom_bar(stat = "identity", fill = "steelblue") +
        coord_cartesian(ylim = c(auc_lim_min, auc_lim_max))+
        theme_bw() +
        labs(x = "", y = "Area Under the Curve", title = "Parsimony plot on the validation set") +
        theme(legend.position = "none",
              axis.text = element_text(size = 12),
              axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))



      # Add number of variables to bar:
      if (nrow(dt) >= 100) {
        print( p + geom_text(aes(label = dt$num), vjust = 1.5, colour = "white", angle = 90))
      } else {
        print( p + geom_text(aes(label = dt$num), vjust = 1.5, colour = "white"))
      }


      return(AUC)
    }
  }


#' @title AutoScore STEP(iii): Generate the initial score with the final list of variables (Re-run AutoScore Modules 2+3)
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data for validation purpose.
#' @param final_variables A vector containing the list of selected variables, selected from Step(ii)\code{\link{AutoScore_parsimony}}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param max_score Maximum total score (Default: 100).
#' @param categorize  Methods for categorize continuous variables. Options include "quantile" or "kmeans" (Default: "quantile").
#' @param quantiles Predefined quantiles to convert continuous variables to categorical ones. (Default: c(0, 0.05, 0.2, 0.8, 0.95, 1)) Available if \code{categorize = "quantile"}.
#' @param max_cluster The max number of cluster (Default: 5). Available if \code{categorize = "kmeans"}.
#' @return Generated \code{cut_vec} for downstream fine-tuning process STEP(iv) \code{\link{AutoScore_fine_tuning}}.
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{AutoScore_testing}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import pROC ggplot2
AutoScore_weighting <-
  function(train_set,
           validation_set,
           final_variables,
           max_score = 100,
           categorize = "quantile",
           max_cluster = 5,
           quantiles = c(0, 0.05, 0.2, 0.8, 0.95, 1)) {
    # prepare train_set and Validation Set
    cat("****Included Variables: \n")
    print(data.frame(variable_name = final_variables))
    train_set_1 <- train_set[, c(final_variables, "label")]
    validation_set_1 <-
      validation_set[, c(final_variables, "label")]

    # AutoScore Module 2 : cut numeric and transfer categories and generate "cut_vec"
    cut_vec <-
      get_cut_vec(
        train_set_1,
        categorize = categorize,
        quantiles = quantiles,
        max_cluster = max_cluster
      )
    train_set_2 <- transform_df_fixed(train_set_1, cut_vec)
    validation_set_2 <-
      transform_df_fixed(validation_set_1, cut_vec)

    # AutoScore Module 3 : Score weighting
    score_table <-
      compute_score_table(train_set_2, max_score, final_variables)
    cat("****Initial Scores: \n")
    #print(as.data.frame(score_table))
    print_scoring_table(scoring_table = score_table, final_variable = final_variables)

    # Using "assign_score" to generate score based on new dataset and Scoring table "score_table"
    validation_set_3 <- assign_score(validation_set_2, score_table)
    validation_set_3$total_score <-
      rowSums(subset(validation_set_3, select = names(validation_set_3)[names(validation_set_3) !=
                                                                          "label"]))
    y_validation <- validation_set_3$label

    # Intermediate evaluation based on Validation Set
    plot_roc_curve(validation_set_3$total_score, as.numeric(y_validation) - 1)
    cat("***Performance (based on validation set):\n")
    print_roc_performance(y_validation, validation_set_3$total_score, threshold = "best")
    cat(
      "***The cutoffs of each variable generated by the AutoScore are saved in cut_vec. You can decide whether to revise or fine-tune them \n"
    )
    #print(cut_vec)
    return(cut_vec)
  }


#' @title AutoScore STEP(iv): Fine-tune the score by revising cut_vec with domain knowledge (AutoScore Module 5)
#' @description Domain knowledge is essential in guiding risk model development.
#'  For continuous variables, the variable transformation is a data-driven process (based on "quantile" or "kmeans" ).
#'  In this step, the automatically generated cutoff values for each continuous variable can be fine-tuned
#'  by combining, rounding, and adjusting according to the standard clinical norm.  Revised \code{cut_vec} will be input with domain knowledge to
#' update scoring table. User can choose any cut-off values/any number of categories. Then final Scoring table will be generated. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data for validation purpose.
#' @param final_variables A vector containing the list of selected variables, selected from Step(ii) \code{\link{AutoScore_parsimony}}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param max_score Maximum total score (Default: 100).
#' @param cut_vec Generated from STEP(iii) \code{\link{AutoScore_weighting}}.Please follow the guidebook
#' @return Generated final table of scoring model for downstream testing
#' @examples
#' ## Please see the guidebook or vignettes
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_testing}},Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import pROC ggplot2
AutoScore_fine_tuning <-
  function(train_set,
           validation_set,
           final_variables,
           cut_vec,
           max_score = 100) {
    # Prepare train_set and Validation Set
    train_set_1 <- train_set[, c(final_variables, "label")]
    validation_set_1 <-
      validation_set[, c(final_variables, "label")]

    # AutoScore Module 2 : cut numeric and transfer categories (based on fix "cut_vec" vector)
    train_set_2 <-
      transform_df_fixed(train_set_1, cut_vec = cut_vec)
    validation_set_2 <-
      transform_df_fixed(validation_set_1, cut_vec = cut_vec)

    # AutoScore Module 3 : Score weighting
    score_table <-
      compute_score_table(train_set_2, max_score, final_variables)
    cat("***Fine-tuned Scores: \n")
    #print(as.data.frame(score_table))
    print_scoring_table(scoring_table = score_table, final_variable = final_variables)

    # Using "assign_score" to generate score based on new dataset and Scoring table "score_table"
    validation_set_3 <- assign_score(validation_set_2, score_table)
    validation_set_3$total_score <-
      rowSums(subset(validation_set_3, select = names(validation_set_3)[names(validation_set_3) !=
                                                                          "label"])) ## which name ="label"
    y_validation <- validation_set_3$label

    # Intermediate evaluation based on Validation Set after fine-tuning
    plot_roc_curve(validation_set_3$total_score, as.numeric(y_validation) - 1)
    cat("***Performance (based on validation set, after fine-tuning):\n")
    print_roc_performance(y_validation, validation_set_3$total_score, threshold = "best")
    return(score_table)
  }


#' @title AutoScore STEP(v): Evaluate the final score with ROC analysis (AutoScore Module 6)
#' @description Domain knowledge is essential in guiding risk model development.
#'  For continuous variables, the variable transformation is a data-driven process (based on "quantile", "kmeans" or "decision_tree).
#'  In this step, the automatically generated cutoff values for each continuous variable can be fine-tuned
#'  by combining, rounding, and adjusting according to the standard clinical norm.  Revised \code{cut_vec} will be input  with domain knowledge to
#' update scoring table. User can choose any cut-off values/any number of categories. Then final Scoring table will be generated. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette..
#' @param test_set A processed \code{data.frame} that contains data for testing purpose. This \code{data.frame} should have same format as
#'        \code{train_set} (same variable names and outcomes)
#' @param final_variables A vector containing the list of selected variables, selected from Step(ii) \code{\link{AutoScore_parsimony}}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param scoring_table The final scoring table after fine-tuning, generated from STEP(iv) \code{\link{AutoScore_fine_tuning}}.Please follow the guidebook
#' @param cut_vec Generated from STEP(iii) \code{\link{AutoScore_weighting}}.Please follow the guidebook
#' @param threshold Score threshold for the ROC analysis to generate sensitivity, specificity, etc. If set to "best", the optimal threshold will be calculated (Default:"best").
#' @param with_label Set to TRUE if there are labels in the test_set and performance will be evaluated accordingly (Default:TRUE).
#' Set it to "FALSE" if there are not "label" in the "test_set" and the final predicted scores will be the output without performance evaluation.
#' @return A data frame with predicted score and the outcome for downstream visualization.
#' @examples
#' ## Please see the guidebook or vignettes
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{print_roc_performance}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import pROC ggplot2
AutoScore_testing <-
  function(test_set,
           final_variables,
           cut_vec,
           scoring_table,
           threshold = "best",
           with_label = TRUE) {
    if (with_label) {
      # prepare test set: categorization and "assign_score"
      test_set_1 <- test_set[, c(final_variables, "label")]
      test_set_2 <-
        transform_df_fixed(test_set_1, cut_vec = cut_vec)
      test_set_3 <- assign_score(test_set_2, scoring_table)
      test_set_3$total_score <-
        rowSums(subset(test_set_3, select = names(test_set_3)[names(test_set_3) !=
                                                                "label"]))
      test_set_3$total_score[which(is.na(test_set_3$total_score))] <-
        0
      y_test <- test_set_3$label

      # Final evaluation based on testing set
      plot_roc_curve(test_set_3$total_score, as.numeric(y_test) - 1)
      cat("***Performance using AutoScore:\n")
      model_roc <- roc(y_test, test_set_3$total_score, quiet = T)
      print_roc_performance(y_test, test_set_3$total_score, threshold = threshold)
      #Modelprc <- pr.curve(test_set_3$total_score[which(y_test == 1)],test_set_3$total_score[which(y_test == 0)],curve = TRUE)
      #values<-coords(model_roc, "best", ret = c("specificity", "sensitivity", "accuracy", "npv", "ppv", "precision"), transpose = TRUE)
      pred_score <-
        data.frame(pred_score = test_set_3$total_score, Label = y_test)
      return(pred_score)

    } else {
      test_set_1 <- test_set[, c(final_variables)]
      test_set_2 <-
        transform_df_fixed(test_set_1, cut_vec = cut_vec)
      test_set_3 <- assign_score(test_set_2, scoring_table)
      test_set_3$total_score <-
        rowSums(subset(test_set_3, select = names(test_set_3)[names(test_set_3) !=
                                                                "label"]))
      test_set_3$total_score[which(is.na(test_set_3$total_score))] <-
        0
      pred_score <-
        data.frame(pred_score = test_set_3$total_score, Label = NA)
      return(pred_score)
    }
  }


# Direct_function ---------------------------------------------------------

#' @title AutoScore function: Check whether the input dataset fulfill the requirement of the AutoScore
#' @param data The data to be checked
#' @examples
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' check_data(sample_data)
#' @return No return value, the result of the checking will be printed out.
#' @export
check_data <- function(data) {
  #1. check label and binary
  if (is.null(data$label))
    stop(
      "ERROR: for this dataset: These is no dependent variable 'label' to indicate the outcome. Please add one first\n"
    )
  if (length(levels(factor(data$label))) != 2)
    warning("Please keep outcome label variable binary\n")

  #2. check each variable
  non_num_fac <- c()
  fac_large <- c()
  special_case <- c()

  for (i in names(data)) {
    if ((class(data[[i]]) != "factor") &&
        (class(data[[i]]) != "numeric")&&
        (class(data[[i]]) != "integer")&&
        (class(data[[i]]) != "logical"))
      non_num_fac <- c(non_num_fac, i)
    if ((length(levels(data[[i]])) > 10) &&
        (is.factor(data[[i]])))
      fac_large <- c(fac_large, i)

    if (grepl(",", i))
      warning(
        paste0(
          "WARNING: the dataset has variable names '",
          i,
          "' with character ','. Please change it. Consider using '_' to replace\n"
        )
      )

    if (grepl(")", i))
      warning(
        paste0(
          "WARNING: the dataset has variable names '",
          i,
          "' with character ')'. Please change it. Consider using '_' to replace\n"
        )
      )

    if (grepl("]", i))
      warning(
        paste0(
          "WARNING: the dataset has variable names '",
          i,
          "' with character ']'. Please change it. Consider using '_' to replace\n"
        )
      )

    if (is.factor(data[[i]])) {
      if (sum(grepl(",", levels(data[[i]]))) > 0)
        warning(
          paste0(
            "WARNING: the dataset has categorical variable '",
            i,
            "', where their levels contain ','. Please use 'levels(*your_variable*)' to change the name of the levels before using the AutoScore. Consider replacing ',' with '_'. Thanks! \n "
          )
        )
    }


    if (sum(grepl(i, names(data))) > 1) {
      a <- names(data)[grepl(i, names(data))]
      a <- a[a != i]
      warning(
        paste0(
          "WARNING: the dataset has variable name '",
          i,
          "', which is entirely included by other variable names:\n",
          paste(paste0("'", a, "'"), collapse = "  "),
          "\nPlease use 'names(*your_df*)' to change the name of variable '",
          i,
          "' before using the AutoScore. Consider adding '_1', '_2',..., '_x, or other similar stuff at end of that name, such as '",
          paste0(i, "_1") ,
          "', to make them totally different and not contain each other. Thanks!\n "
        )
      )

    }


  }

  if (!is.null(non_num_fac))
    warning(
      paste(
        "\nWARNING: the dataset has variable of character and user should transform them to factor or numeric before using AutoScore:(consider using 'df$xxx  <- as.factor(df$xxx))' or 'df$xxx  <- as.numeric(df$xxx))'\n",
        non_num_fac
      )
    )
  if (!is.null(fac_large))
    warning(
      paste(
        "\nWARNING: The number of categories for some variables is too many :larger than: ",
        fac_large
      )
    )

  #3. check missing values
  missing_rate <- colSums(is.na(data))
  if (sum(missing_rate)) {
    warning(
      "\n WARNING: Your dataset contains NA. Please handle them before AutoScore. The variables with missing values are shown below:"
    )
    print(missing_rate[missing_rate != 0])
  }
  else
    message("\n missing value check passed.\n")
  #cat("Please fixed the problem of your dataset before AutoScore if you see any Warnings below.\n")
}
# check 1. missing value or not label is there and binary or not 3.only factor and numeric: 4. factor larger than 10


#' @title AutoScore function: Automatically splitting dataset to train, validation and test set
#' @param data The dataset to be split
#' @param ratio The ratio for dividing dataset into training, validation and testing set.(Default: c(0.7, 0.1, 0.2))
#' @param cross_validation If set to \code{TRUE}, cross-validation would be used for generating parsimony plot, which is
#'   suitable for small-size data. Default to \code{FALSE}
#' @return Returns a list containing training, validation and testing set
#' @examples
#' data("sample_data")
#' set.seed(4)
#' #large sample size
#' out_split <- split_data(data = sample_data, ratio = c(0.7, 0.1, 0.2))
#' #small sample size (for cross-validation)
#' out_split <- split_data(data = sample_data, ratio = c(0.7, 0, 0.3), cross_validation = TRUE)
#' @export
split_data <- function(data, ratio, cross_validation = FALSE) {
  # non cross validation: default
  if (cross_validation == FALSE) {
    n <- length(data[, 1])
    test_ratio <- ratio[3] / sum(ratio)
    validation_ratio <- ratio[2] / sum(ratio)
    #set.seed(4)
    test_index <- sample((1:n), test_ratio * n)
    validate_index <-
      sample((1:n)[!(1:n) %in% test_index], validation_ratio * n)
    train_set <- data[-c(validate_index, test_index), ]
    test_set <- data[test_index, ]
    validation_set <- data[validate_index, ]

    return(list(
      train_set = train_set,
      validation_set = validation_set,
      test_set = test_set
    ))
  }

  #  cross validation: train = validation
  else{
    n <- length(data[, 1])
    test_ratio <- ratio[3] / sum(ratio)
    validation_ratio <- ratio[2] / sum(ratio)
    #set.seed(4)
    test_index <- sample((1:n), test_ratio * n)
    validate_index <-
      sample((1:n)[!(1:n) %in% test_index], validation_ratio * n)
    train_set <- data[-c(test_index),]
    test_set <- data[test_index,]
    validation_set <- train_set

    return(list(
      train_set = train_set,
      validation_set = validation_set,
      test_set = test_set
    ))
  }

}


#' @title AutoScore function: Descriptive Analysis
#' @description Compute descriptive table (usually Table 1 in the medical literature) for the dataset.
#' @param df data frame after checking and fulfilling the requirement of AutoScore
#' @examples
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' compute_descriptive_table(sample_data)
#' @return No return value and the result of the descriptive analysis will be printed out.
#' @export
#' @import tableone
compute_descriptive_table <- function(df) {
  descriptive_table <-
    CreateTableOne(vars = names(df),
                   strata = "label",
                   data = df)
  descriptive_table_overall <-
    CreateTableOne(vars = names(df), data = df)
  print(descriptive_table)
  print(descriptive_table_overall)
}


#' @title AutoScore function: Univariable Analysis
#' @description Perform univariable analysis and generate the result table with odd ratios.
#' @param df data frame after checking
#' @return result of univariate analysis
#' @examples
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' uni_table<-compute_uni_variable_table(sample_data)
#' @export
compute_uni_variable_table <- function(df) {
  uni_table <- data.frame()
  for (i in names(df)[names(df) != "label"]) {
    model <-
      glm(
        as.formula("label ~ ."),
        data = subset(df, select = c("label", i)),
        family = binomial,
        na.action = na.omit
      )
    a <-
      cbind(exp(cbind(OR = coef(model), confint.default(model))), summary(model)$coef[, "Pr(>|z|)"])
    uni_table <- rbind(uni_table, a)
  }
  uni_table <-
    uni_table[!grepl("Intercept", row.names(uni_table), ignore.case = T), ]
  uni_table <- round(uni_table, digits = 3)
  uni_table$V4[uni_table$V4 < 0.001] <- "<0.001"
  uni_table$OR <-
    paste(uni_table$OR,
          "(",
          uni_table$`2.5 %`,
          "-",
          uni_table$`97.5 %`,
          ")",
          sep = "")
  uni_table$`2.5 %` <- NULL
  uni_table$`97.5 %` <- NULL
  names(uni_table)[names(uni_table) == "V4"] <- "p value"
  return(uni_table)
}


#' @title AutoScore function: Multivariate Analysis
#' @description Generate tables for multivariate analysis
#' @param df data frame after checking
#' @return result of the multivariate analysis
#' @examples
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' multi_table<-compute_multi_variable_table(sample_data)
#' @export
compute_multi_variable_table <- function(df) {
  model <-
    glm(label ~ .,
        data = df,
        family = binomial,
        na.action = na.omit)
  multi_table <-
    cbind(exp(cbind(
      adjusted_OR = coef(model), confint.default(model)
    )), summary(model)$coef[, "Pr(>|z|)"])
  multi_table <-
    multi_table[!grepl("Intercept", row.names(multi_table), ignore.case = T), ]
  multi_table <- round(multi_table, digits = 3)
  multi_table <- as.data.frame(multi_table)
  multi_table$V4[multi_table$V4 < 0.001] <- "<0.001"
  multi_table$adjusted_OR <-
    paste(
      multi_table$adjusted_OR,
      "(",
      multi_table$`2.5 %`,
      "-",
      multi_table$`97.5 %`,
      ")",
      sep = ""
    )
  multi_table$`2.5 %` <- NULL
  multi_table$`97.5 %` <- NULL
  names(multi_table)[names(multi_table) == "V4"] <- "p value"
  return(multi_table)
}


#' @title AutoScore Function: Print scoring tables for visualization
#' @param scoring_table Raw scoring table generated by AutoScore step(iv) \code{\link{AutoScore_fine_tuning}}
#' @param final_variable Final included variables
#' @return Data frame of formatted scoring table
#' @seealso \code{\link{AutoScore_fine_tuning}}, \code{\link{AutoScore_weighting}}
#' @export
#' @importFrom knitr kable
print_scoring_table <- function(scoring_table, final_variable) {
  #library(knitr)
  table_tmp <- data.frame()
  var_name <- names(scoring_table)
  var_name_tmp<-gsub("\\(.*","",var_name)
  var_name_tmp<-gsub("\\[.*","",var_name_tmp)
  for (i in 1:length(final_variable)) {
    var_tmp <- final_variable[i]
    # num <- grepl(var_tmp, var_name)
    # rank_indicator[which(rank_indicator=="")]<-max(as.numeric(rank_indicator[-which(rank_indicator=="")]))+1
    {
      num <- grep(var_tmp, var_name_tmp)
      if (grepl(",", var_name[num][1]) != TRUE) {
        table_1 <-
          data.frame(name = var_name[num], value = unname(scoring_table[num]))
        table_1$rank_indicator <- c(seq(1:nrow(table_1)))
        interval <-
          c(gsub(
            pattern = var_tmp,
            replacement = "",
            table_1$name
          ))
        table_1$interval <- interval
        table_2 <- table_1[order(table_1$interval),]
        table_2$variable <- c(var_tmp, rep("", (nrow(table_2) - 1)))
        table_3 <- rbind(table_2, rep("", ncol(table_2)))
        table_tmp <- rbind(table_tmp, table_3)
      }
      else
      {
        num <- grep(paste("^",var_tmp,"$", sep=""), var_name_tmp)
        table_1 <-
          data.frame(name = var_name[num], value = unname(scoring_table[num]))
        rank_indicator <- gsub(".*,", "", table_1$name)
        rank_indicator <-
          gsub(")", "", rank_indicator)
        rank_indicator[which(rank_indicator == "")] <-
          max(as.numeric(rank_indicator[-which(rank_indicator == "")])) + 1
        rank_indicator <- as.numeric(rank_indicator)
        {
          if (length(rank_indicator) == 2) {
            table_1$rank_indicator <- rank_indicator
            table_2 <- table_1[order(table_1$rank_indicator),]
            interval <- c(paste0("<", table_2$rank_indicator[1]))
            interval <-
              c(interval, paste0(">=", table_2$rank_indicator[length(rank_indicator) -
                                                                1]))
            table_2$interval <- interval
            table_2$variable <- c(var_tmp, rep("", (nrow(
              table_2
            ) - 1)))
            table_3 <- rbind(table_2, rep("", ncol(table_2)))
            table_tmp <- rbind(table_tmp, table_3)
          }
          else{
            table_1$rank_indicator <- rank_indicator
            table_2 <- table_1[order(table_1$rank_indicator),]
            interval <- c(paste0("<", table_2$rank_indicator[1]))
            for (j in 1:(length(table_2$rank_indicator) - 2)) {
              interval <-
                c(
                  interval,
                  paste0(
                    "[",
                    table_2$rank_indicator[j],
                    ",",
                    table_2$rank_indicator[j + 1],
                    ")"
                  )
                )
            }
            interval <-
              c(interval, paste0(">=", table_2$rank_indicator[length(rank_indicator) -
                                                                1]))
            table_2$interval <- interval
            table_2$variable <- c(var_tmp, rep("", (nrow(
              table_2
            ) - 1)))
            table_3 <- rbind(table_2, rep("", ncol(table_2)))
            table_tmp <- rbind(table_tmp, table_3)
          }
        }
      }
    }
  }
  table_tmp <- table_tmp[1:(nrow(table_tmp) - 1),]
  table_final <-
    data.frame(
      variable = table_tmp$variable,
      interval = table_tmp$interval,
      point = table_tmp$value
    )
  table_kable_format <-
    kable(table_final,
          align = "llc",
          caption = "AutoScore-created scoring model",
          format = "rst")
  print(table_kable_format)
  invisible(table_final)
}


#' @title AutoScore function: Print receiver operating characteristic (ROC) performance
#' @description Print receiver operating characteristic (ROC) performance
#' @param label outcome variable
#' @param score predicted score
#' @param threshold Threshold for analyze sensitivity, specificity and other metrics. Default to "best"
#' @seealso \code{\link{AutoScore_testing}}
#' @return No return value and the ROC performance will be printed out directly.
#' @export
#' @import pROC
print_roc_performance <-
  function(label, score, threshold = "best") {
    if (sum(is.na(score)) > 0)
      warning("NA in the score: ", sum(is.na(score)))
    model_roc <- roc(label, score, quiet = T)
    cat("AUC: ", round(auc(model_roc), 4), "  ")
    print(ci(model_roc))

    if (threshold == "best") {
      threshold <-
        ceiling(coords(model_roc, "best", ret = "threshold", transpose = TRUE))
      cat("Best score threshold: >=", threshold, "\n")
    } else {
      cat("Score threshold: >=", threshold, "\n")
    }
    cat("Other performance indicators based on this score threshold: \n")
    roc <-
      ci.coords(
        model_roc,
        threshold ,
        ret = c("specificity", "sensitivity", "npv", "ppv"),
        transpose = TRUE
      )
    cat(
      "Sensitivity: ",
      round(roc$sensitivity[2], 4),
      " 95% CI: ",
      round(roc$sensitivity[1], 4),
      "-",
      round(roc$sensitivity[3], 4),
      "\n",
      sep = ""
    )
    cat(
      "Specificity: ",
      round(roc$specificity[2], 4),
      " 95% CI: ",
      round(roc$specificity[1], 4),
      "-",
      round(roc$specificity[3], 4),
      "\n",
      sep = ""
    )
    cat(
      "PPV:         ",
      round(roc$ppv[2], 4),
      " 95% CI: ",
      round(roc$ppv[1], 4),
      "-",
      round(roc$ppv[3], 4),
      "\n",
      sep = ""
    )
    cat(
      "NPV:         ",
      round(roc$npv[2], 4),
      " 95% CI: ",
      round(roc$npv[1], 4),
      "-",
      round(roc$npv[3], 4),
      "\n",
      sep = ""
    )
  }

#' @title AutoScore function: Print conversion table based on final performance evaluation
#' @description Print conversion table based on final performance evaluation
#' @param pred_score a vector with outcomes and final scores generated from \code{\link{AutoScore_fine_tuning}}
#' @param by specify correct method for categorizing the threshold:  by "risk" or "score".Default to "risk"
#' @param values A vector of threshold for analyze sensitivity, specificity and other metrics. Default to "c(0.01,0.05,0.1,0.2,0.5)"
#' @seealso \code{\link{AutoScore_testing}}
#' @return No return value and the conversion will be printed out directly.
#' @export
#' @import pROC knitr
conversion_table<-function(pred_score, by = "risk", values = c(0.01,0.05,0.1,0.2,0.5)){

  glmmodel<-glm(Label~pred_score,data = pred_score,family = binomial(link="logit"))
  pred_score$pred_risk<-predict(glmmodel,newdata=pred_score, type = "response")
  rtotoal<-data.frame(matrix(nrow=0,ncol=7))
  Modelroc<-roc(pred_score$Label,pred_score$pred_risk)

  if(by=="risk"){
    for(i in values){
      r<-data.frame(i)
      r$i<-paste(r$i*100,"%",sep="")
      names(r)[1]<-"Predicted Risk"
      r$`Score cut-off`<-paste("",min(pred_score[pred_score$pred_risk>=i,]$pred_score),sep="")
      r$`Percentage of patients (%)`<-round(length(pred_score[pred_score$pred_risk>=i,][,1])/length(pred_score[,1]),digits = 2)*100
      r1<-organize_performance(ci.coords(Modelroc,x=i ,input="threshold", ret=c("accuracy", "sensitivity","specificity" ,
                                                                                "ppv", "npv" )))
      #r1$X1<-NULL
      r<-cbind(r,r1)
      rtotoal<-rbind(rtotoal,r)
    }

    names(rtotoal)<-c("Predicted Risk [>=]", "Score cut-off [>=]", "Percentage of patients (%)","Accuracy (95% CI)",
                      "Sensitivity (95% CI)", "Specificity (95% CI)", "PPV (95% CI)",
                      "NPV (95% CI)")

  }else if(by=="score"){
    for(i in values){
      r<-data.frame(i)
      risk<-min(pred_score[pred_score$pred_score>=i,]$pred_risk)
      risk<-round(risk,3)
      r$risk<-paste(risk*100,"%",sep="")
      r$`Percentage of patients (%)`<-round(length(pred_score[pred_score$pred_risk>=risk,][,1])/length(pred_score[,1]),digits = 2)*100
      r1<-organize_performance(ci.coords(Modelroc,x=risk ,input="threshold", ret=c("accuracy", "sensitivity","specificity" ,
                                                                                   "ppv", "npv" )))
      #r1$X1<-NULL
      r<-cbind(r,r1)
      rtotoal<-rbind(rtotoal,r)
    }

    names(rtotoal)<-c("Score cut-off [>=]","Predicted Risk [>=]",  "Percentage of patients (%)","Accuracy (95% CI)",
                      "Sensitivity (95% CI)", "Specificity (95% CI)", "PPV (95% CI)",
                      "NPV (95% CI)")



  } else{ stop('ERROR: please specify correct method for categorizing the threshold:  by "risk" or "score".')}

   kable(rtotoal,align=c(rep('c',times=7)))

}


# Internal_function -------------------------------------------------------
## built-in function for AutoScore below
## Those functions are cited by pipeline functions

#' @title Internal function: Compute scoring table based on training dataset (AutoScore Module 3)
#' @description Compute scoring table based on training dataset
#' @param train_set_2 Processed training set after variable transformation (AutoScore Module 2)
#' @param max_score Maximum total score
#' @param variable_list List of included variables
#' @return A scoring table
compute_score_table <-
  function(train_set_2, max_score, variable_list) {
    #AutoScore Module 3 : Score weighting
    # First-step logistic regression
    model <-
      glm(label ~ ., family = binomial(link = "logit"), data = train_set_2)
    coef_vec <- coef(model)
    if (length(which(is.na(coef_vec))) > 0) {
      warning(" WARNING: GLM output contains NULL, Replace NULL with 1")
      coef_vec[which(is.na(coef_vec))] <- 1
    }
    train_set_2 <- change_reference(train_set_2, coef_vec)

    # Second-step logistic regression
    model <-
      glm(label ~ ., family = binomial(link = "logit"), data = train_set_2)
    coef_vec <- coef(model)
    if (length(which(is.na(coef_vec))) > 0) {
      warning(" WARNING: GLM output contains NULL, Replace NULL with 1")
      coef_vec[which(is.na(coef_vec))] <- 1
    }

    # rounding for final scoring table "score_table"
    coef_vec_tmp <- round(coef_vec / min(coef_vec[-1]))
    score_table <- add_baseline(train_set_2, coef_vec_tmp)

    # normalization according to "max_score" and regenerate score_table
    total_max <- max_score
    total <- 0
    for (i in 1:length(variable_list))
      total <-
      total + max(score_table[grepl(variable_list[i], names(score_table))])
    score_table <- round(score_table / (total / total_max))
    return(score_table)
  }


#' @title Internal function: Compute AUC based on validation set for plotting parsimony (AutoScore Module 4)
#' @description  Compute AUC based on validation set for plotting parsimony
#' @param train_set_1 Processed training set
#' @param validation_set_1 Processed validation set
#' @param max_score Maximum total score
#' @param variable_list List of included variables
#' @param categorize  Methods for categorize continuous variables. Options include "quantile" or "kmeans"
#' @param quantiles Predefined quantiles to convert continuous variables to categorical ones. Available if \code{categorize = "quantile"}.
#' @param max_cluster The max number of cluster (Default: 5). Available if \code{categorize = "kmeans"}.
#' @return A List of AUC for parsimony plot
compute_auc_val <-
  function(train_set_1,
           validation_set_1,
           variable_list,
           categorize,
           quantiles,
           max_cluster,
           max_score) {
    # AutoScore Module 2 : cut numeric and transfer categories
    cut_vec <-
      get_cut_vec(
        train_set_1,
        categorize = categorize,
        quantiles = quantiles,
        max_cluster = max_cluster
      )
    train_set_2 <- transform_df_fixed(train_set_1, cut_vec)
    validation_set_2 <-
      transform_df_fixed(validation_set_1, cut_vec)
    if (sum(is.na(validation_set_2)) > 0)
      warning("NA in the validation_set_2: ", sum(is.na(validation_set_2)))
    if (sum(is.na(train_set_2)) > 0)
      warning("NA in the train_set_2: ", sum(is.na(train_set_2)))

    # AutoScore Module 3 : Variable Weighting
    score_table <-
      compute_score_table(train_set_2, max_score, variable_list)
    if (sum(is.na(score_table)) > 0)
      warning("NA in the score_table: ", sum(is.na(score_table)))

    # Using "assign_score" to generate score based on new dataset and Scoring table "score_table"
    validation_set_3 <- assign_score(validation_set_2, score_table)
    if (sum(is.na(validation_set_3)) > 0)
      warning("NA in the validation_set_3: ", sum(is.na(validation_set_3)))

    validation_set_3$total_score <-
      rowSums(subset(validation_set_3, select = names(validation_set_3)[names(validation_set_3) !=
                                                                          "label"]))
    y_validation <- validation_set_3$label
    # plot_roc_curve(validation_set_3$total_score,as.numeric(y_validation)-1)

    # calculate AUC value
    model_roc <-
      roc(y_validation, validation_set_3$total_score, quiet = T)

    return(model_roc)
  }

#' @title Internal function: Calculate cut_vec from the training set (AutoScore Module 2)
#' @param df training set to be used for calculate the cut vector
#' @param categorize  Methods for categorize continuous variables. Options include "quantile" or "kmeans" (Default: "quantile").
#' @param quantiles Predefined quantiles to convert continuous variables to categorical ones. (Default: c(0, 0.05, 0.2, 0.8, 0.95, 1)) Available if \code{categorize = "quantile"}.
#' @param max_cluster The max number of cluster (Default: 5). Available if \code{categorize = "kmeans"}.
#' @return cut_vec for \code{transform_df_fixed}
get_cut_vec <-
  function(df,
           quantiles = c(0, 0.05, 0.2, 0.8, 0.95, 1),
           #by default
           max_cluster = 5,
           categorize = "quantile") {
    # Generate cut_vec for downstream usage
    cut_vec <- list()

    for (i in 1:(length(df) - 1)) {
      # for factor variable
      if (is.factor(df[, i])) {
        if (length(levels(df[, i])) < 10)
          #(next)() else stop("ERROR: The number of categories should be less than 10")
          (next)()
        else
          warning("WARNING: The number of categories should be less than 10",
                  names(df)[i])
      }

      ## mode 1 - quantiles
      if (categorize == "quantile") {
        # options(scipen = 20)
        #print("in quantile")
        cut_off_tmp <- quantile(df[, i], quantiles)
        cut_off_tmp <- unique(cut_off_tmp)
        cut_off <- signif(cut_off_tmp, 3)  # remain 3 digits
        #print(cut_off)

        ## mode 2 k-means clustering
      } else if (categorize == "k_means") {
        #print("using k-means")
        clusters <- kmeans(df[, i], max_cluster)
        cut_off_tmp <- c()
        for (j in unique(clusters$cluster)) {
          #print(min(df[,i][clusters$cluster==j]))
          #print(length(df[,i][clusters$cluster==j]))
          cut_off_tmp <-
            append(cut_off_tmp, min(df[, i][clusters$cluster == j]))
          #print(cut_off_tmp)
        }
        cut_off_tmp <- append(cut_off_tmp, max(df[, i]))
        cut_off_tmp <- sort(cut_off_tmp)
        #print(names(df)[i])
        #assert (length(cut_off_tmp) == 6)
        cut_off_tmp <- unique(cut_off_tmp)
        cut_off <- signif(cut_off_tmp, 3)
        cut_off <- unique(cut_off)
        #print (cut_off)

      } else {
        stop('ERROR: please specify correct method for categorizing:  "quantile" or "k_means".')
      }

      l <- list(cut_off)
      names(l)[1] <- names(df)[i]
      cut_vec <- append(cut_vec, l)
      #print("****************************cut_vec*************************")
      #print(cut_vec)
    }
    ## delete min and max for each cut-off (min and max will be captured in the new dataset)
    for (i in 1:length(cut_vec)) {
      if (length(cut_vec[[i]]) <= 2)
        cut_vec[[i]] <- c("let_binary")
      else
        cut_vec[[i]] <- cut_vec[[i]][2:(length(cut_vec[[i]]) - 1)]
    }
    return(cut_vec)
  }

#' @title Internal function: Categorizing continuous variables based on cut_vec (AutoScore Module 2)
#' @param df dataset(training, validation or testing) to be processed
#' @param cut_vec fixed cut vector
#' @return  Processed \code{data.frame} after categorizing based on fixed cut_vec
transform_df_fixed <- function(df, cut_vec) {
  j <- 1

  # for loop going through all variables
  for (i in 1:(length(df) - 1)) {
    if (is.factor(df[, i])) {
      if (length(levels(df[, i])) < 10)
        (next)()
      else
        stop("ERROR: The number of categories should be less than 9")
    }

    ## make conresponding cutvec for validation_set: cut_vec_new
    #df<-validation_set_1
    #df<-train_set_1
    vec <- df[, i]
    cut_vec_new <- cut_vec[[j]]

    if (cut_vec_new[1] == "let_binary") {
      vec[vec != getmode(vec)] <- paste0("not_", getmode(vec))
      vec <- as.factor(vec)
      df[, i] <- vec
    } else{
      if (min(vec) < cut_vec[[j]][1])
        cut_vec_new <- c(floor(min(df[, i])) - 100, cut_vec_new)
      if (max(vec) >= cut_vec[[j]][length(cut_vec[[j]])])
        cut_vec_new <- c(cut_vec_new, ceiling(max(df[, i]) + 100))

      cut_vec_new_tmp <- signif(cut_vec_new, 3)
      cut_vec_new_tmp <- unique(cut_vec_new_tmp)  ###revised update##
      df[, i] <-
        cut(
          df[, i],
          breaks = cut_vec_new_tmp,
          right = F,
          include.lowest = F,
          dig.lab = 3
        )
      # xmin<-as.character(min(cut_vec_new_tmp)) xmax<-as.character(max(cut_vec_new_tmp))

      ## delete min and max for the Interval after discretion: validation_set
      if (min(vec) < cut_vec[[j]][1])
        levels(df[, i])[1] <- gsub(".*,", "(,", levels(df[, i])[1])
      if (max(vec) >= cut_vec[[j]][length(cut_vec[[j]])])
        levels(df[, i])[length(levels(df[, i]))] <-
        gsub(",.*", ",)", levels(df[, i])[length(levels(df[, i]))])
    }

    j <- j + 1
  }
  return(df)
}


#' @title Internal Function: Plotting ROC curve
#' @param prob Predicate probability
#' @param labels Actual outcome(binary)
#' @param quiet if set to TRUE, there will be no trace printing
#' @return No return value and the ROC curve will be plotted.
#' @import pROC
plot_roc_curve <- function(prob, labels, quiet = TRUE) {
  #library(pROC)
  # prob<-predict(model.glm,newdata=X_test, type = 'response')
  model_roc <- roc(labels, prob, quiet = quiet)
  auc <- auc(model_roc)
  auc_ci <- ci.auc(model_roc)

  roc.data <- data.frame(
    fpr = as.vector(coords(model_roc,
                           "local maximas", ret = "1-specificity", transpose = TRUE)),
    tpr = as.vector(coords(model_roc, "local maximas", ret = "sensitivity",
                           transpose = TRUE)))

  auc_ci <- sort(as.numeric(auc_ci)) # should include AUC and 95% CI
  clr <- rgb(red = 41, green = 70, blue = 76, maxColorValue = 255)
  clr_axis <- rgb(red = 25, green = 24, blue = 24, maxColorValue = 255)
  p<-ggplot(data.frame(fpr = roc.data$fpr, tpr = roc.data$tpr),
            aes_string(x = "fpr", ymin = 0, ymax = "tpr")) +
    #geom_ribbon(alpha = 0.2, fill = clr) +
    geom_line(aes_string(y = "tpr"), color = clr, lwd = 1.2) +
    geom_abline(slope = 1, intercept = 0, lty = 2, lwd = 0.3, color = clr_axis) +
    # scale_color_manual(values = ) +
    scale_x_continuous(expand = c(0, 0), limits = c(0, 1)) +
    scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
    labs(x = "1-Specificity", y = "Sensitivity",
         title = "Receiver Operating Characteristic Curve",
         subtitle = sprintf("AUC=%.3f, 95%% CI: %.3f-%.3f",
                            auc_ci[2], auc_ci[1], auc_ci[3])) +
    theme_classic() +
    theme(plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm"),
          axis.text= element_text(size=12),
          text = element_text(size = 12, color = clr_axis),
          #text = element_text(family = "Tahoma", size = 12, color = clr_axis),
          # axis.line = element_line(color = clr_axis, size = 0.3),
          axis.line = element_blank(),
          axis.ticks.length = unit(0.15, units = "cm"),
          panel.border = element_rect(color = clr_axis, fill = NA))
  print(p)
}


#' @title Internal Function: Change Reference category after first-step logistic regression (part of AutoScore Module 3)
#' @param df A \code{data.frame} used for logistic regression
#' @param coef_vec Generated from logistic regression
#' @return Processed \code{data.frame} after changing reference category
change_reference <- function(df, coef_vec) {
  # delete label first
  df_tmp <- subset(df, select = names(df)[names(df) != "label"])
  names(coef_vec) <- gsub("[`]", "", names(coef_vec)) # remove the possible "`" in the names

  # one loops to go through all variable
  for (i in (1:length(df_tmp))) {
    var_name <- names(df_tmp)[i]
    var_levels <- levels(df_tmp[, i])
    var_coef_names <- paste0(var_name, var_levels)
    coef_i <- coef_vec[which(names(coef_vec) %in% var_coef_names)]
    # if min(coef_tmp)<0, the current lowest one will be used for reference
    if (min(coef_i) < 0) {
      ref <-
        var_levels[which(var_coef_names == names(coef_i)[which.min(coef_i)])]
      df_tmp[, i] <- relevel(df_tmp[, i], ref = ref)
    }
    # char_tmp <- paste("^", names(df_tmp)[i], sep = "")
    # coef_tmp <- coef_vec[grepl(char_tmp, names(coef_vec))]
    # coef_tmp <- coef_tmp[!is.na(coef_tmp)]

    # if min(coef_tmp)<0, the current lowest one will be used for reference
    # if (min(coef_tmp) < 0) {
    #   ref <- gsub(names(df_tmp)[i], "", names(coef_tmp)[which.min(coef_tmp)])
    #   df_tmp[, i] <- relevel(df_tmp[, i], ref = ref)
    # }
  }

  # add label again
  df_tmp$label <- df$label
  return(df_tmp)
}


#' @title Internal Function: Add baselines after second-step logistic regression (part of AutoScore Module 3)
#' @param df A \code{data.frame} used for logistic regression
#' @param coef_vec Generated from logistic regression
#' @return Processed \code{vector} for generating the scoring table
add_baseline <- function(df, coef_vec) {
  names(coef_vec) <- gsub("[`]", "", names(coef_vec)) # remove the possible "`" in the names
  df <- subset(df, select = names(df)[names(df) != "label"])
  coef_names_all <- unlist(lapply(names(df), function(var_name) {
    paste0(var_name, levels(df[, var_name]))
  }))
  coef_vec_all <- numeric(length(coef_names_all))
  names(coef_vec_all) <- coef_names_all
  # Remove items in coef_vec that are not meant to be in coef_vec_all
  # (i.e., the intercept)
  coef_vec_core <-
    coef_vec[which(names(coef_vec) %in% names(coef_vec_all))]
  i_coef <-
    match(x = names(coef_vec_core),
          table = names(coef_vec_all))
  coef_vec_all[i_coef] <- coef_vec_core
  coef_vec_all
}


#' @title Internal Function: Automatically assign scores to each subjects given new data set and scoring table (Used for intermediate and final evaluation)
#' @param df A \code{data.frame} used for testing, where variables keep before categorization
#' @param score_table A \code{vector} containing the scoring table
#' @return Processed \code{data.frame} with assigned scores for each variables
assign_score <- function(df, score_table) {
  for (i in 1:(length(names(df)) - 1)) {
    score_table_tmp <-
      score_table[grepl(names(df)[i], names(score_table))]
    df[, i] <- as.character(df[, i])
    for (j in 1:length(names(score_table_tmp))) {
      df[, i][df[, i] %in% gsub(names(df)[i], "", names(score_table_tmp)[j])] <-
        score_table_tmp[j]
    }

    df[, i] <- as.numeric(df[, i])
  }

  return(df)
}


getmode <- function(vect) {
  uniqvect <- unique(vect)
  uniqvect[which.max(tabulate(match(vect, uniqvect)))]
}

organize_performance<-function(w1){
  df <- data.frame(w1)
  df <- round(df, digits = 3)*100
  df1 <- data.frame(1)
  df1 <- data.frame(`Accuracy (95% CI)`=paste(df$accuracy.50.,"% (",df$accuracy.2.5.,"-",df$accuracy.97.5.,"%)",sep = ""),
                    `Sensitivity (95% CI)`=paste(df$sensitivity.50.,"% (",df$sensitivity.2.5.,"-",df$sensitivity.97.5.,"%)",sep = ""),
                    `Specificity (95% CI)`=paste(df$specificity.50.,"% (",df$specificity.2.5.,"-",df$specificity.97.5.,"%)",sep = ""),
                    `PPV (95% CI)`=paste(df$ppv.50.,"% (",df$ppv.2.5.,"-",df$ppv.97.5.,"%)",sep = ""),
                    `NPV (95% CI)`=paste(df$npv.50.,"% (",df$npv.2.5.,"-",df$npv.97.5.,"%)",sep = ""),
                    check.names = FALSE)

  #print(df1)
  return(df1)
}




#' 20000 simulated ICU admission data, with the same distribution as the data in the MIMIC-III ICU database
#'
#' @description 20000 simulated samples, with the same distribution as the data in the MIMIC-III ICU database. It is used for demonstration only in the Guidebook. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#'  \itemize{
#'  \item{Johnson, A., Pollard, T., Shen, L. et al. MIMIC-III, a freely accessible critical care database. Sci Data 3, 160035 (2016).}
#' }
"sample_data"


#' 1000 simulated ICU admission data, with the same distribution as the data in the MIMIC-III ICU database
#'
#' @description 1000 simulated samples, with the same distribution as the data in the MIMIC-III ICU database. It is used for demonstration only in the Guidebook. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#'  \itemize{
#'  \item{Johnson, A., Pollard, T., Shen, L. et al. MIMIC-III, a freely accessible critical care database. Sci Data 3, 160035 (2016).}
#' }
"sample_data_small"

