% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/forest.R
\name{train_forest}
\alias{train_forest}
\title{Train a Recforest Model}
\usage{
train_forest(
  data,
  id_var,
  covariates,
  event,
  time_vars = c("t.start", "t.stop"),
  death_var = NULL,
  n_trees,
  n_bootstrap = NULL,
  seed = NULL,
  mtry,
  minsplit,
  nodesize,
  method,
  min_score,
  max_nodes,
  parallel = FALSE,
  verbose = TRUE
)
}
\arguments{
\item{data}{A data frame containing the dataset to be used for training the model.}

\item{id_var}{The name of the column containing the unique identifier for each subject.}

\item{covariates}{A character vector containing the names of the columns to be used as predictors in the model.}

\item{event}{The name of the column containing the recurrent event indicator.}

\item{time_vars}{A length-2 character vector containing the names of the columns representing the start and stop times (default "t.start" and "t.stop").}

\item{death_var}{The name of the column containing the death indicator or other any terminal event (optional).}

\item{n_trees}{The number of trees to be trained in the recforest model.}

\item{n_bootstrap}{The number of bootstrap samples to be used for training each tree (in-bag sample).
If not provided, it is set to 2/3 of the sample size (in term of number of unique \code{id_var}).}

\item{seed}{An optional seed value to be used for reproducibility purpose (NULL by default).}

\item{mtry}{The number of candidate variables randomly drawn at each node of the trees.
This parameter should be tuned by minimizing the OOB error.}

\item{minsplit}{The minimal number of events required to split the node. Cannot be smaller than 2.}

\item{nodesize}{The minimal number of subjects required in both child nodes to split. Cannot be smaller than 1.}

\item{method}{The method to be used for training the model. Currently, the following methods are supported : either "NAa" for Nelson-Aalen method, with no terminal event and no longitudinal time-dependent features; either "GL" for Ghosh-Lin modelization step with a terminal event and/or at least one longitudinal time-dependent feature.}

\item{min_score}{The minimum score required to split a node. This parameter is used only when the method is set to "NAa".}

\item{max_nodes}{The maximum number of nodes per tree.}

\item{parallel}{A logical value indicating whether to use parallel processing for training the trees.}

\item{verbose}{A logical value indicating whether to print progress messages.}
}
\value{
A list containing the following elements:
\item{trees}{A list of trained trees.}
\item{tree_metrics}{A list of metrics for each tree.}
\item{metrics}{A summary of the metrics for all trees.}
\item{columns}{A list of column names used in the training.}
\item{params}{A list of parameters used to set the model.}
\item{n_indiv}{Number of individuals in the dataset.}
\item{n_predictors}{Number of predictors used in the model.}
\item{n_trees}{Number of trees trained.}
\item{n_bootstrap}{Number of bootstrap samples used to grow each tree.}
\item{time}{Computation time used to train the model.}
}
\description{
This function trains a recforest model using the provided data and parameters.
}
\details{
The recforest model aggregates predictions over an ensemble of trees, each constructed using a set of decision nodes based on specific splitting rules.
At each node, a subset of predictors is randomly selected, and an optimal split is determined using an appropriate statistical test.
Depending on the specified \code{method}, the algorithm employs different statistical tests to find the best split:
\itemize{
\item For standard recurrent event data, the pseudo-score test statistic is used to compare two Nelson-Aalen estimates of the mean cumulative function.
\item In the presence of terminal events and/or longitudinal variables, the Ghosh-Lin model is utilized to obtain the Wald test statistic, which provides a more accurate assessment of the split.
The trees grow until they meet the stopping criteria, which include a minimum number of events (\code{minsplit}) and a minimum number of individuals in terminal nodes (\code{nodesize}).
The final model is an ensemble of these trees, which helps to reduce overfitting and improve predictive performance by averaging the results on the out-of-bag sample.
}
}
\examples{
if (interactive()) {
  data("bladder1_recforest")
  # To parallel computing
  # n_cores <- min(future::availableCores(), n_trees)
  # future::plan(future::multisession)
  trained_forest <- train_forest(
    data = bladder1_recforest,
    id_var = "id",
    covariates = c("treatment", "number", "size"),
    time_vars = c("t.start", "t.stop"),
    death_var = "death",
    event = "event",
    n_trees = 5,
    n_bootstrap = 70,
    mtry = 2,
    minsplit = 3,
    nodesize = 15,
    method = "NAa",
    min_score = 5,
    max_nodes = 20,
    seed = 111,
    parallel = FALSE,
    verbose = FALSE
  )
  print(trained_forest)
  summary(trained_forest)
}
}
\references{
Cook, R. J., & Lawless, J. F. (1997). Marginal analysis of recurrent events and a terminating event. Statistics in medicine, 16(8), 911-924.

Ghosh, D., & Lin, D. Y. (2002). Marginal regression models for recurrent and terminal events. Statistica Sinica, 663-688.

Ishwaran, H., Kogalur, U. B., Blackstone, E. H., & Lauer, M. S. (2008). Random survival forests.
}
