% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/validate.R
\name{validate}
\alias{validate}
\title{Validate regression models on a test set}
\usage{
validate(
  train_data,
  formulas,
  family,
  test_data = NULL,
  partitions_col = ".partitions",
  control = NULL,
  REML = FALSE,
  cutoff = 0.5,
  positive = 2,
  metrics = list(),
  preprocessing = NULL,
  err_nc = FALSE,
  rm_nc = FALSE,
  parallel = FALSE,
  verbose = FALSE,
  link = deprecated(),
  models = deprecated(),
  model_verbose = deprecated()
)
}
\arguments{
\item{train_data}{\code{data.frame}.

 Can contain a grouping factor for identifying partitions - as made with
 \code{\link[groupdata2:partition]{groupdata2::partition()}}.
 See \code{`partitions_col`}.}

\item{formulas}{Model formulas as strings. (Character)

 E.g. \code{c("y~x", "y~z")}.

 Can contain random effects.

 E.g. \code{c("y~x+(1|r)", "y~z+(1|r)")}.}

\item{family}{Name of the family. (Character)

 Currently supports \strong{\code{"gaussian"}} for linear regression
 with \code{\link[stats:lm]{lm()}} / \code{\link[lme4:lmer]{lme4::lmer()}}
 and \strong{\code{"binomial"}} for binary classification
 with \code{\link[stats:glm]{glm()}} / \code{\link[lme4:glmer]{lme4::glmer()}}.

 See \code{\link[cvms:cross_validate_fn]{cross_validate_fn()}} for use with other model functions.}

\item{test_data}{\code{data.frame}. If specifying \code{`partitions_col`}, this can be \code{NULL}.}

\item{partitions_col}{Name of grouping factor for identifying partitions. (Character)

 Rows with the value \code{1} in \code{`partitions_col`} are used as training set and
 rows with the value \code{2} are used as test set.

 N.B. \strong{Only used if \code{`test_data`} is \code{NULL}}.}

\item{control}{Construct control structures for mixed model fitting
 (with \code{\link[lme4:lmer]{lme4::lmer()}} or \code{\link[lme4:glmer]{lme4::glmer()}}).
 See \code{\link[lme4:lmerControl]{lme4::lmerControl}} and
 \code{\link[lme4:glmerControl]{lme4::glmerControl}}.

 N.B. Ignored if fitting \code{\link[stats:lm]{lm()}} or \code{\link[stats:glm]{glm()}} models.}

\item{REML}{Restricted Maximum Likelihood. (Logical)}

\item{cutoff}{Threshold for predicted classes. (Numeric)

 N.B. \strong{Binomial models only}}

\item{positive}{Level from dependent variable to predict.
 Either as character (\emph{preferable}) or level index (\code{1} or \code{2} - alphabetically).

 E.g. if we have the levels \code{"cat"} and \code{"dog"} and we want \code{"dog"} to be the positive class,
 we can either provide \code{"dog"} or \code{2}, as alphabetically, \code{"dog"} comes after \code{"cat"}.

 \strong{Note:} For \emph{reproducibility}, it's preferable to \strong{specify the name directly}, as
 different \code{\link[base:locales]{locales}} may sort the levels differently.

 Used when calculating confusion matrix metrics and creating \code{ROC} curves.

 The \code{Process} column in the output can be used to verify this setting.

 N.B. Only affects evaluation metrics, not the model training or returned predictions.

 N.B. \strong{Binomial models only}.}

\item{metrics}{\code{list} for enabling/disabling metrics.

 E.g. \code{list("RMSE" = FALSE)} would remove \code{RMSE} from the results,
 and \code{list("Accuracy" = TRUE)} would add the regular \code{Accuracy} metric
 to the classification results.
 Default values (\code{TRUE}/\code{FALSE}) will be used for the remaining available metrics.

 You can enable/disable all metrics at once by including
 \code{"all" = TRUE/FALSE} in the \code{list}. This is done prior to enabling/disabling
 individual metrics, why \code{list("all" = FALSE, "RMSE" = TRUE)}
 would return only the \code{RMSE} metric.

 The \code{list} can be created with
 \code{\link[cvms:gaussian_metrics]{gaussian_metrics()}} or
 \code{\link[cvms:binomial_metrics]{binomial_metrics()}}.

 Also accepts the string \code{"all"}.}

\item{preprocessing}{Name of preprocessing to apply.

 Available preprocessings are:

 \tabular{rrr}{
  \strong{Name} \tab \strong{Description} \cr
  "standardize" \tab Centers and scales the numeric predictors.\cr
  "range" \tab Normalizes the numeric predictors to the \code{0}-\code{1} range.
  Values outside the min/max range in the test fold are truncated to \code{0}/\code{1}.\cr
  "scale" \tab Scales the numeric predictors to have a standard deviation of one.\cr
  "center" \tab Centers the numeric predictors to have a mean of zero.\cr
 }

 The preprocessing parameters (\code{mean}, \code{SD}, etc.) are extracted from the training folds and
 applied to both the training folds and the test fold.
 They are returned in the \strong{Preprocess} column for inspection.

 N.B. The preprocessings should not affect the results
 to a noticeable degree, although \code{"range"} might due to the truncation.}

\item{err_nc}{Whether to raise an \code{error} if a model does not converge. (Logical)}

\item{rm_nc}{Remove non-converged models from output. (Logical)}

\item{parallel}{Whether to validate the list of models in parallel. (Logical)

 Remember to register a parallel backend first.
 E.g. with \code{doParallel::registerDoParallel}.}

\item{verbose}{Whether to message process information
like the number of model instances to fit and which model function was applied. (Logical)}

\item{link, models, model_verbose}{Deprecated.}
}
\value{
\code{tibble} with the results and model objects.

 \subsection{Shared across families}{

 A nested \code{tibble} with \strong{coefficients} of the models from all iterations.

 Count of \strong{convergence warnings}. Consider discarding models that did not converge.

 Count of \strong{other warnings}. These are warnings without keywords such as "convergence".

 Count of \strong{Singular Fit messages}. See
 \code{\link[lme4:isSingular]{lme4::isSingular}} for more information.

 Nested \code{tibble} with the \strong{warnings and messages} caught for each model.

 Specified \strong{family}.

 Nested \strong{model} objects.

 Name of \strong{dependent} variable.

 Names of \strong{fixed} effects.

 Names of \strong{random} effects, if any.

 Nested \code{tibble} with \strong{preprocess}ing parameters, if any.

 }

 ----------------------------------------------------------------

 \subsection{Gaussian Results}{

 ----------------------------------------------------------------

 \strong{\code{RMSE}}, \strong{\code{MAE}}, \strong{\code{NRMSE(IQR)}},
 \strong{\code{RRSE}}, \strong{\code{RAE}}, \strong{\code{RMSLE}},
 \strong{\code{AIC}}, \strong{\code{AICc}}, and \strong{\code{BIC}}.

 See the additional metrics (disabled by default) at \code{\link[cvms:gaussian_metrics]{?gaussian_metrics}}.

 A nested \code{tibble} with the \strong{predictions} and targets.
 }

 ----------------------------------------------------------------

 \subsection{Binomial Results}{

 ----------------------------------------------------------------

 Based on predictions of the test set,
 a confusion matrix and \code{ROC} curve are used to get the following:

 \code{ROC}:

 \strong{\code{AUC}}, \strong{\code{Lower CI}}, and \strong{\code{Upper CI}}.

 \code{Confusion Matrix}:

 \strong{\code{Balanced Accuracy}},
 \strong{\code{F1}},
 \strong{\code{Sensitivity}},
 \strong{\code{Specificity}},
 \strong{\code{Positive Predictive Value}},
 \strong{\code{Negative Predictive Value}},
 \strong{\code{Kappa}},
 \strong{\code{Detection Rate}},
 \strong{\code{Detection Prevalence}},
 \strong{\code{Prevalence}}, and
 \strong{\code{MCC}} (Matthews correlation coefficient).

 See the additional metrics (disabled by default) at
 \code{\link[cvms:binomial_metrics]{?binomial_metrics}}.

 Also includes:

 A nested \code{tibble} with \strong{predictions}, predicted classes (depends on \code{cutoff}), and the targets.
 Note, that the predictions are \emph{not necessarily} of the \emph{specified} \code{positive} class, but of
 the \emph{model's} positive class (second level of dependent variable, alphabetically).

 The \code{\link[pROC:roc]{pROC::roc}} \strong{\code{ROC}} curve object(s).

 A nested \code{tibble} with the \strong{confusion matrix}/matrices.
 The \code{Pos_} columns tells you whether a row is a
 True Positive (\code{TP}), True Negative (\code{TN}),
 False Positive (\code{FP}), or False Negative (\code{FN}),
 depending on which level is the "positive" class. I.e. the level you wish to predict.

 The name of the \strong{Positive Class}.

 }
}
\description{
\Sexpr[results=rd, stage=render]{lifecycle::badge("stable")}

 Train linear or logistic regression models on a training set and validate it by
 predicting a test/validation set.
 Returns results in a \code{tibble} for easy reporting, along with the trained models.

 See \code{\link[cvms:validate_fn]{validate_fn()}} for use
 with custom model functions.
}
\details{
Packages used:

 \subsection{Models}{

 Gaussian: \code{\link[stats:lm]{stats::lm}}, \code{\link[lme4:lmer]{lme4::lmer}}

 Binomial: \code{\link[stats:glm]{stats::glm}}, \code{\link[lme4:glmer]{lme4::glmer}}
 }
 \subsection{Results}{
 \subsection{Shared}{

 \code{AIC} : \code{\link[stats:AIC]{stats::AIC}}

 \code{AICc} : \code{\link[MuMIn:AICc]{MuMIn::AICc}}

 \code{BIC} : \code{\link[stats:BIC]{stats::BIC}}

 }
 \subsection{Gaussian}{

 \code{r2m} : \code{\link[MuMIn:r.squaredGLMM]{MuMIn::r.squaredGLMM}}

 \code{r2c} : \code{\link[MuMIn:r.squaredGLMM]{MuMIn::r.squaredGLMM}}

 }
 \subsection{Binomial}{

 \code{ROC and AUC}: \code{\link[pROC:roc]{pROC::roc}}

 }
 }
}
\examples{
\donttest{
# Attach packages
library(cvms)
library(groupdata2) # partition()
library(dplyr) # \%>\% arrange()

# Data is part of cvms
data <- participant.scores

# Set seed for reproducibility
set.seed(7)

# Partition data
# Keep as single data frame
# We could also have fed validate() separate train and test sets.
data_partitioned <- partition(
  data,
  p = 0.7,
  cat_col = "diagnosis",
  id_col = "participant",
  list_out = FALSE
) \%>\%
  arrange(.partitions)

# Validate a model

# Gaussian
validate(
  data_partitioned,
  formulas = "score~diagnosis",
  partitions_col = ".partitions",
  family = "gaussian",
  REML = FALSE
)

# Binomial
validate(data_partitioned,
  formulas = "diagnosis~score",
  partitions_col = ".partitions",
  family = "binomial"
)

## Feed separate train and test sets

# Partition data to list of data frames
# The first data frame will be train (70\% of the data)
# The second will be test (30\% of the data)
data_partitioned <- partition(
  data,
  p = 0.7,
  cat_col = "diagnosis",
  id_col = "participant",
  list_out = TRUE
)
train_data <- data_partitioned[[1]]
test_data <- data_partitioned[[2]]

# Validate a model

# Gaussian
validate(
  train_data,
  test_data = test_data,
  formulas = "score~diagnosis",
  family = "gaussian",
  REML = FALSE
)
}
}
\seealso{
Other validation functions: 
\code{\link{cross_validate_fn}()},
\code{\link{cross_validate}()},
\code{\link{validate_fn}()}
}
\author{
Ludvig Renbo Olsen, \email{r-pkgs@ludvigolsen.dk}
}
\concept{validation functions}
