% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preference_order.R
\name{preference_order}
\alias{preference_order}
\title{Rank predictors by importance or multicollinearity}
\usage{
preference_order(
  df = NULL,
  responses = NULL,
  predictors = NULL,
  f = f_auto,
  cv_training_fraction = 1,
  cv_iterations = 1,
  seed = 1,
  quiet = FALSE,
  ...
)
}
\arguments{
\item{df}{(required; dataframe, tibble, or sf) A dataframe with responses
(optional) and predictors. Must have at least 10 rows for pairwise
correlation analysis, and \code{10 * (length(predictors) - 1)} for VIF.
Default: NULL.}

\item{responses}{(optional; character, character vector, or NULL) Name of
one or several response variables in \code{df}. Default: NULL.}

\item{predictors}{(optional; character vector or NULL) Names of the
predictors in \code{df}. If NULL, all columns except \code{responses} and
constant/near-zero-variance columns are used. Default: NULL.}

\item{f}{(optional: function name) Unquoted function name without parenthesis (see \link{f_functions}). By default calls to \code{\link[=f_auto]{f_auto()}}, which selects a suitable function depending on the nature of the response and predictors. Set to NULL if \code{responses = NULL}. If NULL, predictors are ranked from lower to higher multicollinearity. Default: \code{f_auto}}

\item{cv_training_fraction}{(optional, numeric) Value between 0.1 and 1 defining the training faction used in cross-validation. If 1 (default), no cross-validation is performed, and the resulting metric is computed from all observations and predictions. Automatically set to 1 when \code{cv_iterations = 1}. Default: 1}

\item{cv_iterations}{(optional, integer) Number of cross-validation iterations to perform. The recommended range lies between 30 and 100. In general, smaller datasets and large values of \code{cv_training_fraction} require more iterations to achieve stability. Automatically set to 1 when \code{cv_training_fraction = 1}. Default: 1}

\item{seed}{(optional, integer) Random seed, required for reproducibility when using cross-validation or random forest models. Default: 1}

\item{quiet}{(optional; logical) If FALSE, messages are printed. Default: FALSE.}

\item{...}{(optional) Internal args (e.g. \code{function_name} for
\code{\link{validate_arg_function_name}}, a precomputed correlation matrix
\code{m}, or cross-validation args for \code{\link{preference_order}}).}
}
\value{
dataframe:
\itemize{
\item \code{response}: character, response name, if any, or \code{"none"} otherwise.

\item \code{predictor}: character, name of the predictor.

\item \code{f}: name of the function used to compute the preference order. If argument \code{f} is NULL, the value "stats::cor()" is added to this column.

\item \code{metric}: name of the metric used to assess strength of association. Usually one of "R-squared", "AUC" (Area Under the ROC Curve), or "Cramer's V". If \code{f} is a custom function not in \code{\link[=f_functions]{f_functions()}}, then \code{metric} is set to "custom". If \code{f} is NULL, then "1 - R-squared" is returned in this column.

\item \code{score}: value of the metric returned by \code{f} to assess the association between the \code{response} and each given \code{predictor}.

\item \code{rank}: integer value indicating the rank of the predictor.
}
}
\description{
Generates a valid input for the argument \code{preference_order} of the functions \code{\link[=vif_select]{vif_select()}}, \code{\link[=cor_select]{cor_select()}}, \code{\link[=collinear_select]{collinear_select()}}, and \code{\link[=collinear]{collinear()}}. This argument helps preserve important predictors during multicollinearity filtering.

The function works in two different ways:
\itemize{
\item When \code{f} is NULL, it ranks the predictors from lower to higher multicollinearity, computed as one minus the average Pearson correlation between the given predictor against all others. This option is useful when the goal is to limit redundancy in a large dataset and there is not an specific model to train in mind.
\item When \code{responses} and \code{f} are not NULL, it ranks the predictors by the strength of their association with a response based on the evaluation of univariate models. This is the best possible option when the end-goal is training a model.
}

The argument \code{f} (requires a valid \code{resopnses} argument) defines how the strength of association between the response and each predictor is computed. By default it calls \code{\link[=f_auto]{f_auto()}}, which uses \code{\link[=f_auto_rules]{f_auto_rules()}} to select a suitable function depending on the types of the response and the predictors. This option is designed to provide sensible, general-purpose defaults optimized for speed and stability rather than any specific modeling approach.

For more fine-tuned control, the package offers the following \code{f} functions (see \code{\link[=f_functions]{f_functions()}}):

\itemize{
\item \strong{Numeric response}:
\itemize{
\item \code{\link[=f_numeric_glm]{f_numeric_glm()}}: Pearson's R-squared of response versus the predictions of a Gaussian GLM.
\item \code{\link[=f_numeric_gam]{f_numeric_gam()}}: GAM model fitted with \code{mgcv::gam()}.
\item \code{\link[=f_numeric_rf]{f_numeric_rf()}}: Random Forest model fitted with \code{ranger::ranger()}.
}
\item \strong{Integer counts response}:
\itemize{
\item \code{\link[=f_count_glm]{f_count_glm()}}: Pearson's R-squared of a Poisson GLM.
\item \code{\link[=f_count_gam]{f_count_gam()}}: Poisson GAM.
\item \code{\link[=f_count_rf]{f_count_rf()}}: Random Forest model fitted with \code{ranger::ranger()}.
}
\item \strong{Binomial response (1 and 0)}:
\itemize{
\item \code{\link[=f_binomial_glm]{f_binomial_glm()}}: AUC of Quasibinomial GLM with weighted cases.
\item \code{\link[=f_binomial_gam]{f_binomial_gam()}}: AUC of Quasibinomial GAM with weighted cases.
\item \code{\link[=f_binomial_rf]{f_binomial_rf()}}: AUC of a Random Forest model with weighted cases.
}
\item \strong{Categorical response}:
\itemize{
\item \code{\link[=f_categorical_rf]{f_categorical_rf()}}: Cramer's V of the response against the predictions of a classification Random Forest model.
}
}

These functions accept a cross-validation setup via the arguments \code{cv_iterations} and \code{cv_training_fraction}.

Additionally, the argument \code{f} accepts any custom function taking a dataframe with the columns "x" (predictor) and "y" (response) and returning a numeric indicator of association.

Accepts a parallelization setup via \code{future::plan()} and a progress bar via \code{progressr::handlers()} (see examples).

Accepts a character vector of response variables as input for the argument \code{responses}. When more than one response is provided, the output is a named list of preference data frames.
}
\examples{
#load example data
data(
  vi_smol,
  vi_predictors_numeric
)

##OPTIONAL: parallelization setup
# future::plan(
#   future::multisession,
#   workers = future::availableCores() - 1
# )

##OPTIONAL: progress bar
##does not work in R examples
# progressr::handlers(global = TRUE)

#ranking predictors from lower to higher multicollinearity
#------------------------------------------------
x <- preference_order(
  df = vi_smol,
  responses = NULL, #default value
  predictors = vi_predictors_numeric[1:10],
  f = NULL #must be explicit
)

x

#automatic selection of ranking function
#------------------------------------------------
x <- preference_order(
  df = vi_smol,
  responses = c("vi_numeric", "vi_categorical"),
  predictors = vi_predictors_numeric[1:10],
  f = f_auto
  )

x

#user selection of ranking function
#------------------------------------------------
#Poisson GLM for a integer counts response
x <- preference_order(
  df = vi_smol,
  responses = "vi_binomial",
  predictors = vi_predictors_numeric[1:10],
  f = f_binomial_glm
)

x

#cross-validation
#------------------------------------------------
x <- preference_order(
  df = vi_smol,
  responses = "vi_binomial",
  predictors = vi_predictors_numeric[1:10],
  f = f_binomial_glm,
  cv_training_fraction = 0.5,
  cv_iterations = 10
)

x

#custom pairwise correlation function
#------------------------------------------------
#custom functions need the ellipsis argument
f_rsquared <- function(df, ...){
    stats::cor(
      x = df$x,
      y = df$y,
      use = "complete.obs"
    )^2
}

x <- preference_order(
  df = vi_smol,
  responses = "vi_numeric",
  predictors = vi_predictors_numeric[1:10],
  f = f_rsquared
)

x

#resetting to sequential processing
#future::plan(future::sequential)
}
\seealso{
Other preference_order_functions: 
\code{\link{f_binomial_gam}()},
\code{\link{f_binomial_glm}()},
\code{\link{f_binomial_rf}()},
\code{\link{f_categorical_rf}()},
\code{\link{f_count_gam}()},
\code{\link{f_count_glm}()},
\code{\link{f_count_rf}()},
\code{\link{f_numeric_gam}()},
\code{\link{f_numeric_glm}()},
\code{\link{f_numeric_rf}()}
}
\author{
Blas M. Benito, PhD
}
\concept{preference_order_functions}
