% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cv_MRF_diag.R
\name{cv_MRF_diag}
\alias{cv_MRF_diag}
\alias{cv_MRF_diag_rep}
\alias{cv_MRF_diag_rep_spatial}
\title{MRF cross validation and assessment of predictive performance}
\usage{
cv_MRF_diag(
  data,
  symmetrise,
  n_nodes,
  n_cores,
  sample_seed,
  n_folds,
  n_fold_runs,
  n_covariates,
  compare_null,
  family,
  plot = TRUE,
  cached_model,
  cached_predictions,
  mod_labels = NULL
)

cv_MRF_diag_rep(
  data,
  symmetrise,
  n_nodes,
  n_cores,
  sample_seed,
  n_folds,
  n_fold_runs,
  n_covariates,
  compare_null,
  family,
  plot = TRUE
)

cv_MRF_diag_rep_spatial(
  data,
  coords,
  symmetrise,
  n_nodes,
  n_cores,
  sample_seed,
  n_folds,
  n_fold_runs,
  n_covariates,
  compare_null,
  family,
  plot = TRUE
)
}
\arguments{
\item{data}{Dataframe. The input data where the \code{n_nodes}
left-most variables are variables that are to be represented by nodes in the graph.
Note that \code{NA}'s are allowed for covariates. If present, these missing values
will be imputed from the distribution \code{rnorm(mean = 0, sd = 1)}, which assumes that
all covariates are scaled and centred (i.e. by using the function
\code{\link[base]{scale}} or similar)}

\item{symmetrise}{The method to use for symmetrising corresponding parameter estimates
(which are taken from separate regressions). Options are \code{min} (take the coefficient with the
smallest absolute value), \code{max} (take the coefficient with the largest absolute value)
or \code{mean} (take the mean of the two coefficients). Default is \code{mean}}

\item{n_nodes}{Positive integer. The index of the last column in \code{data}
which is represented by a node in the final graph. Columns with index
greater than n_nodes are taken as covariates. Default is the number of
columns in \code{data}, corresponding to no additional covariates}

\item{n_cores}{Positive integer. The number of cores to spread the job across using
\code{\link[parallel]{makePSOCKcluster}}. Default is 1 (no parallelisation)}

\item{sample_seed}{Numeric. This seed will be used as the basis
for dividing data into folds. Default is a random seed
between 1 and 100000}

\item{n_folds}{Integer. The number of folds for cross-validation. Default is 10}

\item{n_fold_runs}{Integer. The number of total training runs to perform. During
each run, the data will be split into \code{n_folds} folds and the
observed data in each fold will be compared to their respective predictions.
Defaults to \code{n_folds}}

\item{n_covariates}{Positive integer. The number of covariates in \code{data}, before cross-multiplication}

\item{compare_null}{Logical. If \code{TRUE}, null models will also be run and plotted to
assess the influence of including covariates on model predictive performance.
Default is \code{FALSE}}

\item{family}{The response type. Responses can be quantitative continuous (\code{family = "gaussian"}),
non-negative counts (\code{family = "poisson"}) or binomial 1s and 0s (\code{family = "binomial"}).}

\item{plot}{Logical. If \code{TRUE}, \code{ggplot2} objects are returned. If \code{FALSE},
the prediction metrics are returned as a matrix. Default is \code{TRUE}}

\item{cached_model}{Used by function \code{\link{cv_MRF_diag_rep}} to store an optimised model and prevent
unneccessary replication of node-optimised model fitting}

\item{cached_predictions}{Used by function \code{\link{cv_MRF_diag_rep}} to store predictions from
optimised models and prevent unneccessary replication}

\item{mod_labels}{Optional character string of labels for the two models being compared
(if \code{compare_null == TRUE})}

\item{coords}{A two-column \code{dataframe} (with \code{nrow(coords) == nrow(data)})
representing the spatial coordinates of each observation in \code{data}. Ideally, these
coordinates will represent Latitude and Longitude GPS points for each observation.}
}
\value{
If \code{plot = TRUE}, a \code{ggplot2} object is returned. This will be
a plot containing boxplots of predictive metrics across test sets using the
optimised model (see \code{\link[glmnet]{cv.glmnet}} for further details of \code{lambda1}
optimisation). If \code{plot = FALSE}, a matrix of prediction metrics is returned.
}
\description{
\code{cv_MRF_diag} runs cross validation of \code{\link{MRFcov}} models and tests predictive
performance.

\code{cv_MRF_diag_rep} fits a single node-optimised model
and test's this model's predictive performance across multiple test subsets of the \code{data}.

\code{cv_MRF_diag_rep_spatial} fits a single node-optimised spatial model
and test's this model's predictive performance across multiple test subsets of the \code{data}.
\cr
\cr
All \code{cv_MRF} functions assess model predictive performance and produce
either diagnostic plots or matrices of predictive metrics.
}
\details{
Node-optimised models are fitted using \code{\link[glmnet]{cv.glmnet}},
and these models is used to predict \code{data} test subsets.
Test and training \code{data} subsets are created using \code{\link[caret]{createFolds}}.
\cr
\cr
To account for uncertainty in parameter estimates and in random fold generation, it is recommended
to perform cross-validation multiple times (by controlling the \code{n_fold_runs} argument) using
\code{cv_MRF_diag_rep} to supply a single cached model and that model's predictions.
This is useful for optimising a single model (using \code{\link[glmnet]{cv.glmnet}}) and testing
this model's predictive performance across many test subsets. Alternatively, one can run
\code{cv_MRF_diag} many times to fit different models in each iteration. This will be slower but
technically more sound
}
\examples{
\donttest{
data("Bird.parasites")
# Generate boxplots of model predictive metrics
cv_MRF_diag(data = Bird.parasites, n_nodes = 4,
           n_cores = 1, family = 'binomial')

# Generate boxplots comparing the CRF to an MRF model (no covariates)
cv_MRF_diag(data = Bird.parasites, n_nodes = 4,
           n_cores = 1, family = 'binomial',
           compare_null = TRUE)

# Replicate 10-fold cross-validation 10 times
cv.preds <- cv_MRF_diag_rep(data = Bird.parasites, n_nodes = 4,
                           n_cores = 1, family = 'binomial',
                           compare_null = TRUE,
                           plot = FALSE, n_fold_runs = 10)

# Plot model sensitivity and \% true predictions
library(ggplot2)
gridExtra::grid.arrange(
 ggplot(data = cv.preds, aes(y = mean_sensitivity, x = model)) +
       geom_boxplot() + theme(axis.text.x = ggplot2::element_blank()) +
       labs(x = ''),
 ggplot(data = cv.preds, aes(y = mean_tot_pred, x = model)) +
       geom_boxplot(),
       ncol = 1,
 heights = c(1, 1))

# Create some sample Poisson data with strong correlations
cov <- rnorm(500, 0.2)
cov2 <- rnorm(500, 1)
sp.2 <- rpois(500, lambda = exp(1.5 + (cov * 0.9)))
poiss.dat <- data.frame(sp.1 = rpois(500, lambda = exp(0.5 + (cov * 0.3))),
                       sp.2 = sp.2,
                       sp.3 = rpois(500, lambda = exp(log(sp.2 + 1) + (cov * -0.5))),
                       cov = cov,
                       cov2 = cov2)

# A CRF should produce a better fit (lower deviance, lower MSE)
cvMRF.poiss <- cv_MRF_diag(data = poiss.dat, n_nodes = 3,
                          n_folds = 10,
                          family = 'poisson',
                          compare_null = TRUE, plot = TRUE)
}

}
\references{
Clark, NJ, Wells, K and Lindberg, O.
Unravelling changing interspecific interactions across environmental gradients
using Markov random fields. (2018). Ecology doi: 10.1002/ecy.2221
\href{https://www.researchgate.net/publication/325184442_Unravelling_changing_interspecific_interactions_across_environmental_gradients_using_Markov_random_fields}{Full text here}.
}
\seealso{
\code{\link{MRFcov}},
\code{\link{predict_MRF}},
\code{\link[glmnet]{cv.glmnet}}
}
