% 2018-04-25 A. Papritz
% R CMD Rdconv -t html -o bla.html cv.georob.Rd ; open bla.html; R CMD Rd2pdf --force cv.georob.Rd; 

\encoding{utf8}
\name{cv.georob}
\alias{cv.georob}

\title{Cross-Validating a Spatial Linear Model Fitted by \code{georob}}

\description{
  This function assesses the goodness-of-fit of a spatial linear model by
  \var{K}-fold cross-validation.  In more detail, the model is re-fitted
  \var{K} times by robust (or Gaussian) (RE)ML, excluding each time
  \var{1/K}th of the data.  The re-fitted models are used to compute robust
  (or customary) external Kriging predictions for the omitted observations.
  If the response variable is log-transformed then the Kriging predictions
  can be optionally transformed back to the original scale of the
  measurements.  S3methods for evaluating and plotting diagnostic summaries
  of the cross-validation errors are described for the function
  \code{\link{validate.predictions}}.
  }
  
\usage{
\method{cv}{georob}(object, formula = NULL, subset = NULL, 
    method = c("block", "random"), nset = 10, seed = NULL, 
    sets = NULL, duplicates.in.same.set = TRUE, re.estimate = TRUE, 
    param = object[["variogram.object"]][[1]][["param"]],
    fit.param = object[["variogram.object"]][[1]][["fit.param"]],
    aniso = object[["variogram.object"]][[1]][["aniso"]],
    fit.aniso = object[["variogram.object"]][[1]][["fit.aniso"]],
    variogram.object = NULL,
    use.fitted.param = TRUE, return.fit = FALSE, 
    reduced.output = TRUE, lgn = FALSE, 
    mfl.action = c("offset", "stop"),
    ncores = min(nset, detectCores()), verbose = 0, ...)
}

\arguments{

  \item{object}{an object of class of \code{"georob"}, see
  \code{\link{georobObject}}.}

  \item{formula}{an optional formula for the regression model passed by
  \code{\link[stats]{update}} to \code{\link{georob}}.}
  
  
  \item{subset}{an optional vector specifying a subset of observations
    to be used in the fitting process.}
    
  \item{method}{keyword, controlling whether subsets are formed by
  partitioning data set into \code{block}s by \code{\link[stats]{kmeans}}
  (default) or \code{random}ly.  Ignored if \code{sets} is
  non-\code{NULL}.}
    
  \item{nset}{positive integer defining the number \var{K} of subsets into
  which the data set is partitioned (default: \code{nset = 10}).  Ignored
  if \code{sets} is non-\code{NULL}.}
  
  \item{seed}{optional integer seed to initialize random number generation,
  see \code{\link[base]{set.seed}}. Ignored if \code{sets} is non-\code{NULL}.}
  
  \item{sets}{an optional vector of the same length as the response vector
  of the fitted model and with positive integers taking values in
  \eqn{(1,2,\ldots,K)}, defining in this way the \eqn{K} subsets into which
  the data set is split.  If \code{sets = NULL} (default) the partition is
  randomly generated by \code{\link[stats]{kmeans}} or
  \code{\link[stats]{runif}} (using possibly \code{seed}).}
  
  \item{duplicates.in.same.set}{logical controlling whether replicated
  observations at a given location are assigned to the same subset when
  partitioning the data (default \code{TRUE}).}
  
  \item{re.estimate}{logical controlling whether the model is re-fitted to
  the reduced data sets before computing the Kriging predictions
  (\code{TRUE}, default) or whether the model passed in \code{object} is
  used to compute the predictions for the omitted observations, see
  \emph{Details}.}
  
  \item{param}{a named numeric vector or a matrix or data frame with
  initial values of variogram parameters passed by
  \code{\link[stats]{update}} to \code{\link{georob}}.  If \code{param} is
  a matrix (or a data frame) then it must have \code{nset} rows and\cr
  \code{length(object[["variogram.object"]][[1]][["param"]])} columns with
  initial values of variogram parameters for the \code{nset}
  cross-validation sets, and \code{colnames(param)} must match\cr
  \code{names(object[["variogram.object"]][[1]][["param"]])}.}
  
  \item{fit.param}{a named logical vector or a matrix or data frame
  defining which variogram parameters should be adjusted by
  \code{\link[stats]{update}}.  If \code{fit.param} is a matrix (or a data
  frame) then it must have \code{nset} rows and\cr
  \code{length(object[["variogram.object"]][[1]][["fit.param"]])} columns
  with variogram parameter fitting flags for the \code{nset}
  cross-validation sets, and \code{colnames(param)} must match\cr
  \code{names(object[["variogram.object"]][[1]][["fit.param"]])}.}
  
  \item{aniso}{a named numeric vector or a matrix or data frame with
  initial values of anisotropy parameters passed by
  \code{\link[stats]{update}} to \code{\link{georob}}.  If \code{aniso} is
  a matrix (or a data frame) then it must have \code{nset} rows and\cr
  \code{length(object[["variogram.object"]][[1]][["aniso"]])} columns with
  initial values of anisotropy parameters for the \code{nset}
  cross-validation sets, and \code{colnames(aniso)} must match\cr
  \code{names(object[["variogram.object"]][[1]][["aniso"]])}.}
  
  \item{fit.aniso}{a named logical vector or a matrix or data frame
  defining which anisotropy parameters should be adjusted by
  \code{\link[stats]{update}}.  If \code{fit.aniso} is a matrix (or a data
  frame) then it must have \code{nset} rows and\cr
  \code{length(object[["variogram.object"]][[1]][["fit.aniso"]])} columns
  with anisotropy parameter fitting flags for the \code{nset}
  cross-validation sets, and \code{colnames(param)} must match\cr
  \code{names(object[["variogram.object"]][[1]][["fit.aniso"]])}.}
  
  \item{variogram.object}{an optional list that gives initial values of for
  fitting a possibly nested variogram model for the cross-validation sets.
  Each component is a list with the following components:
  
  \itemize{

      \item{param}: an optional named numeric vector or a matrix or data
      frame with initial values of variogram parameters passed by
      \code{\link[stats]{update}} to \code{\link{georob}}.  If \code{param}
      is a matrix (or a data frame) then it must have \code{nset} rows and\cr
      \code{length(object[["variogram.object"]][[i]][["param"]])} columns
      with initial values of variogram parameters for the \code{nset}
      cross-validation sets (\var{i} is the \var{i}th variogram structure),
      and \code{colnames(param)} must match\cr
      \code{names(object[["variogram.object"]][[i]][["param"]])}.
      
      \item{fit.param}: an optional named logical vector or a matrix or data frame
      defining which variogram parameters should be adjusted by
      \code{\link[stats]{update}}.  If \code{fit.param} is a matrix (or a
      data frame) then it must have \code{nset} rows and\cr
      \code{length(object[["variogram.object"]][[i]][["fit.param"]])}
      columns with variogram parameter fitting flags for the \code{nset}
      cross-validation sets (\var{i} is the \var{i}th variogram structure),
      and \code{colnames(param)} must match\cr
      \code{names(object[["variogram.object"]][[i]][["fit.param"]])}.
      
      \item{aniso}: an optional named numeric vector or a matrix or data frame with
      initial values of anisotropy parameters passed by
      \code{\link[stats]{update}} to \code{\link{georob}}.  If \code{aniso}
      is a matrix (or a data frame) then it must have \code{nset} rows and\cr
      \code{length(object[["variogram.object"]][[i]][["aniso"]])} columns
      with initial values of anisotropy parameters for the \code{nset}
      cross-validation sets (\var{i} is the \var{i}th variogram structure),
      and \code{colnames(aniso)} must match\cr
      \code{names(object[["variogram.object"]][[i]][["aniso"]])}.
      
      \item{fit.aniso}: an optional named logical vector or a matrix or
      data frame defining which anisotropy parameters should be adjusted by
      \code{\link[stats]{update}}.  If \code{fit.aniso} is a matrix (or a
      data frame) then it must have \code{nset} rows and\cr
      \code{length(object[["variogram.object"]][[i]][["fit.aniso"]])}
      columns with anisotropy parameter fitting flags for the \code{nset}
      cross-validation sets(\var{i} is the \var{i}th variogram structure),
      and \code{colnames(param)} must match\cr
      \code{names(object[["variogram.object"]][[i]][["fit.aniso"]])}.
  
    }
  
  }
  
  \item{use.fitted.param}{logical scalar controlling whether fitted values
  of \code{param} (and \code{aniso} are used as initial values when
  variogram parameters are fitted for the cross-validation sets (default
  \code{TRUE}).}
  
  \item{return.fit}{logical controlling whether information about the fit
  should be returned when re-estimating the model with the reduced data
  sets (default \code{FALSE}).}
  
  \item{reduced.output}{logical controlling whether the complete fitted
  model objects, fitted to the reduced data sets, are returned
  (\code{FALSE}) or only some components (\code{TRUE}, default, see
  \emph{Value}).  Ignored if \code{return.fit = FALSE}.}
  
  \item{lgn}{logical controlling whether Kriging predictions of a
  log-transformed response should be transformed back to the original scale
  of the measurements (default \code{FALSE}).}
  
  \item{mfl.action}{character controlling what is done when some levels of
  factor(s) are not present in any of the subsets used to fit the model.
  The function either stops (\code{"stop"}) or treats the respective
  factors as model offset (\code{"offset"}, default).}
  
  \item{ncores}{positive integer controlling how many cores are used for
  parallelized computations, see \emph{Details}.}
  
  \item{verbose}{positive integer controlling logging of diagnostic
  messages to the console during model fitting.  Passed by
  \code{\link[stats]{update}} to \code{\link{georob}}.}
    
  \item{\dots}{additional arguments passed by \code{\link[stats]{update}}
  to \code{\link{georob}}, see \emph{Details}.}

}

\details{
  \bold{Note} that \emph{the data frame passed as} \code{data}
  \emph{argument to} \code{georob} \emph{must exist in the user workspace
  when calling \code{cv.georob}}.  
  
  \code{cv.georob} then uses the packages \pkg{parallel}, \pkg{snow}
  and \pkg{snowfall} for parallelized computations.  By default, the
  function uses \eqn{K} CPUs but not more than are physically available (as
  returned by \code{\link[parallel]{detectCores}}).
  
  \code{cv.georob} uses the function \code{\link[stats]{update}} to
  re-estimated the model with the reduced data sets.  Therefore, any
  argument accepted by \code{\link{georob}} except \code{data} can be
  changed when re-fitting the model.  Some of them (e.g. \code{formula},
  \code{subset}, etc.)  are explicit arguments of \code{cv.georob}, but
  also the remaining ones can be passed by \code{\dots} to the function.
  
  Practitioners in geostatistics commonly cross-validate a fitted model
  without re-estimating the model parameters with the reduced data sets.
  This is clearly an unsound practice (see Hastie et al., 2009, sec.
  7.10).  Therefore, the argument \code{re.estimate} should always be set
  to \code{TRUE}.  The alternative is provided only for historic reasons.

}

\value{
  An object of class \code{cv.georob}, which is a list with the two
  components \code{pred} and \code{fit}.  
  
  \code{pred} is a data frame with the coordinates and the
  cross-validation prediction results with the following variables:
  
  \item{subset}{an integer vector defining to which of the \eqn{K} subsets
  an observation was assigned.}
  
  \item{data}{the values of the (possibly log-transformed) response.}
  
  \item{pred}{the Kriging predictions.}
  
  \item{se}{the Kriging standard errors.}
  
  If \code{lgn = TRUE} then \code{pred} has the additional variables:
  
  \item{lgn.data}{the untransformed response.}
  
  \item{lgn.pred}{the unbiased back-transformed predictions of a
  log-transformed response.}
  
  \item{lgn.se}{the Kriging standard errors of the back-transformed
  predictions of a\cr log-transformed response.}
  
  The second component \code{fit} contains either the full outputs of
  \code{georob}, fitted for the \eqn{K} reduced data sets
  (\code{reduced.output = FALSE}), or \eqn{K} lists with the components
  \code{tuning.psi}, \code{converged}, \cr \code{convergence.code},
  \code{gradient}, \code{variogram.object}, \code{coefficients} along with
  the standard errors of
  \eqn{\widehat{\mbox{\boldmath$\beta$\unboldmath}}}{hat\beta}, see
  \code{\link{georobObject}}.
 
}


\references{
Hastie, T., Tibshirani, R. and Friedman, J. (2009) \emph{The Elements of
Statistical Learning; Data Mining, Inference and Prediction}.  New York:
Springer-Verlag.
}

\author{
   Andreas Papritz \email{andreas.papritz@env.ethz.ch}}

\seealso{
  \code{\link{georobIntro}} for a description of the model and a brief summary of the algorithms;
  
  \code{\link{georob}} for (robust) fitting of spatial linear models;
  
  \code{\link{georobObject}} for a description of the class \code{georob};
  
  \code{\link{profilelogLik}} for computing profiles of Gaussian likelihoods;
  
  \code{\link{plot.georob}} for display of RE(ML) variogram estimates;
  
  \code{\link{control.georob}} for controlling the behaviour of \code{georob};
  
  \code{\link{georobModelBuilding}} for stepwise building models of class \code{georob};
%  \code{\link{cv.georob}} for assessing the goodness of a fit by \code{georob}; 
  
  \code{\link{georobMethods}} for further methods for the class \code{georob};
  
  \code{\link{predict.georob}} for computing robust Kriging predictions; 
  
  \code{\link{validate.predictions}} for validating Kriging predictions; 
  
  \code{\link{lgnpp}} for unbiased back-transformation of Kriging prediction 
  of log-transformed data; 
  
  \code{\link{georobSimulation}} for simulating realizations of a Gaussian process
  from model fitted by \code{georob}; and finally
  
  \code{\link{sample.variogram}} and \code{\link{fit.variogram.model}} 
  for robust estimation and modelling of sample variograms.
}


\examples{
\dontrun{
data(meuse)

r.logzn <- georob(log(zinc) ~ sqrt(dist), data = meuse, locations = ~ x + y,
    variogram.model = "RMexp",
    param = c(variance = 0.15, nugget = 0.05, scale = 200),
    tuning.psi = 1)

r.logzn.cv.1 <- cv(r.logzn, seed = 1, lgn = TRUE, ncores = 1, verbose = 1)
## using multiple cores
## r.logzn.cv.1 <- cv(r.logzn, seed = 1, lgn = TRUE)
r.logzn.cv.2 <- cv(r.logzn, formula = .~. + ffreq, seed = 1, lgn = TRUE, 
    ncores = 1, verbose = 1)
## using multiple cores
## r.logzn.cv.2 <- cv(r.logzn, formula = .~. + ffreq, seed = 1, lgn = TRUE, 
##    ncores = 1, verbose = 1)

plot(r.logzn.cv.1, type = "bs")
plot(r.logzn.cv.2, type = "bs", add = TRUE, col = "red")

legend("topright", lty = 1, col = c("black", "red"), bty = "n",
    legend = c("log(Zn) ~ sqrt(dist)", "log(Zn) ~ sqrt(dist) + ffreq"))}
}

\keyword{models}
\keyword{spatial}
