% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hetTP.R
\name{mleHetTP}
\alias{mleHetTP}
\title{Student-t process modeling with heteroskedastic noise}
\usage{
mleHetTP(
  X,
  Z,
  lower = NULL,
  upper = NULL,
  noiseControl = list(k_theta_g_bounds = c(1, 100), g_max = 10000, g_bounds = c(1e-06,
    0.1), nu_bounds = c(2 + 0.001, 30), sigma2_bounds = c(sqrt(.Machine$double.eps),
    10000)),
  settings = list(linkThetas = "joint", logN = TRUE, initStrategy = "residuals", checkHom
    = TRUE, penalty = TRUE, trace = 0, return.matrices = TRUE, return.hom = FALSE, factr
    = 1e+09),
  covtype = c("Gaussian", "Matern5_2", "Matern3_2"),
  maxit = 100,
  known = list(beta0 = 0),
  init = list(nu = 3),
  eps = sqrt(.Machine$double.eps)
)
}
\arguments{
\item{X}{matrix of all designs, one per row, or list with elements:
\itemize{
  \item \code{X0} matrix of unique design locations, one point per row
  \item \code{Z0} vector of averaged observations, of length \code{nrow(X0)}
  \item \code{mult} number of replicates at designs in \code{X0}, of length \code{nrow(X0)}
}}

\item{Z}{vector of all observations. If using a list with \code{X}, \code{Z} has to be ordered with respect to \code{X0}, and of length \code{sum(mult)}}

\item{lower, upper}{bounds for the \code{theta} parameter (see \code{\link[hetGP]{cov_gen}} for the exact parameterization).
In the multivariate case, it is possible to give vectors for bounds (resp. scalars) for anisotropy (resp. isotropy)}

\item{noiseControl}{list with elements related to optimization of the noise process parameters:
\itemize{
\item \code{g_min}, \code{g_max} minimal and maximal noise to signal ratio (of the mean process)
\item \code{lowerDelta}, \code{upperDelta} optional vectors (or scalars) of bounds on \code{Delta}, of length \code{nrow(X0)} (default to \code{rep(eps, nrow(X0))} and \code{rep(noiseControl$g_max, nrow(X0))} resp., or their \code{log}) 
\item \code{lowerTheta_g}, \code{upperTheta_g} optional vectors of bounds for the lengthscales of the noise process if \code{linkThetas == 'none'}.
Same as for \code{theta} if not provided.
\item \code{k_theta_g_bounds} if \code{linkThetas == 'joint'}, vector with minimal and maximal values for \code{k_theta_g} (default to \code{c(1, 100)}). See Details.
\item \code{g_bounds} vector for minimal and maximal noise to signal ratios for the noise of the noise process, i.e., the smoothing parameter for the noise process.
(default to \code{c(1e-6, 1)}).
\item \code{sigma2_bounds}, vector providing minimal and maximal signal variance.
\item \code{nu_bounds}, vector providing minimal and maximal values for the degrees of freedom. 
}}

\item{settings}{list for options about the general modeling procedure, with elements:
\itemize{
  \item \code{linkThetas} defines the relation between lengthscales of the mean and noise processes.
  Either \code{'none'}, \code{'joint'}(default) or \code{'constr'}, see Details.
  \item \code{logN}, when \code{TRUE} (default), the log-noise process is modeled.
  \item \code{initStrategy} one of \code{'simple'}, \code{'residuals'} (default) and \code{'smoothed'} to obtain starting values for \code{Delta}, see Details
  \item \code{penalty} when \code{TRUE}, the penalized version of the likelihood is used (i.e., the sum of the log-likelihoods of the mean and variance processes, see References).
  \item \code{checkHom} when \code{TRUE}, if the log-likelihood with a homoskedastic model is better, then return it.
  \item \code{trace} optional scalar (default to \code{0}). If positive, tracing information on the fitting process.
If \code{1}, information is given about the result of the heterogeneous model optimization.
Level \code{2} gives more details. Level {3} additionaly displays all details about initialization of hyperparameters.
\item \code{return.matrices} boolean too include the inverse covariance matrix in the object for further use (e.g., prediction).
\item Arguments \code{factr} (default to 1e9) and \code{pgtol} are available to be passed to \code{control} for L-BFGS-B in \code{\link[stats]{optim}}.   
}}

\item{covtype}{covariance kernel type, either \code{'Gaussian'}, \code{'Matern5_2'} or \code{'Matern3_2'}, see \code{\link[hetGP]{cov_gen}}}

\item{maxit}{maximum number of iterations for \code{L-BFGS-B} of \code{\link[stats]{optim}} dedicated to maximum likelihood optimization}

\item{init, known}{optional lists of starting values for mle optimization or that should not be optimized over, respectively.
Values in \code{known} are not modified, while it can happen to those of \code{init}, see Details. 
One can set one or several of the following:
\itemize{
\item \code{theta} lengthscale parameter(s) for the mean process either one value (isotropic) or a vector (anistropic)
\item \code{Delta} vector of nuggets corresponding to each design in \code{X0}, that are smoothed to give \code{Lambda}
(as the global covariance matrix depend on \code{Delta} and \code{nu_hat}, it is recommended to also pass values for \code{theta})
\item \code{beta0} constant trend of the mean process
\item \code{k_theta_g} constant used for link mean and noise processes lengthscales, when \code{settings$linkThetas == 'joint'}
\item \code{theta_g} either one value (isotropic) or a vector (anistropic) for lengthscale parameter(s) of the noise process, when \code{settings$linkThetas != 'joint'}
\item \code{g} scalar nugget of the noise process
\item \code{nu} degree of freedom parameter
\item \code{sigma2} scale variance
\item \code{g_H} scalar homoskedastic nugget for the initialisation with a \code{\link[hetGP]{mleHomGP}}. See Details.
}}

\item{eps}{jitter used in the inversion of the covariance matrix for numerical stability}
}
\value{
a list which is given the S3 class \code{"hetTP"}, with elements:
\itemize{
\item \code{theta}: unless given, maximum likelihood estimate (mle) of the lengthscale parameter(s),
\item \code{Delta}: unless given, mle of the nugget vector (non-smoothed),
\item \code{Lambda}: predicted input noise variance at \code{X0}, 
\item \code{sigma2}: plugin estimator of the variance,
\item \code{theta_g}: unless given, mle of the lengthscale(s) of the noise/log-noise process,
\item \code{k_theta_g}: if \code{settings$linkThetas == 'joint'}, mle for the constant by which lengthscale parameters of \code{theta} are multiplied to get \code{theta_g},
\item \code{g}: unless given, mle of the nugget of the noise/log-noise process,
\item \code{trendtype}: either "\code{SK}" if \code{beta0} is provided, else "\code{OK}",
\item \code{beta0} constant trend of the mean process, plugin-estimator unless given,
\item \code{nmean}: plugin estimator for the constant noise/log-noise process mean,
\item \code{ll}: log-likelihood value, (\code{ll_non_pen}) is the value without the penalty,
\item \code{nit_opt}, \code{msg}: \code{counts} and \code{message} returned by \code{\link[stats]{optim}}
\item \code{modHom}: homoskedastic GP model of class \code{homGP} used for initialization of the mean process,
\item \code{modNugs}: homoskedastic GP model of class \code{homGP} used for initialization of the noise/log-noise process,
\item \code{nu_hat_var}: variance of the noise process,
\item \code{used_args}: list with arguments provided in the call to the function, which is saved in \code{call},
\item \code{X0}, \code{Z0}, \code{Z}, \code{eps}, \code{logN}, \code{covtype}: values given in input,
\item \code{time}: time to train the model, in seconds.
}
}
\description{
Student-t process regression under input dependent noise based on maximum likelihood estimation of the hyperparameters.
A GP is used to model latent (log-) variances.
This function is enhanced to deal with replicated observations.
}
\details{
The global covariance matrix of the model is parameterized as \code{K = sigma2 * C + Lambda * diag(1/mult)},
with \code{C} the correlation matrix between unique designs, depending on the family of kernel used (see \code{\link[hetGP]{cov_gen}} for available choices).
\code{Lambda} is the prediction on the noise level given by a (homoskedastic) GP: \cr
\deqn{\Lambda = C_g(C_g + \mathrm{diag}(g/\mathrm{mult}))^{-1} \Delta} \cr
with \code{C_g} the correlation matrix between unique designs for this second GP, with lengthscales hyperparameters \code{theta_g} and nugget \code{g}
and \code{Delta} the variance level at \code{X0} that are estimated.

It is generally recommended to use \code{\link[hetGP]{find_reps}} to pre-process the data, to rescale the inputs to the unit cube and to normalize the outputs.

The noise process lengthscales can be set in several ways:
\itemize{
\item using \code{k_theta_g} (\code{settings$linkThetas == 'joint'}), supposed to be greater than one by default. 
In this case lengthscales of the noise process are multiples of those of the mean process.
\item if \code{settings$linkThetas == 'constr'}, then the lower bound on \code{theta_g} correspond to estimated values of an homoskedastic GP fit.
\item else lengthscales between the mean and noise process are independent (both either anisotropic or not).
}

When no starting nor fixed parameter values are provided with \code{init} or \code{known}, 
the initialization process consists of fitting first an homoskedastic model of the data, called \code{modHom}.
Unless provided with \code{init$theta}, initial lengthscales are taken at 10\% of the range determined with \code{lower} and \code{upper},
while \code{init$g_H} may be use to pass an initial nugget value.
The resulting lengthscales provide initial values for \code{theta} (or update them if given in \code{init}). \cr \cr
If necessary, a second homoskedastic model, \code{modNugs}, is fitted to the empirical residual variance between the prediction
 given by \code{modHom} at \code{X0} and \code{Z} (up to \code{modHom$nu_hat}).
Note that when specifying \code{settings$linkThetas == 'joint'}, then this second homoskedastic model has fixed lengthscale parameters.
Starting values for \code{theta_g} and \code{g} are extracted from \code{modNugs}.\cr \cr
Finally, three initialization schemes for \code{Delta} are available with \code{settings$initStrategy}: 
\itemize{
\item for \code{settings$initStrategy == 'simple'}, \code{Delta} is simply initialized to the estimated \code{g} value of \code{modHom}. 
Note that this procedure may fail when \code{settings$penalty == TRUE}.
\item for \code{settings$initStrategy == 'residuals'}, \code{Delta} is initialized to the estimated residual variance from the homoskedastic mean prediction.
\item for \code{settings$initStrategy == 'smoothed'}, \code{Delta} takes the values predicted by \code{modNugs} at \code{X0}.
}

Notice that \code{lower} and \code{upper} bounds cannot be equal for \code{\link[stats]{optim}}.
}
\examples{
##------------------------------------------------------------
## Example 1: Heteroskedastic TP modeling on the motorcycle data
##------------------------------------------------------------
set.seed(32)

## motorcycle data
library(MASS)
X <- matrix(mcycle$times, ncol = 1)
Z <- mcycle$accel
nvar <- 1
plot(X, Z, ylim = c(-160, 90), ylab = 'acceleration', xlab = "time")


## Model fitting
model <- mleHetTP(X = X, Z = Z, lower = rep(0.1, nvar), upper = rep(50, nvar),
                  covtype = "Matern5_2")
            
## Display averaged observations
points(model$X0, model$Z0, pch = 20)

## A quick view of the fit                  
summary(model)

## Create a prediction grid and obtain predictions
xgrid <- matrix(seq(0, 60, length.out = 301), ncol = 1) 
preds <- predict(x = xgrid, object =  model)

## Display mean predictive surface
lines(xgrid, preds$mean, col = 'red', lwd = 2)
## Display 95\% confidence intervals
lines(xgrid, preds$mean + sqrt(preds$sd2) * qt(0.05, df = model$nu + nrow(X)), col = 2, lty = 2)
lines(xgrid, preds$mean + sqrt(preds$sd2) * qt(0.95, df = model$nu + nrow(X)), col = 2, lty = 2)
## Display 95\% prediction intervals
lines(xgrid, preds$mean + sqrt(preds$sd2 + preds$nugs) * qt(0.05, df = model$nu + nrow(X)),
  col = 3, lty = 2)
lines(xgrid, preds$mean + sqrt(preds$sd2 + preds$nugs) * qt(0.95, df = model$nu + nrow(X)), 
  col = 3, lty = 2)

##------------------------------------------------------------
## Example 2: 2D Heteroskedastic TP modeling
##------------------------------------------------------------
set.seed(1)
nvar <- 2
  
## Branin redefined in [0,1]^2
branin <- function(x){
  if(is.null(nrow(x)))
    x <- matrix(x, nrow = 1)
    x1 <- x[,1] * 15 - 5
    x2 <- x[,2] * 15
    (x2 - 5/(4 * pi^2) * (x1^2) + 5/pi * x1 - 6)^2 + 10 * (1 - 1/(8 * pi)) * cos(x1) + 10
}

## Noise field via standard deviation
noiseFun <- function(x){
  if(is.null(nrow(x)))
    x <- matrix(x, nrow = 1)
  return(1/5*(3*(2 + 2*sin(x[,1]*pi)*cos(x[,2]*3*pi) + 5*rowSums(x^2))))
}

## data generating function combining mean and noise fields
ftest <- function(x){
  return(branin(x) + rnorm(nrow(x), mean = 0, sd = noiseFun(x)))
}

## Grid of predictive locations
ngrid <- 51
xgrid <- matrix(seq(0, 1, length.out = ngrid), ncol = 1) 
Xgrid <- as.matrix(expand.grid(xgrid, xgrid))

## Unique (randomly chosen) design locations
n <- 100
Xu <- matrix(runif(n * 2), n)

## Select replication sites randomly
X <- Xu[sample(1:n, 20*n, replace = TRUE),]

## obtain training data response at design locations X
Z <- ftest(X)

## Formating of data for model creation (find replicated observations) 
prdata <- find_reps(X, Z, rescale = FALSE, normalize = FALSE)

## Model fitting
model <- mleHetTP(X = list(X0 = prdata$X0, Z0 = prdata$Z0, mult = prdata$mult), Z = prdata$Z, ,
                  lower = rep(0.01, nvar), upper = rep(10, nvar),
                  covtype = "Matern5_2")

## a quick view into the data stored in the "hetTP"-class object
summary(model)                  
             
## prediction from the fit on the grid     
preds <- predict(x = Xgrid, object =  model)

## Visualization of the predictive surface
par(mfrow = c(2, 2))
contour(x = xgrid,  y = xgrid, z = matrix(branin(Xgrid), ngrid), 
  main = "Branin function", nlevels = 20)
points(X, col = 'blue', pch = 20)
contour(x = xgrid,  y = xgrid, z = matrix(preds$mean, ngrid), 
  main = "Predicted mean", nlevels = 20)
points(X, col = 'blue', pch = 20)
contour(x = xgrid,  y = xgrid, z = matrix(noiseFun(Xgrid), ngrid), 
  main = "Noise standard deviation function", nlevels = 20)
points(X, col = 'blue', pch = 20)
contour(x = xgrid,  y= xgrid, z = matrix(sqrt(preds$nugs), ngrid), 
  main = "Predicted noise values", nlevels = 20)
points(X, col = 'blue', pch = 20)
par(mfrow = c(1, 1))
}
\references{
M. Binois, Robert B. Gramacy, M. Ludkovski (2018), Practical heteroskedastic Gaussian process modeling for large simulation experiments,
Journal of Computational and Graphical Statistics, 27(4), 808--821.\cr 
Preprint available on arXiv:1611.05902.\cr \cr
 
A. Shah, A. Wilson, Z. Ghahramani (2014), Student-t processes as alternatives to Gaussian processes, Artificial Intelligence and Statistics, 877--885.
}
\seealso{
\code{\link[hetGP]{predict.hetTP}} for predictions. 
\code{summary} and \code{plot} functions are available as well.
}
