% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/GnE.R
\name{GnE}
\alias{GnE}
\title{Penalized factorial regression using glmnet}
\usage{
GnE(
  dat,
  Y,
  G,
  E,
  K = NULL,
  indices = NULL,
  indicesData = NULL,
  testEnv = NULL,
  weight = NULL,
  outputFile = NULL,
  corType = c("pearson", "spearman"),
  partition = data.frame(),
  nfolds = 10,
  alpha = 1,
  lambda = NULL,
  penG = 0,
  penE = 0,
  scaling = c("train", "all", "no"),
  quadratic = FALSE,
  verbose = FALSE
)
}
\arguments{
\item{dat}{A \code{data.frame} with data from multi-environment trials.
Each row corresponds to a particular genotype in a particular environment.
The data do not need to be balanced, i.e. an environment does not need to
contain all genotypes. \code{dat} should contain the training as well as the
test environments (see testEnv)}

\item{Y}{The trait to be analyzed: either of type character, in which case
it should be one of the column names in \code{dat}, or numeric, in which
case the Yth column of \code{dat} will be analyzed.}

\item{G}{The column in \code{dat} containing the factor genotype (either
character or numeric).}

\item{E}{The column in \code{dat} containing the factor environment
(either character or numeric).}

\item{K}{A kinship matrix. Used for replacing the estimated genotypic main
effect and each of the sensitivities by genomic prediction from a g-BLUP
model for each of them. If \code{NULL}, the estimated effects from the model
are returned and used for constructing predictions.}

\item{indices}{The columns in \code{dat} containing the environmental
indices (vector of type character). Alternatively, if the indices are always
constant within environments (i.e. not genotype dependent), the
environmental data can also be provided using the argument \code{indicesData}
(see below).}

\item{indicesData}{An optional \code{data.frame} containing environmental
indices (covariates); one value for each environment and index. It should
have the environment names as row names (corresponding to the names
contained in \code{dat$E}); the column names are the indices. If
\code{indices} (see before) is also provided, the latter will be ignored.}

\item{testEnv}{vector (character). Data from these environments are not used
for fitting the model. Accuracy is evaluated for training and test
environments separately. The default is \code{NULL}, i.e. no test
environments, in which case the whole data set is training. It is also
possible that there are test environments, but without any data; in this
case, no accuracy is reported for test environments (CHECK correctness).}

\item{weight}{Numeric vector of length \code{nrow(dat)}, specifying the
weight (inverse variance) of each observation, used in glmnet. Default
\code{NULL}, giving constant weights.}

\item{outputFile}{The file name of the output files, without .csv extension
which is added by the function. If not \code{NULL} the prediction accuracies
for training and test environments are written to separate files. If
\code{NULL} the output is not written to file.}

\item{corType}{type of correlation: Pearson (default) or Spearman rank sum.}

\item{partition}{\code{data.frame} with columns E and partition. The column
E should contain the training environments (type character); partition
should be of type integer. Environments in the same fold should have
the same integer value. Default is \code{data.frame()}, in which case the
function uses a leave-one-environment out cross-validation. If \code{NULL},
the (inner) training sets used for cross-validation will be drawn randomly
from all observations, ignoring the environment structure. In the latter
case, the number of folds (nfolds) can be specified.}

\item{nfolds}{Default \code{NULL}. If \code{partition == NULL}, this can be
used to specify the number of folds to be used in glmnet.}

\item{alpha}{Type of penalty, as in glmnet (1 = LASSO, 0 = ridge; in between
= elastic net). Default is 1.}

\item{lambda}{Numeric vector; defines the grid over which the penalty lambda
is optimized in cross validation. Default: NULL (defined by glmnet).
Important special case: lambda = 0 (no penalty).}

\item{penG}{numeric; default 0. If 1, genotypic main effects are
penalized. If 0, they are not. Any non negative real number is allowed.}

\item{penE}{numeric; default 0. If 1, environmental main effects are
penalized. If 0, they are not. Any non negative real number is allowed.}

\item{scaling}{determines how the environmental variables are scaled.
"train" : all data (test and training environments) are scaled
using the mean and and standard deviation in the training environments.
"all" : using the mean and standard deviation of all environments.
"no" : No scaling.}

\item{quadratic}{boolean; default \code{FALSE}. If \code{TRUE}, quadratic
terms (i.e., squared indices) are added to the model.}

\item{verbose}{boolean; default \code{FALSE}. If \code{TRUE}, the accuracies
per environment are printed on screen.}
}
\value{
A list with the following elements:
\describe{
\item{predTrain}{A data.frame with predictions for the training set}
\item{predTest}{A data.frame with predictions for the test set}
\item{resTrain}{A data.frame with residuals for the training set}
\item{resTest}{A data.frame with residuals for the test set}
\item{mu}{the estimated overall mean}
\item{envInfoTrain}{The estimated environmental main effects, and the
predicted effects, obtained when the former are regressed on the averaged
indices, using penalized regression}
\item{envInfoTest}{The predicted environmental main effects for the test
environments, obtained from penalized regression using the estimated
main effects for the training environments and the averaged indices}
\item{parGeno}{data.frame containing the estimated genotypic main effects
(first column) and sensitivities (subsequent columns)}
\item{trainAccuracyEnv}{a data.frame with the accuracy (r) for each
training environment, as well as the root mean square error (RMSE), mean
absolute deviation (MAD) and rank (the latter is a proportion: how many
of the best 5 genotypes are in the top 10). To be removed or further
developed. All these quantities are also evaluated for a model with only
genotypic and environmental main effects (columns rMain, RMSEMain and
rankMain)}
\item{testAccuracyEnv}{A data.frame with the accuracy for each test
environment, with the same columns as trainAccuracyEnv}
\item{trainAccuracyGeno}{a data.frame with the accuracy (r) for each
genotype, averaged over the training environments}
\item{testAccuracyGeno}{a data.frame with the accuracy (r) for each
genotype, averaged over the test environments}
\item{lambda}{The value of lambda selected using cross validation}
\item{lambdaSequence}{The values of lambda used in the fits of glmnet. If
\code{lambda} was provided as input, the value of \code{lambda} is
returned}
\item{RMSEtrain}{The root mean squared error on the training environments}
\item{RMSEtest}{The root mean squared error on the test environments}
\item{Y}{The name of the trait that was predicted, i.e. the column name
in \code{dat} that was used}
\item{G}{The genotype label that was used, i.e. the argument G that was
used}
\item{E}{The environment label that was used, i.e. the argument E that
was used}
\item{indices}{The indices that were used, i.e. the argument indices that
was used}
\item{quadratic}{The quadratic option that was used}
}
}
\description{
\loadmathjax
Based on multi-environment field trials, fits the factorial regression model
\mjeqn{Y_{ij} = \mu + e_j + g_i + \sum_{k=1}^s \beta_{ik} x_{ij} +
\epsilon_{ij},}{ascii} with environmental main effects \mjeqn{e_j}{ascii},
genotypic main effects \mjeqn{g_{i}}{ascii} and genotype-specific
environmental sensitivities \mjeqn{\beta_{ik}}{ascii}. See e.g. Millet
et al 2019 and Bustos-Korts et al 2019. There are \mjeqn{s}{ascii}
environmental indices with values \mjeqn{x_{ij}}{ascii}. Optionally,
predictions can be made for a set of test environments, for which
environmental indices are available. The new environments must contain the
same set of genotypes, or a subset.

Penalization: the model above is fitted using glmnet, simultaneously
penalizing \mjeqn{e_j}{ascii}, \mjeqn{g_i}{ascii} and
\mjeqn{\beta_{ik}}{ascii}. If \code{penG = 0} and \code{penE = 0}, the main
effects \mjeqn{g_i}{ascii} and \mjeqn{e_j}{ascii} are not penalized. If these
parameters are 1, the the main effects are penalized to the same degree as
the sensitivities. Any non negative values are allowed. Cross validation is
performed with each fold containing a number of environments (details below).

After fitting the model, it is possible to replace the estimated
genotypic main effects and sensitivities by
their predicted genetic values. Specifically, if a kinship matrix \code{K}
is assigned the function performs genomic prediction with g-BLUP for the
genotypic main effect and each of the sensitivities in turn.

Predictions for the test environments are first constructed using the
estimated genotypic main effects and sensitivities; next, predicted
environmental main effects are added. The latter are obtained by
regressing the estimated environmental main effects for the training
environments on the average values of the indices in these environments,
as in Millet et al. 2019.
}
\examples{
## load the data, which are contained in the package
data(drops_GE)
data(drops_GnE)

## We remove identifiers that we don't need.
drops_GE_GnE <- rbind(drops_GE[, -c(2, 4)], drops_GnE[, -c(2, 4)])

## Define indeces.
ind <- colnames(drops_GE)[13:23]

## Define test environments.
testenv <- levels(drops_GnE$Experiment)

## Additive model, only main effects (set the penalty parameter to a large value).
Additive_model <- GnE(drops_GE_GnE, Y = "grain.yield", lambda = 100000,
                      G = "Variety_ID", E = "Experiment", testEnv = testenv,
                      indices = ind, penG = FALSE, penE = FALSE,
                      alpha = 0.5, scaling = "train")
\donttest{
## Full model, no penalization (set the penalty parameter to zero).
Full_model <- GnE(drops_GE_GnE, Y = "grain.yield", lambda = 0,
                  G = "Variety_ID", E = "Experiment", testEnv = testenv,
                  indices = ind, penG = FALSE, penE = FALSE,
                  alpha = 0.5, scaling = "train")

## Elastic Net model, set alpha parameter to 0.5.
Elnet_model <- GnE(drops_GE_GnE, Y = "grain.yield", lambda = NULL,
                   G = "Variety_ID", E = "Experiment", testEnv = testenv,
                   indices = ind, penG = FALSE, penE = FALSE,
                   alpha = 0.5, scaling = "train")

## Lasso model, set alpha parameter to 1.
Lasso_model <- GnE(drops_GE_GnE, Y = "grain.yield", lambda = NULL,
                   G = "Variety_ID", E = "Experiment", testEnv = testenv,
                   indices = ind, penG = FALSE, penE = FALSE,
                   alpha = 1, scaling = "train")

## Ridge model, set alpha parameter to 0.
Ridge_model <- GnE(drops_GE_GnE, Y = "grain.yield", lambda = NULL,
                   G = "Variety_ID", E = "Experiment", testEnv = testenv,
                   indices = ind, penG = FALSE, penE = FALSE,
                   alpha = 0, scaling = "train")
}

}
\references{
Millet, E.J., Kruijer, W., Coupel-Ledru, A. et al. Genomic
prediction of maize yield across European environmental conditions. Nat
Genet 51, 952–956 (2019). \doi{10.1038/s41588-019-0414-y}
}
