% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generateData.R
\name{generateData}
\alias{generateData}
\title{Quickly generate synthetic data for simulation studies}
\usage{
generateData(
  X = NULL,
  Beta = NULL,
  E = NULL,
  Theta = NULL,
  Sigma.X = NULL,
  n,
  p,
  q,
  rho,
  missing.type = "MCAR",
  Beta.row.sparsity = 0.2,
  Beta.elm.sparsity = 0.2,
  with.seed = NULL
)
}
\arguments{
\item{X}{(Optional) a user-supplied predictor matrix (\eqn{n\times p}). The default is \code{'NULL'} and the program simulates the rows of \code{'X'} independently from \eqn{\mathcal{MVN}}(\eqn{0_p}, \eqn{\mathbf{\Sigma}_X}). A user-supplied matrix overrides this default, and the argument \code{'Sigma.X'} for \eqn{\mathbf{\Sigma}_X} will be ignored.}

\item{Beta}{(Optional) a user-supplied regression coefficient matrix \eqn{\mathbf{B}} (\eqn{p\times q}). The default is \code{'NULL'} and the program will generate a sparse \eqn{\mathbf{B}} in which the nonzero elements are independently drawn from \eqn{\mathcal{N}(0, 1)}; the row sparsity and element sparsity of \eqn{\mathbf{B}} are controlled by the arguments \code{'Beta.row.sparsity'} and \code{'Beta.elm.sparsity'}, respectively. A user-supplied matrix overrides this default, and \code{'Beta.row.sparsity'} and \code{'Beta.elm.sparsity'} will be ignored.}

\item{E}{(Optional) a user-supplied error matrix (\eqn{n\times q}). The default is \code{'NULL'} and the program simulates the rows of \code{'E'} independently from \eqn{\mathcal{MVN}}(\eqn{0_q}, \eqn{\mathbf{\Theta}^{-1}}). A response matrix \code{'Y'} without missing values is given by \code{'Y = X \%*\% Beta + E'}. A user-supplied matrix overrides this default, and the argument \code{'Theta'} for \eqn{\mathbf{\Theta}} will be ignored.}

\item{Theta}{(Optional) a user-supplied positive definite precision (inverse covariance) matrix \eqn{\mathbf{\Theta}} (\eqn{q\times q}) for the response variables. The default is \code{'NULL'} and the program will generate a block-structured matrix having four blocks corresponding to four types of network structures: independent, weak graph, strong graph and chain. This is only needed when \code{'E = NULL'}.}

\item{Sigma.X}{(Optional) A user-supplied positive definite covariance matrix \eqn{\mathbf{\Sigma}_X} (\eqn{p\times p}) for the predictor variables. The samples of \code{'X'} are independently drawn from a multivariate Gaussian distribution \eqn{\mathcal{MVN}}(\eqn{0_p}, \eqn{\mathbf{\Sigma}_X}). If \code{'Sigma.X = NULL'} (default), the program uses an AR(1) covariance with 0.7 autocorrelation (i.e., \eqn{[\mathbf{\Sigma}_X]_{jk} = 0.7^{|j-k|}}). This is only needed when \code{'X = NULL'}.}

\item{n}{Sample size.}

\item{p}{The dimensionality of the predictors.}

\item{q}{The dimensionality of the responses.}

\item{rho}{A scalar or a numeric vector of length \eqn{q} specifying the approximate proportion of missing values in each column of the response matrix.}

\item{missing.type}{Character string: can be "\code{MCAR}" (default), "\code{MAR}" or "\code{MNAR}".}

\item{Beta.row.sparsity}{A Bernoulli parameter between 0 and 1 controlling the approximate proportion of rows where at least one element could be nonzero in \eqn{\mathbf{B}}; the default is \code{'Beta.row.sparsity = 0.2'}. This is only needed when \code{'Beta = NULL'}.}

\item{Beta.elm.sparsity}{A Bernoulli parameter between 0 and 1 controlling the approximate proportion of nonzero elements among the rows of \eqn{\mathbf{B}} where not all elements are zeros; the default is \code{'Beta.elm.sparsity = 0.2'}. This is only needed when \code{'Beta = NULL'}.}

\item{with.seed}{A random number seed for the generative process.}
}
\value{
This function returns a \code{'list'} consisting of the following components:
\item{\code{X}}{A simulated (or the user-supplied) predictor matrix (\eqn{n\times p}).}
\item{\code{Y}}{A simulated response matrix without missing values (\eqn{n\times q}).}
\item{\code{Z}}{A simulated response matrix with missing values coded as \code{'NA'}s (\eqn{n\times q}).}
\item{\code{Beta}}{The regression coefficient matrix \eqn{\mathbf{B}} for the generative model (\eqn{p\times q}).}
\item{\code{Theta}}{The precision matrix \eqn{\mathbf{\Theta}} for the generative model (\eqn{q\times q}).}
\item{\code{rho}}{A vector of length \eqn{q} storing the specified missing rate for each column of the response matrix.}
\item{\code{missing.type}}{Character string: the type of missing mechanism used to generate missing values in the response matrix.}
}
\description{
The \sQuote{\code{generateData}} function is used to readily produce synthetic data with randomly/systematically-missing values from a conditional Gaussian graphical model. 
This function supports three types of missing mechanisms that can be specified by users -- missing completely at random (MCAR), missing at random (MAR) and 
missing not at random (MNAR).
}
\details{
The dataset is simulated through the following steps:
\enumerate{
  \item If \code{'X = NULL'} (default), the function \sQuote{\code{MASS::mvrnorm(n, mean = rep(0, p), sigma = Sigma.X)}} is used to simulate \code{'n'} samples from a \code{'p'}-variate Gaussian distribution for generating a predictor matrix \code{'X'};
  \item If \code{'Beta = NULL'} (default), the function \sQuote{\code{stats::rnorm(p*q, 0, 1)}} is used to fill an empty (\eqn{p \times q}) dimensional matrix \code{'Beta'}, of which the row sparsity and element sparsity are later controlled by the auxiliary arguments \code{'Beta.row.sparsity'} and \code{'Beta.elm.sparsity'}, respectively;
  \item If \code{'E = NULL'} (default), the function \sQuote{\code{MASS::mvrnorm(n, mean = rep(0, q), sigma = solve(Theta))}} is used to simulate \code{'n'} samples from a \code{'q'}-variate Gaussian distribution for generating an error matrix \code{'E'};
  \item A complete response matrix \code{'Y'} without missing values is then generated by the command \code{'Y = X \%*\% Beta + E'};
  \item To get a response matrix \code{'Z'} := \eqn{f}(\code{'Y'}) corrupted by missing data, the values in \code{'Y'} are partially replaced with \code{'NA'}s following the strategy specified by the arguments \code{'missing.type'} and \code{'rho'}.
}

To better illustrate the step 5 above, suppose for all \code{i = 1,...,n} and \code{j = 1,...,q}: \code{'Y[i, j]'} is replaced with \code{'NA'} 
if \code{'M[i, j] == 1'}, where \code{'M'} is an indicator matrix of missingness having the same dimension as \code{'Y'}.
The value of \code{'M[i, j]'} is partially controlled by the arguments \code{'missing.type'} and \code{'rho'}. 
Below we sum up the three built-in missing mechanisms supported by the \sQuote{\code{generateData}} function:
\itemize{
  \item \code{'missing.type'} == "\code{MCAR}": \code{'Y[i, j] <- NA'} if \code{'M[i, j] == 1'}, where \code{'M[i, j] = rbinom(0, rho[j])'};
  \item \code{'missing.type'} == "\code{MAR}": \code{'Y[i, j] <- NA'} if \code{'M[i, j] == 1'}, where \code{'M[i, j]  = rbinom(0, (rho[j] * c / (1 + exp(-(X \%*\% Beta)[i, j]))))'}, in which \code{c} is a constant correcting the missing rate of the \code{j}th column of \code{'Y'} to \code{'rho[j]'};
  \item \code{'missing.type'} == "\code{MNAR}": \code{'Y[i, j] <- NA'} if \code{'M[i, j] == 1'}, where \code{'M[i, j] = 1 * (Y[i, j] < Tj)'}, in which \code{'Tj = quantile(Y[ , j], rho[j])'}.
}
Of the aforementioned missing mechanisms, "\code{MCAR}" is random, and the other two are systematic. 
under "\code{MCAR}", \code{'M[i, j]'} is not related to \code{'Y'} or to \code{'X'}; 
under "\code{MAR}", \code{'M[i, j]'} is related to \code{'X'}, but not related to \code{'Y'} after \code{'X'} is controlled; 
under "\code{MNAR}", \code{'M[i, j]'} is related to \code{'Y'} itself, even after \code{'X'} is controlled.
}
\examples{
## Simulate a dataset with response values missing completely at random (MCAR), 
## the overall missing rate is around 10\%.
sim.dat <- generateData(n = 300, p = 50, q = 20, rho = 0.1, missing.type = "MCAR")
## -------------------------------------------------------------------------------
## Fit a missoNet model using the simulated dataset.
X <- sim.dat$X  # predictor matrix
Y <- sim.dat$Z  # corrupted response matrix
fit <- missoNet(X = X, Y = Y, lambda.Beta = 0.1, lambda.Theta = 0.1)


## Simulate a dataset with response values missing at random (MAR), the approximate 
## missing rate for each column of the response matrix is specified through a vector 'rho'.
## 
## The row sparsity and element sparsity of the auto-generated 'Beta' could be 
## adjusted correspondingly by using 'Beta.row.sparsity' and 'Beta.elm.sparsity'.
n <- 300; p <- 50; q <- 20
rho <- runif(q, min = 0, max = 0.2)
sim.dat <- generateData(n = n, p = p, q = q, rho = rho, missing.type = "MAR",
                        Beta.row.sparsity = 0.3, Beta.elm.sparsity = 0.2)


## Simulate a dataset with response values missing not at random (MNAR), 
## using the user-supplied 'Beta' and 'Theta'.
n <- 300; p <- 50; q <- 20
Beta <- matrix(rnorm(p*q, 0, 1), p, q)  # a nonsparse 'Beta' (p x q)
Theta <- diag(q)  # a diagonal 'Theta' (q x q)
sim.dat <- generateData(Beta = Beta, Theta = Theta, n = n, p = p, q = q,
                        rho = 0.1, missing.type = "MNAR")
## ---------------------------------------------------------------------          
## Specifying just one of 'Beta' and 'Theta' is also allowed.
sim.dat <- generateData(Theta = Theta, n = n, p = p, q = q,
                        rho = 0.1, missing.type = "MNAR")


## User-supplied 'X', 'Beta' and 'E', in which case 'Y' is deterministic.
n <- 300; p <- 50; q <- 20
X <- matrix(rnorm(n*p, 0, 1), n, p)
Beta <- matrix(rnorm(p*q, 0, 1), p, q)
E <- mvtnorm::rmvnorm(n, rep(0, q), sigma = diag(q))
sim.dat <- generateData(X = X, Beta = Beta, E = E, n = n, p = p, q = q,
                        rho = 0.1, missing.type = "MCAR")
}
\author{
Yixiao Zeng \email{yixiao.zeng@mail.mcgill.ca}, Celia M.T. Greenwood and Archer Yi Yang.
}
