% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/simulation_functions.R
\name{s_generate_data}
\alias{s_generate_data}
\title{Generate linear response data and test and training sets for simulation study}
\usage{
s_generate_data(p, X, beta, binary_outcome = FALSE,
  cluster_distance = c("corr", "corr0", "corr1", "tom", "tom0", "tom1",
  "diffcorr", "difftom", "corScor", "tomScor", "fisherScore"), n, n0,
  include_interaction = F, signal_to_noise_ratio = 1,
  eclust_distance = c("fisherScore", "corScor", "diffcorr", "difftom"),
  cluster_method = c("hclust", "protoclust"), cut_method = c("dynamic",
  "gap", "fixed"), distance_method = c("euclidean", "maximum", "manhattan",
  "canberra", "binary", "minkowski"), n_clusters,
  agglomeration_method = c("complete", "average", "ward.D2", "single",
  "ward.D", "mcquitty", "median", "centroid"), nPC = 1, K.max = 10,
  B = 10)
}
\arguments{
\item{p}{number of genes in design matrix}

\item{X}{gene expression matrix of size n x p using the
\code{generate_blocks} function}

\item{beta}{true beta coefficient vector}

\item{binary_outcome}{Logical. Should a binary outcome be generated. Default
is \code{FALSE}. See details on how a binary outcome is generated}

\item{cluster_distance}{character representing which matrix from the training
set that you want to use to cluster the genes. Must be one of the following
\itemize{ \item corr, corr0, corr1, tom, tom0, tom1, diffcorr, difftom,
corScor, tomScor, fisherScore }}

\item{n}{total number of subjects}

\item{n0}{total number of subjects with E=0}

\item{include_interaction}{Should an interaction with the environment be
generated as part of the response. Default is FALSE.}

\item{signal_to_noise_ratio}{signal to noise ratio, default is 1}

\item{eclust_distance}{character representing which matrix from the training
set that you want to use to cluster the genes based on the environment. See
\code{cluster_distance} for avaialble options. Should be different from
\code{cluster_distance}. For example, if \code{cluster_distance=corr} and
\code{EclustDistance=fisherScore}. That is, one should be based on
correlations ignoring the environment, and the other should be based on
correlations accounting for the environment. This function will always
return this add on}

\item{cluster_method}{Cluster the data using hierarchical clustering or
prototype clustering. Defaults \code{cluster_method="hclust"}. Other option
is \code{\link[protoclust]{protoclust}}, however this package must be
installed before proceeding with this option}

\item{cut_method}{what method to use to cut the dendrogram. \code{'dynamic'}
refers to \code{dynamicTreeCut} library. \code{'gap'} is Tibshirani's gap
statistic \code{\link[cluster]{clusGap}} using the \code{'Tibs2001SEmax'}
rule. \code{'fixed'} is a fixed number specified by the \code{n_clusters}
argument}

\item{distance_method}{one of "euclidean","maximum","manhattan", "canberra",
"binary","minkowski" to be passed to \code{\link[stats]{dist}} function.}

\item{n_clusters}{Number of clusters specified by the user. Only applicable
when \code{cut_method="fixed"}}

\item{agglomeration_method}{the agglomeration method to be used. This should
be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single",
"complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC)
or "centroid" (= UPGMC).}

\item{nPC}{number of principal components to extract from each cluster.
Default is 1. Only 1 or 2 is allowed.}

\item{K.max}{the maximum number of clusters to consider, must be at least
two. Only used if \code{cutMethod='gap'}}

\item{B}{integer, number of Monte Carlo (“bootstrap”) samples. Only used if
\code{cutMethod='gap'}}
}
\value{
list of (in the following order) \describe{ \item{beta_truth}{a 1
  column matrix containing the true beta coefficient vector}
  \item{similarity}{an object of class similarity which is the similarity
  matrix specified by the \code{cluster_distance}
  argument}\item{similarityEclust}{an object of class similarity which is the
  similarity matrix specified by the \code{eclust_distance} argument}
  \item{DT}{data.table of simulated data from the \code{s_response} function}
  \item{Y}{The simulated response} \item{X0}{the n0 x p design matrix for the
  unexposed subjects} \item{X1}{the n1 x p design matrix for the exposed
  subjects} \item{X_train}{the training design matrix for all subjects}
  \item{X_test}{the test set design matrix for all subjects}
  \item{Y_train}{the training set response} \item{Y_test}{the test set
  response} \item{DT_train}{the training response and training design matrix
  in a single data.frame object} \item{DT_test}{the test response and
  training design matrix in a single data.frame object} \item{S0}{a character
  vector of the active genes i.e. the ones that are associated with the
  response} \item{n_clusters_All}{the number of clusters identified by using
  the similarity matrix specified by the \code{cluster_distance} argument}
  \item{n_clusters_Eclust}{the number of clusters identified by using the
  similarity matrix specified by the \code{eclust_distance}
  argument}\item{n_clusters_Addon}{the sum of \code{n_clusters_All} and
  \code{n_clusters_Eclust}} \item{clustersAll}{the cluster membership of each
  gene based on the \code{cluster_distance} matrix} \item{clustersAddon}{the
  cluster membership of each gene based on both the \code{cluster_distance}
  matrix and the \code{eclust_distance} matrix. Note that each gene will
  appear twice here}\item{clustersEclust}{the cluster membership of each gene
  based on the \code{eclust_distance} matrix}
  \item{gene_groups_inter}{cluster membership of each gene with a penalty
  factor used for the group lasso} \item{gene_groups_inter_Addon}{cluster
  membership of each gene with a penalty factor used for the group lasso}
  \item{tom_train_all}{the TOM matrix based on all training subjects}
  \item{tom_train_diff}{the absolute difference of the exposed and unexposed
  TOM matrices: \eqn{|TOM_{E=1} - TOM_{E=0}|}} \item{tom_train_e1}{the TOM
  matrix based on training exposed subjects only} \item{tom_train_e0}{the TOM
  matrix based on training unexposed subjects only} \item{corr_train_all}{the
  Pearson correlation matrix based on all training subjects}
  \item{corr_train_diff}{the absolute difference of the exposed and unexposed
  Pearson correlation matrices: \eqn{|Cor_{E=1} - Cor_{E=0}|}}
  \item{corr_train_e1}{the Pearson correlation matrix based on training
  exposed subjects only} \item{corr_train_e0}{the Pearson correlation matrix
  based on training unexposed subjects only}\item{fisherScore}{The fisher
  scoring matrix. see \code{\link{u_fisherZ}} for details}\item{corScor}{The
  correlation scoring matrix: \eqn{|Cor_{E=1} + Cor_{E=0} - 2|}}
  \item{mse_null}{The MSE for the null model}\item{DT_train_folds}{The 10
  training folds used for the stability measures}\item{X_train_folds}{The 10
  X training folds (the same as in DT_train_folds)}\item{Y_train_folds}{The
  10 Y training folds (the same as in DT_train_folds)} }
}
\description{
create a function that takes as input, the number of genes, the
  true beta vector, the gene expression matrix created from the
  generate_blocks function and returns a list of data matrix, as well as
  correlation matrices, TOM matrices, cluster information, training and test
  data
}
\details{
To generate a binary outcome we first generate a continuous outcome
  Y which is \eqn{X^T \beta}, defined \eqn{p = 1/(1 + exp(-Y ))} and used
  this to generate a two-class outcome z with \eqn{Pr(z = 1) = p} and
  \eqn{Pr(z = 0) = 1 - p}.
}
\note{
this function calls the \code{s_response} to generate phenotype as a
  function of the gene expression data. This function also returns other
  information derived from the simulated data including the test and training
  sets, the correlation and TOM matrices and the clusters.

the PCs and averages need to be calculated in the fitting functions,
  because these will change based on the CV fold
}
\examples{
library(magrittr)

# simulation parameters
rho = 0.90; p = 500 ;SNR = 1 ; n = 200; n0 = n1 = 100 ; nActive = p*0.10 ; cluster_distance = "tom";
Ecluster_distance = "difftom"; rhoOther = 0.6; betaMean = 2;
alphaMean = 1; betaE = 3; distanceMethod = "euclidean"; clustMethod = "hclust";
cutMethod = "dynamic"; agglomerationMethod = "average"

#in this simulation its blocks 3 and 4 that are important
#leaveOut:  optional specification of modules that should be left out
#of the simulation, that is their genes will be simulated as unrelated
#("grey"). This can be useful when simulating several sets, in some which a module
#is present while in others it is absent.
d0 <- s_modules(n = n0, p = p, rho = 0, exposed = FALSE,
                modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25),
                minCor = 0.01,
                maxCor = 1,
                corPower = 1,
                propNegativeCor = 0.3,
                backgroundNoise = 0.5,
                signed = FALSE,
                leaveOut = 1:4)

d1 <- s_modules(n = n1, p = p, rho = rho, exposed = TRUE,
                modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25),
                minCor = 0.4,
                maxCor = 1,
                corPower = 0.3,
                propNegativeCor = 0.3,
                backgroundNoise = 0.5,
                signed = FALSE)

truemodule1 <- d1$setLabels

X <- rbind(d0$datExpr, d1$datExpr) \%>\%
  magrittr::set_colnames(paste0("Gene", 1:p)) \%>\%
  magrittr::set_rownames(paste0("Subject",1:n))

betaMainEffect <- vector("double", length = p)
betaMainInteractions <- vector("double", length = p)

# the first nActive/2 in the 3rd block are active
betaMainEffect[which(truemodule1 \%in\% 3)[1:(nActive/2)]] <- runif(
  nActive/2, betaMean - 0.1, betaMean + 0.1)

# the first nActive/2 in the 4th block are active
betaMainEffect[which(truemodule1 \%in\% 4)[1:(nActive/2)]] <- runif(
  nActive/2, betaMean+2 - 0.1, betaMean+2 + 0.1)
betaMainInteractions[which(betaMainEffect!=0)] <- runif(nActive, alphaMean - 0.1, alphaMean + 0.1)
beta <- c(betaMainEffect, betaE, betaMainInteractions)
\dontrun{
result <- s_generate_data(p = p, X = X,
                          beta = beta,
                          include_interaction = TRUE,
                          cluster_distance = cluster_distance,
                          n = n, n0 = n0,
                          eclust_distance = Ecluster_distance,
                          signal_to_noise_ratio = SNR,
                          distance_method = distanceMethod,
                          cluster_method = clustMethod,
                          cut_method = cutMethod,
                          agglomeration_method = agglomerationMethod,
                          nPC = 1)
names(result)
}
}

