% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/runSimulation.R
\name{runSimulation}
\alias{head.SimDesign}
\alias{print.SimDesign}
\alias{runSimulation}
\alias{summary.SimDesign}
\alias{tail.SimDesign}
\title{Run a Monte Carlo simulation given a data.frame of conditions and simulation functions}
\usage{
runSimulation(design, replications, generate, analyse, summarise,
  fixed_objects = NULL, packages = NULL, filename = "SimDesign-results",
  save = FALSE, save_results = FALSE, save_seeds = FALSE,
  load_seed = NULL, seed = NULL, parallel = FALSE,
  ncores = parallel::detectCores(), cl = NULL, MPI = FALSE,
  max_errors = 50, as.factor = TRUE, save_generate_data = FALSE,
  save_details = list(), edit = "none", progress = FALSE,
  verbose = TRUE)

\method{print}{SimDesign}(x, drop.extras = FALSE, drop.design = FALSE,
  format.time = TRUE, ...)

\method{head}{SimDesign}(x, ...)

\method{tail}{SimDesign}(x, ...)

\method{summary}{SimDesign}(object, ...)
}
\arguments{
\item{design}{a \code{data.frame} object containing the Monte Carlo simulation conditions to
be studied, where each row represents a unique condition}

\item{replications}{number of replication to perform per condition (i.e., each row in \code{design}).
Must be greater than 0}

\item{generate}{user-defined data and parameter generating function.
See \code{\link{Generate}} for details}

\item{analyse}{user-defined computation function which acts on the data generated from
\code{\link{Generate}}. See \code{\link{Analyse}} for details}

\item{summarise}{optional (but recommended) user-defined summary function to be used
after all the replications have completed within each \code{design} condition. Omitting this function
will return a list of matrices (or a single matrix, if only one row in \code{design} is supplied)
or more general objects (such as lists) containing the results returned form \code{\link{Analyse}}.
Ommiting this function is only recommended for didactic purposes because it leaves out a large amount of
information (e.g., try-errors, warning messages, etc) and generally is not as flexible internally. See
the \code{save_results} option for a better alternative to storing the Generate-Analyse results}

\item{fixed_objects}{(optional) an object (usually a \code{list})
containing additional user-defined objects
that should remain fixed across conditions. This is useful when including
long fixed vectors/matrices of population parameters, data
that should be used across all conditions and replications (e.g., including a fixed design matrix
for linear regression), or simply can be used to control constant global elements such as sample size}

\item{packages}{a character vector of external packages to be used during the simulation (e.g.,
\code{c('MASS', 'mvtnorm', 'simsem')} ). Use this input when \code{parallel = TRUE} or
\code{MPI = TRUE} to use non-standard functions from additional packages,
otherwise the functions must be made available by using explicit
\code{\link{library}} or \code{\link{require}} calls within the provided simulation functions.
Alternatively, functions can be called explicitly without attaching the package with \code{::}
(e.g., \code{mvtnorm::rmvnorm()})}

\item{filename}{(optional) the name of the \code{.rds} file to save the final simulation results to
when \code{save = TRUE}.
When \code{NULL}, the final simulation object is not saved to the drive. As well,
if the same file name already exists in the working directly at the time of saving then a new
file will be generated instead and a warning will be thrown; this helps avoid accidentally overwriting
existing files. Default is \code{'SimDesign-results'}}

\item{save}{logical; save the simulation state and final results to the hard-drive? This is useful
for simulations which require an extended amount of time. When \code{TRUE}, a temp file
will be created in the working directory which allows the simulation state to be saved
and recovered (in case of power outages, crashes, etc). To recover you simulation at the last known
location simply rerun the same code you used to initially define the simulation and the object
will automatically be detected and read-in. Upon completion, and if \code{filename} is not
\code{NULL}, the final results will also be saved to the working directory. Default is \code{FALSE}}

\item{save_results}{logical; save the results returned from \code{\link{Analyse}} to external
\code{.rds} files located in the defined \code{save_results_dirname} directory/folder?
Use this if you would like to keep track of the individual parameters returned from the analyses.
Each saved object will contain a list of three elements containing the condition (row from \code{design}),
results (as a \code{list} or \code{matrix}), and try-errors. When \code{TRUE}, a temp file will be used to track the simulation
state (in case of power outages, crashes, etc). When \code{TRUE}, temporary files will also be saved
to the working directory (in the same was as when \code{save = TRUE} to better track the state of the simulation.
See \code{\link{SimResults}} for an example of how to read these \code{.rds} files back into R
after the simulation is complete. Default is \code{FALSE}}

\item{save_seeds}{logical; save the \code{.Random.seed} states prior to performing each replication into
plain text files located in the defined \code{save_seeds_dirname} directory/folder?
Use this if you would like to keep track of the simulation state within each replication and design
condition. Primarily, this is useful for completely replicating any cell in the simulation if need be,
especially when tracking down hard-to-find errors and bugs. As well, see the \code{load_seed} input
to load a given \code{.Random.seed} to exactly replicate the generated data and analysis state (handy
for debugging). When \code{TRUE}, temporary files will also be saved
to the working directory (in the same was as when \code{save = TRUE} to better track the state of the simulation.
Default is \code{FALSE}}

\item{load_seed}{a character object indicating which file to load from when the \code{.Random.seed}s have
be saved (after a run with \code{save_seeds = TRUE}). E.g., \code{load_seed = 'design-row-2/seed-1'}
will load the first seed in the second row of the \code{design} input. Note that it is important NOT
to modify the \code{design} input object, otherwise the path may not point to the correct saved location.
Default is \code{NULL}}

\item{seed}{a vector of integers to be used for reproducibility.
The length of the vector must be equal the number of rows in \code{design}.
This argument calls \code{\link{set.seed}} or
\code{\link{clusterSetRNGStream}} for each condition, respectively,
but will not be run when \code{MPI = TRUE}.
Default is \code{NULL}, indicating that no seed is set for each condition}

\item{parallel}{logical; use parallel processing from the \code{parallel} package over each
unique condition?}

\item{ncores}{number of cores to be used in parallel execution. Default uses all available}

\item{cl}{cluster object defined by \code{\link{makeCluster}} used to run code in parallel.
If \code{NULL} and \code{parallel = TRUE}, a local cluster object will be defined which
selects the maximum number cores available
and will be stop the cluster when the simulation is complete. Note that supplying a \code{cl}
object will automatically set the \code{parallel} argument to \code{TRUE}}

\item{MPI}{logical; use the \code{foreach} package in a form usable by MPI to run simulation
in parallel on a cluster? Default is \code{FALSE}}

\item{max_errors}{the simulation will terminate when more than this number of constitutive errors are thrown in any
given condition. The purpose of this is to indicate that likely something problematic is going
wrong in the generate-analyse phases and should be inspected. Default is 50}

\item{as.factor}{logical; coerce the input \code{design} elements into \code{factor}s when the
simulation is complete? If the columns inputs are numeric then these will be treated
as \code{ordered}. Default is \code{TRUE}}

\item{save_generate_data}{logical; save the data returned from \code{\link{Generate}} to external \code{.rds} files
located in the defined \code{save_generate_data_dirname} directory/folder?
It is generally recommended to leave this argument as \code{FALSE} because saving datasets will often consume
a large amount of disk space, and by and large saving data is not required or recommended for simulations.
A more space-friendly version is available when using the \code{save_seed} flag.
When \code{TRUE}, temporary files will also be saved
to the working directory (in the same was as when \code{save = TRUE} to better track the state of the simulation.
Default is \code{FALSE}}

\item{save_details}{a list pertaining to information about how and where files should be saved
  when \code{save}, \code{save_results}, or \code{save_generate_data} are triggered.

  \describe{

    \item{\code{safe}}{logical; trigger whether safe-saving should be performed. When \code{TRUE} files
      will never be overwritten accidentally, and where appropriate the program will either stop or generate
      new files with unique names. Default is \code{TRUE}}

    \item{\code{compname}}{name of the computer running the simulation. Normally this doesn't need
      to be modified, but in the event that a manual node breaks down while running a simulation the
      results from the temp files may be resumed on another computer by changing the name of the
      node to match the broken computer. Default is the result of evaluating \code{unname(Sys.info()['nodename'])}}

    \item{\code{tmpfilename}}{the name of the temporary \code{.rds} file when any of the \code{save} flag is used.
       This file will be read-in if it is in the working directory and the simulation will continue
       at the last point this file was saved
       (useful in case of power outages or broken nodes). Finally, this file will be deleted when the
       simulation is complete. Default is the system name (\code{compname}) appended
       to \code{'SIMDESIGN-TEMPFILE_'}}

    \item{\code{save_results_dirname}}{a string indicating the name of the folder to save
      result objects to when \code{save_results = TRUE}. If a directory/folder does not exist
      in the current working directory then a unique one will be created automatically. Default is
      \code{'SimDesign-results_'} with the associated \code{compname} appended}

    \item{\code{save_seeds_dirname}}{a string indicating the name of the folder to save
      \code{.Random.seed} objects to when \code{save_seeds = TRUE}. If a directory/folder does not exist
      in the current working directory then one will be created automatically. Default is
      \code{'SimDesign-seeds_'} with the associated \code{compname} appended}

    \item{\code{save_generate_data_dirname}}{a string indicating the name of the folder to save
      data objects to when \code{save_generate_data = TRUE}. If a directory/folder does not exist
      in the current working directory then one will be created automatically.
      Within this folder nested directories will be created associated with each row in \code{design}.
      Default is \code{'SimDesign-generate-data_'} with the \code{compname} appended}

  }}

\item{edit}{a string indicating where to initiate a \code{browser()} call for editing and debugging.
  General options are \code{'none'} (default) and \code{'all'}, which are used
  to disable debugging and to debug all the user defined functions, respectively.
  Specific options include: \code{'generate'}
  to edit the data simulation function, \code{'analyse'} to edit the computational function, and
  \code{'summarise'} to  edit the aggregation function.

  Alternatively, users may place \code{\link{browser}} calls within the respective functions for
  debugging at specific lines (note: parallel computation flags will automatically be disabled
  when a \code{browser()} is detected)}

\item{progress}{logical; display a progress bar for each simulation condition?
This is useful when simulations conditions take a long time to run.
Uses the \code{pbapply} package to display the progress. Default is \code{FALSE}}

\item{verbose}{logical; print messages to the R console? Default is \code{TRUE}}

\item{x}{SimDesign object returned from \code{\link{runSimulation}}}

\item{drop.extras}{logical; don't print information about warnings, errors, simulation time, and replications?
Default is \code{FALSE}}

\item{drop.design}{logical; don't include information about the (potentially factorized) simulation design?
This may be useful if you wish to \code{cbind()} the original design \code{data.frame} to the simulation
results instead of using the auto-factorized version. Default is \code{FALSE}}

\item{format.time}{logical; format \code{SIM_TIME} into a day/hour/min/sec character vector? Default is
\code{TRUE}}

\item{...}{additional arguments}

\item{object}{SimDesign object returned from \code{\link{runSimulation}}}
}
\value{
a \code{data.frame} (also of class \code{'SimDesign'})
  with the original \code{design} conditions in the left-most columns,
  simulation results and ERROR/WARNING's (if applicable) in the middle columns,
  and additional information (such as REPLICATIONS, SIM_TIME, COMPLETED, and SEED) in the right-most
  columns.
}
\description{
This function runs a Monte Carlo simulation study given a set of predefined simulation functions,
design conditions, and number of replications. Results can be saved as temporary files in case of interruptions
and may be restored by re-running \code{runSimulation}, provided that the respective temp
file can be found in the working directory. \code{runSimulation} supports parallel
and cluster computing, global and local debugging, error handling (including fail-safe
stopping when functions fail too often, even across nodes), and tracking of error and warning messages.
For convenience, all functions available in the R workspace are exported across all computational nodes
so that they are more easily accessible (however, other R objects are not, and therefore
must be passed to the \code{fixed_objects} input to become available across nodes).
For a didactic presentation of the package refer to Sigal and Chalmers (2016).
}
\details{
The strategy for organizing the Monte Carlo simulation work-flow is to

\describe{
   \item{1)}{Define a suitable \code{design} data.frame object containing fixed conditional
      information about the Monte Carlo simulations. This is often expedited by using the
      \code{\link{expand.grid}} function, and if necessary using the \code{\link{subset}}
      function to remove redundant or non-applicable rows}
   \item{2)}{Define the three step functions to generate the data (\code{\link{Generate}}),
      analyse the generated data by computing the respective parameter estimates, detection rates,
      etc (\code{\link{Analyse}}), and finally summarise the results across the total
      number of replications (\code{\link{Summarise}}). Note that these functions can be
      automatically generated by using the \code{\link{SimFunctions}} function.
   }
   \item{3)}{Pass the above objects to the \code{runSimulation} function, and declare the
      number of replications to perform with the \code{replications} input. This function will accept
      a \code{design} data.frame object and will return a suitable data.frame object with the
      simulation results}
   \item{4)}{Analyze the output from \code{runSimulation}, possibly using ANOVA techniques
     (\code{\link{SimAnova}}) and generating suitable plots and tables}
}

For a skeleton version of the work-flow, which is often useful when initially defining a simulation,
see \code{\link{SimFunctions}}. This function will write template simulation code
to one/two files so that modifying the required functions and objects can begin immediately
with minimal error. This means that you can focus on your Monte Carlo simulation immediately rather
than worrying about the administrative code-work required to organize the simulation work-flow.

Additional information for each condition are also contained in the \code{data.frame} object returned by
\code{runSimulation}: \code{REPLICATIONS} to indicate the number of Monte Carlo replications,
\code{SIM_TIME} to indicate how long (in seconds) it took to complete
all the Monte Carlo replications for each respective design condition,
\code{COMPLETED} to indicate the date in which the given simulation condition completed,
\code{SEED} if the \code{seed} argument
was used, columns containing the number of replications which had to be re-run due to errors (where the error messages
represent the names of the columns prefixed with a \code{ERROR:} string), and
columns containing the number of warnings prefixed with a \code{WARNING:} string.

Additional examples, presentation files, and tutorials can be found on the package wiki located at
\url{https://github.com/philchalmers/SimDesign/wiki}.
}
\section{Saving data, results, seeds, and the simulation state}{


To conserve RAM, temporary objects (such as data generated across conditions and replications)
are discarded; however, these can be saved to the hard-disk by passing the appropriate flags.
For longer simulations it is recommended to use \code{save = TRUE} to temporarily save the
simulation state, and to use the \code{save_results} flag to write the analysis results
the to hard-disc.

The generated data can be saved by passing
\code{save_generate_data = TRUE}, however it is often more memory efficient to use the
\code{save_seeds} option instead to only save R's \code{.Random.seed} state instead (still
allowing for complete reproducibility); individual \code{.Random.seed} terms may also be read in with the
\code{load_seed} input to reproduce the exact simulation state at any given replication. Finally,
providing a vector of \code{seeds} is also possible to ensure
that each simulation condition is completely reproducible under the single/multi-core method selected.

Finally, when the Monte Carlo simulation is complete
it is recommended to write the results to a hard-drive for safe keeping, particularly with the
\code{save} and \code{filename} arguments provided (for reasons that are more obvious in the parallel computation
descriptions below). Using the \code{filename} argument (along with \code{save = TRUE})
supplied is much safer than using something
like \code{\link{saveRDS}} directly because files will never accidentally be overwritten,
and instead a new file name will be created when a conflict arises; this type of safety
is prevalent in many aspects of the package and helps to avoid many unrecoverable (yet surprisingly common)
mistakes.
}

\section{Resuming temporary results}{


In the event of a computer crash, power outage, etc, if \code{save = TRUE} was used
then the original code used to execute \code{runSimulation()} need only be re-run to resume the simulation.
The saved temp file will be read into the function automatically, and the simulation will continue
one the condition where it left off before the simulation state was terminated.
}

\section{A note on parallel computing}{


When running simulations in parallel (either with \code{parallel = TRUE} or \code{MPI = TRUE})
R objects defined in the global environment will generally \emph{not} be visible across nodes.
Hence, you may see errors such as \code{Error: object 'something' not found} if you try to use an object
that is defined in the workspace but is not passed to \code{runSimulation}.
To avoid this type or error, simply pass additional objects to the
\code{fixed_objects} input (usually it's convenient to supply a named list of these objects).
Fortunately, however, \emph{custom functions defined in the global environment are exported across
nodes automatically}. This makes it convenient when writing code because custom functions will
always be available across nodes if they are visible in the R workspace. As well, note the
\code{packages} input to declare packages which must be loaded via \code{library()} in order to make
specific non-standard R functions available across nodes.
}

\section{Cluster computing}{


SimDesign code may be released to a computing system which supports parallel cluster computations using
the industry standard Message Passing Interface (MPI) form. This simply
requires that the computers be setup using the usual MPI requirements (typically, running some flavor
of Linux, have password-less open-SSH access, IP addresses have been added to the \code{/etc/hosts} file
or \code{~/.ssh/config}, etc).
More generally though, these resources are widely available through professional
organizations dedicated to super-computing.

To setup the R code for an MPI cluster one need only add the argument \code{MPI = TRUE},
wrap the appropriate MPI directives around \code{runSimulation}, and submit the
files using the suitable BASH commands to execute the \code{mpirun} tool. For example,

\describe{
  \item{\code{library(doMPI)}}{}
  \item{\code{cl <- startMPIcluster()}}{}
  \item{\code{registerDoMPI(cl)}}{}
  \item{\code{runSimulation(design=Design, replications=1000, save=TRUE, filename='mysimulation',
    generate=Generate, analyse=Analyse, summarise=Summarise,  MPI=TRUE)}}{}
  \item{\code{closeCluster(cl)}}{}
  \item{\code{mpi.quit()}}{}
}

The necessary SimDesign files must be uploaded to the dedicated master node
so that a BASH call to \code{mpirun} can be used to distribute the work across slaves.
For instance, if the following BASH command is run on the master node then 16 processes
will be summoned (1 master, 15 slaves) across the computers named \code{localhost}, \code{slave1},
and \code{slave2} in the ssh \code{config} file.

\code{mpirun -np 16 -H localhost,slave1,slave2 R --slave -f simulation.R}
}

\section{Network computing}{


If you access have to a set of computers which can be linked via secure-shell (ssh) on the same LAN network then
Network computing (a.k.a., a Beowulf cluster) may be a viable and useful option.
This approach is similar to MPI computing approach
except that it offers more localized control and requires more hands-on administrative access to the master
and slave nodes. The setup generally requires that the master node
has \code{SimDesign} installed and the slave/master nodes have all the required R packages pre-installed
(Unix utilities such as \code{dsh} are very useful for this purpose). Finally,
the master node must have ssh access to the slave nodes, each slave node must have ssh access
with the master node, and a cluster object (\code{cl}) from the \code{parallel} package must be defined on the
master node.

Setup for network computing is generally more straightforward and controlled
than the setup for MPI jobs in that it only requires the specification of a) the respective
IP addresses within a defined R script, and b) the user name
(if different from the master node's user name. Otherwise, only a) is required).
However, on Linux I have found it is also important to include relevant information about the host names
and IP addresses in the \code{/etc/hosts} file on the master and slave nodes, and to ensure that
the selected port (passed to \code{\link{makeCluster}}) on the master node is not hindered by a firewall.

As an example, using the following code the master node (primary) will spawn 7 slaves and 1 master,
while a separate computer on the network with the associated IP address will spawn an additional 6 slaves.
Information will be collected on the master node, which is also where the files
and objects will be saved using the \code{save} inputs (if requested).

\describe{
  \item{\code{library(parallel)}}{}
  \item{\code{primary <- '192.168.2.1'}}{}
  \item{\code{IPs <- list(list(host=primary, user='myname', ncore=8), list(host='192.168.2.2', user='myname', ncore=6))}}{}
  \item{\code{spec <- lapply(IPs, function(IP) rep(list(list(host=IP$host, user=IP$user)), IP$ncore))}}{}
  \item{\code{spec <- unlist(spec, recursive=FALSE)}}{}
  \item{\code{cl <- makeCluster(master=primary, spec=spec)}}{}
  \item{\code{Final <- runSimulation(..., cl=cl)}}{}
  \item{\code{stopCluster(cl)}}{}
}

The object \code{cl} is passed to \code{runSimulation} on the master node
and the computations are distributed across the respective
IP addresses. Finally, it's usually good practice to use \code{stopCluster(cl)}
when all the simulations are said and done to release the communication between the computers,
which is what the above code shows.

Alternatively, if you have provided suitable names for each respective slave node, as well as the master,
then you can define the \code{cl} object using these instead (rather than supplying the IP addresses in
your R script). This requires that the master node has itself and all the slave nodes defined in the
\code{/etc/hosts} and \code{~/.ssh/config} files, while the slave nodes require themselves and the
master node in the same files (only 2 IP addresses required on each slave).
Following this setup, and assuming the user name is the same across all nodes,
the \code{cl} object could instead be defined with

\describe{
  \item{\code{library(parallel)}}{}
  \item{\code{primary <- 'master'}}{}
  \item{\code{IPs <- list(list(host=primary, ncore=8), list(host='slave', ncore=6))}}{}
  \item{\code{spec <- lapply(IPs, function(IP) rep(list(list(host=IP$host)), IP$ncore))}}{}
  \item{\code{spec <- unlist(spec, recursive=FALSE)}}{}
  \item{\code{cl <- makeCluster(master=primary, spec=spec)}}{}
  \item{\code{Final <- runSimulation(..., cl=cl)}}{}
  \item{\code{stopCluster(cl)}}{}
}

Or, even more succinctly if all communication elements required are identical to the master node,

\describe{
  \item{\code{library(parallel)}}{}
  \item{\code{primary <- 'master'}}{}
  \item{\code{spec <- c(rep(primary, 8), rep('slave', 6))}}{}
  \item{\code{cl <- makeCluster(master=primary, spec=spec)}}{}
  \item{\code{Final <- runSimulation(..., cl=cl)}}{}
  \item{\code{stopCluster(cl)}}{}
}
}

\section{Poor man's cluster computing for independent nodes}{


In the event that you do not have access to a Beowulf-type cluster (described in the section on
"Network Computing") but have multiple personal
computers then the simulation code can be manually distributed across each independent computer instead.
This simply requires passing a smaller value to the \code{replications} argument on each computer and later
aggregating the results using the \code{\link{aggregate_simulations}} function.

For instance, if you have two computers available on different networks and wanted a total of 500 replications you
could pass \code{replications = 300} to one computer and \code{replications = 200} to the other along
with a \code{filename} argument (or simply saving the final objects as \code{.rds} files manually after
\code{runSimulation()} has finished). This will create two distinct \code{.rds} files which can be
combined later with the \code{\link{aggregate_simulations}} function. The benefit of this approach over
MPI or setting up a Beowulf cluster is that computers need not be linked on the same network,
and, should the need arise, the temporary
simulation results can be migrated to another computer in case of a complete hardware failure by moving the
saved temp files to another node, modifying
the suitable \code{compname} input to \code{save_details} (or, if the \code{filename} and \code{tmpfilename}
were modified, matching those files accordingly), and resuming the simulation as normal.

Note that this is also a useful tactic if the MPI or Network computing options require you to
submit smaller jobs due to time and resource constraint-related reasons,
where fewer replications/nodes should be requested. After all the jobs are completed and saved to their
respective files, \code{\link{aggregate_simulations}}
can then collapse the files as if the simulations were run all at once. Hence, SimDesign makes submitting
smaller jobs to super-computing resources considerably less error prone than managing a number of smaller
jobs manually .
}
\examples{

#-------------------------------------------------------------------------------
# Example 1: Sampling distribution of mean

# This example demonstrate some of the simpler uses of SimDesign,
# particularly for classroom settings. The only factor varied in this simulation
# is sample size.

# skeleton functions to be saved and edited
SimFunctions()

#### Step 1 --- Define your conditions under study and create design data.frame

Design <- data.frame(N = c(10, 20, 30))

#~~~~~~~~~~~~~~~~~~~~~~~~
#### Step 2 --- Define generate, analyse, and summarise functions

# help(Generate)
Generate <- function(condition, fixed_objects = NULL){
    dat <- with(condition, rnorm(N, 10, 5)) # distributed N(10, 5)
    dat
}

# help(Analyse)
Analyse <- function(condition, dat, fixed_objects = NULL){
    ret <- mean(dat) # mean of the sample data vector
    ret
}

# help(Summarise)
Summarise <- function(condition, results, fixed_objects = NULL){
    ret <- c(mu=mean(results), SE=sd(results)) # mean and SD summary of the sample means
    ret
}


#~~~~~~~~~~~~~~~~~~~~~~~~
#### Step 3 --- Collect results by looping over the rows in design

# run the simulation
Final <- runSimulation(design=Design, replications=1000,
                       generate=Generate, analyse=Analyse, summarise=Summarise)
Final


#~~~~~~~~~~~~~~~~~~~~~~~~
#### Extras
# compare SEs estimates to the true SEs from the formula sigma/sqrt(N)
5 / sqrt(Design$N)

# To store the results from the analyse function either
#   a) omit a definition of of summarise(), or
#   b) pass save_results = TRUE to runSimulation() and read the results in with SimResults()

# e.g., the a) approach
results <- runSimulation(design=Design, replications=1000,
                       generate=Generate, analyse=Analyse)
str(results)
head(results[[1]])

# or b) approach
Final <- runSimulation(design=Design, replications=1000, save_results=TRUE,
                       generate=Generate, analyse=Analyse, summarise=Summarise)
results <- SimResults(Final)
str(results)
head(results[[1]]$results)

# remove the saved results from the hard-drive if you no longer want them
SimClean(results = TRUE)




#-------------------------------------------------------------------------------
# Example 2: t-test and Welch test when varying sample size, group sizes, and SDs

# skeleton functions to be saved and edited
SimFunctions()

\dontrun{
# in real-world simulations it's often better/easier to save
# these functions directly to your hard-drive with
SimFunctions('my-simulation')
}

#### Step 1 --- Define your conditions under study and create design data.frame

Design <- expand.grid(sample_size = c(30, 60, 90, 120),
                      group_size_ratio = c(1, 4, 8),
                      standard_deviation_ratio = c(.5, 1, 2))
dim(Design)
head(Design)

#~~~~~~~~~~~~~~~~~~~~~~~~
#### Step 2 --- Define generate, analyse, and summarise functions

Generate <- function(condition, fixed_objects = NULL){
    N <- condition$sample_size      # alternatively, could use Attach() to make objects available
    grs <- condition$group_size_ratio
    sd <- condition$standard_deviation_ratio
    if(grs < 1){
        N2 <- N / (1/grs + 1)
        N1 <- N - N2
    } else {
        N1 <- N / (grs + 1)
        N2 <- N - N1
    }
    group1 <- rnorm(N1)
    group2 <- rnorm(N2, sd=sd)
    dat <- data.frame(group = c(rep('g1', N1), rep('g2', N2)), DV = c(group1, group2))
    dat
}

Analyse <- function(condition, dat, fixed_objects = NULL){
    welch <- t.test(DV ~ group, dat)
    ind <- t.test(DV ~ group, dat, var.equal=TRUE)

    # In this function the p values for the t-tests are returned,
    #  and make sure to name each element, for future reference
    ret <- c(welch = welch$p.value, independent = ind$p.value)
    ret
}

Summarise <- function(condition, results, fixed_objects = NULL){
    #find results of interest here (e.g., alpha < .1, .05, .01)
    ret <- EDR(results, alpha = .05)
    ret
}


#~~~~~~~~~~~~~~~~~~~~~~~~
#### Step 3 --- Collect results by looping over the rows in design

# first, test to see if it works
Final <- runSimulation(design=Design, replications=5,
                       generate=Generate, analyse=Analyse, summarise=Summarise)
head(Final)

\dontrun{
# complete run with 1000 replications per condition
Final <- runSimulation(design=Design, replications=1000, parallel=TRUE,
                       generate=Generate, analyse=Analyse, summarise=Summarise)
head(Final, digits = 3)
View(Final)

## save final results to a file upon completion (not run)
runSimulation(design=Design, replications=1000, parallel=TRUE, save=TRUE, filename = 'mysim',
              generate=Generate, analyse=Analyse, summarise=Summarise)



## Debug the generate function. See ?browser for help on debugging
##   Type help to see available commands (e.g., n, c, where, ...),
##   ls() to see what has been defined, and type Q to quit the debugger
runSimulation(design=Design, replications=1000,
              generate=Generate, analyse=Analyse, summarise=Summarise,
              parallel=TRUE, edit='generate')

## Alternatively, place a browser() within the desired function line to
##   jump to a specific location
Summarise <- function(condition, results, fixed_objects = NULL){
    #find results of interest here (e.g., alpha < .1, .05, .01)
    ret <- EDR(results[,nms], alpha = .05)
    browser()
    ret
}

runSimulation(design=Design, replications=1000,
              generate=Generate, analyse=Analyse, summarise=Summarise,
              parallel=TRUE)




## EXTRA: To run the simulation on a MPI cluster, use the following setup on each node (not run)
# library(doMPI)
# cl <- startMPIcluster()
# registerDoMPI(cl)
# Final <- runSimulation(design=Design, replications=1000, MPI=TRUE, save=TRUE,
#                        generate=Generate, analyse=Analyse, summarise=Summarise)
# saveRDS(Final, 'mysim.rds')
# closeCluster(cl)
# mpi.quit()


## Similarly, run simulation on a network linked via ssh
##  (two way ssh key-paired connection must be possible between master and slave nodes)
##
## define IP addresses, including primary IP
# primary <- '192.168.2.20'
# IPs <- list(
#     list(host=primary, user='phil', ncore=8),
#     list(host='192.168.2.17', user='phil', ncore=8)
# )
# spec <- lapply(IPs, function(IP)
#                    rep(list(list(host=IP$host, user=IP$user)), IP$ncore))
# spec <- unlist(spec, recursive=FALSE)
#
# cl <- parallel::makeCluster(type='PSOCK', master=primary, spec=spec)
# Final <- runSimulation(design=Design, replications=1000, parallel = TRUE, save=TRUE,
#                        generate=Generate, analyse=Analyse, summarise=Summarise, cl=cl)

#~~~~~~~~~~~~~~~~~~~~~~~~
###### Post-analysis: Analyze the results via functions like lm() or SimAnova(), and create
###### tables(dplyr) or plots (ggplot2) to help visualize the results.
###### This is where you get to be a data analyst!

library(dplyr)
Final2 <- tbl_df(Final)
Final2 \%>\% summarise(mean(welch), mean(independent))
Final2 \%>\% group_by(standard_deviation_ratio, group_size_ratio) \%>\%
   summarise(mean(welch), mean(independent))

# quick ANOVA analysis method with all two-way interactions
SimAnova( ~ (sample_size + group_size_ratio + standard_deviation_ratio)^2, Final)

# or more specific anovas
SimAnova(independent ~ (group_size_ratio + standard_deviation_ratio)^2,
    Final)

# make some plots
library(ggplot2)
library(reshape2)
welch_ind <- Final[,c('group_size_ratio', "standard_deviation_ratio",
    "welch", "independent")]
dd <- melt(welch_ind, id.vars = names(welch_ind)[1:2])

ggplot(dd, aes(factor(group_size_ratio), value)) +
    geom_abline(intercept=0.05, slope=0, col = 'red') +
    geom_abline(intercept=0.075, slope=0, col = 'red', linetype='dotted') +
    geom_abline(intercept=0.025, slope=0, col = 'red', linetype='dotted') +
    geom_boxplot() + facet_wrap(~variable)

ggplot(dd, aes(factor(group_size_ratio), value, fill = factor(standard_deviation_ratio))) +
    geom_abline(intercept=0.05, slope=0, col = 'red') +
    geom_abline(intercept=0.075, slope=0, col = 'red', linetype='dotted') +
    geom_abline(intercept=0.025, slope=0, col = 'red', linetype='dotted') +
    geom_boxplot() + facet_grid(variable~standard_deviation_ratio) +
    theme(legend.position = 'none')

}

}
\references{
Sigal, M. J., & Chalmers, R. P. (2016). Play it again: Teaching statistics with Monte
Carlo simulation. \code{Journal of Statistics Education, 24}(3), 136-156.
}
\seealso{
\code{\link{Generate}}, \code{\link{Analyse}}, \code{\link{Summarise}},
  \code{\link{SimFunctions}}, \code{\link{SimClean}}, \code{\link{SimAnova}}, \code{\link{SimResults}},
  \code{\link{aggregate_simulations}}, \code{\link{Attach}}, \code{\link{SimShiny}}
}

